reiser4-for-2.6.19.patch

   1 Andrew Wade (3):
   2       Reiser4: fix use after free in jrelse_tail
   3       Reiser4: release d_ref
   4       Reiser4: release d_ref (fix)
   5
   6 Edward Shishkin (2):
   7       Reiser4 for 2.6.18
   8       reiser4-generic_file_read-fix
   9
  10 Laurent Riffard (5):
  11       Reiser4: cometics changes in mm/filemap.c.
  12       Reiser4: fix calls to kmem_cache_destroy
  13       Reiser4: Replace inode.u.generic_ip with inode.i_private
  14       Reiser4: inode.i_blksize suppression
  15       Reiser4: remove unnecessary config.h includes.
  16
  17  Documentation/Changes                         |   12 +
  18  Documentation/filesystems/reiser4.txt         |   75 +
  19  arch/i386/lib/usercopy.c                      |    2 +
  20  fs/Kconfig                                    |    2 +
  21  fs/Makefile                                   |    1 +
  22  fs/fs-writeback.c                             |   26 +-
  23  fs/reiser4/Kconfig                            |   31 +
  24  fs/reiser4/Makefile                           |  100 +
  25  fs/reiser4/README                             |  125 +
  26  fs/reiser4/as_ops.c                           |  391 +++
  27  fs/reiser4/block_alloc.c                      | 1139 ++++++++
  28  fs/reiser4/block_alloc.h                      |  175 ++
  29  fs/reiser4/blocknrset.c                       |  368 +++
  30  fs/reiser4/carry.c                            | 1381 +++++++++
  31  fs/reiser4/carry.h                            |  442 +++
  32  fs/reiser4/carry_ops.c                        | 2103 ++++++++++++++
  33  fs/reiser4/carry_ops.h                        |   42 +
  34  fs/reiser4/context.c                          |  278 ++
  35  fs/reiser4/context.h                          |  228 ++
  36  fs/reiser4/coord.c                            |  937 ++++++
  37  fs/reiser4/coord.h                            |  389 +++
  38  fs/reiser4/debug.c                            |  300 ++
  39  fs/reiser4/debug.h                            |  350 +++
  40  fs/reiser4/dformat.h                          |   71 +
  41  fs/reiser4/dscale.c                           |  174 ++
  42  fs/reiser4/dscale.h                           |   27 +
  43  fs/reiser4/entd.c                             |  354 +++
  44  fs/reiser4/entd.h                             |   90 +
  45  fs/reiser4/eottl.c                            |  510 ++++
  46  fs/reiser4/estimate.c                         |  111 +
  47  fs/reiser4/export_ops.c                       |  296 ++
  48  fs/reiser4/flush.c                            | 3626 +++++++++++++++++++++++
  49  fs/reiser4/flush.h                            |  274 ++
  50  fs/reiser4/flush_queue.c                      |  681 +++++
  51  fs/reiser4/forward.h                          |  258 ++
  52  fs/reiser4/fsdata.c                           |  803 ++++++
  53  fs/reiser4/fsdata.h                           |  218 ++
  54  fs/reiser4/init_super.c                       |  739 +++++
  55  fs/reiser4/inode.c                            |  727 +++++
  56  fs/reiser4/inode.h                            |  430 +++
  57  fs/reiser4/ioctl.h                            |   41 +
  58  fs/reiser4/jnode.c                            | 1922 +++++++++++++
  59  fs/reiser4/jnode.h                            |  707 +++++
  60  fs/reiser4/kassign.c                          |  659 +++++
  61  fs/reiser4/kassign.h                          |  110 +
  62  fs/reiser4/key.c                              |  137 +
  63  fs/reiser4/key.h                              |  384 +++
  64  fs/reiser4/ktxnmgrd.c                         |  214 ++
  65  fs/reiser4/ktxnmgrd.h                         |   52 +
  66  fs/reiser4/lock.c                             | 1261 ++++++++
  67  fs/reiser4/lock.h                             |  272 ++
  68  fs/reiser4/oid.c                              |  141 +
  69  fs/reiser4/page_cache.c                       |  712 +++++
  70  fs/reiser4/page_cache.h                       |   62 +
  71  fs/reiser4/plugin/Makefile                    |   26 +
  72  fs/reiser4/plugin/cluster.c                   |   66 +
  73  fs/reiser4/plugin/cluster.h                   |  315 ++
  74  fs/reiser4/plugin/compress/Makefile           |    6 +
  75  fs/reiser4/plugin/compress/compress.c         |  369 +++
  76  fs/reiser4/plugin/compress/compress.h         |   38 +
  77  fs/reiser4/plugin/compress/compress_mode.c    |  163 ++
  78  fs/reiser4/plugin/compress/lzoconf.h          |  420 +++
  79  fs/reiser4/plugin/compress/minilzo.c          | 2155 ++++++++++++++
  80  fs/reiser4/plugin/compress/minilzo.h          |   94 +
  81  fs/reiser4/plugin/crypto/cipher.c             |  116 +
  82  fs/reiser4/plugin/crypto/cipher.h             |   67 +
  83  fs/reiser4/plugin/crypto/digest.c             |   58 +
  84  fs/reiser4/plugin/dir/Makefile                |    5 +
  85  fs/reiser4/plugin/dir/dir.h                   |   36 +
  86  fs/reiser4/plugin/dir/hashed_dir.c            |   81 +
  87  fs/reiser4/plugin/dir/seekable_dir.c          |   46 +
  88  fs/reiser4/plugin/dir_plugin_common.c         |  864 ++++++
  89  fs/reiser4/plugin/disk_format/Makefile        |    5 +
  90  fs/reiser4/plugin/disk_format/disk_format.c   |   37 +
  91  fs/reiser4/plugin/disk_format/disk_format.h   |   27 +
  92  fs/reiser4/plugin/disk_format/disk_format40.c |  556 ++++
  93  fs/reiser4/plugin/disk_format/disk_format40.h |   99 +
  94  fs/reiser4/plugin/fibration.c                 |  174 ++
  95  fs/reiser4/plugin/fibration.h                 |   37 +
  96  fs/reiser4/plugin/file/Makefile               |    7 +
  97  fs/reiser4/plugin/file/cryptcompress.c        | 3819 +++++++++++++++++++++++++
  98  fs/reiser4/plugin/file/cryptcompress.h        |  549 ++++
  99  fs/reiser4/plugin/file/file.c                 | 2713 ++++++++++++++++++
 100  fs/reiser4/plugin/file/file.h                 |  257 ++
 101  fs/reiser4/plugin/file/invert.c               |  493 ++++
 102  fs/reiser4/plugin/file/symfile.c              |   87 +
 103  fs/reiser4/plugin/file/symlink.c              |   92 +
 104  fs/reiser4/plugin/file/tail_conversion.c      |  728 +++++
 105  fs/reiser4/plugin/file_ops.c                  |  167 ++
 106  fs/reiser4/plugin/file_ops_readdir.c          |  655 +++++
 107  fs/reiser4/plugin/file_plugin_common.c        |  929 ++++++
 108  fs/reiser4/plugin/hash.c                      |  350 +++
 109  fs/reiser4/plugin/inode_ops.c                 |  886 ++++++
 110  fs/reiser4/plugin/inode_ops_rename.c          |  904 ++++++
 111  fs/reiser4/plugin/item/Makefile               |   18 +
 112  fs/reiser4/plugin/item/acl.h                  |   66 +
 113  fs/reiser4/plugin/item/blackbox.c             |  142 +
 114  fs/reiser4/plugin/item/blackbox.h             |   33 +
 115  fs/reiser4/plugin/item/cde.c                  | 1007 +++++++
 116  fs/reiser4/plugin/item/cde.h                  |   87 +
 117  fs/reiser4/plugin/item/ctail.c                | 1588 ++++++++++
 118  fs/reiser4/plugin/item/ctail.h                |   89 +
 119  fs/reiser4/plugin/item/extent.c               |  197 ++
 120  fs/reiser4/plugin/item/extent.h               |  228 ++
 121  fs/reiser4/plugin/item/extent_file_ops.c      | 1716 +++++++++++
 122  fs/reiser4/plugin/item/extent_flush_ops.c     | 1018 +++++++
 123  fs/reiser4/plugin/item/extent_item_ops.c      |  882 ++++++
 124  fs/reiser4/plugin/item/internal.c             |  392 +++
 125  fs/reiser4/plugin/item/internal.h             |   57 +
 126  fs/reiser4/plugin/item/item.c                 |  727 +++++
 127  fs/reiser4/plugin/item/item.h                 |  399 +++
 128  fs/reiser4/plugin/item/sde.c                  |  190 ++
 129  fs/reiser4/plugin/item/sde.h                  |   66 +
 130  fs/reiser4/plugin/item/static_stat.c          | 1038 +++++++
 131  fs/reiser4/plugin/item/static_stat.h          |  219 ++
 132  fs/reiser4/plugin/item/tail.c                 |  805 ++++++
 133  fs/reiser4/plugin/item/tail.h                 |   58 +
 134  fs/reiser4/plugin/node/Makefile               |    5 +
 135  fs/reiser4/plugin/node/node.c                 |  131 +
 136  fs/reiser4/plugin/node/node.h                 |  272 ++
 137  fs/reiser4/plugin/node/node40.c               | 2924 +++++++++++++++++++
 138  fs/reiser4/plugin/node/node40.h               |  125 +
 139  fs/reiser4/plugin/object.c                    |  502 ++++
 140  fs/reiser4/plugin/object.h                    |  121 +
 141  fs/reiser4/plugin/plugin.c                    |  535 ++++
 142  fs/reiser4/plugin/plugin.h                    |  935 ++++++
 143  fs/reiser4/plugin/plugin_header.h             |  136 +
 144  fs/reiser4/plugin/plugin_set.c                |  378 +++
 145  fs/reiser4/plugin/plugin_set.h                |   82 +
 146  fs/reiser4/plugin/regular.c                   |   44 +
 147  fs/reiser4/plugin/security/Makefile           |    4 +
 148  fs/reiser4/plugin/security/perm.c             |   44 +
 149  fs/reiser4/plugin/security/perm.h             |   82 +
 150  fs/reiser4/plugin/space/Makefile              |    4 +
 151  fs/reiser4/plugin/space/bitmap.c              | 1592 +++++++++++
 152  fs/reiser4/plugin/space/bitmap.h              |   47 +
 153  fs/reiser4/plugin/space/space_allocator.h     |   80 +
 154  fs/reiser4/plugin/tail_policy.c               |  113 +
 155  fs/reiser4/pool.c                             |  236 ++
 156  fs/reiser4/pool.h                             |   54 +
 157  fs/reiser4/readahead.c                        |  138 +
 158  fs/reiser4/readahead.h                        |   48 +
 159  fs/reiser4/reiser4.h                          |  275 ++
 160  fs/reiser4/safe_link.c                        |  351 +++
 161  fs/reiser4/safe_link.h                        |   29 +
 162  fs/reiser4/seal.c                             |  217 ++
 163  fs/reiser4/seal.h                             |   49 +
 164  fs/reiser4/search.c                           | 1611 +++++++++++
 165  fs/reiser4/status_flags.c                     |  176 ++
 166  fs/reiser4/status_flags.h                     |   43 +
 167  fs/reiser4/super.c                            |  313 ++
 168  fs/reiser4/super.h                            |  467 +++
 169  fs/reiser4/super_ops.c                        |  720 +++++
 170  fs/reiser4/tap.c                              |  377 +++
 171  fs/reiser4/tap.h                              |   69 +
 172  fs/reiser4/tree.c                             | 1875 ++++++++++++
 173  fs/reiser4/tree.h                             |  579 ++++
 174  fs/reiser4/tree_mod.c                         |  383 +++
 175  fs/reiser4/tree_mod.h                         |   29 +
 176  fs/reiser4/tree_walk.c                        |  926 ++++++
 177  fs/reiser4/tree_walk.h                        |  125 +
 178  fs/reiser4/txnmgr.c                           | 3158 ++++++++++++++++++++
 179  fs/reiser4/txnmgr.h                           |  703 +++++
 180  fs/reiser4/type_safe_hash.h                   |  320 +++
 181  fs/reiser4/vfs_ops.c                          |  267 ++
 182  fs/reiser4/vfs_ops.h                          |   58 +
 183  fs/reiser4/wander.c                           | 1799 ++++++++++++
 184  fs/reiser4/wander.h                           |  135 +
 185  fs/reiser4/writeout.h                         |   21 +
 186  fs/reiser4/znode.c                            | 1028 +++++++
 187  fs/reiser4/znode.h                            |  434 +++
 188  include/linux/fs.h                            |    3 +
 189  lib/radix-tree.c                              |    1 +
 190  mm/filemap.c                                  |    5 +
 191  mm/readahead.c                                |    1 +
 192  175 files changed, 79647 insertions(+), 10 deletions(-)
 193
 194 diff --git a/Documentation/Changes b/Documentation/Changes
 195 index abee7f5..c8323c2 100644
 196 --- a/Documentation/Changes
 197 +++ b/Documentation/Changes
 198 @@ -36,6 +36,7 @@ o  module-init-tools      0.9.10                  # depmod -V
 199  o  e2fsprogs              1.29                    # tune2fs
 200  o  jfsutils               1.1.3                   # fsck.jfs -V
 201  o  reiserfsprogs          3.6.3                   # reiserfsck -V 2>&1|grep reiserfsprogs
 202 +o  reiser4progs           1.0.0                   # fsck.reiser4 -V
 203  o  xfsprogs               2.6.0                   # xfs_db -V
 204  o  pcmciautils            004                     # pccardctl -V
 205  o  quota-tools            3.09                    # quota -V
 206 @@ -144,6 +145,13 @@ The reiserfsprogs package should be used for reiserfs-3.6.x
 207  versions of mkreiserfs, resize_reiserfs, debugreiserfs and
 208  reiserfsck. These utils work on both i386 and alpha platforms.
 209
 210 +Reiser4progs
 211 +------------
 212 +
 213 +The reiser4progs package contains utilities for the reiser4 file system.
 214 +Detailed instructions are provided in the README file located at:
 215 +<ftp://ftp.namesys.com/pub/reiser4progs/README>.
 216 +
 217  Xfsprogs
 218  --------
 219
 220 @@ -322,6 +330,10 @@ Reiserfsprogs
 221  -------------
 222  o  <http://www.namesys.com/pub/reiserfsprogs/reiserfsprogs-3.6.3.tar.gz>
 223
 224 +Reiser4progs
 225 +------------
 226 +o  <ftp://ftp.namesys.com/pub/reiser4progs/>
 227 +
 228  Xfsprogs
 229  --------
 230  o  <ftp://oss.sgi.com/projects/xfs/download/>
 231 diff --git a/Documentation/filesystems/reiser4.txt b/Documentation/filesystems/reiser4.txt
 232 new file mode 100644
 233 index 0000000..8e07c9e
 234 --- /dev/null
 235 +++ b/Documentation/filesystems/reiser4.txt
 236 @@ -0,0 +1,75 @@
 237 +Reiser4 filesystem
 238 +==================
 239 +Reiser4 is a file system based on dancing tree algorithms, and is
 240 +described at http://www.namesys.com
 241 +
 242 +
 243 +References
 244 +==========
 245 +web page               http://namesys.com/v4/v4.html
 246 +source code            ftp://ftp.namesys.com/pub/reiser4-for-2.6/
 247 +userland tools         ftp://ftp.namesys.com/pub/reiser4progs/
 248 +install page           http://www.namesys.com/install_v4.html
 249 +
 250 +Compile options
 251 +===============
 252 +Enable reiser4 debug mode
 253 +       This checks everything imaginable while reiser4
 254 +       runs
 255 +
 256 +Mount options
 257 +=============
 258 +tmgr.atom_max_size=N
 259 +       Atoms containing more than N blocks will be forced to commit.
 260 +       N is decimal.
 261 +       Default is nr_free_pagecache_pages() / 2 at mount time.
 262 +
 263 +tmgr.atom_max_age=N
 264 +       Atoms older than N seconds will be forced to commit. N is decimal.
 265 +       Default is 600.
 266 +
 267 +tmgr.atom_max_flushers=N
 268 +       Limit of concurrent flushers for one atom. 0 means no limit.
 269 +       Default is 0.
 270 +
 271 +tree.cbk_cache.nr_slots=N
 272 +       Number of slots in the cbk cache.
 273 +
 274 +flush.relocate_threshold=N
 275 +       If flush finds more than N adjacent dirty leaf-level blocks it
 276 +       will force them to be relocated.
 277 +       Default is 64.
 278 +
 279 +flush.relocate_distance=N
 280 +       If flush finds can find a block allocation closer than at most
 281 +       N from the preceder it will relocate to that position.
 282 +       Default is 64.
 283 +
 284 +flush.scan_maxnodes=N
 285 +       The maximum number of nodes to scan left on a level during
 286 +       flush.
 287 +       Default is 10000.
 288 +
 289 +optimal_io_size=N
 290 +       Preferred IO size. This value is used to set st_blksize of
 291 +       struct stat.
 292 +       Default is 65536.
 293 +
 294 +bsdgroups
 295 +       Turn on BSD-style gid assignment.
 296 +
 297 +32bittimes
 298 +       By default file in reiser4 have 64 bit timestamps. Files
 299 +       created when filesystem is mounted with 32bittimes mount
 300 +       option will get 32 bit timestamps.
 301 +
 302 +mtflush
 303 +       Turn off concurrent flushing.
 304 +
 305 +nopseudo
 306 +       Disable pseudo files support. See
 307 +       http://namesys.com/v4/pseudo.html for more about pseudo files.
 308 +
 309 +dont_load_bitmap
 310 +       Don't load all bitmap blocks at mount time, it is useful for
 311 +       machines with tiny RAM and large disks.
 312 diff --git a/arch/i386/lib/usercopy.c b/arch/i386/lib/usercopy.c
 313 index d22cfc9..bb4a75a 100644
 314 --- a/arch/i386/lib/usercopy.c
 315 +++ b/arch/i386/lib/usercopy.c
 316 @@ -812,6 +812,7 @@ unsigned long __copy_from_user_ll_nocache(void *to, const void __user *from,
 317  #endif
 318         return n;
 319  }
 320 +EXPORT_SYMBOL(__copy_from_user_ll_nocache);
 321
 322  unsigned long __copy_from_user_ll_nocache_nozero(void *to, const void __user *from,
 323                                         unsigned long n)
 324 @@ -827,6 +828,7 @@ unsigned long __copy_from_user_ll_nocache_nozero(void *to, const void __user *fr
 325  #endif
 326         return n;
 327  }
 328 +EXPORT_SYMBOL(__copy_from_user_ll_nocache_nozero);
 329
 330  /**
 331   * copy_to_user: - Copy a block of data into user space.
 332 diff --git a/fs/Kconfig b/fs/Kconfig
 333 index 7b1511d..4673de9 100644
 334 --- a/fs/Kconfig
 335 +++ b/fs/Kconfig
 336 @@ -278,6 +278,8 @@ config FS_MBCACHE
 337         default y if EXT2_FS=y || EXT3_FS=y || EXT4DEV_FS=y
 338         default m if EXT2_FS=m || EXT3_FS=m || EXT4DEV_FS=m
 339
 340 +source "fs/reiser4/Kconfig"
 341 +
 342  config REISERFS_FS
 343         tristate "Reiserfs support"
 344         help
 345 diff --git a/fs/Makefile b/fs/Makefile
 346 index 9a5ce93..671aeaf 100644
 347 --- a/fs/Makefile
 348 +++ b/fs/Makefile
 349 @@ -61,6 +61,7 @@ obj-$(CONFIG_DLM)             += dlm/
 350
 351  # Do not add any filesystems before this line
 352  obj-$(CONFIG_REISERFS_FS)      += reiserfs/
 353 +obj-$(CONFIG_REISER4_FS)       += reiser4/
 354  obj-$(CONFIG_EXT3_FS)          += ext3/ # Before ext2 so root fs can be ext3
 355  obj-$(CONFIG_EXT4DEV_FS)       += ext4/ # Before ext2 so root fs can be ext4dev
 356  obj-$(CONFIG_JBD)              += jbd/
 357 diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
 358 index c403b66..e1a4b0b 100644
 359 --- a/fs/fs-writeback.c
 360 +++ b/fs/fs-writeback.c
 361 @@ -285,8 +285,6 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 362   * WB_SYNC_HOLD is a hack for sys_sync(): reattach the inode to sb->s_dirty so
 363   * that it can be located for waiting on in __writeback_single_inode().
 364   *
 365 - * Called under inode_lock.
 366 - *
 367   * If `bdi' is non-zero then we're being asked to writeback a specific queue.
 368   * This function assumes that the blockdev superblock's inodes are backed by
 369   * a variety of queues, so all inodes are searched.  For other superblocks,
 370 @@ -302,11 +300,13 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 371   * on the writer throttling path, and we get decent balancing between many
 372   * throttled threads: we don't want them all piling up on __wait_on_inode.
 373   */
 374 -static void
 375 -sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc)
 376 +void
 377 +generic_sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc)
 378  {
 379         const unsigned long start = jiffies;    /* livelock avoidance */
 380
 381 +       spin_lock(&inode_lock);
 382 +
 383         if (!wbc->for_kupdate || list_empty(&sb->s_io))
 384                 list_splice_init(&sb->s_dirty, &sb->s_io);
 385
 386 @@ -386,8 +386,19 @@ sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc)
 387                 if (wbc->nr_to_write <= 0)
 388                         break;
 389         }
 390 +       spin_unlock(&inode_lock);
 391         return;         /* Leave any unwritten inodes on s_io */
 392  }
 393 +EXPORT_SYMBOL(generic_sync_sb_inodes);
 394 +
 395 +static void
 396 +sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc)
 397 +{
 398 +       if (sb->s_op->sync_inodes)
 399 +               sb->s_op->sync_inodes(sb, wbc);
 400 +       else
 401 +               generic_sync_sb_inodes(sb, wbc);
 402 +}
 403
 404  /*
 405   * Start writeback of dirty pagecache data against all unlocked inodes.
 406 @@ -428,11 +439,8 @@ restart:
 407                          * be unmounted by the time it is released.
 408                          */
 409                         if (down_read_trylock(&sb->s_umount)) {
 410 -                               if (sb->s_root) {
 411 -                                       spin_lock(&inode_lock);
 412 +                               if (sb->s_root)
 413                                         sync_sb_inodes(sb, wbc);
 414 -                                       spin_unlock(&inode_lock);
 415 -                               }
 416                                 up_read(&sb->s_umount);
 417                         }
 418                         spin_lock(&sb_lock);
 419 @@ -470,9 +478,7 @@ void sync_inodes_sb(struct super_block *sb, int wait)
 420                         (inodes_stat.nr_inodes - inodes_stat.nr_unused) +
 421                         nr_dirty + nr_unstable;
 422         wbc.nr_to_write += wbc.nr_to_write / 2;         /* Bit more for luck */
 423 -       spin_lock(&inode_lock);
 424         sync_sb_inodes(sb, &wbc);
 425 -       spin_unlock(&inode_lock);
 426  }
 427
 428  /*
 429 diff --git a/fs/reiser4/Kconfig b/fs/reiser4/Kconfig
 430 new file mode 100644
 431 index 0000000..37aa01c
 432 --- /dev/null
 433 +++ b/fs/reiser4/Kconfig
 434 @@ -0,0 +1,31 @@
 435 +config REISER4_FS
 436 +       tristate "Reiser4 (EXPERIMENTAL)"
 437 +       depends on EXPERIMENTAL
 438 +       select ZLIB_INFLATE
 439 +       select ZLIB_DEFLATE
 440 +       help
 441 +         Reiser4 is a filesystem that performs all filesystem operations
 442 +         as atomic transactions, which means that it either performs a
 443 +         write, or it does not, and in the event of a crash it does not
 444 +         partially perform it or corrupt it.
 445 +
 446 +         It stores files in dancing trees, which are like balanced trees but
 447 +         faster.  It packs small files together so that they share blocks
 448 +         without wasting space.  This means you can use it to store really
 449 +         small files.  It also means that it saves you disk space.  It avoids
 450 +         hassling you with anachronisms like having a maximum number of
 451 +         inodes, and wasting space if you use less than that number.
 452 +
 453 +         Reiser4 is a distinct filesystem type from reiserfs (V3).
 454 +         It's therefore not possible to use reiserfs file systems
 455 +         with reiser4.
 456 +
 457 +         To learn more about reiser4, go to http://www.namesys.com
 458 +
 459 +config REISER4_DEBUG
 460 +       bool "Enable reiser4 debug mode"
 461 +       depends on REISER4_FS
 462 +       help
 463 +         Don't use this unless you are debugging reiser4.
 464 +
 465 +         If unsure, say N.
 466 diff --git a/fs/reiser4/Makefile b/fs/reiser4/Makefile
 467 new file mode 100644
 468 index 0000000..07318c7
 469 --- /dev/null
 470 +++ b/fs/reiser4/Makefile
 471 @@ -0,0 +1,100 @@
 472 +#
 473 +# reiser4/Makefile
 474 +#
 475 +
 476 +obj-$(CONFIG_REISER4_FS) += reiser4.o
 477 +
 478 +reiser4-y := \
 479 +                  debug.o \
 480 +                  jnode.o \
 481 +                  znode.o \
 482 +                  key.o \
 483 +                  pool.o \
 484 +                  tree_mod.o \
 485 +                  estimate.o \
 486 +                  carry.o \
 487 +                  carry_ops.o \
 488 +                  lock.o \
 489 +                  tree.o \
 490 +                  context.o \
 491 +                  tap.o \
 492 +                  coord.o \
 493 +                  block_alloc.o \
 494 +                  txnmgr.o \
 495 +                  kassign.o \
 496 +                  flush.o \
 497 +                  wander.o \
 498 +                  eottl.o \
 499 +                  search.o \
 500 +                  page_cache.o \
 501 +                  seal.o \
 502 +                  dscale.o \
 503 +                  flush_queue.o \
 504 +                  ktxnmgrd.o \
 505 +                  blocknrset.o \
 506 +                  super.o \
 507 +                  super_ops.o \
 508 +                  fsdata.o \
 509 +                  export_ops.o \
 510 +                  oid.o \
 511 +                  tree_walk.o \
 512 +                  inode.o \
 513 +                  vfs_ops.o \
 514 +                  as_ops.o \
 515 +                  entd.o\
 516 +                  readahead.o \
 517 +                  status_flags.o \
 518 +                  init_super.o \
 519 +                  safe_link.o \
 520 +           \
 521 +                  plugin/plugin.o \
 522 +                  plugin/plugin_set.o \
 523 +                  plugin/node/node.o \
 524 +                  plugin/object.o \
 525 +                  plugin/cluster.o \
 526 +                  plugin/inode_ops.o \
 527 +                  plugin/inode_ops_rename.o \
 528 +                  plugin/file_ops.o \
 529 +                  plugin/file_ops_readdir.o \
 530 +                  plugin/file_plugin_common.o \
 531 +                  plugin/file/file.o \
 532 +                  plugin/file/tail_conversion.o \
 533 +                  plugin/file/symlink.o \
 534 +                  plugin/file/cryptcompress.o \
 535 +                  plugin/dir_plugin_common.o \
 536 +                  plugin/dir/hashed_dir.o \
 537 +                  plugin/dir/seekable_dir.o \
 538 +                  plugin/node/node40.o \
 539 +           \
 540 +                  plugin/crypto/cipher.o \
 541 +                  plugin/crypto/digest.o \
 542 +           \
 543 +                  plugin/compress/minilzo.o \
 544 +                  plugin/compress/compress.o \
 545 +                  plugin/compress/compress_mode.o \
 546 +           \
 547 +                  plugin/item/static_stat.o \
 548 +                  plugin/item/sde.o \
 549 +                  plugin/item/cde.o \
 550 +                  plugin/item/blackbox.o \
 551 +                  plugin/item/internal.o \
 552 +                  plugin/item/tail.o \
 553 +                  plugin/item/ctail.o \
 554 +                  plugin/item/extent.o \
 555 +                  plugin/item/extent_item_ops.o \
 556 +                  plugin/item/extent_file_ops.o \
 557 +                  plugin/item/extent_flush_ops.o \
 558 +           \
 559 +                  plugin/hash.o \
 560 +                  plugin/fibration.o \
 561 +                  plugin/tail_policy.o \
 562 +                  plugin/item/item.o \
 563 +           \
 564 +                  plugin/security/perm.o \
 565 +                  plugin/space/bitmap.o \
 566 +           \
 567 +                  plugin/disk_format/disk_format40.o \
 568 +                  plugin/disk_format/disk_format.o \
 569 +          \
 570 +                  plugin/regular.o
 571 +
 572 diff --git a/fs/reiser4/README b/fs/reiser4/README
 573 new file mode 100644
 574 index 0000000..4637f59
 575 --- /dev/null
 576 +++ b/fs/reiser4/README
 577 @@ -0,0 +1,125 @@
 578 +[LICENSING]
 579 +
 580 +Reiser4 is hereby licensed under the GNU General
 581 +Public License version 2.
 582 +
 583 +Source code files that contain the phrase "licensing governed by
 584 +reiser4/README" are "governed files" throughout this file.  Governed
 585 +files are licensed under the GPL.  The portions of them owned by Hans
 586 +Reiser, or authorized to be licensed by him, have been in the past,
 587 +and likely will be in the future, licensed to other parties under
 588 +other licenses.  If you add your code to governed files, and don't
 589 +want it to be owned by Hans Reiser, put your copyright label on that
 590 +code so the poor blight and his customers can keep things straight.
 591 +All portions of governed files not labeled otherwise are owned by Hans
 592 +Reiser, and by adding your code to it, widely distributing it to
 593 +others or sending us a patch, and leaving the sentence in stating that
 594 +licensing is governed by the statement in this file, you accept this.
 595 +It will be a kindness if you identify whether Hans Reiser is allowed
 596 +to license code labeled as owned by you on your behalf other than
 597 +under the GPL, because he wants to know if it is okay to do so and put
 598 +a check in the mail to you (for non-trivial improvements) when he
 599 +makes his next sale.  He makes no guarantees as to the amount if any,
 600 +though he feels motivated to motivate contributors, and you can surely
 601 +discuss this with him before or after contributing.  You have the
 602 +right to decline to allow him to license your code contribution other
 603 +than under the GPL.
 604 +
 605 +Further licensing options are available for commercial and/or other
 606 +interests directly from Hans Reiser: reiser@namesys.com.  If you interpret
 607 +the GPL as not allowing those additional licensing options, you read
 608 +it wrongly, and Richard Stallman agrees with me, when carefully read
 609 +you can see that those restrictions on additional terms do not apply
 610 +to the owner of the copyright, and my interpretation of this shall
 611 +govern for this license.
 612 +
 613 +[END LICENSING]
 614 +
 615 +Reiser4 is a file system based on dancing tree algorithms, and is
 616 +described at http://www.namesys.com
 617 +
 618 +mkfs.reiser4 and other utilities are on our webpage or wherever your
 619 +Linux provider put them.  You really want to be running the latest
 620 +version off the website if you use fsck.
 621 +
 622 +Yes, if you update your reiser4 kernel module you do have to
 623 +recompile your kernel, most of the time.  The errors you get will be
 624 +quite cryptic if your forget to do so.
 625 +
 626 +Hideous Commercial Pitch: Spread your development costs across other OS
 627 +vendors.  Select from the best in the world, not the best in your
 628 +building, by buying from third party OS component suppliers.  Leverage
 629 +the software component development power of the internet.  Be the most
 630 +aggressive in taking advantage of the commercial possibilities of
 631 +decentralized internet development, and add value through your branded
 632 +integration that you sell as an operating system.  Let your competitors
 633 +be the ones to compete against the entire internet by themselves.  Be
 634 +hip, get with the new economic trend, before your competitors do.  Send
 635 +email to reiser@namesys.com
 636 +
 637 +Hans Reiser was the primary architect of Reiser4, but a whole team
 638 +chipped their ideas in.  He invested everything he had into Namesys
 639 +for 5.5 dark years of no money before Reiser3 finally started to work well
 640 +enough to bring in money.  He owns the copyright.
 641 +
 642 +DARPA was the primary sponsor of Reiser4.  DARPA does not endorse
 643 +Reiser4, it merely sponsors it.  DARPA is, in solely Hans's personal
 644 +opinion, unique in its willingness to invest into things more
 645 +theoretical than the VC community can readily understand, and more
 646 +longterm than allows them to be sure that they will be the ones to
 647 +extract the economic benefits from.  DARPA also integrated us into a
 648 +security community that transformed our security worldview.
 649 +
 650 +Vladimir Saveliev is our lead programmer, with us from the beginning,
 651 +and he worked long hours writing the cleanest code.  This is why he is
 652 +now the lead programmer after years of commitment to our work.  He
 653 +always made the effort to be the best he could be, and to make his
 654 +code the best that it could be.  What resulted was quite remarkable. I
 655 +don't think that money can ever motivate someone to work the way he
 656 +did, he is one of the most selfless men I know.
 657 +
 658 +Alexander Lyamin was our sysadmin, and helped to educate us in
 659 +security issues.  Moscow State University and IMT were very generous
 660 +in the internet access they provided us, and in lots of other little
 661 +ways that a generous institution can be.
 662 +
 663 +Alexander Zarochentcev (sometimes known as zam, or sasha), wrote the
 664 +locking code, the block allocator, and finished the flushing code.
 665 +His code is always crystal clean and well structured.
 666 +
 667 +Nikita Danilov wrote the core of the balancing code, the core of the
 668 +plugins code, and the directory code.  He worked a steady pace of long
 669 +hours that produced a whole lot of well abstracted code.  He is our
 670 +senior computer scientist.
 671 +
 672 +Vladimir Demidov wrote the parser.  Writing an in kernel parser is
 673 +something very few persons have the skills for, and it is thanks to
 674 +him that we can say that the parser is really not so big compared to
 675 +various bits of our other code, and making a parser work in the kernel
 676 +was not so complicated as everyone would imagine mainly because it was
 677 +him doing it...
 678 +
 679 +Joshua McDonald wrote the transaction manager, and the flush code.
 680 +The flush code unexpectedly turned out be extremely hairy for reasons
 681 +you can read about on our web page, and he did a great job on an
 682 +extremely difficult task.
 683 +
 684 +Nina Reiser handled our accounting, government relations, and much
 685 +more.
 686 +
 687 +Ramon Reiser developed our website.
 688 +
 689 +Beverly Palmer drew our graphics.
 690 +
 691 +Vitaly Fertman developed librepair, userspace plugins repair code, fsck
 692 +and worked with Umka on developing libreiser4 and userspace plugins.
 693 +
 694 +Yury Umanets (aka Umka) developed libreiser4, userspace plugins and
 695 +userspace tools (reiser4progs).
 696 +
 697 +Oleg Drokin (aka Green) is the release manager who fixes everything.
 698 +It is so nice to have someone like that on the team.  He (plus Chris
 699 +and Jeff) make it possible for the entire rest of the Namesys team to
 700 +focus on Reiser4, and he fixed a whole lot of Reiser4 bugs also.  It
 701 +is just amazing to watch his talent for spotting bugs in action.
 702 +
 703 diff --git a/fs/reiser4/as_ops.c b/fs/reiser4/as_ops.c
 704 new file mode 100644
 705 index 0000000..54f7b2a
 706 --- /dev/null
 707 +++ b/fs/reiser4/as_ops.c
 708 @@ -0,0 +1,391 @@
 709 +/* Copyright 2003 by Hans Reiser, licensing governed by reiser4/README */
 710 +
 711 +/* Interface to VFS. Reiser4 address_space_operations are defined here. */
 712 +
 713 +#include "forward.h"
 714 +#include "debug.h"
 715 +#include "dformat.h"
 716 +#include "coord.h"
 717 +#include "plugin/item/item.h"
 718 +#include "plugin/file/file.h"
 719 +#include "plugin/security/perm.h"
 720 +#include "plugin/disk_format/disk_format.h"
 721 +#include "plugin/plugin.h"
 722 +#include "plugin/plugin_set.h"
 723 +#include "plugin/object.h"
 724 +#include "txnmgr.h"
 725 +#include "jnode.h"
 726 +#include "znode.h"
 727 +#include "block_alloc.h"
 728 +#include "tree.h"
 729 +#include "vfs_ops.h"
 730 +#include "inode.h"
 731 +#include "page_cache.h"
 732 +#include "ktxnmgrd.h"
 733 +#include "super.h"
 734 +#include "reiser4.h"
 735 +#include "entd.h"
 736 +
 737 +#include <linux/profile.h>
 738 +#include <linux/types.h>
 739 +#include <linux/mount.h>
 740 +#include <linux/vfs.h>
 741 +#include <linux/mm.h>
 742 +#include <linux/buffer_head.h>
 743 +#include <linux/dcache.h>
 744 +#include <linux/list.h>
 745 +#include <linux/pagemap.h>
 746 +#include <linux/slab.h>
 747 +#include <linux/seq_file.h>
 748 +#include <linux/init.h>
 749 +#include <linux/module.h>
 750 +#include <linux/writeback.h>
 751 +#include <linux/backing-dev.h>
 752 +#include <linux/quotaops.h>
 753 +#include <linux/security.h>
 754 +
 755 +/* address space operations */
 756 +
 757 +/**
 758 + * reiser4_set_page_dirty - set dirty bit, tag in page tree, dirty accounting
 759 + * @page: page to be dirtied
 760 + *
 761 + * Operation of struct address_space_operations. This implementation is used by
 762 + * unix and crc file plugins.
 763 + *
 764 + * This is called when reiser4 page gets dirtied outside of reiser4, for
 765 + * example, when dirty bit is moved from pte to physical page.
 766 + *
 767 + * Tags page in the mapping's page tree with special tag so that it is possible
 768 + * to do all the reiser4 specific work wrt dirty pages (jnode creation,
 769 + * capturing by an atom) later because it can not be done in the contexts where
 770 + * set_page_dirty is called.
 771 + */
 772 +int reiser4_set_page_dirty(struct page *page)
 773 +{
 774 +       /* this page can be unformatted only */
 775 +       assert("vs-1734", (page->mapping &&
 776 +                          page->mapping->host &&
 777 +                          get_super_fake(page->mapping->host->i_sb) !=
 778 +                          page->mapping->host
 779 +                          && get_cc_fake(page->mapping->host->i_sb) !=
 780 +                          page->mapping->host
 781 +                          && get_bitmap_fake(page->mapping->host->i_sb) !=
 782 +                          page->mapping->host));
 783 +
 784 +       if (!TestSetPageDirty(page)) {
 785 +               struct address_space *mapping = page->mapping;
 786 +
 787 +               if (mapping) {
 788 +                       write_lock_irq(&mapping->tree_lock);
 789 +
 790 +                       /* check for race with truncate */
 791 +                       if (page->mapping) {
 792 +                               assert("vs-1652", page->mapping == mapping);
 793 +                               if (mapping_cap_account_dirty(mapping))
 794 +                                       inc_zone_page_state(page,
 795 +                                                           NR_FILE_DIRTY);
 796 +                               radix_tree_tag_set(&mapping->page_tree,
 797 +                                                  page->index,
 798 +                                                  PAGECACHE_TAG_REISER4_MOVED);
 799 +                       }
 800 +                       write_unlock_irq(&mapping->tree_lock);
 801 +                       __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
 802 +               }
 803 +       }
 804 +       return 0;
 805 +}
 806 +
 807 +static int filler(void *vp, struct page *page)
 808 +{
 809 +       return page->mapping->a_ops->readpage(vp, page);
 810 +}
 811 +
 812 +/**
 813 + * reiser4_readpages - submit read for a set of pages
 814 + * @file: file to read
 815 + * @mapping: address space
 816 + * @pages: list of pages to submit read for
 817 + * @nr_pages: number of pages no the list
 818 + *
 819 + * Operation of struct address_space_operations. This implementation is used by
 820 + * unix and crc file plugins.
 821 + *
 822 + * Calls read_cache_pages or readpages hook if it is set.
 823 + */
 824 +int
 825 +reiser4_readpages(struct file *file, struct address_space *mapping,
 826 +                 struct list_head *pages, unsigned nr_pages)
 827 +{
 828 +       reiser4_context *ctx;
 829 +       reiser4_file_fsdata *fsdata;
 830 +
 831 +       ctx = init_context(mapping->host->i_sb);
 832 +       if (IS_ERR(ctx))
 833 +               return PTR_ERR(ctx);
 834 +
 835 +       fsdata = reiser4_get_file_fsdata(file);
 836 +       if (IS_ERR(fsdata)) {
 837 +               reiser4_exit_context(ctx);
 838 +               return PTR_ERR(fsdata);
 839 +       }
 840 +
 841 +       if (fsdata->ra2.readpages)
 842 +               fsdata->ra2.readpages(mapping, pages, fsdata->ra2.data);
 843 +       else {
 844 +               /*
 845 +                * filler (reiser4 readpage method) may involve tree search
 846 +                * which is not allowed when lock stack is not clean. If lock
 847 +                * stack is not clean - do nothing.
 848 +                */
 849 +               if (lock_stack_isclean(get_current_lock_stack()))
 850 +                       read_cache_pages(mapping, pages, filler, file);
 851 +               else {
 852 +                       while (!list_empty(pages)) {
 853 +                               struct page *victim;
 854 +
 855 +                               victim = list_entry(pages->prev, struct page, lru);
 856 +                               list_del(&victim->lru);
 857 +                               page_cache_release(victim);
 858 +                       }
 859 +               }
 860 +       }
 861 +       reiser4_exit_context(ctx);
 862 +       return 0;
 863 +}
 864 +
 865 +/* ->invalidatepage method for reiser4 */
 866 +
 867 +/*
 868 + * this is called for each truncated page from
 869 + * truncate_inode_pages()->truncate_{complete,partial}_page().
 870 + *
 871 + * At the moment of call, page is under lock, and outstanding io (if any) has
 872 + * completed.
 873 + */
 874 +
 875 +/**
 876 + * reiser4_invalidatepage
 877 + * @page: page to invalidate
 878 + * @offset: starting offset for partial invalidation
 879 + *
 880 + */
 881 +void reiser4_invalidatepage(struct page *page, unsigned long offset)
 882 +{
 883 +       int ret = 0;
 884 +       reiser4_context *ctx;
 885 +       struct inode *inode;
 886 +       jnode *node;
 887 +
 888 +       /*
 889 +        * This is called to truncate file's page.
 890 +        *
 891 +        * Originally, reiser4 implemented truncate in a standard way
 892 +        * (vmtruncate() calls ->invalidatepage() on all truncated pages
 893 +        * first, then file system ->truncate() call-back is invoked).
 894 +        *
 895 +        * This lead to the problem when ->invalidatepage() was called on a
 896 +        * page with jnode that was captured into atom in ASTAGE_PRE_COMMIT
 897 +        * process. That is, truncate was bypassing transactions. To avoid
 898 +        * this, try_capture_page_to_invalidate() call was added here.
 899 +        *
 900 +        * After many troubles with vmtruncate() based truncate (including
 901 +        * races with flush, tail conversion, etc.) it was re-written in the
 902 +        * top-to-bottom style: items are killed in cut_tree_object() and
 903 +        * pages belonging to extent are invalidated in kill_hook_extent(). So
 904 +        * probably now additional call to capture is not needed here.
 905 +        */
 906 +
 907 +       assert("nikita-3137", PageLocked(page));
 908 +       assert("nikita-3138", !PageWriteback(page));
 909 +       inode = page->mapping->host;
 910 +
 911 +       /*
 912 +        * ->invalidatepage() should only be called for the unformatted
 913 +        * jnodes. Destruction of all other types of jnodes is performed
 914 +        * separately. But, during some corner cases (like handling errors
 915 +        * during mount) it is simpler to let ->invalidatepage to be called on
 916 +        * them. Check for this, and do nothing.
 917 +        */
 918 +       if (get_super_fake(inode->i_sb) == inode)
 919 +               return;
 920 +       if (get_cc_fake(inode->i_sb) == inode)
 921 +               return;
 922 +       if (get_bitmap_fake(inode->i_sb) == inode)
 923 +               return;
 924 +       assert("vs-1426", PagePrivate(page));
 925 +       assert("vs-1427",
 926 +              page->mapping == jnode_get_mapping(jnode_by_page(page)));
 927 +       assert("", jprivate(page) != NULL);
 928 +       assert("", ergo(inode_file_plugin(inode) !=
 929 +                       file_plugin_by_id(CRC_FILE_PLUGIN_ID), offset == 0));
 930 +
 931 +       ctx = init_context(inode->i_sb);
 932 +       if (IS_ERR(ctx))
 933 +               return;
 934 +
 935 +       node = jprivate(page);
 936 +       spin_lock_jnode(node);
 937 +       if (!(node->state & ((1 << JNODE_DIRTY) | (1<< JNODE_FLUSH_QUEUED) |
 938 +                         (1 << JNODE_WRITEBACK) | (1 << JNODE_OVRWR)))) {
 939 +               /* there is not need to capture */
 940 +               jref(node);
 941 +               JF_SET(node, JNODE_HEARD_BANSHEE);
 942 +               page_clear_jnode(page, node);
 943 +               uncapture_jnode(node);
 944 +               unhash_unformatted_jnode(node);
 945 +               jput(node);
 946 +               reiser4_exit_context(ctx);
 947 +               return;
 948 +       }
 949 +       spin_unlock_jnode(node);
 950 +
 951 +       /* capture page being truncated. */
 952 +       ret = try_capture_page_to_invalidate(page);
 953 +       if (ret != 0)
 954 +               warning("nikita-3141", "Cannot capture: %i", ret);
 955 +
 956 +       if (offset == 0) {
 957 +               /* remove jnode from transaction and detach it from page. */
 958 +               jref(node);
 959 +               JF_SET(node, JNODE_HEARD_BANSHEE);
 960 +               /* page cannot be detached from jnode concurrently, because it
 961 +                * is locked */
 962 +               uncapture_page(page);
 963 +
 964 +               /* this detaches page from jnode, so that jdelete will not try
 965 +                * to lock page which is already locked */
 966 +               spin_lock_jnode(node);
 967 +               page_clear_jnode(page, node);
 968 +               spin_unlock_jnode(node);
 969 +               unhash_unformatted_jnode(node);
 970 +
 971 +               jput(node);
 972 +       }
 973 +
 974 +       reiser4_exit_context(ctx);
 975 +}
 976 +
 977 +/* help function called from reiser4_releasepage(). It returns true if jnode
 978 + * can be detached from its page and page released. */
 979 +int jnode_is_releasable(jnode * node /* node to check */ )
 980 +{
 981 +       assert("nikita-2781", node != NULL);
 982 +       assert_spin_locked(&(node->guard));
 983 +       assert_spin_locked(&(node->load));
 984 +
 985 +       /* is some thread is currently using jnode page, later cannot be
 986 +        * detached */
 987 +       if (atomic_read(&node->d_count) != 0) {
 988 +               return 0;
 989 +       }
 990 +
 991 +       assert("vs-1214", !jnode_is_loaded(node));
 992 +
 993 +       /*
 994 +        * can only release page if real block number is assigned to it. Simple
 995 +        * check for ->atom wouldn't do, because it is possible for node to be
 996 +        * clean, not it atom yet, and still having fake block number. For
 997 +        * example, node just created in jinit_new().
 998 +        */
 999 +       if (blocknr_is_fake(jnode_get_block(node)))
1000 +               return 0;
1001 +
1002 +       /*
1003 +        * pages prepared for write can not be released anyway, so avoid
1004 +        * detaching jnode from the page
1005 +        */
1006 +       if (JF_ISSET(node, JNODE_WRITE_PREPARED))
1007 +               return 0;
1008 +
1009 +       /*
1010 +        * dirty jnode cannot be released. It can however be submitted to disk
1011 +        * as part of early flushing, but only after getting flush-prepped.
1012 +        */
1013 +       if (JF_ISSET(node, JNODE_DIRTY))
1014 +               return 0;
1015 +
1016 +       /* overwrite set is only written by log writer. */
1017 +       if (JF_ISSET(node, JNODE_OVRWR))
1018 +               return 0;
1019 +
1020 +       /* jnode is already under writeback */
1021 +       if (JF_ISSET(node, JNODE_WRITEBACK))
1022 +               return 0;
1023 +
1024 +       /* don't flush bitmaps or journal records */
1025 +       if (!jnode_is_znode(node) && !jnode_is_unformatted(node))
1026 +               return 0;
1027 +
1028 +       return 1;
1029 +}
1030 +
1031 +/*
1032 + * ->releasepage method for reiser4
1033 + *
1034 + * This is called by VM scanner when it comes across clean page.  What we have
1035 + * to do here is to check whether page can really be released (freed that is)
1036 + * and if so, detach jnode from it and remove page from the page cache.
1037 + *
1038 + * Check for releasability is done by releasable() function.
1039 + */
1040 +int reiser4_releasepage(struct page *page, gfp_t gfp UNUSED_ARG)
1041 +{
1042 +       jnode *node;
1043 +
1044 +       assert("nikita-2257", PagePrivate(page));
1045 +       assert("nikita-2259", PageLocked(page));
1046 +       assert("nikita-2892", !PageWriteback(page));
1047 +       assert("nikita-3019", schedulable());
1048 +
1049 +       /* NOTE-NIKITA: this can be called in the context of reiser4 call. It
1050 +          is not clear what to do in this case. A lot of deadlocks seems be
1051 +          possible. */
1052 +       if (page_count(page) > 3)
1053 +               return 0;
1054 +
1055 +       node = jnode_by_page(page);
1056 +       assert("nikita-2258", node != NULL);
1057 +       assert("reiser4-4", page->mapping != NULL);
1058 +       assert("reiser4-5", page->mapping->host != NULL);
1059 +
1060 +       if (PageDirty(page))
1061 +               return 0;
1062 +
1063 +       /* releasable() needs jnode lock, because it looks at the jnode fields
1064 +        * and we need jload_lock here to avoid races with jload(). */
1065 +       spin_lock_jnode(node);
1066 +       spin_lock(&(node->load));
1067 +       if (jnode_is_releasable(node)) {
1068 +               struct address_space *mapping;
1069 +
1070 +               mapping = page->mapping;
1071 +               jref(node);
1072 +               /* there is no need to synchronize against
1073 +                * jnode_extent_write() here, because pages seen by
1074 +                * jnode_extent_write() are !releasable(). */
1075 +               page_clear_jnode(page, node);
1076 +               spin_unlock(&(node->load));
1077 +               spin_unlock_jnode(node);
1078 +
1079 +               /* we are under memory pressure so release jnode also. */
1080 +               jput(node);
1081 +
1082 +               return 1;
1083 +       } else {
1084 +               spin_unlock(&(node->load));
1085 +               spin_unlock_jnode(node);
1086 +               assert("nikita-3020", schedulable());
1087 +               return 0;
1088 +       }
1089 +}
1090 +
1091 +/* Make Linus happy.
1092 +   Local variables:
1093 +   c-indentation-style: "K&R"
1094 +   mode-name: "LC"
1095 +   c-basic-offset: 8
1096 +   tab-width: 8
1097 +   fill-column: 120
1098 +   End:
1099 +*/
1100 diff --git a/fs/reiser4/block_alloc.c b/fs/reiser4/block_alloc.c
1101 new file mode 100644
1102 index 0000000..9650ac3
1103 --- /dev/null
1104 +++ b/fs/reiser4/block_alloc.c
1105 @@ -0,0 +1,1139 @@
1106 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
1107 +
1108 +#include "debug.h"
1109 +#include "dformat.h"
1110 +#include "plugin/plugin.h"
1111 +#include "txnmgr.h"
1112 +#include "znode.h"
1113 +#include "block_alloc.h"
1114 +#include "tree.h"
1115 +#include "super.h"
1116 +
1117 +#include <linux/types.h>       /* for __u??  */
1118 +#include <linux/fs.h>          /* for struct super_block  */
1119 +#include <linux/spinlock.h>
1120 +
1121 +/* THE REISER4 DISK SPACE RESERVATION SCHEME. */
1122 +
1123 +/* We need to be able to reserve enough disk space to ensure that an atomic
1124 +   operation will have enough disk space to flush (see flush.c and
1125 +   http://namesys.com/v4/v4.html) and commit it once it is started.
1126 +
1127 +   In our design a call for reserving disk space may fail but not an actual
1128 +   block allocation.
1129 +
1130 +   All free blocks, already allocated blocks, and all kinds of reserved blocks
1131 +   are counted in different per-fs block counters.
1132 +
1133 +   A reiser4 super block's set of block counters currently is:
1134 +
1135 +   free -- free blocks,
1136 +   used -- already allocated blocks,
1137 +
1138 +   grabbed -- initially reserved for performing an fs operation, those blocks
1139 +          are taken from free blocks, then grabbed disk space leaks from grabbed
1140 +          blocks counter to other counters like "fake allocated", "flush
1141 +          reserved", "used", the rest of not used grabbed space is returned to
1142 +          free space at the end of fs operation;
1143 +
1144 +   fake allocated -- counts all nodes without real disk block numbers assigned,
1145 +                     we have separate accounting for formatted and unformatted
1146 +                     nodes (for easier debugging);
1147 +
1148 +   flush reserved -- disk space needed for flushing and committing an atom.
1149 +                     Each dirty already allocated block could be written as a
1150 +                     part of atom's overwrite set or as a part of atom's
1151 +                     relocate set.  In both case one additional block is needed,
1152 +                     it is used as a wandered block if we do overwrite or as a
1153 +                    new location for a relocated block.
1154 +
1155 +   In addition, blocks in some states are counted on per-thread and per-atom
1156 +   basis.  A reiser4 context has a counter of blocks grabbed by this transaction
1157 +   and the sb's grabbed blocks counter is a sum of grabbed blocks counter values
1158 +   of each reiser4 context.  Each reiser4 atom has a counter of "flush reserved"
1159 +   blocks, which are reserved for flush processing and atom commit. */
1160 +
1161 +/* AN EXAMPLE: suppose we insert new item to the reiser4 tree.  We estimate
1162 +   number of blocks to grab for most expensive case of balancing when the leaf
1163 +   node we insert new item to gets split and new leaf node is allocated.
1164 +
1165 +   So, we need to grab blocks for
1166 +
1167 +   1) one block for possible dirtying the node we insert an item to. That block
1168 +      would be used for node relocation at flush time or for allocating of a
1169 +      wandered one, it depends what will be a result (what set, relocate or
1170 +      overwrite the node gets assigned to) of the node processing by the flush
1171 +      algorithm.
1172 +
1173 +   2) one block for either allocating a new node, or dirtying of right or left
1174 +      clean neighbor, only one case may happen.
1175 +
1176 +   VS-FIXME-HANS: why can only one case happen? I would expect to see dirtying of left neighbor, right neighbor, current
1177 +   node, and creation of new node.  have I forgotten something?  email me.
1178 +
1179 +   These grabbed blocks are counted in both reiser4 context "grabbed blocks"
1180 +   counter and in the fs-wide one (both ctx->grabbed_blocks and
1181 +   sbinfo->blocks_grabbed get incremented by 2), sb's free blocks counter is
1182 +   decremented by 2.
1183 +
1184 +   Suppose both two blocks were spent for dirtying of an already allocated clean
1185 +   node (one block went from "grabbed" to "flush reserved") and for new block
1186 +   allocating (one block went from "grabbed" to "fake allocated formatted").
1187 +
1188 +   Inserting of a child pointer to the parent node caused parent node to be
1189 +   split, the balancing code takes care about this grabbing necessary space
1190 +   immediately by calling reiser4_grab with BA_RESERVED flag set which means
1191 +   "can use the 5% reserved disk space".
1192 +
1193 +   At this moment insertion completes and grabbed blocks (if they were not used)
1194 +   should be returned to the free space counter.
1195 +
1196 +   However the atom life-cycle is not completed.  The atom had one "flush
1197 +   reserved" block added by our insertion and the new fake allocated node is
1198 +   counted as a "fake allocated formatted" one.  The atom has to be fully
1199 +   processed by flush before commit.  Suppose that the flush moved the first,
1200 +   already allocated node to the atom's overwrite list, the new fake allocated
1201 +   node, obviously, went into the atom relocate set.  The reiser4 flush
1202 +   allocates the new node using one unit from "fake allocated formatted"
1203 +   counter, the log writer uses one from "flush reserved" for wandered block
1204 +   allocation.
1205 +
1206 +   And, it is not the end.  When the wandered block is deallocated after the
1207 +   atom gets fully played (see wander.c for term description), the disk space
1208 +   occupied for it is returned to free blocks. */
1209 +
1210 +/* BLOCK NUMBERS */
1211 +
1212 +/* Any reiser4 node has a block number assigned to it.  We use these numbers for
1213 +   indexing in hash tables, so if a block has not yet been assigned a location
1214 +   on disk we need to give it a temporary fake block number.
1215 +
1216 +   Current implementation of reiser4 uses 64-bit integers for block numbers. We
1217 +   use highest bit in 64-bit block number to distinguish fake and real block
1218 +   numbers. So, only 63 bits may be used to addressing of real device
1219 +   blocks. That "fake" block numbers space is divided into subspaces of fake
1220 +   block numbers for data blocks and for shadow (working) bitmap blocks.
1221 +
1222 +   Fake block numbers for data blocks are generated by a cyclic counter, which
1223 +   gets incremented after each real block allocation. We assume that it is
1224 +   impossible to overload this counter during one transaction life. */
1225 +
1226 +/* Initialize a blocknr hint. */
1227 +void blocknr_hint_init(reiser4_blocknr_hint * hint)
1228 +{
1229 +       memset(hint, 0, sizeof(reiser4_blocknr_hint));
1230 +}
1231 +
1232 +/* Release any resources of a blocknr hint. */
1233 +void blocknr_hint_done(reiser4_blocknr_hint * hint UNUSED_ARG)
1234 +{
1235 +       /* No resources should be freed in current blocknr_hint implementation. */
1236 +}
1237 +
1238 +/* see above for explanation of fake block number.  */
1239 +/* Audited by: green(2002.06.11) */
1240 +int blocknr_is_fake(const reiser4_block_nr * da)
1241 +{
1242 +       /* The reason for not simply returning result of '&' operation is that
1243 +          while return value is (possibly 32bit) int,  the reiser4_block_nr is
1244 +          at least 64 bits long, and high bit (which is the only possible
1245 +          non zero bit after the masking) would be stripped off */
1246 +       return (*da & REISER4_FAKE_BLOCKNR_BIT_MASK) ? 1 : 0;
1247 +}
1248 +
1249 +/* Static functions for <reiser4 super block>/<reiser4 context> block counters
1250 +   arithmetic. Mostly, they are isolated to not to code same assertions in
1251 +   several places. */
1252 +static void sub_from_ctx_grabbed(reiser4_context * ctx, __u64 count)
1253 +{
1254 +       BUG_ON(ctx->grabbed_blocks < count);
1255 +       assert("zam-527", ctx->grabbed_blocks >= count);
1256 +       ctx->grabbed_blocks -= count;
1257 +}
1258 +
1259 +static void add_to_ctx_grabbed(reiser4_context * ctx, __u64 count)
1260 +{
1261 +       ctx->grabbed_blocks += count;
1262 +}
1263 +
1264 +static void sub_from_sb_grabbed(reiser4_super_info_data * sbinfo, __u64 count)
1265 +{
1266 +       assert("zam-525", sbinfo->blocks_grabbed >= count);
1267 +       sbinfo->blocks_grabbed -= count;
1268 +}
1269 +
1270 +/* Decrease the counter of block reserved for flush in super block. */
1271 +static void
1272 +sub_from_sb_flush_reserved(reiser4_super_info_data * sbinfo, __u64 count)
1273 +{
1274 +       assert("vpf-291", sbinfo->blocks_flush_reserved >= count);
1275 +       sbinfo->blocks_flush_reserved -= count;
1276 +}
1277 +
1278 +static void
1279 +sub_from_sb_fake_allocated(reiser4_super_info_data * sbinfo, __u64 count,
1280 +                          reiser4_ba_flags_t flags)
1281 +{
1282 +       if (flags & BA_FORMATTED) {
1283 +               assert("zam-806", sbinfo->blocks_fake_allocated >= count);
1284 +               sbinfo->blocks_fake_allocated -= count;
1285 +       } else {
1286 +               assert("zam-528",
1287 +                      sbinfo->blocks_fake_allocated_unformatted >= count);
1288 +               sbinfo->blocks_fake_allocated_unformatted -= count;
1289 +       }
1290 +}
1291 +
1292 +static void sub_from_sb_used(reiser4_super_info_data * sbinfo, __u64 count)
1293 +{
1294 +       assert("zam-530",
1295 +              sbinfo->blocks_used >= count + sbinfo->min_blocks_used);
1296 +       sbinfo->blocks_used -= count;
1297 +}
1298 +
1299 +static void
1300 +sub_from_cluster_reserved(reiser4_super_info_data * sbinfo, __u64 count)
1301 +{
1302 +       assert("edward-501", sbinfo->blocks_clustered >= count);
1303 +       sbinfo->blocks_clustered -= count;
1304 +}
1305 +
1306 +/* Increase the counter of block reserved for flush in atom. */
1307 +static void add_to_atom_flush_reserved_nolock(txn_atom * atom, __u32 count)
1308 +{
1309 +       assert("zam-772", atom != NULL);
1310 +       assert_spin_locked(&(atom->alock));
1311 +       atom->flush_reserved += count;
1312 +}
1313 +
1314 +/* Decrease the counter of block reserved for flush in atom. */
1315 +static void sub_from_atom_flush_reserved_nolock(txn_atom * atom, __u32 count)
1316 +{
1317 +       assert("zam-774", atom != NULL);
1318 +       assert_spin_locked(&(atom->alock));
1319 +       assert("nikita-2790", atom->flush_reserved >= count);
1320 +       atom->flush_reserved -= count;
1321 +}
1322 +
1323 +/* super block has 6 counters: free, used, grabbed, fake allocated
1324 +   (formatted and unformatted) and flush reserved. Their sum must be
1325 +   number of blocks on a device. This function checks this */
1326 +int check_block_counters(const struct super_block *super)
1327 +{
1328 +       __u64 sum;
1329 +
1330 +       sum = reiser4_grabbed_blocks(super) + reiser4_free_blocks(super) +
1331 +           reiser4_data_blocks(super) + reiser4_fake_allocated(super) +
1332 +           reiser4_fake_allocated_unformatted(super) + flush_reserved(super) +
1333 +           reiser4_clustered_blocks(super);
1334 +       if (reiser4_block_count(super) != sum) {
1335 +               printk("super block counters: "
1336 +                      "used %llu, free %llu, "
1337 +                      "grabbed %llu, fake allocated (formatetd %llu, unformatted %llu), "
1338 +                      "reserved %llu, clustered %llu, sum %llu, must be (block count) %llu\n",
1339 +                      (unsigned long long)reiser4_data_blocks(super),
1340 +                      (unsigned long long)reiser4_free_blocks(super),
1341 +                      (unsigned long long)reiser4_grabbed_blocks(super),
1342 +                      (unsigned long long)reiser4_fake_allocated(super),
1343 +                      (unsigned long long)
1344 +                      reiser4_fake_allocated_unformatted(super),
1345 +                      (unsigned long long)flush_reserved(super),
1346 +                      (unsigned long long)reiser4_clustered_blocks(super),
1347 +                      (unsigned long long)sum,
1348 +                      (unsigned long long)reiser4_block_count(super));
1349 +               return 0;
1350 +       }
1351 +       return 1;
1352 +}
1353 +
1354 +/* Adjust "working" free blocks counter for number of blocks we are going to
1355 +   allocate.  Record number of grabbed blocks in fs-wide and per-thread
1356 +   counters.  This function should be called before bitmap scanning or
1357 +   allocating fake block numbers
1358 +
1359 +   @super           -- pointer to reiser4 super block;
1360 +   @count           -- number of blocks we reserve;
1361 +
1362 +   @return          -- 0 if success,  -ENOSPC, if all
1363 +                       free blocks are preserved or already allocated.
1364 +*/
1365 +
1366 +static int
1367 +reiser4_grab(reiser4_context * ctx, __u64 count, reiser4_ba_flags_t flags)
1368 +{
1369 +       __u64 free_blocks;
1370 +       int ret = 0, use_reserved = flags & BA_RESERVED;
1371 +       reiser4_super_info_data *sbinfo;
1372 +
1373 +       assert("vs-1276", ctx == get_current_context());
1374 +
1375 +       /* Do not grab anything on ro-mounted fs. */
1376 +       if (rofs_super(ctx->super)) {
1377 +               ctx->grab_enabled = 0;
1378 +               return 0;
1379 +       }
1380 +
1381 +       sbinfo = get_super_private(ctx->super);
1382 +
1383 +       spin_lock_reiser4_super(sbinfo);
1384 +
1385 +       free_blocks = sbinfo->blocks_free;
1386 +
1387 +       if ((use_reserved && free_blocks < count) ||
1388 +           (!use_reserved && free_blocks < count + sbinfo->blocks_reserved)) {
1389 +               ret = RETERR(-ENOSPC);
1390 +               goto unlock_and_ret;
1391 +       }
1392 +
1393 +       add_to_ctx_grabbed(ctx, count);
1394 +
1395 +       sbinfo->blocks_grabbed += count;
1396 +       sbinfo->blocks_free -= count;
1397 +
1398 +#if REISER4_DEBUG
1399 +       if (ctx->grabbed_initially == 0)
1400 +               ctx->grabbed_initially = count;
1401 +#endif
1402 +
1403 +       assert("nikita-2986", check_block_counters(ctx->super));
1404 +
1405 +       /* disable grab space in current context */
1406 +       ctx->grab_enabled = 0;
1407 +
1408 +      unlock_and_ret:
1409 +       spin_unlock_reiser4_super(sbinfo);
1410 +
1411 +       return ret;
1412 +}
1413 +
1414 +int reiser4_grab_space(__u64 count, reiser4_ba_flags_t flags)
1415 +{
1416 +       int ret;
1417 +       reiser4_context *ctx;
1418 +
1419 +       assert("nikita-2964", ergo(flags & BA_CAN_COMMIT,
1420 +                                  lock_stack_isclean(get_current_lock_stack
1421 +                                                     ())));
1422 +       ctx = get_current_context();
1423 +       if (!(flags & BA_FORCE) && !is_grab_enabled(ctx)) {
1424 +               return 0;
1425 +       }
1426 +
1427 +       ret = reiser4_grab(ctx, count, flags);
1428 +       if (ret == -ENOSPC) {
1429 +
1430 +               /* Trying to commit the all transactions if BA_CAN_COMMIT flag present */
1431 +               if (flags & BA_CAN_COMMIT) {
1432 +                       txnmgr_force_commit_all(ctx->super, 0);
1433 +                       ctx->grab_enabled = 1;
1434 +                       ret = reiser4_grab(ctx, count, flags);
1435 +               }
1436 +       }
1437 +       /*
1438 +        * allocation from reserved pool cannot fail. This is severe error.
1439 +        */
1440 +       assert("nikita-3005", ergo(flags & BA_RESERVED, ret == 0));
1441 +       return ret;
1442 +}
1443 +
1444 +/*
1445 + * SPACE RESERVED FOR UNLINK/TRUNCATE
1446 + *
1447 + * Unlink and truncate require space in transaction (to update stat data, at
1448 + * least). But we don't want rm(1) to fail with "No space on device" error.
1449 + *
1450 + * Solution is to reserve 5% of disk space for truncates and
1451 + * unlinks. Specifically, normal space grabbing requests don't grab space from
1452 + * reserved area. Only requests with BA_RESERVED bit in flags are allowed to
1453 + * drain it. Per super block delete_sema semaphore is used to allow only one
1454 + * thread at a time to grab from reserved area.
1455 + *
1456 + * Grabbing from reserved area should always be performed with BA_CAN_COMMIT
1457 + * flag.
1458 + *
1459 + */
1460 +
1461 +int reiser4_grab_reserved(struct super_block *super,
1462 +                         __u64 count, reiser4_ba_flags_t flags)
1463 +{
1464 +       reiser4_super_info_data *sbinfo = get_super_private(super);
1465 +
1466 +       assert("nikita-3175", flags & BA_CAN_COMMIT);
1467 +
1468 +       /* Check the delete semaphore already taken by us, we assume that
1469 +        * reading of machine word is atomic. */
1470 +       if (sbinfo->delete_sema_owner == current) {
1471 +               if (reiser4_grab_space
1472 +                   (count, (flags | BA_RESERVED) & ~BA_CAN_COMMIT)) {
1473 +                       warning("zam-1003",
1474 +                               "nested call of grab_reserved fails count=(%llu)",
1475 +                               (unsigned long long)count);
1476 +                       reiser4_release_reserved(super);
1477 +                       return RETERR(-ENOSPC);
1478 +               }
1479 +               return 0;
1480 +       }
1481 +
1482 +       if (reiser4_grab_space(count, flags)) {
1483 +               down(&sbinfo->delete_sema);
1484 +               assert("nikita-2929", sbinfo->delete_sema_owner == NULL);
1485 +               sbinfo->delete_sema_owner = current;
1486 +
1487 +               if (reiser4_grab_space(count, flags | BA_RESERVED)) {
1488 +                       warning("zam-833",
1489 +                               "reserved space is not enough (%llu)",
1490 +                               (unsigned long long)count);
1491 +                       reiser4_release_reserved(super);
1492 +                       return RETERR(-ENOSPC);
1493 +               }
1494 +       }
1495 +       return 0;
1496 +}
1497 +
1498 +void reiser4_release_reserved(struct super_block *super)
1499 +{
1500 +       reiser4_super_info_data *info;
1501 +
1502 +       info = get_super_private(super);
1503 +       if (info->delete_sema_owner == current) {
1504 +               info->delete_sema_owner = NULL;
1505 +               up(&info->delete_sema);
1506 +       }
1507 +}
1508 +
1509 +static reiser4_super_info_data *grabbed2fake_allocated_head(int count)
1510 +{
1511 +       reiser4_context *ctx;
1512 +       reiser4_super_info_data *sbinfo;
1513 +
1514 +       ctx = get_current_context();
1515 +       sub_from_ctx_grabbed(ctx, count);
1516 +
1517 +       sbinfo = get_super_private(ctx->super);
1518 +       spin_lock_reiser4_super(sbinfo);
1519 +
1520 +       sub_from_sb_grabbed(sbinfo, count);
1521 +       /* return sbinfo locked */
1522 +       return sbinfo;
1523 +}
1524 +
1525 +/* is called after @count fake block numbers are allocated and pointer to
1526 +   those blocks are inserted into tree. */
1527 +static void grabbed2fake_allocated_formatted(void)
1528 +{
1529 +       reiser4_super_info_data *sbinfo;
1530 +
1531 +       sbinfo = grabbed2fake_allocated_head(1);
1532 +       sbinfo->blocks_fake_allocated++;
1533 +
1534 +       assert("vs-922", check_block_counters(reiser4_get_current_sb()));
1535 +
1536 +       spin_unlock_reiser4_super(sbinfo);
1537 +}
1538 +
1539 +/**
1540 + * grabbed2fake_allocated_unformatted
1541 + * @count:
1542 + *
1543 + */
1544 +static void grabbed2fake_allocated_unformatted(int count)
1545 +{
1546 +       reiser4_super_info_data *sbinfo;
1547 +
1548 +       sbinfo = grabbed2fake_allocated_head(count);
1549 +       sbinfo->blocks_fake_allocated_unformatted += count;
1550 +
1551 +       assert("vs-9221", check_block_counters(reiser4_get_current_sb()));
1552 +
1553 +       spin_unlock_reiser4_super(sbinfo);
1554 +}
1555 +
1556 +void grabbed2cluster_reserved(int count)
1557 +{
1558 +       reiser4_context *ctx;
1559 +       reiser4_super_info_data *sbinfo;
1560 +
1561 +       ctx = get_current_context();
1562 +       sub_from_ctx_grabbed(ctx, count);
1563 +
1564 +       sbinfo = get_super_private(ctx->super);
1565 +       spin_lock_reiser4_super(sbinfo);
1566 +
1567 +       sub_from_sb_grabbed(sbinfo, count);
1568 +       sbinfo->blocks_clustered += count;
1569 +
1570 +       assert("edward-504", check_block_counters(ctx->super));
1571 +
1572 +       spin_unlock_reiser4_super(sbinfo);
1573 +}
1574 +
1575 +void cluster_reserved2grabbed(int count)
1576 +{
1577 +       reiser4_context *ctx;
1578 +       reiser4_super_info_data *sbinfo;
1579 +
1580 +       ctx = get_current_context();
1581 +
1582 +       sbinfo = get_super_private(ctx->super);
1583 +       spin_lock_reiser4_super(sbinfo);
1584 +
1585 +       sub_from_cluster_reserved(sbinfo, count);
1586 +       sbinfo->blocks_grabbed += count;
1587 +
1588 +       assert("edward-505", check_block_counters(ctx->super));
1589 +
1590 +       spin_unlock_reiser4_super(sbinfo);
1591 +       add_to_ctx_grabbed(ctx, count);
1592 +}
1593 +
1594 +void cluster_reserved2free(int count)
1595 +{
1596 +       reiser4_context *ctx;
1597 +       reiser4_super_info_data *sbinfo;
1598 +
1599 +       assert("edward-503", get_current_context()->grabbed_blocks == 0);
1600 +
1601 +       ctx = get_current_context();
1602 +       sbinfo = get_super_private(ctx->super);
1603 +       spin_lock_reiser4_super(sbinfo);
1604 +
1605 +       sub_from_cluster_reserved(sbinfo, count);
1606 +       sbinfo->blocks_free += count;
1607 +
1608 +       assert("edward-502", check_block_counters(ctx->super));
1609 +
1610 +       spin_unlock_reiser4_super(sbinfo);
1611 +}
1612 +
1613 +static DEFINE_SPINLOCK(fake_lock);
1614 +static reiser4_block_nr fake_gen = 0;
1615 +
1616 +/**
1617 + * assign_fake_blocknr
1618 + * @blocknr:
1619 + * @count:
1620 + *
1621 + * Obtain a fake block number for new node which will be used to refer to
1622 + * this newly allocated node until real allocation is done.
1623 + */
1624 +static void assign_fake_blocknr(reiser4_block_nr *blocknr, int count)
1625 +{
1626 +       spin_lock(&fake_lock);
1627 +       *blocknr = fake_gen;
1628 +       fake_gen += count;
1629 +       spin_unlock(&fake_lock);
1630 +
1631 +       BUG_ON(*blocknr & REISER4_BLOCKNR_STATUS_BIT_MASK);
1632 +       /**blocknr &= ~REISER4_BLOCKNR_STATUS_BIT_MASK;*/
1633 +       *blocknr |= REISER4_UNALLOCATED_STATUS_VALUE;
1634 +       assert("zam-394", zlook(current_tree, blocknr) == NULL);
1635 +}
1636 +
1637 +int assign_fake_blocknr_formatted(reiser4_block_nr * blocknr)
1638 +{
1639 +       assign_fake_blocknr(blocknr, 1);
1640 +       grabbed2fake_allocated_formatted();
1641 +       return 0;
1642 +}
1643 +
1644 +/**
1645 + * fake_blocknrs_unformatted
1646 + * @count: number of fake numbers to get
1647 + *
1648 + * Allocates @count fake block numbers which will be assigned to jnodes
1649 + */
1650 +reiser4_block_nr fake_blocknr_unformatted(int count)
1651 +{
1652 +       reiser4_block_nr blocknr;
1653 +
1654 +       assign_fake_blocknr(&blocknr, count);
1655 +       grabbed2fake_allocated_unformatted(count);
1656 +
1657 +       return blocknr;
1658 +}
1659 +
1660 +/* adjust sb block counters, if real (on-disk) block allocation immediately
1661 +   follows grabbing of free disk space. */
1662 +static void grabbed2used(reiser4_context *ctx, reiser4_super_info_data *sbinfo,
1663 +                        __u64 count)
1664 +{
1665 +       sub_from_ctx_grabbed(ctx, count);
1666 +
1667 +       spin_lock_reiser4_super(sbinfo);
1668 +
1669 +       sub_from_sb_grabbed(sbinfo, count);
1670 +       sbinfo->blocks_used += count;
1671 +
1672 +       assert("nikita-2679", check_block_counters(ctx->super));
1673 +
1674 +       spin_unlock_reiser4_super(sbinfo);
1675 +}
1676 +
1677 +/* adjust sb block counters when @count unallocated blocks get mapped to disk */
1678 +static void fake_allocated2used(reiser4_super_info_data *sbinfo, __u64 count,
1679 +                               reiser4_ba_flags_t flags)
1680 +{
1681 +       spin_lock_reiser4_super(sbinfo);
1682 +
1683 +       sub_from_sb_fake_allocated(sbinfo, count, flags);
1684 +       sbinfo->blocks_used += count;
1685 +
1686 +       assert("nikita-2680", check_block_counters(reiser4_get_current_sb()));
1687 +
1688 +       spin_unlock_reiser4_super(sbinfo);
1689 +}
1690 +
1691 +static void flush_reserved2used(txn_atom * atom, __u64 count)
1692 +{
1693 +       reiser4_super_info_data *sbinfo;
1694 +
1695 +       assert("zam-787", atom != NULL);
1696 +       assert_spin_locked(&(atom->alock));
1697 +
1698 +       sub_from_atom_flush_reserved_nolock(atom, (__u32) count);
1699 +
1700 +       sbinfo = get_current_super_private();
1701 +       spin_lock_reiser4_super(sbinfo);
1702 +
1703 +       sub_from_sb_flush_reserved(sbinfo, count);
1704 +       sbinfo->blocks_used += count;
1705 +
1706 +       assert("zam-789", check_block_counters(reiser4_get_current_sb()));
1707 +
1708 +       spin_unlock_reiser4_super(sbinfo);
1709 +}
1710 +
1711 +/* update the per fs  blocknr hint default value. */
1712 +void
1713 +update_blocknr_hint_default(const struct super_block *s,
1714 +                           const reiser4_block_nr * block)
1715 +{
1716 +       reiser4_super_info_data *sbinfo = get_super_private(s);
1717 +
1718 +       assert("nikita-3342", !blocknr_is_fake(block));
1719 +
1720 +       spin_lock_reiser4_super(sbinfo);
1721 +       if (*block < sbinfo->block_count) {
1722 +               sbinfo->blocknr_hint_default = *block;
1723 +       } else {
1724 +               warning("zam-676",
1725 +                       "block number %llu is too large to be used in a blocknr hint\n",
1726 +                       (unsigned long long)*block);
1727 +               dump_stack();
1728 +               DEBUGON(1);
1729 +       }
1730 +       spin_unlock_reiser4_super(sbinfo);
1731 +}
1732 +
1733 +/* get current value of the default blocknr hint. */
1734 +void get_blocknr_hint_default(reiser4_block_nr * result)
1735 +{
1736 +       reiser4_super_info_data *sbinfo = get_current_super_private();
1737 +
1738 +       spin_lock_reiser4_super(sbinfo);
1739 +       *result = sbinfo->blocknr_hint_default;
1740 +       assert("zam-677", *result < sbinfo->block_count);
1741 +       spin_unlock_reiser4_super(sbinfo);
1742 +}
1743 +
1744 +/* Allocate "real" disk blocks by calling a proper space allocation plugin
1745 + * method. Blocks are allocated in one contiguous disk region. The plugin
1746 + * independent part accounts blocks by subtracting allocated amount from grabbed
1747 + * or fake block counter and add the same amount to the counter of allocated
1748 + * blocks.
1749 + *
1750 + * @hint -- a reiser4 blocknr hint object which contains further block
1751 + *          allocation hints and parameters (search start, a stage of block
1752 + *          which will be mapped to disk, etc.),
1753 + * @blk  -- an out parameter for the beginning of the allocated region,
1754 + * @len  -- in/out parameter, it should contain the maximum number of allocated
1755 + *          blocks, after block allocation completes, it contains the length of
1756 + *          allocated disk region.
1757 + * @flags -- see reiser4_ba_flags_t description.
1758 + *
1759 + * @return -- 0 if success, error code otherwise.
1760 + */
1761 +int
1762 +reiser4_alloc_blocks(reiser4_blocknr_hint * hint, reiser4_block_nr * blk,
1763 +                    reiser4_block_nr * len, reiser4_ba_flags_t flags)
1764 +{
1765 +       __u64 needed = *len;
1766 +       reiser4_context *ctx;
1767 +       reiser4_super_info_data *sbinfo;
1768 +       int ret;
1769 +
1770 +       assert("zam-986", hint != NULL);
1771 +
1772 +       ctx = get_current_context();
1773 +       sbinfo = get_super_private(ctx->super);
1774 +
1775 +       /* For write-optimized data we use default search start value, which is
1776 +        * close to last write location. */
1777 +       if (flags & BA_USE_DEFAULT_SEARCH_START) {
1778 +               get_blocknr_hint_default(&hint->blk);
1779 +       }
1780 +
1781 +       /* VITALY: allocator should grab this for internal/tx-lists/similar only. */
1782 +/* VS-FIXME-HANS: why is this comment above addressed to vitaly (from vitaly)? */
1783 +       if (hint->block_stage == BLOCK_NOT_COUNTED) {
1784 +               ret = reiser4_grab_space_force(*len, flags);
1785 +               if (ret != 0)
1786 +                       return ret;
1787 +       }
1788 +
1789 +       ret =
1790 +           sa_alloc_blocks(get_space_allocator(ctx->super), hint, (int)needed,
1791 +                           blk, len);
1792 +
1793 +       if (!ret) {
1794 +               assert("zam-680", *blk < reiser4_block_count(ctx->super));
1795 +               assert("zam-681",
1796 +                      *blk + *len <= reiser4_block_count(ctx->super));
1797 +
1798 +               if (flags & BA_PERMANENT) {
1799 +                       /* we assume that current atom exists at this moment */
1800 +                       txn_atom *atom = get_current_atom_locked();
1801 +                       atom->nr_blocks_allocated += *len;
1802 +                       spin_unlock_atom(atom);
1803 +               }
1804 +
1805 +               switch (hint->block_stage) {
1806 +               case BLOCK_NOT_COUNTED:
1807 +               case BLOCK_GRABBED:
1808 +                       grabbed2used(ctx, sbinfo, *len);
1809 +                       break;
1810 +               case BLOCK_UNALLOCATED:
1811 +                       fake_allocated2used(sbinfo, *len, flags);
1812 +                       break;
1813 +               case BLOCK_FLUSH_RESERVED:
1814 +                       {
1815 +                               txn_atom *atom = get_current_atom_locked();
1816 +                               flush_reserved2used(atom, *len);
1817 +                               spin_unlock_atom(atom);
1818 +                       }
1819 +                       break;
1820 +               default:
1821 +                       impossible("zam-531", "wrong block stage");
1822 +               }
1823 +       } else {
1824 +               assert("zam-821",
1825 +                      ergo(hint->max_dist == 0
1826 +                           && !hint->backward, ret != -ENOSPC));
1827 +               if (hint->block_stage == BLOCK_NOT_COUNTED)
1828 +                       grabbed2free(ctx, sbinfo, needed);
1829 +       }
1830 +
1831 +       return ret;
1832 +}
1833 +
1834 +/* used -> fake_allocated -> grabbed -> free */
1835 +
1836 +/* adjust sb block counters when @count unallocated blocks get unmapped from
1837 +   disk */
1838 +static void
1839 +used2fake_allocated(reiser4_super_info_data * sbinfo, __u64 count,
1840 +                   int formatted)
1841 +{
1842 +       spin_lock_reiser4_super(sbinfo);
1843 +
1844 +       if (formatted)
1845 +               sbinfo->blocks_fake_allocated += count;
1846 +       else
1847 +               sbinfo->blocks_fake_allocated_unformatted += count;
1848 +
1849 +       sub_from_sb_used(sbinfo, count);
1850 +
1851 +       assert("nikita-2681", check_block_counters(reiser4_get_current_sb()));
1852 +
1853 +       spin_unlock_reiser4_super(sbinfo);
1854 +}
1855 +
1856 +static void
1857 +used2flush_reserved(reiser4_super_info_data * sbinfo, txn_atom * atom,
1858 +                   __u64 count, reiser4_ba_flags_t flags UNUSED_ARG)
1859 +{
1860 +       assert("nikita-2791", atom != NULL);
1861 +       assert_spin_locked(&(atom->alock));
1862 +
1863 +       add_to_atom_flush_reserved_nolock(atom, (__u32) count);
1864 +
1865 +       spin_lock_reiser4_super(sbinfo);
1866 +
1867 +       sbinfo->blocks_flush_reserved += count;
1868 +       /*add_to_sb_flush_reserved(sbinfo, count); */
1869 +       sub_from_sb_used(sbinfo, count);
1870 +
1871 +       assert("nikita-2681", check_block_counters(reiser4_get_current_sb()));
1872 +
1873 +       spin_unlock_reiser4_super(sbinfo);
1874 +}
1875 +
1876 +/* disk space, virtually used by fake block numbers is counted as "grabbed" again. */
1877 +static void
1878 +fake_allocated2grabbed(reiser4_context * ctx, reiser4_super_info_data * sbinfo,
1879 +                      __u64 count, reiser4_ba_flags_t flags)
1880 +{
1881 +       add_to_ctx_grabbed(ctx, count);
1882 +
1883 +       spin_lock_reiser4_super(sbinfo);
1884 +
1885 +       assert("nikita-2682", check_block_counters(ctx->super));
1886 +
1887 +       sbinfo->blocks_grabbed += count;
1888 +       sub_from_sb_fake_allocated(sbinfo, count, flags & BA_FORMATTED);
1889 +
1890 +       assert("nikita-2683", check_block_counters(ctx->super));
1891 +
1892 +       spin_unlock_reiser4_super(sbinfo);
1893 +}
1894 +
1895 +void fake_allocated2free(__u64 count, reiser4_ba_flags_t flags)
1896 +{
1897 +       reiser4_context *ctx;
1898 +       reiser4_super_info_data *sbinfo;
1899 +
1900 +       ctx = get_current_context();
1901 +       sbinfo = get_super_private(ctx->super);
1902 +
1903 +       fake_allocated2grabbed(ctx, sbinfo, count, flags);
1904 +       grabbed2free(ctx, sbinfo, count);
1905 +}
1906 +
1907 +void grabbed2free_mark(__u64 mark)
1908 +{
1909 +       reiser4_context *ctx;
1910 +       reiser4_super_info_data *sbinfo;
1911 +
1912 +       ctx = get_current_context();
1913 +       sbinfo = get_super_private(ctx->super);
1914 +
1915 +       assert("nikita-3007", (__s64) mark >= 0);
1916 +       assert("nikita-3006", ctx->grabbed_blocks >= mark);
1917 +       grabbed2free(ctx, sbinfo, ctx->grabbed_blocks - mark);
1918 +}
1919 +
1920 +/**
1921 + * grabbed2free - adjust grabbed and free block counters
1922 + * @ctx: context to update grabbed block counter of
1923 + * @sbinfo: super block to update grabbed and free block counters of
1924 + * @count: number of blocks to adjust counters by
1925 + *
1926 + * Decreases context's and per filesystem's counters of grabbed
1927 + * blocks. Increases per filesystem's counter of free blocks.
1928 + */
1929 +void grabbed2free(reiser4_context *ctx, reiser4_super_info_data *sbinfo,
1930 +                 __u64 count)
1931 +{
1932 +       sub_from_ctx_grabbed(ctx, count);
1933 +
1934 +       spin_lock_reiser4_super(sbinfo);
1935 +
1936 +       sub_from_sb_grabbed(sbinfo, count);
1937 +       sbinfo->blocks_free += count;
1938 +       assert("nikita-2684", check_block_counters(ctx->super));
1939 +
1940 +       spin_unlock_reiser4_super(sbinfo);
1941 +}
1942 +
1943 +void grabbed2flush_reserved_nolock(txn_atom * atom, __u64 count)
1944 +{
1945 +       reiser4_context *ctx;
1946 +       reiser4_super_info_data *sbinfo;
1947 +
1948 +       assert("vs-1095", atom);
1949 +
1950 +       ctx = get_current_context();
1951 +       sbinfo = get_super_private(ctx->super);
1952 +
1953 +       sub_from_ctx_grabbed(ctx, count);
1954 +
1955 +       add_to_atom_flush_reserved_nolock(atom, count);
1956 +
1957 +       spin_lock_reiser4_super(sbinfo);
1958 +
1959 +       sbinfo->blocks_flush_reserved += count;
1960 +       sub_from_sb_grabbed(sbinfo, count);
1961 +
1962 +       assert("vpf-292", check_block_counters(ctx->super));
1963 +
1964 +       spin_unlock_reiser4_super(sbinfo);
1965 +}
1966 +
1967 +void grabbed2flush_reserved(__u64 count)
1968 +{
1969 +       txn_atom *atom = get_current_atom_locked();
1970 +
1971 +       grabbed2flush_reserved_nolock(atom, count);
1972 +
1973 +       spin_unlock_atom(atom);
1974 +}
1975 +
1976 +void flush_reserved2grabbed(txn_atom * atom, __u64 count)
1977 +{
1978 +       reiser4_context *ctx;
1979 +       reiser4_super_info_data *sbinfo;
1980 +
1981 +       assert("nikita-2788", atom != NULL);
1982 +       assert_spin_locked(&(atom->alock));
1983 +
1984 +       ctx = get_current_context();
1985 +       sbinfo = get_super_private(ctx->super);
1986 +
1987 +       add_to_ctx_grabbed(ctx, count);
1988 +
1989 +       sub_from_atom_flush_reserved_nolock(atom, (__u32) count);
1990 +
1991 +       spin_lock_reiser4_super(sbinfo);
1992 +
1993 +       sbinfo->blocks_grabbed += count;
1994 +       sub_from_sb_flush_reserved(sbinfo, count);
1995 +
1996 +       assert("vpf-292", check_block_counters(ctx->super));
1997 +
1998 +       spin_unlock_reiser4_super(sbinfo);
1999 +}
2000 +
2001 +/**
2002 + * all_grabbed2free - releases all blocks grabbed in context
2003 + *
2004 + * Decreases context's and super block's grabbed block counters by number of
2005 + * blocks grabbed by current context and increases super block's free block
2006 + * counter correspondingly.
2007 + */
2008 +void all_grabbed2free(void)
2009 +{
2010 +       reiser4_context *ctx = get_current_context();
2011 +
2012 +       grabbed2free(ctx, get_super_private(ctx->super), ctx->grabbed_blocks);
2013 +}
2014 +
2015 +/* adjust sb block counters if real (on-disk) blocks do not become unallocated
2016 +   after freeing, @count blocks become "grabbed". */
2017 +static void
2018 +used2grabbed(reiser4_context * ctx, reiser4_super_info_data * sbinfo,
2019 +            __u64 count)
2020 +{
2021 +       add_to_ctx_grabbed(ctx, count);
2022 +
2023 +       spin_lock_reiser4_super(sbinfo);
2024 +
2025 +       sbinfo->blocks_grabbed += count;
2026 +       sub_from_sb_used(sbinfo, count);
2027 +
2028 +       assert("nikita-2685", check_block_counters(ctx->super));
2029 +
2030 +       spin_unlock_reiser4_super(sbinfo);
2031 +}
2032 +
2033 +/* this used to be done through used2grabbed and grabbed2free*/
2034 +static void used2free(reiser4_super_info_data * sbinfo, __u64 count)
2035 +{
2036 +       spin_lock_reiser4_super(sbinfo);
2037 +
2038 +       sbinfo->blocks_free += count;
2039 +       sub_from_sb_used(sbinfo, count);
2040 +
2041 +       assert("nikita-2685", check_block_counters(reiser4_get_current_sb()));
2042 +
2043 +       spin_unlock_reiser4_super(sbinfo);
2044 +}
2045 +
2046 +#if REISER4_DEBUG
2047 +
2048 +/* check "allocated" state of given block range */
2049 +static void
2050 +reiser4_check_blocks(const reiser4_block_nr * start,
2051 +                    const reiser4_block_nr * len, int desired)
2052 +{
2053 +       sa_check_blocks(start, len, desired);
2054 +}
2055 +
2056 +/* check "allocated" state of given block */
2057 +void reiser4_check_block(const reiser4_block_nr * block, int desired)
2058 +{
2059 +       const reiser4_block_nr one = 1;
2060 +
2061 +       reiser4_check_blocks(block, &one, desired);
2062 +}
2063 +
2064 +#endif
2065 +
2066 +/* Blocks deallocation function may do an actual deallocation through space
2067 +   plugin allocation or store deleted block numbers in atom's delete_set data
2068 +   structure depend on @defer parameter. */
2069 +
2070 +/* if BA_DEFER bit is not turned on, @target_stage means the stage of blocks which
2071 +   will be deleted from WORKING bitmap. They might be just unmapped from disk, or
2072 +   freed but disk space is still grabbed by current thread, or these blocks must
2073 +   not be counted in any reiser4 sb block counters, see block_stage_t comment */
2074 +
2075 +/* BA_FORMATTED bit is only used when BA_DEFER in not present: it is used to
2076 +   distinguish blocks allocated for unformatted and formatted nodes */
2077 +
2078 +int
2079 +reiser4_dealloc_blocks(const reiser4_block_nr * start,
2080 +                      const reiser4_block_nr * len,
2081 +                      block_stage_t target_stage, reiser4_ba_flags_t flags)
2082 +{
2083 +       txn_atom *atom = NULL;
2084 +       int ret;
2085 +       reiser4_context *ctx;
2086 +       reiser4_super_info_data *sbinfo;
2087 +
2088 +       ctx = get_current_context();
2089 +       sbinfo = get_super_private(ctx->super);
2090 +
2091 +       if (REISER4_DEBUG) {
2092 +               assert("zam-431", *len != 0);
2093 +               assert("zam-432", *start != 0);
2094 +               assert("zam-558", !blocknr_is_fake(start));
2095 +
2096 +               spin_lock_reiser4_super(sbinfo);
2097 +               assert("zam-562", *start < sbinfo->block_count);
2098 +               spin_unlock_reiser4_super(sbinfo);
2099 +       }
2100 +
2101 +       if (flags & BA_DEFER) {
2102 +               blocknr_set_entry *bsep = NULL;
2103 +
2104 +               /* storing deleted block numbers in a blocknr set
2105 +                  datastructure for further actual deletion */
2106 +               do {
2107 +                       atom = get_current_atom_locked();
2108 +                       assert("zam-430", atom != NULL);
2109 +
2110 +                       ret =
2111 +                           blocknr_set_add_extent(atom, &atom->delete_set,
2112 +                                                  &bsep, start, len);
2113 +
2114 +                       if (ret == -ENOMEM)
2115 +                               return ret;
2116 +
2117 +                       /* This loop might spin at most two times */
2118 +               } while (ret == -E_REPEAT);
2119 +
2120 +               assert("zam-477", ret == 0);
2121 +               assert("zam-433", atom != NULL);
2122 +
2123 +               spin_unlock_atom(atom);
2124 +
2125 +       } else {
2126 +               assert("zam-425", get_current_super_private() != NULL);
2127 +               sa_dealloc_blocks(get_space_allocator(ctx->super), *start,
2128 +                                 *len);
2129 +
2130 +               if (flags & BA_PERMANENT) {
2131 +                       /* These blocks were counted as allocated, we have to revert it
2132 +                        * back if allocation is discarded. */
2133 +                       txn_atom *atom = get_current_atom_locked();
2134 +                       atom->nr_blocks_allocated -= *len;
2135 +                       spin_unlock_atom(atom);
2136 +               }
2137 +
2138 +               switch (target_stage) {
2139 +               case BLOCK_NOT_COUNTED:
2140 +                       assert("vs-960", flags & BA_FORMATTED);
2141 +                       /* VITALY: This is what was grabbed for internal/tx-lists/similar only */
2142 +                       used2free(sbinfo, *len);
2143 +                       break;
2144 +
2145 +               case BLOCK_GRABBED:
2146 +                       used2grabbed(ctx, sbinfo, *len);
2147 +                       break;
2148 +
2149 +               case BLOCK_UNALLOCATED:
2150 +                       used2fake_allocated(sbinfo, *len, flags & BA_FORMATTED);
2151 +                       break;
2152 +
2153 +               case BLOCK_FLUSH_RESERVED:{
2154 +                               txn_atom *atom;
2155 +
2156 +                               atom = get_current_atom_locked();
2157 +                               used2flush_reserved(sbinfo, atom, *len,
2158 +                                                   flags & BA_FORMATTED);
2159 +                               spin_unlock_atom(atom);
2160 +                               break;
2161 +                       }
2162 +               default:
2163 +                       impossible("zam-532", "wrong block stage");
2164 +               }
2165 +       }
2166 +
2167 +       return 0;
2168 +}
2169 +
2170 +/* wrappers for block allocator plugin methods */
2171 +int pre_commit_hook(void)
2172 +{
2173 +       assert("zam-502", get_current_super_private() != NULL);
2174 +       sa_pre_commit_hook();
2175 +       return 0;
2176 +}
2177 +
2178 +/* an actor which applies delete set to block allocator data */
2179 +static int
2180 +apply_dset(txn_atom * atom UNUSED_ARG, const reiser4_block_nr * a,
2181 +          const reiser4_block_nr * b, void *data UNUSED_ARG)
2182 +{
2183 +       reiser4_context *ctx;
2184 +       reiser4_super_info_data *sbinfo;
2185 +
2186 +       __u64 len = 1;
2187 +
2188 +       ctx = get_current_context();
2189 +       sbinfo = get_super_private(ctx->super);
2190 +
2191 +       assert("zam-877", atom->stage >= ASTAGE_PRE_COMMIT);
2192 +       assert("zam-552", sbinfo != NULL);
2193 +
2194 +       if (b != NULL)
2195 +               len = *b;
2196 +
2197 +       if (REISER4_DEBUG) {
2198 +               spin_lock_reiser4_super(sbinfo);
2199 +
2200 +               assert("zam-554", *a < reiser4_block_count(ctx->super));
2201 +               assert("zam-555", *a + len <= reiser4_block_count(ctx->super));
2202 +
2203 +               spin_unlock_reiser4_super(sbinfo);
2204 +       }
2205 +
2206 +       sa_dealloc_blocks(&sbinfo->space_allocator, *a, len);
2207 +       /* adjust sb block counters */
2208 +       used2free(sbinfo, len);
2209 +       return 0;
2210 +}
2211 +
2212 +void post_commit_hook(void)
2213 +{
2214 +       txn_atom *atom;
2215 +
2216 +       atom = get_current_atom_locked();
2217 +       assert("zam-452", atom->stage == ASTAGE_POST_COMMIT);
2218 +       spin_unlock_atom(atom);
2219 +
2220 +       /* do the block deallocation which was deferred
2221 +          until commit is done */
2222 +       blocknr_set_iterator(atom, &atom->delete_set, apply_dset, NULL, 1);
2223 +
2224 +       assert("zam-504", get_current_super_private() != NULL);
2225 +       sa_post_commit_hook();
2226 +}
2227 +
2228 +void post_write_back_hook(void)
2229 +{
2230 +       assert("zam-504", get_current_super_private() != NULL);
2231 +
2232 +       sa_post_commit_hook();
2233 +}
2234 +
2235 +/*
2236 +   Local variables:
2237 +   c-indentation-style: "K&R"
2238 +   mode-name: "LC"
2239 +   c-basic-offset: 8
2240 +   tab-width: 8
2241 +   fill-column: 120
2242 +   scroll-step: 1
2243 +   End:
2244 +*/
2245 diff --git a/fs/reiser4/block_alloc.h b/fs/reiser4/block_alloc.h
2246 new file mode 100644
2247 index 0000000..b1cb54a
2248 --- /dev/null
2249 +++ b/fs/reiser4/block_alloc.h
2250 @@ -0,0 +1,175 @@
2251 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
2252 +
2253 +#if !defined (__FS_REISER4_BLOCK_ALLOC_H__)
2254 +#define __FS_REISER4_BLOCK_ALLOC_H__
2255 +
2256 +#include "dformat.h"
2257 +#include "forward.h"
2258 +
2259 +#include <linux/types.h>       /* for __u??  */
2260 +#include <linux/fs.h>
2261 +
2262 +/* Mask when is applied to given block number shows is that block number is a fake one */
2263 +#define REISER4_FAKE_BLOCKNR_BIT_MASK   0x8000000000000000ULL
2264 +/* Mask which isolates a type of object this fake block number was assigned to */
2265 +#define REISER4_BLOCKNR_STATUS_BIT_MASK 0xC000000000000000ULL
2266 +
2267 +/*result after applying the REISER4_BLOCKNR_STATUS_BIT_MASK should be compared
2268 +   against these two values to understand is the object unallocated or bitmap
2269 +   shadow object (WORKING BITMAP block, look at the plugin/space/bitmap.c) */
2270 +#define REISER4_UNALLOCATED_STATUS_VALUE    0xC000000000000000ULL
2271 +#define REISER4_BITMAP_BLOCKS_STATUS_VALUE  0x8000000000000000ULL
2272 +
2273 +/* specification how block allocation was counted in sb block counters */
2274 +typedef enum {
2275 +       BLOCK_NOT_COUNTED = 0,  /* reiser4 has no info about this block yet */
2276 +       BLOCK_GRABBED = 1,      /* free space grabbed for further allocation
2277 +                                  of this block */
2278 +       BLOCK_FLUSH_RESERVED = 2,       /* block is reserved for flush needs. */
2279 +       BLOCK_UNALLOCATED = 3,  /* block is used for existing in-memory object
2280 +                                  ( unallocated formatted or unformatted
2281 +                                  node) */
2282 +       BLOCK_ALLOCATED = 4     /* block is mapped to disk, real on-disk block
2283 +                                  number assigned */
2284 +} block_stage_t;
2285 +
2286 +/* a hint for block allocator */
2287 +struct reiser4_blocknr_hint {
2288 +       /* FIXME: I think we want to add a longterm lock on the bitmap block here.  This
2289 +          is to prevent jnode_flush() calls from interleaving allocations on the same
2290 +          bitmap, once a hint is established. */
2291 +
2292 +       /* search start hint */
2293 +       reiser4_block_nr blk;
2294 +       /* if not zero, it is a region size we search for free blocks in */
2295 +       reiser4_block_nr max_dist;
2296 +       /* level for allocation, may be useful have branch-level and higher
2297 +          write-optimized. */
2298 +       tree_level level;
2299 +       /* block allocator assumes that blocks, which will be mapped to disk,
2300 +          are in this specified block_stage */
2301 +       block_stage_t block_stage;
2302 +       /* If direction = 1 allocate blocks in backward direction from the end
2303 +        * of disk to the beginning of disk.  */
2304 +       unsigned int backward:1;
2305 +
2306 +};
2307 +
2308 +/* These flags control block allocation/deallocation behavior */
2309 +enum reiser4_ba_flags {
2310 +       /* do allocatations from reserved (5%) area */
2311 +       BA_RESERVED = (1 << 0),
2312 +
2313 +       /* block allocator can do commit trying to recover free space */
2314 +       BA_CAN_COMMIT = (1 << 1),
2315 +
2316 +       /* if operation will be applied to formatted block */
2317 +       BA_FORMATTED = (1 << 2),
2318 +
2319 +       /* defer actual block freeing until transaction commit */
2320 +       BA_DEFER = (1 << 3),
2321 +
2322 +       /* allocate blocks for permanent fs objects (formatted or unformatted), not
2323 +          wandered of log blocks */
2324 +       BA_PERMANENT = (1 << 4),
2325 +
2326 +       /* grab space even it was disabled */
2327 +       BA_FORCE = (1 << 5),
2328 +
2329 +       /* use default start value for free blocks search. */
2330 +       BA_USE_DEFAULT_SEARCH_START = (1 << 6)
2331 +};
2332 +
2333 +typedef enum reiser4_ba_flags reiser4_ba_flags_t;
2334 +
2335 +extern void blocknr_hint_init(reiser4_blocknr_hint * hint);
2336 +extern void blocknr_hint_done(reiser4_blocknr_hint * hint);
2337 +extern void update_blocknr_hint_default(const struct super_block *,
2338 +                                       const reiser4_block_nr *);
2339 +extern void get_blocknr_hint_default(reiser4_block_nr *);
2340 +
2341 +extern reiser4_block_nr reiser4_fs_reserved_space(struct super_block *super);
2342 +
2343 +int assign_fake_blocknr_formatted(reiser4_block_nr *);
2344 +reiser4_block_nr fake_blocknr_unformatted(int);
2345 +
2346 +/* free -> grabbed -> fake_allocated -> used */
2347 +
2348 +int reiser4_grab_space(__u64 count, reiser4_ba_flags_t flags);
2349 +void all_grabbed2free(void);
2350 +void grabbed2free(reiser4_context *, reiser4_super_info_data *, __u64 count);
2351 +void fake_allocated2free(__u64 count, reiser4_ba_flags_t flags);
2352 +void grabbed2flush_reserved_nolock(txn_atom * atom, __u64 count);
2353 +void grabbed2flush_reserved(__u64 count);
2354 +int reiser4_alloc_blocks(reiser4_blocknr_hint * hint,
2355 +                        reiser4_block_nr * start,
2356 +                        reiser4_block_nr * len, reiser4_ba_flags_t flags);
2357 +int reiser4_dealloc_blocks(const reiser4_block_nr *,
2358 +                          const reiser4_block_nr *,
2359 +                          block_stage_t, reiser4_ba_flags_t flags);
2360 +
2361 +static inline int reiser4_alloc_block(reiser4_blocknr_hint * hint,
2362 +                                     reiser4_block_nr * start,
2363 +                                     reiser4_ba_flags_t flags)
2364 +{
2365 +       reiser4_block_nr one = 1;
2366 +       return reiser4_alloc_blocks(hint, start, &one, flags);
2367 +}
2368 +
2369 +static inline int reiser4_dealloc_block(const reiser4_block_nr * block,
2370 +                                       block_stage_t stage,
2371 +                                       reiser4_ba_flags_t flags)
2372 +{
2373 +       const reiser4_block_nr one = 1;
2374 +       return reiser4_dealloc_blocks(block, &one, stage, flags);
2375 +}
2376 +
2377 +#define reiser4_grab_space_force(count, flags)         \
2378 +       reiser4_grab_space(count, flags | BA_FORCE)
2379 +
2380 +extern void grabbed2free_mark(__u64 mark);
2381 +extern int reiser4_grab_reserved(struct super_block *,
2382 +                                __u64, reiser4_ba_flags_t);
2383 +extern void reiser4_release_reserved(struct super_block *super);
2384 +
2385 +/* grabbed -> fake_allocated */
2386 +
2387 +/* fake_allocated -> used */
2388 +
2389 +/* used -> fake_allocated -> grabbed -> free */
2390 +
2391 +extern void flush_reserved2grabbed(txn_atom * atom, __u64 count);
2392 +
2393 +extern int blocknr_is_fake(const reiser4_block_nr * da);
2394 +
2395 +extern void grabbed2cluster_reserved(int count);
2396 +extern void cluster_reserved2grabbed(int count);
2397 +extern void cluster_reserved2free(int count);
2398 +
2399 +extern int check_block_counters(const struct super_block *);
2400 +
2401 +#if REISER4_DEBUG
2402 +
2403 +extern void reiser4_check_block(const reiser4_block_nr *, int);
2404 +
2405 +#else
2406 +
2407 +#  define reiser4_check_block(beg, val)        noop
2408 +
2409 +#endif
2410 +
2411 +extern int pre_commit_hook(void);
2412 +extern void post_commit_hook(void);
2413 +extern void post_write_back_hook(void);
2414 +
2415 +#endif                         /* __FS_REISER4_BLOCK_ALLOC_H__ */
2416 +
2417 +/* Make Linus happy.
2418 +   Local variables:
2419 +   c-indentation-style: "K&R"
2420 +   mode-name: "LC"
2421 +   c-basic-offset: 8
2422 +   tab-width: 8
2423 +   fill-column: 120
2424 +   End:
2425 +*/
2426 diff --git a/fs/reiser4/blocknrset.c b/fs/reiser4/blocknrset.c
2427 new file mode 100644
2428 index 0000000..8bd1402
2429 --- /dev/null
2430 +++ b/fs/reiser4/blocknrset.c
2431 @@ -0,0 +1,368 @@
2432 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
2433 +
2434 +/* This file contains code for various block number sets used by the atom to
2435 +   track the deleted set and wandered block mappings. */
2436 +
2437 +#include "debug.h"
2438 +#include "dformat.h"
2439 +#include "txnmgr.h"
2440 +#include "context.h"
2441 +
2442 +#include <linux/slab.h>
2443 +
2444 +/* The proposed data structure for storing unordered block number sets is a
2445 +   list of elements, each of which contains an array of block number or/and
2446 +   array of block number pairs. That element called blocknr_set_entry is used
2447 +   to store block numbers from the beginning and for extents from the end of
2448 +   the data field (char data[...]). The ->nr_blocks and ->nr_pairs fields
2449 +   count numbers of blocks and extents.
2450 +
2451 +   +------------------- blocknr_set_entry->data ------------------+
2452 +   |block1|block2| ... <free space> ... |pair3|pair2|pair1|
2453 +   +------------------------------------------------------------+
2454 +
2455 +   When current blocknr_set_entry is full, allocate a new one. */
2456 +
2457 +/* Usage examples: blocknr sets are used in reiser4 for storing atom's delete
2458 + * set (single blocks and block extents), in that case blocknr pair represent an
2459 + * extent; atom's wandered map is also stored as a blocknr set, blocknr pairs
2460 + * there represent a (real block) -> (wandered block) mapping. */
2461 +
2462 +typedef struct blocknr_pair blocknr_pair;
2463 +
2464 +/* The total size of a blocknr_set_entry. */
2465 +#define BLOCKNR_SET_ENTRY_SIZE 128
2466 +
2467 +/* The number of blocks that can fit the blocknr data area. */
2468 +#define BLOCKNR_SET_ENTRIES_NUMBER             \
2469 +       ((BLOCKNR_SET_ENTRY_SIZE -              \
2470 +         2 * sizeof (unsigned) -               \
2471 +         sizeof(struct list_head)) /           \
2472 +        sizeof(reiser4_block_nr))
2473 +
2474 +/* An entry of the blocknr_set */
2475 +struct blocknr_set_entry {
2476 +       unsigned nr_singles;
2477 +       unsigned nr_pairs;
2478 +       struct list_head link;
2479 +       reiser4_block_nr entries[BLOCKNR_SET_ENTRIES_NUMBER];
2480 +};
2481 +
2482 +/* A pair of blocks as recorded in the blocknr_set_entry data. */
2483 +struct blocknr_pair {
2484 +       reiser4_block_nr a;
2485 +       reiser4_block_nr b;
2486 +};
2487 +
2488 +/* Return the number of blocknr slots available in a blocknr_set_entry. */
2489 +/* Audited by: green(2002.06.11) */
2490 +static unsigned bse_avail(blocknr_set_entry * bse)
2491 +{
2492 +       unsigned used = bse->nr_singles + 2 * bse->nr_pairs;
2493 +
2494 +       assert("jmacd-5088", BLOCKNR_SET_ENTRIES_NUMBER >= used);
2495 +       cassert(sizeof(blocknr_set_entry) == BLOCKNR_SET_ENTRY_SIZE);
2496 +
2497 +       return BLOCKNR_SET_ENTRIES_NUMBER - used;
2498 +}
2499 +
2500 +/* Initialize a blocknr_set_entry. */
2501 +static void bse_init(blocknr_set_entry *bse)
2502 +{
2503 +       bse->nr_singles = 0;
2504 +       bse->nr_pairs = 0;
2505 +       INIT_LIST_HEAD(&bse->link);
2506 +}
2507 +
2508 +/* Allocate and initialize a blocknr_set_entry. */
2509 +/* Audited by: green(2002.06.11) */
2510 +static blocknr_set_entry *bse_alloc(void)
2511 +{
2512 +       blocknr_set_entry *e;
2513 +
2514 +       if ((e = (blocknr_set_entry *) kmalloc(sizeof(blocknr_set_entry),
2515 +                                              get_gfp_mask())) == NULL)
2516 +               return NULL;
2517 +
2518 +       bse_init(e);
2519 +
2520 +       return e;
2521 +}
2522 +
2523 +/* Free a blocknr_set_entry. */
2524 +/* Audited by: green(2002.06.11) */
2525 +static void bse_free(blocknr_set_entry * bse)
2526 +{
2527 +       kfree(bse);
2528 +}
2529 +
2530 +/* Add a block number to a blocknr_set_entry */
2531 +/* Audited by: green(2002.06.11) */
2532 +static void
2533 +bse_put_single(blocknr_set_entry * bse, const reiser4_block_nr * block)
2534 +{
2535 +       assert("jmacd-5099", bse_avail(bse) >= 1);
2536 +
2537 +       bse->entries[bse->nr_singles++] = *block;
2538 +}
2539 +
2540 +/* Get a pair of block numbers */
2541 +/* Audited by: green(2002.06.11) */
2542 +static inline blocknr_pair *bse_get_pair(blocknr_set_entry * bse, unsigned pno)
2543 +{
2544 +       assert("green-1", BLOCKNR_SET_ENTRIES_NUMBER >= 2 * (pno + 1));
2545 +
2546 +       return (blocknr_pair *) (bse->entries + BLOCKNR_SET_ENTRIES_NUMBER -
2547 +                                2 * (pno + 1));
2548 +}
2549 +
2550 +/* Add a pair of block numbers to a blocknr_set_entry */
2551 +/* Audited by: green(2002.06.11) */
2552 +static void
2553 +bse_put_pair(blocknr_set_entry * bse, const reiser4_block_nr * a,
2554 +            const reiser4_block_nr * b)
2555 +{
2556 +       blocknr_pair *pair;
2557 +
2558 +       assert("jmacd-5100", bse_avail(bse) >= 2 && a != NULL && b != NULL);
2559 +
2560 +       pair = bse_get_pair(bse, bse->nr_pairs++);
2561 +
2562 +       pair->a = *a;
2563 +       pair->b = *b;
2564 +}
2565 +
2566 +/* Add either a block or pair of blocks to the block number set.  The first
2567 +   blocknr (@a) must be non-NULL.  If @b is NULL a single blocknr is added, if
2568 +   @b is non-NULL a pair is added.  The block number set belongs to atom, and
2569 +   the call is made with the atom lock held.  There may not be enough space in
2570 +   the current blocknr_set_entry.  If new_bsep points to a non-NULL
2571 +   blocknr_set_entry then it will be added to the blocknr_set and new_bsep
2572 +   will be set to NULL.  If new_bsep contains NULL then the atom lock will be
2573 +   released and a new bse will be allocated in new_bsep.  E_REPEAT will be
2574 +   returned with the atom unlocked for the operation to be tried again.  If
2575 +   the operation succeeds, 0 is returned.  If new_bsep is non-NULL and not
2576 +   used during the call, it will be freed automatically. */
2577 +static int blocknr_set_add(txn_atom *atom, blocknr_set *bset,
2578 +                          blocknr_set_entry **new_bsep, const reiser4_block_nr *a,
2579 +                          const reiser4_block_nr *b)
2580 +{
2581 +       blocknr_set_entry *bse;
2582 +       unsigned entries_needed;
2583 +
2584 +       assert("jmacd-5101", a != NULL);
2585 +
2586 +       entries_needed = (b == NULL) ? 1 : 2;
2587 +       if (list_empty(&bset->entries) ||
2588 +           bse_avail(list_entry(bset->entries.next, blocknr_set_entry, link)) < entries_needed) {
2589 +               /* See if a bse was previously allocated. */
2590 +               if (*new_bsep == NULL) {
2591 +                       spin_unlock_atom(atom);
2592 +                       *new_bsep = bse_alloc();
2593 +                       return (*new_bsep != NULL) ? -E_REPEAT :
2594 +                               RETERR(-ENOMEM);
2595 +               }
2596 +
2597 +               /* Put it on the head of the list. */
2598 +               list_add(&((*new_bsep)->link), &bset->entries);
2599 +
2600 +               *new_bsep = NULL;
2601 +       }
2602 +
2603 +       /* Add the single or pair. */
2604 +       bse = list_entry(bset->entries.next, blocknr_set_entry, link);
2605 +       if (b == NULL) {
2606 +               bse_put_single(bse, a);
2607 +       } else {
2608 +               bse_put_pair(bse, a, b);
2609 +       }
2610 +
2611 +       /* If new_bsep is non-NULL then there was an allocation race, free this copy. */
2612 +       if (*new_bsep != NULL) {
2613 +               bse_free(*new_bsep);
2614 +               *new_bsep = NULL;
2615 +       }
2616 +
2617 +       return 0;
2618 +}
2619 +
2620 +/* Add an extent to the block set.  If the length is 1, it is treated as a
2621 +   single block (e.g., reiser4_set_add_block). */
2622 +/* Audited by: green(2002.06.11) */
2623 +/* Auditor note: Entire call chain cannot hold any spinlocks, because
2624 +   kmalloc might schedule. The only exception is atom spinlock, which is
2625 +   properly freed. */
2626 +int
2627 +blocknr_set_add_extent(txn_atom * atom,
2628 +                      blocknr_set * bset,
2629 +                      blocknr_set_entry ** new_bsep,
2630 +                      const reiser4_block_nr * start,
2631 +                      const reiser4_block_nr * len)
2632 +{
2633 +       assert("jmacd-5102", start != NULL && len != NULL && *len > 0);
2634 +       return blocknr_set_add(atom, bset, new_bsep, start,
2635 +                              *len == 1 ? NULL : len);
2636 +}
2637 +
2638 +/* Add a block pair to the block set. It adds exactly a pair, which is checked
2639 + * by an assertion that both arguments are not null.*/
2640 +/* Audited by: green(2002.06.11) */
2641 +/* Auditor note: Entire call chain cannot hold any spinlocks, because
2642 +   kmalloc might schedule. The only exception is atom spinlock, which is
2643 +   properly freed. */
2644 +int
2645 +blocknr_set_add_pair(txn_atom * atom,
2646 +                    blocknr_set * bset,
2647 +                    blocknr_set_entry ** new_bsep, const reiser4_block_nr * a,
2648 +                    const reiser4_block_nr * b)
2649 +{
2650 +       assert("jmacd-5103", a != NULL && b != NULL);
2651 +       return blocknr_set_add(atom, bset, new_bsep, a, b);
2652 +}
2653 +
2654 +/* Initialize a blocknr_set. */
2655 +void blocknr_set_init(blocknr_set *bset)
2656 +{
2657 +       INIT_LIST_HEAD(&bset->entries);
2658 +}
2659 +
2660 +/* Release the entries of a blocknr_set. */
2661 +void blocknr_set_destroy(blocknr_set *bset)
2662 +{
2663 +       blocknr_set_entry *bse;
2664 +
2665 +       while (!list_empty_careful(&bset->entries)) {
2666 +               bse = list_entry(bset->entries.next, blocknr_set_entry, link);
2667 +               list_del_init(&bse->link);
2668 +               bse_free(bse);
2669 +       }
2670 +}
2671 +
2672 +/* Merge blocknr_set entries out of @from into @into. */
2673 +/* Audited by: green(2002.06.11) */
2674 +/* Auditor comments: This merge does not know if merged sets contain
2675 +   blocks pairs (As for wandered sets) or extents, so it cannot really merge
2676 +   overlapping ranges if there is some. So I believe it may lead to
2677 +   some blocks being presented several times in one blocknr_set. To help
2678 +   debugging such problems it might help to check for duplicate entries on
2679 +   actual processing of this set. Testing this kind of stuff right here is
2680 +   also complicated by the fact that these sets are not sorted and going
2681 +   through whole set on each element addition is going to be CPU-heavy task */
2682 +void blocknr_set_merge(blocknr_set * from, blocknr_set * into)
2683 +{
2684 +       blocknr_set_entry *bse_into = NULL;
2685 +
2686 +       /* If @from is empty, no work to perform. */
2687 +       if (list_empty_careful(&from->entries)) {
2688 +               return;
2689 +       }
2690 +
2691 +       /* If @into is not empty, try merging partial-entries. */
2692 +       if (!list_empty_careful(&into->entries)) {
2693 +
2694 +               /* Neither set is empty, pop the front to members and try to combine them. */
2695 +               blocknr_set_entry *bse_from;
2696 +               unsigned into_avail;
2697 +
2698 +               bse_into = list_entry(into->entries.next, blocknr_set_entry, link);
2699 +               list_del_init(&bse_into->link);
2700 +               bse_from = list_entry(from->entries.next, blocknr_set_entry, link);
2701 +               list_del_init(&bse_from->link);
2702 +
2703 +               /* Combine singles. */
2704 +               for (into_avail = bse_avail(bse_into);
2705 +                    into_avail != 0 && bse_from->nr_singles != 0;
2706 +                    into_avail -= 1) {
2707 +                       bse_put_single(bse_into,
2708 +                                      &bse_from->entries[--bse_from->
2709 +                                                         nr_singles]);
2710 +               }
2711 +
2712 +               /* Combine pairs. */
2713 +               for (; into_avail > 1 && bse_from->nr_pairs != 0;
2714 +                    into_avail -= 2) {
2715 +                       blocknr_pair *pair =
2716 +                           bse_get_pair(bse_from, --bse_from->nr_pairs);
2717 +                       bse_put_pair(bse_into, &pair->a, &pair->b);
2718 +               }
2719 +
2720 +               /* If bse_from is empty, delete it now. */
2721 +               if (bse_avail(bse_from) == BLOCKNR_SET_ENTRIES_NUMBER) {
2722 +                       bse_free(bse_from);
2723 +               } else {
2724 +                       /* Otherwise, bse_into is full or nearly full (e.g.,
2725 +                          it could have one slot avail and bse_from has one
2726 +                          pair left).  Push it back onto the list.  bse_from
2727 +                          becomes bse_into, which will be the new partial. */
2728 +                       list_add(&bse_into->link, &into->entries);
2729 +                       bse_into = bse_from;
2730 +               }
2731 +       }
2732 +
2733 +       /* Splice lists together. */
2734 +       list_splice_init(&from->entries, into->entries.prev);
2735 +
2736 +       /* Add the partial entry back to the head of the list. */
2737 +       if (bse_into != NULL) {
2738 +               list_add(&bse_into->link, &into->entries);
2739 +       }
2740 +}
2741 +
2742 +/* Iterate over all blocknr set elements. */
2743 +int blocknr_set_iterator(txn_atom *atom, blocknr_set *bset,
2744 +                        blocknr_set_actor_f actor, void *data, int delete)
2745 +{
2746 +
2747 +       blocknr_set_entry *entry;
2748 +
2749 +       assert("zam-429", atom != NULL);
2750 +       assert("zam-430", atom_is_protected(atom));
2751 +       assert("zam-431", bset != 0);
2752 +       assert("zam-432", actor != NULL);
2753 +
2754 +       entry = list_entry(bset->entries.next, blocknr_set_entry, link);
2755 +       while (&bset->entries != &entry->link) {
2756 +               blocknr_set_entry *tmp = list_entry(entry->link.next, blocknr_set_entry, link);
2757 +               unsigned int i;
2758 +               int ret;
2759 +
2760 +               for (i = 0; i < entry->nr_singles; i++) {
2761 +                       ret = actor(atom, &entry->entries[i], NULL, data);
2762 +
2763 +                       /* We can't break a loop if delete flag is set. */
2764 +                       if (ret != 0 && !delete)
2765 +                               return ret;
2766 +               }
2767 +
2768 +               for (i = 0; i < entry->nr_pairs; i++) {
2769 +                       struct blocknr_pair *ab;
2770 +
2771 +                       ab = bse_get_pair(entry, i);
2772 +
2773 +                       ret = actor(atom, &ab->a, &ab->b, data);
2774 +
2775 +                       if (ret != 0 && !delete)
2776 +                               return ret;
2777 +               }
2778 +
2779 +               if (delete) {
2780 +                       list_del(&entry->link);
2781 +                       bse_free(entry);
2782 +               }
2783 +
2784 +               entry = tmp;
2785 +       }
2786 +
2787 +       return 0;
2788 +}
2789 +
2790 +/*
2791 + * Local variables:
2792 + * c-indentation-style: "K&R"
2793 + * mode-name: "LC"
2794 + * c-basic-offset: 8
2795 + * tab-width: 8
2796 + * fill-column: 79
2797 + * scroll-step: 1
2798 + * End:
2799 + */
2800 diff --git a/fs/reiser4/carry.c b/fs/reiser4/carry.c
2801 new file mode 100644
2802 index 0000000..9ba15c4
2803 --- /dev/null
2804 +++ b/fs/reiser4/carry.c
2805 @@ -0,0 +1,1381 @@
2806 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
2807 +/* Functions to "carry" tree modification(s) upward. */
2808 +/* Tree is modified one level at a time. As we modify a level we accumulate a
2809 +   set of changes that need to be propagated to the next level.  We manage
2810 +   node locking such that any searches that collide with carrying are
2811 +   restarted, from the root if necessary.
2812 +
2813 +   Insertion of a new item may result in items being moved among nodes and
2814 +   this requires the delimiting key to be updated at the least common parent
2815 +   of the nodes modified to preserve search tree invariants. Also, insertion
2816 +   may require allocation of a new node. A pointer to the new node has to be
2817 +   inserted into some node on the parent level, etc.
2818 +
2819 +   Tree carrying is meant to be analogous to arithmetic carrying.
2820 +
2821 +   A carry operation is always associated with some node (&carry_node).
2822 +
2823 +   Carry process starts with some initial set of operations to be performed
2824 +   and an initial set of already locked nodes.  Operations are performed one
2825 +   by one. Performing each single operation has following possible effects:
2826 +
2827 +    - content of carry node associated with operation is modified
2828 +    - new carry nodes are locked and involved into carry process on this level
2829 +    - new carry operations are posted to the next level
2830 +
2831 +   After all carry operations on this level are done, process is repeated for
2832 +   the accumulated sequence on carry operations for the next level. This
2833 +   starts by trying to lock (in left to right order) all carry nodes
2834 +   associated with carry operations on the parent level. After this, we decide
2835 +   whether more nodes are required on the left of already locked set. If so,
2836 +   all locks taken on the parent level are released, new carry nodes are
2837 +   added, and locking process repeats.
2838 +
2839 +   It may happen that balancing process fails owing to unrecoverable error on
2840 +   some of upper levels of a tree (possible causes are io error, failure to
2841 +   allocate new node, etc.). In this case we should unmount the filesystem,
2842 +   rebooting if it is the root, and possibly advise the use of fsck.
2843 +
2844 +   USAGE:
2845 +
2846 +    int some_tree_operation( znode *node, ... )
2847 +    {
2848 +       // Allocate on a stack pool of carry objects: operations and nodes.
2849 +       // Most carry processes will only take objects from here, without
2850 +       // dynamic allocation.
2851 +
2852 +I feel uneasy about this pool.  It adds to code complexity, I understand why it exists, but.... -Hans
2853 +
2854 +       carry_pool  pool;
2855 +       carry_level lowest_level;
2856 +       carry_op   *op;
2857 +
2858 +       init_carry_pool( &pool );
2859 +       init_carry_level( &lowest_level, &pool );
2860 +
2861 +       // operation may be one of:
2862 +       //   COP_INSERT    --- insert new item into node
2863 +       //   COP_CUT       --- remove part of or whole node
2864 +       //   COP_PASTE     --- increase size of item
2865 +       //   COP_DELETE    --- delete pointer from parent node
2866 +       //   COP_UPDATE    --- update delimiting key in least
2867 +       //                     common ancestor of two
2868 +
2869 +       op = post_carry( &lowest_level, operation, node, 0 );
2870 +       if( IS_ERR( op ) || ( op == NULL ) ) {
2871 +           handle error
2872 +       } else {
2873 +           // fill in remaining fields in @op, according to carry.h:carry_op
2874 +           result = carry( &lowest_level, NULL );
2875 +       }
2876 +       done_carry_pool( &pool );
2877 +    }
2878 +
2879 +   When you are implementing node plugin method that participates in carry
2880 +   (shifting, insertion, deletion, etc.), do the following:
2881 +
2882 +   int foo_node_method( znode *node, ..., carry_level *todo )
2883 +   {
2884 +       carry_op   *op;
2885 +
2886 +       ....
2887 +
2888 +       // note, that last argument to post_carry() is non-null
2889 +       // here, because @op is to be applied to the parent of @node, rather
2890 +       // than to the @node itself as in the previous case.
2891 +
2892 +       op = node_post_carry( todo, operation, node, 1 );
2893 +       // fill in remaining fields in @op, according to carry.h:carry_op
2894 +
2895 +       ....
2896 +
2897 +   }
2898 +
2899 +   BATCHING:
2900 +
2901 +   One of the main advantages of level-by-level balancing implemented here is
2902 +   ability to batch updates on a parent level and to peform them more
2903 +   efficiently as a result.
2904 +
2905 +   Description To Be Done (TBD).
2906 +
2907 +   DIFFICULTIES AND SUBTLE POINTS:
2908 +
2909 +   1. complex plumbing is required, because:
2910 +
2911 +       a. effective allocation through pools is needed
2912 +
2913 +       b. target of operation is not exactly known when operation is
2914 +       posted. This is worked around through bitfields in &carry_node and
2915 +       logic in lock_carry_node()
2916 +
2917 +       c. of interaction with locking code: node should be added into sibling
2918 +       list when pointer to it is inserted into its parent, which is some time
2919 +       after node was created. Between these moments, node is somewhat in
2920 +       suspended state and is only registered in the carry lists
2921 +
2922 +    2. whole balancing logic is implemented here, in particular, insertion
2923 +    logic is coded in make_space().
2924 +
2925 +    3. special cases like insertion (add_tree_root()) or deletion
2926 +    (kill_tree_root()) of tree root and morphing of paste into insert
2927 +    (insert_paste()) have to be handled.
2928 +
2929 +    4. there is non-trivial interdependency between allocation of new nodes
2930 +    and almost everything else. This is mainly due to the (1.c) above. I shall
2931 +    write about this later.
2932 +
2933 +*/
2934 +
2935 +#include "forward.h"
2936 +#include "debug.h"
2937 +#include "key.h"
2938 +#include "coord.h"
2939 +#include "plugin/item/item.h"
2940 +#include "plugin/item/extent.h"
2941 +#include "plugin/node/node.h"
2942 +#include "jnode.h"
2943 +#include "znode.h"
2944 +#include "tree_mod.h"
2945 +#include "tree_walk.h"
2946 +#include "block_alloc.h"
2947 +#include "pool.h"
2948 +#include "tree.h"
2949 +#include "carry.h"
2950 +#include "carry_ops.h"
2951 +#include "super.h"
2952 +#include "reiser4.h"
2953 +
2954 +#include <linux/types.h>
2955 +
2956 +/* level locking/unlocking */
2957 +static int lock_carry_level(carry_level * level);
2958 +static void unlock_carry_level(carry_level * level, int failure);
2959 +static void done_carry_level(carry_level * level);
2960 +static void unlock_carry_node(carry_level * level, carry_node * node, int fail);
2961 +
2962 +int lock_carry_node(carry_level * level, carry_node * node);
2963 +int lock_carry_node_tail(carry_node * node);
2964 +
2965 +/* carry processing proper */
2966 +static int carry_on_level(carry_level * doing, carry_level * todo);
2967 +
2968 +static carry_op *add_op(carry_level * level, pool_ordering order,
2969 +                       carry_op * reference);
2970 +
2971 +/* handlers for carry operations. */
2972 +
2973 +static void fatal_carry_error(carry_level * doing, int ecode);
2974 +static int add_new_root(carry_level * level, carry_node * node, znode * fake);
2975 +
2976 +
2977 +static void print_level(const char *prefix, carry_level * level);
2978 +
2979 +#if REISER4_DEBUG
2980 +typedef enum {
2981 +       CARRY_TODO,
2982 +       CARRY_DOING
2983 +} carry_queue_state;
2984 +static int carry_level_invariant(carry_level * level, carry_queue_state state);
2985 +#endif
2986 +
2987 +/* main entry point for tree balancing.
2988 +
2989 +   Tree carry performs operations from @doing and while doing so accumulates
2990 +   information about operations to be performed on the next level ("carried"
2991 +   to the parent level). Carried operations are performed, causing possibly
2992 +   more operations to be carried upward etc. carry() takes care about
2993 +   locking and pinning znodes while operating on them.
2994 +
2995 +   For usage, see comment at the top of fs/reiser4/carry.c
2996 +
2997 +*/
2998 +int carry(carry_level * doing /* set of carry operations to be performed */ ,
2999 +         carry_level * done    /* set of nodes, already performed at the
3000 +                                * previous level. NULL in most cases */ )
3001 +{
3002 +       int result = 0;
3003 +       /* queue of new requests */
3004 +       carry_level *todo;
3005 +       ON_DEBUG(STORE_COUNTERS);
3006 +
3007 +       assert("nikita-888", doing != NULL);
3008 +       BUG_ON(done != NULL);
3009 +
3010 +       todo = doing + 1;
3011 +       init_carry_level(todo, doing->pool);
3012 +
3013 +       /* queue of requests preformed on the previous level */
3014 +       done = todo + 1;
3015 +       init_carry_level(done, doing->pool);
3016 +
3017 +       /* iterate until there is nothing more to do */
3018 +       while (result == 0 && doing->ops_num > 0) {
3019 +               carry_level *tmp;
3020 +
3021 +               /* at this point @done is locked. */
3022 +               /* repeat lock/do/unlock while
3023 +
3024 +                  (1) lock_carry_level() fails due to deadlock avoidance, or
3025 +
3026 +                  (2) carry_on_level() decides that more nodes have to
3027 +                  be involved.
3028 +
3029 +                  (3) some unexpected error occurred while balancing on the
3030 +                  upper levels. In this case all changes are rolled back.
3031 +
3032 +                */
3033 +               while (1) {
3034 +                       result = lock_carry_level(doing);
3035 +                       if (result == 0) {
3036 +                               /* perform operations from @doing and
3037 +                                  accumulate new requests in @todo */
3038 +                               result = carry_on_level(doing, todo);
3039 +                               if (result == 0)
3040 +                                       break;
3041 +                               else if (result != -E_REPEAT ||
3042 +                                        !doing->restartable) {
3043 +                                       warning("nikita-1043",
3044 +                                               "Fatal error during carry: %i",
3045 +                                               result);
3046 +                                       print_level("done", done);
3047 +                                       print_level("doing", doing);
3048 +                                       print_level("todo", todo);
3049 +                                       /* do some rough stuff like aborting
3050 +                                          all pending transcrashes and thus
3051 +                                          pushing tree back to the consistent
3052 +                                          state. Alternatvely, just panic.
3053 +                                        */
3054 +                                       fatal_carry_error(doing, result);
3055 +                                       return result;
3056 +                               }
3057 +                       } else if (result != -E_REPEAT) {
3058 +                               fatal_carry_error(doing, result);
3059 +                               return result;
3060 +                       }
3061 +                       unlock_carry_level(doing, 1);
3062 +               }
3063 +               /* at this point @done can be safely unlocked */
3064 +               done_carry_level(done);
3065 +
3066 +               /* cyclically shift queues */
3067 +               tmp = done;
3068 +               done = doing;
3069 +               doing = todo;
3070 +               todo = tmp;
3071 +               init_carry_level(todo, doing->pool);
3072 +
3073 +               /* give other threads chance to run */
3074 +               preempt_point();
3075 +       }
3076 +       done_carry_level(done);
3077 +
3078 +       /* all counters, but x_refs should remain the same. x_refs can change
3079 +          owing to transaction manager */
3080 +       ON_DEBUG(CHECK_COUNTERS);
3081 +       return result;
3082 +}
3083 +
3084 +/* perform carry operations on given level.
3085 +
3086 +   Optimizations proposed by pooh:
3087 +
3088 +   (1) don't lock all nodes from queue at the same time. Lock nodes lazily as
3089 +   required;
3090 +
3091 +   (2) unlock node if there are no more operations to be performed upon it and
3092 +   node didn't add any operation to @todo. This can be implemented by
3093 +   attaching to each node two counters: counter of operaions working on this
3094 +   node and counter and operations carried upward from this node.
3095 +
3096 +*/
3097 +static int carry_on_level(carry_level * doing  /* queue of carry operations to
3098 +                                                * do on this level */ ,
3099 +                         carry_level * todo    /* queue where new carry
3100 +                                                * operations to be performed on
3101 +                                                * the * parent level are
3102 +                                                * accumulated during @doing
3103 +                                                * processing. */ )
3104 +{
3105 +       int result;
3106 +       int (*f) (carry_op *, carry_level *, carry_level *);
3107 +       carry_op *op;
3108 +       carry_op *tmp_op;
3109 +
3110 +       assert("nikita-1034", doing != NULL);
3111 +       assert("nikita-1035", todo != NULL);
3112 +
3113 +       /* @doing->nodes are locked. */
3114 +
3115 +       /* This function can be split into two phases: analysis and modification.
3116 +
3117 +          Analysis calculates precisely what items should be moved between
3118 +          nodes. This information is gathered in some structures attached to
3119 +          each carry_node in a @doing queue. Analysis also determines whether
3120 +          new nodes are to be allocated etc.
3121 +
3122 +          After analysis is completed, actual modification is performed. Here
3123 +          we can take advantage of "batch modification": if there are several
3124 +          operations acting on the same node, modifications can be performed
3125 +          more efficiently when batched together.
3126 +
3127 +          Above is an optimization left for the future.
3128 +        */
3129 +       /* Important, but delayed optimization: it's possible to batch
3130 +          operations together and perform them more efficiently as a
3131 +          result. For example, deletion of several neighboring items from a
3132 +          node can be converted to a single ->cut() operation.
3133 +
3134 +          Before processing queue, it should be scanned and "mergeable"
3135 +          operations merged.
3136 +        */
3137 +       result = 0;
3138 +       for_all_ops(doing, op, tmp_op) {
3139 +               carry_opcode opcode;
3140 +
3141 +               assert("nikita-1041", op != NULL);
3142 +               opcode = op->op;
3143 +               assert("nikita-1042", op->op < COP_LAST_OP);
3144 +               f = op_dispatch_table[op->op].handler;
3145 +               result = f(op, doing, todo);
3146 +               /* locking can fail with -E_REPEAT. Any different error is fatal
3147 +                  and will be handled by fatal_carry_error() sledgehammer.
3148 +                */
3149 +               if (result != 0)
3150 +                       break;
3151 +       }
3152 +       if (result == 0) {
3153 +               carry_plugin_info info;
3154 +               carry_node *scan;
3155 +               carry_node *tmp_scan;
3156 +
3157 +               info.doing = doing;
3158 +               info.todo = todo;
3159 +
3160 +               assert("nikita-3002",
3161 +                      carry_level_invariant(doing, CARRY_DOING));
3162 +               for_all_nodes(doing, scan, tmp_scan) {
3163 +                       znode *node;
3164 +
3165 +                       node = carry_real(scan);
3166 +                       assert("nikita-2547", node != NULL);
3167 +                       if (node_is_empty(node)) {
3168 +                               result =
3169 +                                   node_plugin_by_node(node)->
3170 +                                   prepare_removal(node, &info);
3171 +                               if (result != 0)
3172 +                                       break;
3173 +                       }
3174 +               }
3175 +       }
3176 +       return result;
3177 +}
3178 +
3179 +/* post carry operation
3180 +
3181 +   This is main function used by external carry clients: node layout plugins
3182 +   and tree operations to create new carry operation to be performed on some
3183 +   level.
3184 +
3185 +   New operation will be included in the @level queue. To actually perform it,
3186 +   call carry( level, ... ). This function takes write lock on @node. Carry
3187 +   manages all its locks by itself, don't worry about this.
3188 +
3189 +   This function adds operation and node at the end of the queue. It is up to
3190 +   caller to guarantee proper ordering of node queue.
3191 +
3192 +*/
3193 +carry_op *post_carry(carry_level * level       /* queue where new operation is to
3194 +                                                * be posted at */ ,
3195 +                    carry_opcode op /* opcode of operation */ ,
3196 +                    znode * node       /* node on which this operation
3197 +                                        * will operate */ ,
3198 +                    int apply_to_parent_p      /* whether operation will operate
3199 +                                                * directly on @node or on it
3200 +                                                * parent. */ )
3201 +{
3202 +       carry_op *result;
3203 +       carry_node *child;
3204 +
3205 +       assert("nikita-1046", level != NULL);
3206 +       assert("nikita-1788", znode_is_write_locked(node));
3207 +
3208 +       result = add_op(level, POOLO_LAST, NULL);
3209 +       if (IS_ERR(result))
3210 +               return result;
3211 +       child = add_carry(level, POOLO_LAST, NULL);
3212 +       if (IS_ERR(child)) {
3213 +               reiser4_pool_free(&level->pool->op_pool, &result->header);
3214 +               return (carry_op *) child;
3215 +       }
3216 +       result->node = child;
3217 +       result->op = op;
3218 +       child->parent = apply_to_parent_p;
3219 +       if (ZF_ISSET(node, JNODE_ORPHAN))
3220 +               child->left_before = 1;
3221 +       child->node = node;
3222 +       return result;
3223 +}
3224 +
3225 +/* initialize carry queue */
3226 +void init_carry_level(carry_level * level /* level to initialize */ ,
3227 +                     carry_pool * pool /* pool @level will allocate objects
3228 +                                        * from */ )
3229 +{
3230 +       assert("nikita-1045", level != NULL);
3231 +       assert("nikita-967", pool != NULL);
3232 +
3233 +       memset(level, 0, sizeof *level);
3234 +       level->pool = pool;
3235 +
3236 +       INIT_LIST_HEAD(&level->nodes);
3237 +       INIT_LIST_HEAD(&level->ops);
3238 +}
3239 +
3240 +/* allocate carry pool and initialize pools within queue */
3241 +carry_pool *init_carry_pool(int size)
3242 +{
3243 +       carry_pool *pool;
3244 +
3245 +       assert("", size >= sizeof(carry_pool) + 3 * sizeof(carry_level));
3246 +       pool = kmalloc(size, get_gfp_mask());
3247 +       if (pool == NULL)
3248 +               return ERR_PTR(RETERR(-ENOMEM));
3249 +
3250 +       reiser4_init_pool(&pool->op_pool, sizeof(carry_op), CARRIES_POOL_SIZE,
3251 +                         (char *)pool->op);
3252 +       reiser4_init_pool(&pool->node_pool, sizeof(carry_node),
3253 +                         NODES_LOCKED_POOL_SIZE, (char *)pool->node);
3254 +       return pool;
3255 +}
3256 +
3257 +/* finish with queue pools */
3258 +void done_carry_pool(carry_pool * pool /* pool to destroy */ )
3259 +{
3260 +       reiser4_done_pool(&pool->op_pool);
3261 +       reiser4_done_pool(&pool->node_pool);
3262 +       kfree(pool);
3263 +}
3264 +
3265 +/* add new carry node to the @level.
3266 +
3267 +   Returns pointer to the new carry node allocated from pool.  It's up to
3268 +   callers to maintain proper order in the @level. Assumption is that if carry
3269 +   nodes on one level are already sorted and modifications are peroformed from
3270 +   left to right, carry nodes added on the parent level will be ordered
3271 +   automatically. To control ordering use @order and @reference parameters.
3272 +
3273 +*/
3274 +carry_node *add_carry_skip(carry_level * level /* &carry_level to add node
3275 +                                                * to */ ,
3276 +                          pool_ordering order  /* where to insert: at the
3277 +                                                * beginning of @level,
3278 +                                                * before @reference, after
3279 +                                                * @reference, at the end
3280 +                                                * of @level */ ,
3281 +                          carry_node * reference       /* reference node for
3282 +                                                        * insertion */ )
3283 +{
3284 +       ON_DEBUG(carry_node * orig_ref = reference);
3285 +
3286 +       if (order == POOLO_BEFORE) {
3287 +               reference = find_left_carry(reference, level);
3288 +               if (reference == NULL)
3289 +                       reference = list_entry(level->nodes.next, carry_node,
3290 +                                              header.level_linkage);
3291 +               else
3292 +                       reference = list_entry(reference->header.level_linkage.next,
3293 +                                              carry_node, header.level_linkage);
3294 +       } else if (order == POOLO_AFTER) {
3295 +               reference = find_right_carry(reference, level);
3296 +               if (reference == NULL)
3297 +                       reference = list_entry(level->nodes.prev, carry_node,
3298 +                                              header.level_linkage);
3299 +               else
3300 +                       reference = list_entry(reference->header.level_linkage.prev,
3301 +                                              carry_node, header.level_linkage);
3302 +       }
3303 +       assert("nikita-2209",
3304 +              ergo(orig_ref != NULL,
3305 +                   carry_real(reference) == carry_real(orig_ref)));
3306 +       return add_carry(level, order, reference);
3307 +}
3308 +
3309 +carry_node *add_carry(carry_level * level      /* &carry_level to add node
3310 +                                                * to */ ,
3311 +                     pool_ordering order       /* where to insert: at the
3312 +                                                * beginning of @level, before
3313 +                                                * @reference, after @reference,
3314 +                                                * at the end of @level */ ,
3315 +                     carry_node * reference    /* reference node for
3316 +                                                * insertion */ )
3317 +{
3318 +       carry_node *result;
3319 +
3320 +       result =
3321 +           (carry_node *) add_obj(&level->pool->node_pool, &level->nodes,
3322 +                                  order, &reference->header);
3323 +       if (!IS_ERR(result) && (result != NULL))
3324 +               ++level->nodes_num;
3325 +       return result;
3326 +}
3327 +
3328 +/* add new carry operation to the @level.
3329 +
3330 +   Returns pointer to the new carry operations allocated from pool. It's up to
3331 +   callers to maintain proper order in the @level. To control ordering use
3332 +   @order and @reference parameters.
3333 +
3334 +*/
3335 +static carry_op *add_op(carry_level * level /* &carry_level to add node to */ ,
3336 +                       pool_ordering order     /* where to insert: at the beginning of
3337 +                                                * @level, before @reference, after
3338 +                                                * @reference, at the end of @level */ ,
3339 +                       carry_op *
3340 +                       reference /* reference node for insertion */ )
3341 +{
3342 +       carry_op *result;
3343 +
3344 +       result =
3345 +           (carry_op *) add_obj(&level->pool->op_pool, &level->ops, order,
3346 +                                &reference->header);
3347 +       if (!IS_ERR(result) && (result != NULL))
3348 +               ++level->ops_num;
3349 +       return result;
3350 +}
3351 +
3352 +/* Return node on the right of which @node was created.
3353 +
3354 +   Each node is created on the right of some existing node (or it is new root,
3355 +   which is special case not handled here).
3356 +
3357 +   @node is new node created on some level, but not yet inserted into its
3358 +   parent, it has corresponding bit (JNODE_ORPHAN) set in zstate.
3359 +
3360 +*/
3361 +static carry_node *find_begetting_brother(carry_node * node    /* node to start search
3362 +                                                                * from */ ,
3363 +                                         carry_level * kin UNUSED_ARG  /* level to
3364 +                                                                        * scan */ )
3365 +{
3366 +       carry_node *scan;
3367 +
3368 +       assert("nikita-1614", node != NULL);
3369 +       assert("nikita-1615", kin != NULL);
3370 +       assert("nikita-1616", LOCK_CNT_GTZ(rw_locked_tree));
3371 +       assert("nikita-1619", ergo(carry_real(node) != NULL,
3372 +                                  ZF_ISSET(carry_real(node), JNODE_ORPHAN)));
3373 +
3374 +       for (scan = node;;
3375 +            scan = list_entry(scan->header.level_linkage.prev, carry_node,
3376 +                              header.level_linkage)) {
3377 +               assert("nikita-1617", &kin->nodes != &scan->header.level_linkage);
3378 +               if ((scan->node != node->node) &&
3379 +                   !ZF_ISSET(scan->node, JNODE_ORPHAN)) {
3380 +                       assert("nikita-1618", carry_real(scan) != NULL);
3381 +                       break;
3382 +               }
3383 +       }
3384 +       return scan;
3385 +}
3386 +
3387 +static cmp_t
3388 +carry_node_cmp(carry_level * level, carry_node * n1, carry_node * n2)
3389 +{
3390 +       assert("nikita-2199", n1 != NULL);
3391 +       assert("nikita-2200", n2 != NULL);
3392 +
3393 +       if (n1 == n2)
3394 +               return EQUAL_TO;
3395 +       while (1) {
3396 +               n1 = carry_node_next(n1);
3397 +               if (carry_node_end(level, n1))
3398 +                       return GREATER_THAN;
3399 +               if (n1 == n2)
3400 +                       return LESS_THAN;
3401 +       }
3402 +       impossible("nikita-2201", "End of level reached");
3403 +}
3404 +
3405 +carry_node *find_carry_node(carry_level * level, const znode * node)
3406 +{
3407 +       carry_node *scan;
3408 +       carry_node *tmp_scan;
3409 +
3410 +       assert("nikita-2202", level != NULL);
3411 +       assert("nikita-2203", node != NULL);
3412 +
3413 +       for_all_nodes(level, scan, tmp_scan) {
3414 +               if (carry_real(scan) == node)
3415 +                       return scan;
3416 +       }
3417 +       return NULL;
3418 +}
3419 +
3420 +znode *carry_real(const carry_node * node)
3421 +{
3422 +       assert("nikita-3061", node != NULL);
3423 +
3424 +       return node->lock_handle.node;
3425 +}
3426 +
3427 +carry_node *insert_carry_node(carry_level * doing, carry_level * todo,
3428 +                             const znode * node)
3429 +{
3430 +       carry_node *base;
3431 +       carry_node *scan;
3432 +       carry_node *tmp_scan;
3433 +       carry_node *proj;
3434 +
3435 +       base = find_carry_node(doing, node);
3436 +       assert("nikita-2204", base != NULL);
3437 +
3438 +       for_all_nodes(todo, scan, tmp_scan) {
3439 +               proj = find_carry_node(doing, scan->node);
3440 +               assert("nikita-2205", proj != NULL);
3441 +               if (carry_node_cmp(doing, proj, base) != LESS_THAN)
3442 +                       break;
3443 +       }
3444 +       return scan;
3445 +}
3446 +
3447 +static carry_node *add_carry_atplace(carry_level * doing, carry_level * todo,
3448 +                                    znode * node)
3449 +{
3450 +       carry_node *reference;
3451 +
3452 +       assert("nikita-2994", doing != NULL);
3453 +       assert("nikita-2995", todo != NULL);
3454 +       assert("nikita-2996", node != NULL);
3455 +
3456 +       reference = insert_carry_node(doing, todo, node);
3457 +       assert("nikita-2997", reference != NULL);
3458 +
3459 +       return add_carry(todo, POOLO_BEFORE, reference);
3460 +}
3461 +
3462 +/* like post_carry(), but designed to be called from node plugin methods.
3463 +   This function is different from post_carry() in that it finds proper place
3464 +   to insert node in the queue. */
3465 +carry_op *node_post_carry(carry_plugin_info * info     /* carry parameters
3466 +                                                        * passed down to node
3467 +                                                        * plugin */ ,
3468 +                         carry_opcode op /* opcode of operation */ ,
3469 +                         znode * node  /* node on which this
3470 +                                        * operation will operate */ ,
3471 +                         int apply_to_parent_p /* whether operation will
3472 +                                                * operate directly on @node
3473 +                                                * or on it parent. */ )
3474 +{
3475 +       carry_op *result;
3476 +       carry_node *child;
3477 +
3478 +       assert("nikita-2207", info != NULL);
3479 +       assert("nikita-2208", info->todo != NULL);
3480 +
3481 +       if (info->doing == NULL)
3482 +               return post_carry(info->todo, op, node, apply_to_parent_p);
3483 +
3484 +       result = add_op(info->todo, POOLO_LAST, NULL);
3485 +       if (IS_ERR(result))
3486 +               return result;
3487 +       child = add_carry_atplace(info->doing, info->todo, node);
3488 +       if (IS_ERR(child)) {
3489 +               reiser4_pool_free(&info->todo->pool->op_pool, &result->header);
3490 +               return (carry_op *) child;
3491 +       }
3492 +       result->node = child;
3493 +       result->op = op;
3494 +       child->parent = apply_to_parent_p;
3495 +       if (ZF_ISSET(node, JNODE_ORPHAN))
3496 +               child->left_before = 1;
3497 +       child->node = node;
3498 +       return result;
3499 +}
3500 +
3501 +/* lock all carry nodes in @level */
3502 +static int lock_carry_level(carry_level * level /* level to lock */ )
3503 +{
3504 +       int result;
3505 +       carry_node *node;
3506 +       carry_node *tmp_node;
3507 +
3508 +       assert("nikita-881", level != NULL);
3509 +       assert("nikita-2229", carry_level_invariant(level, CARRY_TODO));
3510 +
3511 +       /* lock nodes from left to right */
3512 +       result = 0;
3513 +       for_all_nodes(level, node, tmp_node) {
3514 +               result = lock_carry_node(level, node);
3515 +               if (result != 0)
3516 +                       break;
3517 +       }
3518 +       return result;
3519 +}
3520 +
3521 +/* Synchronize delimiting keys between @node and its left neighbor.
3522 +
3523 +   To reduce contention on dk key and simplify carry code, we synchronize
3524 +   delimiting keys only when carry ultimately leaves tree level (carrying
3525 +   changes upward) and unlocks nodes at this level.
3526 +
3527 +   This function first finds left neighbor of @node and then updates left
3528 +   neighbor's right delimiting key to conincide with least key in @node.
3529 +
3530 +*/
3531 +
3532 +ON_DEBUG(extern atomic_t delim_key_version;
3533 +    )
3534 +
3535 +static void sync_dkeys(znode * spot /* node to update */ )
3536 +{
3537 +       reiser4_key pivot;
3538 +       reiser4_tree *tree;
3539 +
3540 +       assert("nikita-1610", spot != NULL);
3541 +       assert("nikita-1612", LOCK_CNT_NIL(rw_locked_dk));
3542 +
3543 +       tree = znode_get_tree(spot);
3544 +       read_lock_tree(tree);
3545 +       write_lock_dk(tree);
3546 +
3547 +       assert("nikita-2192", znode_is_loaded(spot));
3548 +
3549 +       /* sync left delimiting key of @spot with key in its leftmost item */
3550 +       if (node_is_empty(spot))
3551 +               pivot = *znode_get_rd_key(spot);
3552 +       else
3553 +               leftmost_key_in_node(spot, &pivot);
3554 +
3555 +       znode_set_ld_key(spot, &pivot);
3556 +
3557 +       /* there can be sequence of empty nodes pending removal on the left of
3558 +          @spot. Scan them and update their left and right delimiting keys to
3559 +          match left delimiting key of @spot. Also, update right delimiting
3560 +          key of first non-empty left neighbor.
3561 +        */
3562 +       while (1) {
3563 +               if (!ZF_ISSET(spot, JNODE_LEFT_CONNECTED))
3564 +                       break;
3565 +
3566 +               spot = spot->left;
3567 +               if (spot == NULL)
3568 +                       break;
3569 +
3570 +               znode_set_rd_key(spot, &pivot);
3571 +               /* don't sink into the domain of another balancing */
3572 +               if (!znode_is_write_locked(spot))
3573 +                       break;
3574 +               if (ZF_ISSET(spot, JNODE_HEARD_BANSHEE))
3575 +                       znode_set_ld_key(spot, &pivot);
3576 +               else
3577 +                       break;
3578 +       }
3579 +
3580 +       write_unlock_dk(tree);
3581 +       read_unlock_tree(tree);
3582 +}
3583 +
3584 +/* unlock all carry nodes in @level */
3585 +static void unlock_carry_level(carry_level * level /* level to unlock */ ,
3586 +                              int failure      /* true if unlocking owing to
3587 +                                                * failure */ )
3588 +{
3589 +       carry_node *node;
3590 +       carry_node *tmp_node;
3591 +
3592 +       assert("nikita-889", level != NULL);
3593 +
3594 +       if (!failure) {
3595 +               znode *spot;
3596 +
3597 +               spot = NULL;
3598 +               /* update delimiting keys */
3599 +               for_all_nodes(level, node, tmp_node) {
3600 +                       if (carry_real(node) != spot) {
3601 +                               spot = carry_real(node);
3602 +                               sync_dkeys(spot);
3603 +                       }
3604 +               }
3605 +       }
3606 +
3607 +       /* nodes can be unlocked in arbitrary order.  In preemptible
3608 +          environment it's better to unlock in reverse order of locking,
3609 +          though.
3610 +        */
3611 +       for_all_nodes_back(level, node, tmp_node) {
3612 +               /* all allocated nodes should be already linked to their
3613 +                  parents at this moment. */
3614 +               assert("nikita-1631", ergo(!failure, !ZF_ISSET(carry_real(node),
3615 +                                                              JNODE_ORPHAN)));
3616 +               ON_DEBUG(check_dkeys(carry_real(node)));
3617 +               unlock_carry_node(level, node, failure);
3618 +       }
3619 +       level->new_root = NULL;
3620 +}
3621 +
3622 +/* finish with @level
3623 +
3624 +   Unlock nodes and release all allocated resources */
3625 +static void done_carry_level(carry_level * level /* level to finish */ )
3626 +{
3627 +       carry_node *node;
3628 +       carry_node *tmp_node;
3629 +       carry_op *op;
3630 +       carry_op *tmp_op;
3631 +
3632 +       assert("nikita-1076", level != NULL);
3633 +
3634 +       unlock_carry_level(level, 0);
3635 +       for_all_nodes(level, node, tmp_node) {
3636 +               assert("nikita-2113", list_empty_careful(&node->lock_handle.locks_link));
3637 +               assert("nikita-2114", list_empty_careful(&node->lock_handle.owners_link));
3638 +               reiser4_pool_free(&level->pool->node_pool, &node->header);
3639 +       }
3640 +       for_all_ops(level, op, tmp_op)
3641 +           reiser4_pool_free(&level->pool->op_pool, &op->header);
3642 +}
3643 +
3644 +/* helper function to complete locking of carry node
3645 +
3646 +   Finish locking of carry node. There are several ways in which new carry
3647 +   node can be added into carry level and locked. Normal is through
3648 +   lock_carry_node(), but also from find_{left|right}_neighbor(). This
3649 +   function factors out common final part of all locking scenarios. It
3650 +   supposes that @node -> lock_handle is lock handle for lock just taken and
3651 +   fills ->real_node from this lock handle.
3652 +
3653 +*/
3654 +int lock_carry_node_tail(carry_node * node /* node to complete locking of */ )
3655 +{
3656 +       assert("nikita-1052", node != NULL);
3657 +       assert("nikita-1187", carry_real(node) != NULL);
3658 +       assert("nikita-1188", !node->unlock);
3659 +
3660 +       node->unlock = 1;
3661 +       /* Load node content into memory and install node plugin by
3662 +          looking at the node header.
3663 +
3664 +          Most of the time this call is cheap because the node is
3665 +          already in memory.
3666 +
3667 +          Corresponding zrelse() is in unlock_carry_node()
3668 +        */
3669 +       return zload(carry_real(node));
3670 +}
3671 +
3672 +/* lock carry node
3673 +
3674 +   "Resolve" node to real znode, lock it and mark as locked.
3675 +   This requires recursive locking of znodes.
3676 +
3677 +   When operation is posted to the parent level, node it will be applied to is
3678 +   not yet known. For example, when shifting data between two nodes,
3679 +   delimiting has to be updated in parent or parents of nodes involved. But
3680 +   their parents is not yet locked and, moreover said nodes can be reparented
3681 +   by concurrent balancing.
3682 +
3683 +   To work around this, carry operation is applied to special "carry node"
3684 +   rather than to the znode itself. Carry node consists of some "base" or
3685 +   "reference" znode and flags indicating how to get to the target of carry
3686 +   operation (->real_node field of carry_node) from base.
3687 +
3688 +*/
3689 +int lock_carry_node(carry_level * level /* level @node is in */ ,
3690 +                   carry_node * node /* node to lock */ )
3691 +{
3692 +       int result;
3693 +       znode *reference_point;
3694 +       lock_handle lh;
3695 +       lock_handle tmp_lh;
3696 +       reiser4_tree *tree;
3697 +
3698 +       assert("nikita-887", level != NULL);
3699 +       assert("nikita-882", node != NULL);
3700 +
3701 +       result = 0;
3702 +       reference_point = node->node;
3703 +       init_lh(&lh);
3704 +       init_lh(&tmp_lh);
3705 +       if (node->left_before) {
3706 +               /* handling of new nodes, allocated on the previous level:
3707 +
3708 +                  some carry ops were propably posted from the new node, but
3709 +                  this node neither has parent pointer set, nor is
3710 +                  connected. This will be done in ->create_hook() for
3711 +                  internal item.
3712 +
3713 +                  No then less, parent of new node has to be locked. To do
3714 +                  this, first go to the "left" in the carry order. This
3715 +                  depends on the decision to always allocate new node on the
3716 +                  right of existing one.
3717 +
3718 +                  Loop handles case when multiple nodes, all orphans, were
3719 +                  inserted.
3720 +
3721 +                  Strictly speaking, taking tree lock is not necessary here,
3722 +                  because all nodes scanned by loop in
3723 +                  find_begetting_brother() are write-locked by this thread,
3724 +                  and thus, their sibling linkage cannot change.
3725 +
3726 +                */
3727 +               tree = znode_get_tree(reference_point);
3728 +               read_lock_tree(tree);
3729 +               reference_point = find_begetting_brother(node, level)->node;
3730 +               read_unlock_tree(tree);
3731 +               assert("nikita-1186", reference_point != NULL);
3732 +       }
3733 +       if (node->parent && (result == 0)) {
3734 +               result =
3735 +                   reiser4_get_parent(&tmp_lh, reference_point,
3736 +                                      ZNODE_WRITE_LOCK);
3737 +               if (result != 0) {
3738 +                       ;       /* nothing */
3739 +               } else if (znode_get_level(tmp_lh.node) == 0) {
3740 +                       assert("nikita-1347", znode_above_root(tmp_lh.node));
3741 +                       result = add_new_root(level, node, tmp_lh.node);
3742 +                       if (result == 0) {
3743 +                               reference_point = level->new_root;
3744 +                               move_lh(&lh, &node->lock_handle);
3745 +                       }
3746 +               } else if ((level->new_root != NULL)
3747 +                          && (level->new_root !=
3748 +                              znode_parent_nolock(reference_point))) {
3749 +                       /* parent of node exists, but this level aready
3750 +                          created different new root, so */
3751 +                       warning("nikita-1109",
3752 +                               /* it should be "radicis", but tradition is
3753 +                                  tradition.  do banshees read latin? */
3754 +                               "hodie natus est radici frater");
3755 +                       result = -EIO;
3756 +               } else {
3757 +                       move_lh(&lh, &tmp_lh);
3758 +                       reference_point = lh.node;
3759 +               }
3760 +       }
3761 +       if (node->left && (result == 0)) {
3762 +               assert("nikita-1183", node->parent);
3763 +               assert("nikita-883", reference_point != NULL);
3764 +               result =
3765 +                   reiser4_get_left_neighbor(&tmp_lh, reference_point,
3766 +                                             ZNODE_WRITE_LOCK,
3767 +                                             GN_CAN_USE_UPPER_LEVELS);
3768 +               if (result == 0) {
3769 +                       done_lh(&lh);
3770 +                       move_lh(&lh, &tmp_lh);
3771 +                       reference_point = lh.node;
3772 +               }
3773 +       }
3774 +       if (!node->parent && !node->left && !node->left_before) {
3775 +               result =
3776 +                   longterm_lock_znode(&lh, reference_point, ZNODE_WRITE_LOCK,
3777 +                                       ZNODE_LOCK_HIPRI);
3778 +       }
3779 +       if (result == 0) {
3780 +               move_lh(&node->lock_handle, &lh);
3781 +               result = lock_carry_node_tail(node);
3782 +       }
3783 +       done_lh(&tmp_lh);
3784 +       done_lh(&lh);
3785 +       return result;
3786 +}
3787 +
3788 +/* release a lock on &carry_node.
3789 +
3790 +   Release if necessary lock on @node. This opearion is pair of
3791 +   lock_carry_node() and is idempotent: you can call it more than once on the
3792 +   same node.
3793 +
3794 +*/
3795 +static void
3796 +unlock_carry_node(carry_level * level,
3797 +                 carry_node * node /* node to be released */ ,
3798 +                 int failure   /* 0 if node is unlocked due
3799 +                                * to some error */ )
3800 +{
3801 +       znode *real_node;
3802 +
3803 +       assert("nikita-884", node != NULL);
3804 +
3805 +       real_node = carry_real(node);
3806 +       /* pair to zload() in lock_carry_node_tail() */
3807 +       zrelse(real_node);
3808 +       if (node->unlock && (real_node != NULL)) {
3809 +               assert("nikita-899", real_node == node->lock_handle.node);
3810 +               longterm_unlock_znode(&node->lock_handle);
3811 +       }
3812 +       if (failure) {
3813 +               if (node->deallocate && (real_node != NULL)) {
3814 +                       /* free node in bitmap
3815 +
3816 +                          Prepare node for removal. Last zput() will finish
3817 +                          with it.
3818 +                        */
3819 +                       ZF_SET(real_node, JNODE_HEARD_BANSHEE);
3820 +               }
3821 +               if (node->free) {
3822 +                       assert("nikita-2177",
3823 +                              list_empty_careful(&node->lock_handle.locks_link));
3824 +                       assert("nikita-2112",
3825 +                              list_empty_careful(&node->lock_handle.owners_link));
3826 +                       reiser4_pool_free(&level->pool->node_pool,
3827 +                                         &node->header);
3828 +               }
3829 +       }
3830 +}
3831 +
3832 +/* fatal_carry_error() - all-catching error handling function
3833 +
3834 +   It is possible that carry faces unrecoverable error, like unability to
3835 +   insert pointer at the internal level. Our simple solution is just panic in
3836 +   this situation. More sophisticated things like attempt to remount
3837 +   file-system as read-only can be implemented without much difficlties.
3838 +
3839 +   It is believed, that:
3840 +
3841 +   1. in stead of panicking, all current transactions can be aborted rolling
3842 +   system back to the consistent state.
3843 +
3844 +Umm, if you simply panic without doing anything more at all, then all current
3845 +transactions are aborted and the system is rolled back to a consistent state,
3846 +by virtue of the design of the transactional mechanism. Well, wait, let's be
3847 +precise.  If an internal node is corrupted on disk due to hardware failure,
3848 +then there may be no consistent state that can be rolled back to, so instead
3849 +we should say that it will rollback the transactions, which barring other
3850 +factors means rolling back to a consistent state.
3851 +
3852 +# Nikita: there is a subtle difference between panic and aborting
3853 +# transactions: machine doesn't reboot. Processes aren't killed. Processes
3854 +# don't using reiser4 (not that we care about such processes), or using other
3855 +# reiser4 mounts (about them we do care) will simply continue to run. With
3856 +# some luck, even application using aborted file system can survive: it will
3857 +# get some error, like EBADF, from each file descriptor on failed file system,
3858 +# but applications that do care about tolerance will cope with this (squid
3859 +# will).
3860 +
3861 +It would be a nice feature though to support rollback without rebooting
3862 +followed by remount, but this can wait for later versions.
3863 +
3864 +   2. once isolated transactions will be implemented it will be possible to
3865 +   roll back offending transaction.
3866 +
3867 +2. is additional code complexity of inconsistent value (it implies that a broken tree should be kept in operation), so we must think about
3868 +it more before deciding if it should be done.  -Hans
3869 +
3870 +*/
3871 +static void fatal_carry_error(carry_level * doing UNUSED_ARG   /* carry level
3872 +                                                                * where
3873 +                                                                * unrecoverable
3874 +                                                                * error
3875 +                                                                * occurred */ ,
3876 +                             int ecode /* error code */ )
3877 +{
3878 +       assert("nikita-1230", doing != NULL);
3879 +       assert("nikita-1231", ecode < 0);
3880 +
3881 +       reiser4_panic("nikita-1232", "Carry failed: %i", ecode);
3882 +}
3883 +
3884 +/* add new root to the tree
3885 +
3886 +   This function itself only manages changes in carry structures and delegates
3887 +   all hard work (allocation of znode for new root, changes of parent and
3888 +   sibling pointers to the add_tree_root().
3889 +
3890 +   Locking: old tree root is locked by carry at this point. Fake znode is also
3891 +   locked.
3892 +
3893 +*/
3894 +static int add_new_root(carry_level * level    /* carry level in context of which
3895 +                                                * operation is performed */ ,
3896 +                       carry_node * node /* carry node for existing root */ ,
3897 +                       znode * fake    /* "fake" znode already locked by
3898 +                                        * us */ )
3899 +{
3900 +       int result;
3901 +
3902 +       assert("nikita-1104", level != NULL);
3903 +       assert("nikita-1105", node != NULL);
3904 +
3905 +       assert("nikita-1403", znode_is_write_locked(node->node));
3906 +       assert("nikita-1404", znode_is_write_locked(fake));
3907 +
3908 +       /* trying to create new root. */
3909 +       /* @node is root and it's already locked by us. This
3910 +          means that nobody else can be trying to add/remove
3911 +          tree root right now.
3912 +        */
3913 +       if (level->new_root == NULL)
3914 +               level->new_root = add_tree_root(node->node, fake);
3915 +       if (!IS_ERR(level->new_root)) {
3916 +               assert("nikita-1210", znode_is_root(level->new_root));
3917 +               node->deallocate = 1;
3918 +               result =
3919 +                   longterm_lock_znode(&node->lock_handle, level->new_root,
3920 +                                       ZNODE_WRITE_LOCK, ZNODE_LOCK_LOPRI);
3921 +               if (result == 0)
3922 +                       zput(level->new_root);
3923 +       } else {
3924 +               result = PTR_ERR(level->new_root);
3925 +               level->new_root = NULL;
3926 +       }
3927 +       return result;
3928 +}
3929 +
3930 +/* allocate new znode and add the operation that inserts the
3931 +   pointer to it into the parent node into the todo level
3932 +
3933 +   Allocate new znode, add it into carry queue and post into @todo queue
3934 +   request to add pointer to new node into its parent.
3935 +
3936 +   This is carry related routing that calls new_node() to allocate new
3937 +   node.
3938 +*/
3939 +carry_node *add_new_znode(znode * brother      /* existing left neighbor of new
3940 +                                                * node */ ,
3941 +                         carry_node * ref      /* carry node after which new
3942 +                                                * carry node is to be inserted
3943 +                                                * into queue. This affects
3944 +                                                * locking. */ ,
3945 +                         carry_level * doing   /* carry queue where new node is
3946 +                                                * to be added */ ,
3947 +                         carry_level * todo    /* carry queue where COP_INSERT
3948 +                                                * operation to add pointer to
3949 +                                                * new node will ne added */ )
3950 +{
3951 +       carry_node *fresh;
3952 +       znode *new_znode;
3953 +       carry_op *add_pointer;
3954 +       carry_plugin_info info;
3955 +
3956 +       assert("nikita-1048", brother != NULL);
3957 +       assert("nikita-1049", todo != NULL);
3958 +
3959 +       /* There is a lot of possible variations here: to what parent
3960 +          new node will be attached and where. For simplicity, always
3961 +          do the following:
3962 +
3963 +          (1) new node and @brother will have the same parent.
3964 +
3965 +          (2) new node is added on the right of @brother
3966 +
3967 +        */
3968 +
3969 +       fresh = add_carry_skip(doing, ref ? POOLO_AFTER : POOLO_LAST, ref);
3970 +       if (IS_ERR(fresh))
3971 +               return fresh;
3972 +
3973 +       fresh->deallocate = 1;
3974 +       fresh->free = 1;
3975 +
3976 +       new_znode = new_node(brother, znode_get_level(brother));
3977 +       if (IS_ERR(new_znode))
3978 +               /* @fresh will be deallocated automatically by error
3979 +                  handling code in the caller. */
3980 +               return (carry_node *) new_znode;
3981 +
3982 +       /* new_znode returned znode with x_count 1. Caller has to decrease
3983 +          it. make_space() does. */
3984 +
3985 +       ZF_SET(new_znode, JNODE_ORPHAN);
3986 +       fresh->node = new_znode;
3987 +
3988 +       while (ZF_ISSET(carry_real(ref), JNODE_ORPHAN)) {
3989 +               ref = carry_node_prev(ref);
3990 +               assert("nikita-1606", !carry_node_end(doing, ref));
3991 +       }
3992 +
3993 +       info.todo = todo;
3994 +       info.doing = doing;
3995 +       add_pointer = node_post_carry(&info, COP_INSERT, carry_real(ref), 1);
3996 +       if (IS_ERR(add_pointer)) {
3997 +               /* no need to deallocate @new_znode here: it will be
3998 +                  deallocated during carry error handling. */
3999 +               return (carry_node *) add_pointer;
4000 +       }
4001 +
4002 +       add_pointer->u.insert.type = COPT_CHILD;
4003 +       add_pointer->u.insert.child = fresh;
4004 +       add_pointer->u.insert.brother = brother;
4005 +       /* initially new node spawns empty key range */
4006 +       write_lock_dk(znode_get_tree(brother));
4007 +       znode_set_ld_key(new_znode,
4008 +                        znode_set_rd_key(new_znode,
4009 +                                         znode_get_rd_key(brother)));
4010 +       write_unlock_dk(znode_get_tree(brother));
4011 +       return fresh;
4012 +}
4013 +
4014 +/* DEBUGGING FUNCTIONS.
4015 +
4016 +   Probably we also should leave them on even when
4017 +   debugging is turned off to print dumps at errors.
4018 +*/
4019 +#if REISER4_DEBUG
4020 +static int carry_level_invariant(carry_level * level, carry_queue_state state)
4021 +{
4022 +       carry_node *node;
4023 +       carry_node *tmp_node;
4024 +
4025 +       if (level == NULL)
4026 +               return 0;
4027 +
4028 +       if (level->track_type != 0 &&
4029 +           level->track_type != CARRY_TRACK_NODE &&
4030 +           level->track_type != CARRY_TRACK_CHANGE)
4031 +               return 0;
4032 +
4033 +       /* check that nodes are in ascending order */
4034 +       for_all_nodes(level, node, tmp_node) {
4035 +               znode *left;
4036 +               znode *right;
4037 +
4038 +               reiser4_key lkey;
4039 +               reiser4_key rkey;
4040 +
4041 +               if (node != carry_node_front(level)) {
4042 +                       if (state == CARRY_TODO) {
4043 +                               right = node->node;
4044 +                               left = carry_node_prev(node)->node;
4045 +                       } else {
4046 +                               right = carry_real(node);
4047 +                               left = carry_real(carry_node_prev(node));
4048 +                       }
4049 +                       if (right == NULL || left == NULL)
4050 +                               continue;
4051 +                       if (node_is_empty(right) || node_is_empty(left))
4052 +                               continue;
4053 +                       if (!keyle(leftmost_key_in_node(left, &lkey),
4054 +                                  leftmost_key_in_node(right, &rkey))) {
4055 +                               warning("", "wrong key order");
4056 +                               return 0;
4057 +                       }
4058 +               }
4059 +       }
4060 +       return 1;
4061 +}
4062 +#endif
4063 +
4064 +/* get symbolic name for boolean */
4065 +static const char *tf(int boolean /* truth value */ )
4066 +{
4067 +       return boolean ? "t" : "f";
4068 +}
4069 +
4070 +/* symbolic name for carry operation */
4071 +static const char *carry_op_name(carry_opcode op /* carry opcode */ )
4072 +{
4073 +       switch (op) {
4074 +       case COP_INSERT:
4075 +               return "COP_INSERT";
4076 +       case COP_DELETE:
4077 +               return "COP_DELETE";
4078 +       case COP_CUT:
4079 +               return "COP_CUT";
4080 +       case COP_PASTE:
4081 +               return "COP_PASTE";
4082 +       case COP_UPDATE:
4083 +               return "COP_UPDATE";
4084 +       case COP_EXTENT:
4085 +               return "COP_EXTENT";
4086 +       case COP_INSERT_FLOW:
4087 +               return "COP_INSERT_FLOW";
4088 +       default:{
4089 +                       /* not mt safe, but who cares? */
4090 +                       static char buf[20];
4091 +
4092 +                       sprintf(buf, "unknown op: %x", op);
4093 +                       return buf;
4094 +               }
4095 +       }
4096 +}
4097 +
4098 +/* dump information about carry node */
4099 +static void print_carry(const char *prefix /* prefix to print */ ,
4100 +                       carry_node * node /* node to print */ )
4101 +{
4102 +       if (node == NULL) {
4103 +               printk("%s: null\n", prefix);
4104 +               return;
4105 +       }
4106 +       printk
4107 +           ("%s: %p parent: %s, left: %s, unlock: %s, free: %s, dealloc: %s\n",
4108 +            prefix, node, tf(node->parent), tf(node->left), tf(node->unlock),
4109 +            tf(node->free), tf(node->deallocate));
4110 +}
4111 +
4112 +/* dump information about carry operation */
4113 +static void print_op(const char *prefix /* prefix to print */ ,
4114 +                    carry_op * op /* operation to print */ )
4115 +{
4116 +       if (op == NULL) {
4117 +               printk("%s: null\n", prefix);
4118 +               return;
4119 +       }
4120 +       printk("%s: %p carry_opcode: %s\n", prefix, op, carry_op_name(op->op));
4121 +       print_carry("\tnode", op->node);
4122 +       switch (op->op) {
4123 +       case COP_INSERT:
4124 +       case COP_PASTE:
4125 +               print_coord("\tcoord",
4126 +                           op->u.insert.d ? op->u.insert.d->coord : NULL, 0);
4127 +               print_key("\tkey", op->u.insert.d ? op->u.insert.d->key : NULL);
4128 +               print_carry("\tchild", op->u.insert.child);
4129 +               break;
4130 +       case COP_DELETE:
4131 +               print_carry("\tchild", op->u.delete.child);
4132 +               break;
4133 +       case COP_CUT:
4134 +               if (op->u.cut_or_kill.is_cut) {
4135 +                       print_coord("\tfrom",
4136 +                                   op->u.cut_or_kill.u.kill->params.from, 0);
4137 +                       print_coord("\tto", op->u.cut_or_kill.u.kill->params.to,
4138 +                                   0);
4139 +               } else {
4140 +                       print_coord("\tfrom",
4141 +                                   op->u.cut_or_kill.u.cut->params.from, 0);
4142 +                       print_coord("\tto", op->u.cut_or_kill.u.cut->params.to,
4143 +                                   0);
4144 +               }
4145 +               break;
4146 +       case COP_UPDATE:
4147 +               print_carry("\tleft", op->u.update.left);
4148 +               break;
4149 +       default:
4150 +               /* do nothing */
4151 +               break;
4152 +       }
4153 +}
4154 +
4155 +/* dump information about all nodes and operations in a @level */
4156 +static void print_level(const char *prefix /* prefix to print */ ,
4157 +                       carry_level * level /* level to print */ )
4158 +{
4159 +       carry_node *node;
4160 +       carry_node *tmp_node;
4161 +       carry_op *op;
4162 +       carry_op *tmp_op;
4163 +
4164 +       if (level == NULL) {
4165 +               printk("%s: null\n", prefix);
4166 +               return;
4167 +       }
4168 +       printk("%s: %p, restartable: %s\n",
4169 +              prefix, level, tf(level->restartable));
4170 +
4171 +       for_all_nodes(level, node, tmp_node)
4172 +           print_carry("\tcarry node", node);
4173 +       for_all_ops(level, op, tmp_op)
4174 +           print_op("\tcarry op", op);
4175 +}
4176 +
4177 +/* Make Linus happy.
4178 +   Local variables:
4179 +   c-indentation-style: "K&R"
4180 +   mode-name: "LC"
4181 +   c-basic-offset: 8
4182 +   tab-width: 8
4183 +   fill-column: 120
4184 +   scroll-step: 1
4185 +   End:
4186 +*/
4187 diff --git a/fs/reiser4/carry.h b/fs/reiser4/carry.h
4188 new file mode 100644
4189 index 0000000..0cc56f4
4190 --- /dev/null
4191 +++ b/fs/reiser4/carry.h
4192 @@ -0,0 +1,442 @@
4193 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
4194 +
4195 +/* Functions and data types to "carry" tree modification(s) upward.
4196 +   See fs/reiser4/carry.c for details. */
4197 +
4198 +#if !defined( __FS_REISER4_CARRY_H__ )
4199 +#define __FS_REISER4_CARRY_H__
4200 +
4201 +#include "forward.h"
4202 +#include "debug.h"
4203 +#include "pool.h"
4204 +#include "znode.h"
4205 +
4206 +#include <linux/types.h>
4207 +
4208 +/* &carry_node - "location" of carry node.
4209 +
4210 +   "location" of node that is involved or going to be involved into
4211 +   carry process. Node where operation will be carried to on the
4212 +   parent level cannot be recorded explicitly. Operation will be carried
4213 +   usually to the parent of some node (where changes are performed at
4214 +   the current level) or, to the left neighbor of its parent. But while
4215 +   modifications are performed at the current level, parent may
4216 +   change. So, we have to allow some indirection (or, positevly,
4217 +   flexibility) in locating carry nodes.
4218 +
4219 +*/
4220 +typedef struct carry_node {
4221 +       /* pool linkage */
4222 +       reiser4_pool_header header;
4223 +
4224 +       /* base node from which real_node is calculated. See
4225 +          fs/reiser4/carry.c:lock_carry_node(). */
4226 +       znode *node;
4227 +
4228 +       /* how to get ->real_node */
4229 +       /* to get ->real_node obtain parent of ->node */
4230 +       __u32 parent:1;
4231 +       /* to get ->real_node obtain left neighbor of parent of
4232 +          ->node */
4233 +       __u32 left:1;
4234 +       __u32 left_before:1;
4235 +
4236 +       /* locking */
4237 +
4238 +       /* this node was locked by carry process and should be
4239 +          unlocked when carry leaves a level */
4240 +       __u32 unlock:1;
4241 +
4242 +       /* disk block for this node was allocated by carry process and
4243 +          should be deallocated when carry leaves a level */
4244 +       __u32 deallocate:1;
4245 +       /* this carry node was allocated by carry process and should be
4246 +          freed when carry leaves a level */
4247 +       __u32 free:1;
4248 +
4249 +       /* type of lock we want to take on this node */
4250 +       lock_handle lock_handle;
4251 +} carry_node;
4252 +
4253 +/* &carry_opcode - elementary operations that can be carried upward
4254 +
4255 +   Operations that carry() can handle. This list is supposed to be
4256 +   expanded.
4257 +
4258 +   Each carry operation (cop) is handled by appropriate function defined
4259 +   in fs/reiser4/carry.c. For example COP_INSERT is handled by
4260 +   fs/reiser4/carry.c:carry_insert() etc. These functions in turn
4261 +   call plugins of nodes affected by operation to modify nodes' content
4262 +   and to gather operations to be performed on the next level.
4263 +
4264 +*/
4265 +typedef enum {
4266 +       /* insert new item into node. */
4267 +       COP_INSERT,
4268 +       /* delete pointer from parent node */
4269 +       COP_DELETE,
4270 +       /* remove part of or whole node. */
4271 +       COP_CUT,
4272 +       /* increase size of item. */
4273 +       COP_PASTE,
4274 +       /* insert extent (that is sequence of unformatted nodes). */
4275 +       COP_EXTENT,
4276 +       /* update delimiting key in least common ancestor of two
4277 +          nodes. This is performed when items are moved between two
4278 +          nodes.
4279 +        */
4280 +       COP_UPDATE,
4281 +       /* insert flow */
4282 +       COP_INSERT_FLOW,
4283 +       COP_LAST_OP,
4284 +} carry_opcode;
4285 +
4286 +#define CARRY_FLOW_NEW_NODES_LIMIT 20
4287 +
4288 +/* mode (or subtype) of COP_{INSERT|PASTE} operation. Specifies how target
4289 +   item is determined. */
4290 +typedef enum {
4291 +       /* target item is one containing pointer to the ->child node */
4292 +       COPT_CHILD,
4293 +       /* target item is given explicitly by @coord */
4294 +       COPT_ITEM_DATA,
4295 +       /* target item is given by key */
4296 +       COPT_KEY,
4297 +       /* see insert_paste_common() for more comments on this. */
4298 +       COPT_PASTE_RESTARTED,
4299 +} cop_insert_pos_type;
4300 +
4301 +/* flags to cut and delete */
4302 +typedef enum {
4303 +       /* don't kill node even if it became completely empty as results of
4304 +        * cut. This is needed for eottl handling. See carry_extent() for
4305 +        * details. */
4306 +       DELETE_RETAIN_EMPTY = (1 << 0)
4307 +} cop_delete_flag;
4308 +
4309 +/*
4310 + * carry() implements "lock handle tracking" feature.
4311 + *
4312 + * Callers supply carry with node where to perform initial operation and lock
4313 + * handle on this node. Trying to optimize node utilization carry may actually
4314 + * move insertion point to different node. Callers expect that lock handle
4315 + * will rebe transferred to the new node also.
4316 + *
4317 + */
4318 +typedef enum {
4319 +       /* transfer lock handle along with insertion point */
4320 +       CARRY_TRACK_CHANGE = 1,
4321 +       /* acquire new lock handle to the node where insertion point is. This
4322 +        * is used when carry() client doesn't initially possess lock handle
4323 +        * on the insertion point node, for example, by extent insertion
4324 +        * code. See carry_extent(). */
4325 +       CARRY_TRACK_NODE = 2
4326 +} carry_track_type;
4327 +
4328 +/* data supplied to COP_{INSERT|PASTE} by callers */
4329 +typedef struct carry_insert_data {
4330 +       /* position where new item is to be inserted */
4331 +       coord_t *coord;
4332 +       /* new item description */
4333 +       reiser4_item_data *data;
4334 +       /* key of new item */
4335 +       const reiser4_key *key;
4336 +} carry_insert_data;
4337 +
4338 +/* cut and kill are similar, so carry_cut_data and carry_kill_data share the below structure of parameters */
4339 +struct cut_kill_params {
4340 +       /* coord where cut starts (inclusive) */
4341 +       coord_t *from;
4342 +       /* coord where cut stops (inclusive, this item/unit will also be
4343 +        * cut) */
4344 +       coord_t *to;
4345 +       /* starting key. This is necessary when item and unit pos don't
4346 +        * uniquely identify what portion or tree to remove. For example, this
4347 +        * indicates what portion of extent unit will be affected. */
4348 +       const reiser4_key *from_key;
4349 +       /* exclusive stop key */
4350 +       const reiser4_key *to_key;
4351 +       /* if this is not NULL, smallest actually removed key is stored
4352 +        * here. */
4353 +       reiser4_key *smallest_removed;
4354 +       /* kill_node_content()  is called for file truncate */
4355 +       int truncate;
4356 +};
4357 +
4358 +struct carry_cut_data {
4359 +       struct cut_kill_params params;
4360 +};
4361 +
4362 +struct carry_kill_data {
4363 +       struct cut_kill_params params;
4364 +       /* parameter to be passed to the ->kill_hook() method of item
4365 +        * plugin */
4366 +       /*void *iplug_params; *//* FIXME: unused currently */
4367 +       /* if not NULL---inode whose items are being removed. This is needed
4368 +        * for ->kill_hook() of extent item to update VM structures when
4369 +        * removing pages. */
4370 +       struct inode *inode;
4371 +       /* sibling list maintenance is complicated by existence of eottl. When
4372 +        * eottl whose left and right neighbors are formatted leaves is
4373 +        * removed, one has to connect said leaves in the sibling list. This
4374 +        * cannot be done when extent removal is just started as locking rules
4375 +        * require sibling list update to happen atomically with removal of
4376 +        * extent item. Therefore: 1. pointers to left and right neighbors
4377 +        * have to be passed down to the ->kill_hook() of extent item, and
4378 +        * 2. said neighbors have to be locked. */
4379 +       lock_handle *left;
4380 +       lock_handle *right;
4381 +       /* flags modifying behavior of kill. Currently, it may have DELETE_RETAIN_EMPTY set. */
4382 +       unsigned flags;
4383 +       char *buf;
4384 +};
4385 +
4386 +/* &carry_tree_op - operation to "carry" upward.
4387 +
4388 +   Description of an operation we want to "carry" to the upper level of
4389 +   a tree: e.g, when we insert something and there is not enough space
4390 +   we allocate a new node and "carry" the operation of inserting a
4391 +   pointer to the new node to the upper level, on removal of empty node,
4392 +   we carry up operation of removing appropriate entry from parent.
4393 +
4394 +   There are two types of carry ops: when adding or deleting node we
4395 +   node at the parent level where appropriate modification has to be
4396 +   performed is known in advance. When shifting items between nodes
4397 +   (split, merge), delimiting key should be changed in the least common
4398 +   parent of the nodes involved that is not known in advance.
4399 +
4400 +   For the operations of the first type we store in &carry_op pointer to
4401 +   the &carry_node at the parent level. For the operation of the second
4402 +   type we store &carry_node or parents of the left and right nodes
4403 +   modified and keep track of them upward until they coincide.
4404 +
4405 +*/
4406 +typedef struct carry_op {
4407 +       /* pool linkage */
4408 +       reiser4_pool_header header;
4409 +       carry_opcode op;
4410 +       /* node on which operation is to be performed:
4411 +
4412 +          for insert, paste: node where new item is to be inserted
4413 +
4414 +          for delete: node where pointer is to be deleted
4415 +
4416 +          for cut: node to cut from
4417 +
4418 +          for update: node where delimiting key is to be modified
4419 +
4420 +          for modify: parent of modified node
4421 +
4422 +        */
4423 +       carry_node *node;
4424 +       union {
4425 +               struct {
4426 +                       /* (sub-)type of insertion/paste. Taken from
4427 +                          cop_insert_pos_type. */
4428 +                       __u8 type;
4429 +                       /* various operation flags. Taken from
4430 +                          cop_insert_flag. */
4431 +                       __u8 flags;
4432 +                       carry_insert_data *d;
4433 +                       carry_node *child;
4434 +                       znode *brother;
4435 +               } insert, paste, extent;
4436 +
4437 +               struct {
4438 +                       int is_cut;
4439 +                       union {
4440 +                               carry_kill_data *kill;
4441 +                               carry_cut_data *cut;
4442 +                       } u;
4443 +               } cut_or_kill;
4444 +
4445 +               struct {
4446 +                       carry_node *left;
4447 +               } update;
4448 +               struct {
4449 +                       /* changed child */
4450 +                       carry_node *child;
4451 +                       /* bitmask of changes. See &cop_modify_flag */
4452 +                       __u32 flag;
4453 +               } modify;
4454 +               struct {
4455 +                       /* flags to deletion operation. Are taken from
4456 +                          cop_delete_flag */
4457 +                       __u32 flags;
4458 +                       /* child to delete from parent. If this is
4459 +                          NULL, delete op->node.  */
4460 +                       carry_node *child;
4461 +               } delete;
4462 +               struct {
4463 +                       /* various operation flags. Taken from
4464 +                          cop_insert_flag. */
4465 +                       __u32 flags;
4466 +                       flow_t *flow;
4467 +                       coord_t *insert_point;
4468 +                       reiser4_item_data *data;
4469 +                       /* flow insertion is limited by number of new blocks
4470 +                          added in that operation which do not get any data
4471 +                          but part of flow. This limit is set by macro
4472 +                          CARRY_FLOW_NEW_NODES_LIMIT. This field stores number
4473 +                          of nodes added already during one carry_flow */
4474 +                       int new_nodes;
4475 +               } insert_flow;
4476 +       } u;
4477 +} carry_op;
4478 +
4479 +/* &carry_op_pool - preallocated pool of carry operations, and nodes */
4480 +typedef struct carry_pool {
4481 +       carry_op op[CARRIES_POOL_SIZE];
4482 +       reiser4_pool op_pool;
4483 +       carry_node node[NODES_LOCKED_POOL_SIZE];
4484 +       reiser4_pool node_pool;
4485 +} carry_pool;
4486 +
4487 +/* &carry_tree_level - carry process on given level
4488 +
4489 +   Description of balancing process on the given level.
4490 +
4491 +   No need for locking here, as carry_tree_level is essentially per
4492 +   thread thing (for now).
4493 +
4494 +*/
4495 +struct carry_level {
4496 +       /* this level may be restarted */
4497 +       __u32 restartable:1;
4498 +       /* list of carry nodes on this level, ordered by key order */
4499 +       struct list_head nodes;
4500 +       struct list_head ops;
4501 +       /* pool where new objects are allocated from */
4502 +       carry_pool *pool;
4503 +       int ops_num;
4504 +       int nodes_num;
4505 +       /* new root created on this level, if any */
4506 +       znode *new_root;
4507 +       /* This is set by caller (insert_by_key(), resize_item(), etc.) when
4508 +          they want ->tracked to automagically wander to the node where
4509 +          insertion point moved after insert or paste.
4510 +        */
4511 +       carry_track_type track_type;
4512 +       /* lock handle supplied by user that we are tracking. See
4513 +          above. */
4514 +       lock_handle *tracked;
4515 +};
4516 +
4517 +/* information carry passes to plugin methods that may add new operations to
4518 +   the @todo queue  */
4519 +struct carry_plugin_info {
4520 +       carry_level *doing;
4521 +       carry_level *todo;
4522 +};
4523 +
4524 +int carry(carry_level * doing, carry_level * done);
4525 +
4526 +carry_node *add_carry(carry_level * level, pool_ordering order,
4527 +                     carry_node * reference);
4528 +carry_node *add_carry_skip(carry_level * level, pool_ordering order,
4529 +                          carry_node * reference);
4530 +
4531 +extern carry_node *insert_carry_node(carry_level * doing,
4532 +                                    carry_level * todo, const znode * node);
4533 +
4534 +extern carry_pool *init_carry_pool(int);
4535 +extern void done_carry_pool(carry_pool * pool);
4536 +
4537 +extern void init_carry_level(carry_level * level, carry_pool * pool);
4538 +
4539 +extern carry_op *post_carry(carry_level * level, carry_opcode op, znode * node,
4540 +                           int apply_to_parent);
4541 +extern carry_op *node_post_carry(carry_plugin_info * info, carry_opcode op,
4542 +                                znode * node, int apply_to_parent_p);
4543 +
4544 +carry_node *add_new_znode(znode * brother, carry_node * reference,
4545 +                         carry_level * doing, carry_level * todo);
4546 +
4547 +carry_node *find_carry_node(carry_level * level, const znode * node);
4548 +
4549 +extern znode *carry_real(const carry_node * node);
4550 +
4551 +/* helper macros to iterate over carry queues */
4552 +
4553 +#define carry_node_next( node )                                        \
4554 +       list_entry((node)->header.level_linkage.next, carry_node,       \
4555 +                  header.level_linkage)
4556 +
4557 +#define carry_node_prev( node )                                        \
4558 +       list_entry((node)->header.level_linkage.prev, carry_node,       \
4559 +                  header.level_linkage)
4560 +
4561 +#define carry_node_front( level )                                              \
4562 +       list_entry((level)->nodes.next, carry_node, header.level_linkage)
4563 +
4564 +#define carry_node_back( level )                                               \
4565 +       list_entry((level)->nodes.prev, carry_node, header.level_linkage)
4566 +
4567 +#define carry_node_end( level, node )                          \
4568 +       (&(level)->nodes == &(node)->header.level_linkage)
4569 +
4570 +/* macro to iterate over all operations in a @level */
4571 +#define for_all_ops( level /* carry level (of type carry_level *) */,                  \
4572 +                    op    /* pointer to carry operation, modified by loop (of          \
4573 +                           * type carry_op *) */,                                      \
4574 +                    tmp   /* pointer to carry operation (of type carry_op *),          \
4575 +                           * used to make iterator stable in the face of               \
4576 +                           * deletions from the level */ )                             \
4577 +for (op = list_entry(level->ops.next, carry_op, header.level_linkage),                 \
4578 +     tmp = list_entry(op->header.level_linkage.next, carry_op, header.level_linkage);  \
4579 +     &op->header.level_linkage != &level->ops;                                         \
4580 +     op = tmp,                                                                         \
4581 +     tmp = list_entry(op->header.level_linkage.next, carry_op, header.level_linkage))
4582 +
4583 +#if 0
4584 +for( op = ( carry_op * ) pool_level_list_front( &level -> ops ),               \
4585 +     tmp = ( carry_op * ) pool_level_list_next( &op -> header ) ;              \
4586 +     ! pool_level_list_end( &level -> ops, &op -> header ) ;                   \
4587 +     op = tmp, tmp = ( carry_op * ) pool_level_list_next( &op -> header ) )
4588 +#endif
4589 +
4590 +/* macro to iterate over all nodes in a @level */                                              \
4591 +#define for_all_nodes( level /* carry level (of type carry_level *) */,                                \
4592 +                      node  /* pointer to carry node, modified by loop (of                     \
4593 +                             * type carry_node *) */,                                          \
4594 +                      tmp   /* pointer to carry node (of type carry_node *),                   \
4595 +                             * used to make iterator stable in the face of *                   \
4596 +                             * deletions from the level */ )                                   \
4597 +for (node = list_entry(level->nodes.next, carry_node, header.level_linkage),                   \
4598 +     tmp = list_entry(node->header.level_linkage.next, carry_node, header.level_linkage);      \
4599 +     &node->header.level_linkage != &level->nodes;                                             \
4600 +     node = tmp,                                                                               \
4601 +     tmp = list_entry(node->header.level_linkage.next, carry_node, header.level_linkage))
4602 +
4603 +#if 0
4604 +for( node = carry_node_front( level ),                                         \
4605 +     tmp = carry_node_next( node ) ; ! carry_node_end( level, node ) ;         \
4606 +     node = tmp, tmp = carry_node_next( node ) )
4607 +#endif
4608 +
4609 +/* macro to iterate over all nodes in a @level in reverse order
4610 +
4611 +   This is used, because nodes are unlocked in reversed order of locking */
4612 +#define for_all_nodes_back( level /* carry level (of type carry_level *) */,   \
4613 +                           node  /* pointer to carry node, modified by loop    \
4614 +                                  * (of type carry_node *) */,                 \
4615 +                           tmp   /* pointer to carry node (of type carry_node  \
4616 +                                  * *), used to make iterator stable in the    \
4617 +                                  * face of deletions from the level */ )      \
4618 +for( node = carry_node_back( level ),          \
4619 +     tmp = carry_node_prev( node ) ; ! carry_node_end( level, node ) ;         \
4620 +     node = tmp, tmp = carry_node_prev( node ) )
4621 +
4622 +/* __FS_REISER4_CARRY_H__ */
4623 +#endif
4624 +
4625 +/* Make Linus happy.
4626 +   Local variables:
4627 +   c-indentation-style: "K&R"
4628 +   mode-name: "LC"
4629 +   c-basic-offset: 8
4630 +   tab-width: 8
4631 +   fill-column: 120
4632 +   scroll-step: 1
4633 +   End:
4634 +*/
4635 diff --git a/fs/reiser4/carry_ops.c b/fs/reiser4/carry_ops.c
4636 new file mode 100644
4637 index 0000000..bfc1ee7
4638 --- /dev/null
4639 +++ b/fs/reiser4/carry_ops.c
4640 @@ -0,0 +1,2103 @@
4641 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
4642 +
4643 +/* implementation of carry operations */
4644 +
4645 +#include "forward.h"
4646 +#include "debug.h"
4647 +#include "key.h"
4648 +#include "coord.h"
4649 +#include "plugin/item/item.h"
4650 +#include "plugin/node/node.h"
4651 +#include "jnode.h"
4652 +#include "znode.h"
4653 +#include "block_alloc.h"
4654 +#include "tree_walk.h"
4655 +#include "pool.h"
4656 +#include "tree_mod.h"
4657 +#include "carry.h"
4658 +#include "carry_ops.h"
4659 +#include "tree.h"
4660 +#include "super.h"
4661 +#include "reiser4.h"
4662 +
4663 +#include <linux/types.h>
4664 +#include <linux/err.h>
4665 +
4666 +static int carry_shift_data(sideof side, coord_t * insert_coord, znode * node,
4667 +                           carry_level * doing, carry_level * todo,
4668 +                           unsigned int including_insert_coord_p);
4669 +
4670 +extern int lock_carry_node(carry_level * level, carry_node * node);
4671 +extern int lock_carry_node_tail(carry_node * node);
4672 +
4673 +/* find left neighbor of a carry node
4674 +
4675 +   Look for left neighbor of @node and add it to the @doing queue. See
4676 +   comments in the body.
4677 +
4678 +*/
4679 +static carry_node *find_left_neighbor(carry_op * op    /* node to find left
4680 +                                                        * neighbor of */ ,
4681 +                                     carry_level * doing /* level to scan */ )
4682 +{
4683 +       int result;
4684 +       carry_node *node;
4685 +       carry_node *left;
4686 +       int flags;
4687 +       reiser4_tree *tree;
4688 +
4689 +       node = op->node;
4690 +
4691 +       tree = current_tree;
4692 +       read_lock_tree(tree);
4693 +       /* first, check whether left neighbor is already in a @doing queue */
4694 +       if (carry_real(node)->left != NULL) {
4695 +               /* NOTE: there is locking subtlety here. Look into
4696 +                * find_right_neighbor() for more info */
4697 +               if (find_carry_node(doing, carry_real(node)->left) != NULL) {
4698 +                       read_unlock_tree(tree);
4699 +                       left = node;
4700 +                       do {
4701 +                               left = list_entry(left->header.level_linkage.prev,
4702 +                                                 carry_node, header.level_linkage);
4703 +                               assert("nikita-3408", !carry_node_end(doing,
4704 +                                                                     left));
4705 +                       } while (carry_real(left) == carry_real(node));
4706 +                       return left;
4707 +               }
4708 +       }
4709 +       read_unlock_tree(tree);
4710 +
4711 +       left = add_carry_skip(doing, POOLO_BEFORE, node);
4712 +       if (IS_ERR(left))
4713 +               return left;
4714 +
4715 +       left->node = node->node;
4716 +       left->free = 1;
4717 +
4718 +       flags = GN_TRY_LOCK;
4719 +       if (!op->u.insert.flags & COPI_LOAD_LEFT)
4720 +               flags |= GN_NO_ALLOC;
4721 +
4722 +       /* then, feeling lucky, peek left neighbor in the cache. */
4723 +       result = reiser4_get_left_neighbor(&left->lock_handle, carry_real(node),
4724 +                                          ZNODE_WRITE_LOCK, flags);
4725 +       if (result == 0) {
4726 +               /* ok, node found and locked. */
4727 +               result = lock_carry_node_tail(left);
4728 +               if (result != 0)
4729 +                       left = ERR_PTR(result);
4730 +       } else if (result == -E_NO_NEIGHBOR || result == -ENOENT) {
4731 +               /* node is leftmost node in a tree, or neighbor wasn't in
4732 +                  cache, or there is an extent on the left. */
4733 +               reiser4_pool_free(&doing->pool->node_pool, &left->header);
4734 +               left = NULL;
4735 +       } else if (doing->restartable) {
4736 +               /* if left neighbor is locked, and level is restartable, add
4737 +                  new node to @doing and restart. */
4738 +               assert("nikita-913", node->parent != 0);
4739 +               assert("nikita-914", node->node != NULL);
4740 +               left->left = 1;
4741 +               left->free = 0;
4742 +               left = ERR_PTR(-E_REPEAT);
4743 +       } else {
4744 +               /* left neighbor is locked, level cannot be restarted. Just
4745 +                  ignore left neighbor. */
4746 +               reiser4_pool_free(&doing->pool->node_pool, &left->header);
4747 +               left = NULL;
4748 +       }
4749 +       return left;
4750 +}
4751 +
4752 +/* find right neighbor of a carry node
4753 +
4754 +   Look for right neighbor of @node and add it to the @doing queue. See
4755 +   comments in the body.
4756 +
4757 +*/
4758 +static carry_node *find_right_neighbor(carry_op * op   /* node to find right
4759 +                                                        * neighbor of */ ,
4760 +                                      carry_level * doing /* level to scan */ )
4761 +{
4762 +       int result;
4763 +       carry_node *node;
4764 +       carry_node *right;
4765 +       lock_handle lh;
4766 +       int flags;
4767 +       reiser4_tree *tree;
4768 +
4769 +       init_lh(&lh);
4770 +
4771 +       node = op->node;
4772 +
4773 +       tree = current_tree;
4774 +       read_lock_tree(tree);
4775 +       /* first, check whether right neighbor is already in a @doing queue */
4776 +       if (carry_real(node)->right != NULL) {
4777 +               /*
4778 +                * Tree lock is taken here anyway, because, even if _outcome_
4779 +                * of (find_carry_node() != NULL) doesn't depends on
4780 +                * concurrent updates to ->right, find_carry_node() cannot
4781 +                * work with second argument NULL. Hence, following comment is
4782 +                * of historic importance only.
4783 +                *
4784 +                * Subtle:
4785 +                *
4786 +                * Q: why don't we need tree lock here, looking for the right
4787 +                * neighbor?
4788 +                *
4789 +                * A: even if value of node->real_node->right were changed
4790 +                * during find_carry_node() execution, outcome of execution
4791 +                * wouldn't change, because (in short) other thread cannot add
4792 +                * elements to the @doing, and if node->real_node->right
4793 +                * already was in @doing, value of node->real_node->right
4794 +                * couldn't change, because node cannot be inserted between
4795 +                * locked neighbors.
4796 +                */
4797 +               if (find_carry_node(doing, carry_real(node)->right) != NULL) {
4798 +                       read_unlock_tree(tree);
4799 +                       /*
4800 +                        * What we are doing here (this is also applicable to
4801 +                        * the find_left_neighbor()).
4802 +                        *
4803 +                        * tree_walk.c code requires that insertion of a
4804 +                        * pointer to a child, modification of parent pointer
4805 +                        * in the child, and insertion of the child into
4806 +                        * sibling list are atomic (see
4807 +                        * plugin/item/internal.c:create_hook_internal()).
4808 +                        *
4809 +                        * carry allocates new node long before pointer to it
4810 +                        * is inserted into parent and, actually, long before
4811 +                        * parent is even known. Such allocated-but-orphaned
4812 +                        * nodes are only trackable through carry level lists.
4813 +                        *
4814 +                        * Situation that is handled here is following: @node
4815 +                        * has valid ->right pointer, but there is
4816 +                        * allocated-but-orphaned node in the carry queue that
4817 +                        * is logically between @node and @node->right. Here
4818 +                        * we are searching for it. Critical point is that
4819 +                        * this is only possible if @node->right is also in
4820 +                        * the carry queue (this is checked above), because
4821 +                        * this is the only way new orphaned node could be
4822 +                        * inserted between them (before inserting new node,
4823 +                        * make_space() first tries to shift to the right, so,
4824 +                        * right neighbor will be locked and queued).
4825 +                        *
4826 +                        */
4827 +                       right = node;
4828 +                       do {
4829 +                               right = list_entry(right->header.level_linkage.next,
4830 +                                                  carry_node, header.level_linkage);
4831 +                               assert("nikita-3408", !carry_node_end(doing,
4832 +                                                                     right));
4833 +                       } while (carry_real(right) == carry_real(node));
4834 +                       return right;
4835 +               }
4836 +       }
4837 +       read_unlock_tree(tree);
4838 +
4839 +       flags = GN_CAN_USE_UPPER_LEVELS;
4840 +       if (!op->u.insert.flags & COPI_LOAD_RIGHT)
4841 +               flags = GN_NO_ALLOC;
4842 +
4843 +       /* then, try to lock right neighbor */
4844 +       init_lh(&lh);
4845 +       result = reiser4_get_right_neighbor(&lh, carry_real(node),
4846 +                                           ZNODE_WRITE_LOCK, flags);
4847 +       if (result == 0) {
4848 +               /* ok, node found and locked. */
4849 +               right = add_carry_skip(doing, POOLO_AFTER, node);
4850 +               if (!IS_ERR(right)) {
4851 +                       right->node = lh.node;
4852 +                       move_lh(&right->lock_handle, &lh);
4853 +                       right->free = 1;
4854 +                       result = lock_carry_node_tail(right);
4855 +                       if (result != 0)
4856 +                               right = ERR_PTR(result);
4857 +               }
4858 +       } else if ((result == -E_NO_NEIGHBOR) || (result == -ENOENT)) {
4859 +               /* node is rightmost node in a tree, or neighbor wasn't in
4860 +                  cache, or there is an extent on the right. */
4861 +               right = NULL;
4862 +       } else
4863 +               right = ERR_PTR(result);
4864 +       done_lh(&lh);
4865 +       return right;
4866 +}
4867 +
4868 +/* how much free space in a @node is needed for @op
4869 +
4870 +   How much space in @node is required for completion of @op, where @op is
4871 +   insert or paste operation.
4872 +*/
4873 +static unsigned int space_needed_for_op(znode * node   /* znode data are
4874 +                                                        * inserted or
4875 +                                                        * pasted in */ ,
4876 +                                       carry_op * op   /* carry
4877 +                                                          operation */ )
4878 +{
4879 +       assert("nikita-919", op != NULL);
4880 +
4881 +       switch (op->op) {
4882 +       default:
4883 +               impossible("nikita-1701", "Wrong opcode");
4884 +       case COP_INSERT:
4885 +               return space_needed(node, NULL, op->u.insert.d->data, 1);
4886 +       case COP_PASTE:
4887 +               return space_needed(node, op->u.insert.d->coord,
4888 +                                   op->u.insert.d->data, 0);
4889 +       }
4890 +}
4891 +
4892 +/* how much space in @node is required to insert or paste @data at
4893 +   @coord. */
4894 +unsigned int space_needed(const znode * node   /* node data are inserted or
4895 +                                                * pasted in */ ,
4896 +                         const coord_t * coord /* coord where data are
4897 +                                                * inserted or pasted
4898 +                                                * at */ ,
4899 +                         const reiser4_item_data * data        /* data to insert or
4900 +                                                                * paste */ ,
4901 +                         int insertion /* non-0 is inserting, 0---paste */ )
4902 +{
4903 +       int result;
4904 +       item_plugin *iplug;
4905 +
4906 +       assert("nikita-917", node != NULL);
4907 +       assert("nikita-918", node_plugin_by_node(node) != NULL);
4908 +       assert("vs-230", !insertion || (coord == NULL));
4909 +
4910 +       result = 0;
4911 +       iplug = data->iplug;
4912 +       if (iplug->b.estimate != NULL) {
4913 +               /* ask item plugin how much space is needed to insert this
4914 +                  item */
4915 +               result += iplug->b.estimate(insertion ? NULL : coord, data);
4916 +       } else {
4917 +               /* reasonable default */
4918 +               result += data->length;
4919 +       }
4920 +       if (insertion) {
4921 +               node_plugin *nplug;
4922 +
4923 +               nplug = node->nplug;
4924 +               /* and add node overhead */
4925 +               if (nplug->item_overhead != NULL) {
4926 +                       result += nplug->item_overhead(node, NULL);
4927 +               }
4928 +       }
4929 +       return result;
4930 +}
4931 +
4932 +/* find &coord in parent where pointer to new child is to be stored. */
4933 +static int find_new_child_coord(carry_op * op  /* COP_INSERT carry operation to
4934 +                                                * insert pointer to new
4935 +                                                * child */ )
4936 +{
4937 +       int result;
4938 +       znode *node;
4939 +       znode *child;
4940 +
4941 +       assert("nikita-941", op != NULL);
4942 +       assert("nikita-942", op->op == COP_INSERT);
4943 +
4944 +       node = carry_real(op->node);
4945 +       assert("nikita-943", node != NULL);
4946 +       assert("nikita-944", node_plugin_by_node(node) != NULL);
4947 +
4948 +       child = carry_real(op->u.insert.child);
4949 +       result =
4950 +           find_new_child_ptr(node, child, op->u.insert.brother,
4951 +                              op->u.insert.d->coord);
4952 +
4953 +       build_child_ptr_data(child, op->u.insert.d->data);
4954 +       return result;
4955 +}
4956 +
4957 +/* additional amount of free space in @node required to complete @op */
4958 +static int free_space_shortage(znode * node /* node to check */ ,
4959 +                              carry_op * op /* operation being performed */ )
4960 +{
4961 +       assert("nikita-1061", node != NULL);
4962 +       assert("nikita-1062", op != NULL);
4963 +
4964 +       switch (op->op) {
4965 +       default:
4966 +               impossible("nikita-1702", "Wrong opcode");
4967 +       case COP_INSERT:
4968 +       case COP_PASTE:
4969 +               return space_needed_for_op(node, op) - znode_free_space(node);
4970 +       case COP_EXTENT:
4971 +               /* when inserting extent shift data around until insertion
4972 +                  point is utmost in the node. */
4973 +               if (coord_wrt(op->u.insert.d->coord) == COORD_INSIDE)
4974 +                       return +1;
4975 +               else
4976 +                       return -1;
4977 +       }
4978 +}
4979 +
4980 +/* helper function: update node pointer in operation after insertion
4981 +   point was probably shifted into @target. */
4982 +static znode *sync_op(carry_op * op, carry_node * target)
4983 +{
4984 +       znode *insertion_node;
4985 +
4986 +       /* reget node from coord: shift might move insertion coord to
4987 +          the neighbor */
4988 +       insertion_node = op->u.insert.d->coord->node;
4989 +       /* if insertion point was actually moved into new node,
4990 +          update carry node pointer in operation. */
4991 +       if (insertion_node != carry_real(op->node)) {
4992 +               op->node = target;
4993 +               assert("nikita-2540", carry_real(target) == insertion_node);
4994 +       }
4995 +       assert("nikita-2541",
4996 +              carry_real(op->node) == op->u.insert.d->coord->node);
4997 +       return insertion_node;
4998 +}
4999 +
5000 +/*
5001 + * complete make_space() call: update tracked lock handle if necessary. See
5002 + * comments for fs/reiser4/carry.h:carry_track_type
5003 + */
5004 +static int
5005 +make_space_tail(carry_op * op, carry_level * doing, znode * orig_node)
5006 +{
5007 +       int result;
5008 +       carry_track_type tracking;
5009 +       znode *node;
5010 +
5011 +       tracking = doing->track_type;
5012 +       node = op->u.insert.d->coord->node;
5013 +
5014 +       if (tracking == CARRY_TRACK_NODE ||
5015 +           (tracking == CARRY_TRACK_CHANGE && node != orig_node)) {
5016 +               /* inserting or pasting into node different from
5017 +                  original. Update lock handle supplied by caller. */
5018 +               assert("nikita-1417", doing->tracked != NULL);
5019 +               done_lh(doing->tracked);
5020 +               init_lh(doing->tracked);
5021 +               result = longterm_lock_znode(doing->tracked, node,
5022 +                                            ZNODE_WRITE_LOCK,
5023 +                                            ZNODE_LOCK_HIPRI);
5024 +       } else
5025 +               result = 0;
5026 +       return result;
5027 +}
5028 +
5029 +/* This is insertion policy function. It shifts data to the left and right
5030 +   neighbors of insertion coord and allocates new nodes until there is enough
5031 +   free space to complete @op.
5032 +
5033 +   See comments in the body.
5034 +
5035 +   Assumes that the node format favors insertions at the right end of the node
5036 +   as node40 does.
5037 +
5038 +   See carry_flow() on detail about flow insertion
5039 +*/
5040 +static int make_space(carry_op * op /* carry operation, insert or paste */ ,
5041 +                     carry_level * doing /* current carry queue */ ,
5042 +                     carry_level * todo /* carry queue on the parent level */ )
5043 +{
5044 +       znode *node;
5045 +       int result;
5046 +       int not_enough_space;
5047 +       int blk_alloc;
5048 +       znode *orig_node;
5049 +       __u32 flags;
5050 +
5051 +       coord_t *coord;
5052 +
5053 +       assert("nikita-890", op != NULL);
5054 +       assert("nikita-891", todo != NULL);
5055 +       assert("nikita-892",
5056 +              op->op == COP_INSERT ||
5057 +              op->op == COP_PASTE || op->op == COP_EXTENT);
5058 +       assert("nikita-1607",
5059 +              carry_real(op->node) == op->u.insert.d->coord->node);
5060 +
5061 +       flags = op->u.insert.flags;
5062 +
5063 +       /* NOTE check that new node can only be allocated after checking left
5064 +        * and right neighbors. This is necessary for proper work of
5065 +        * find_{left,right}_neighbor(). */
5066 +       assert("nikita-3410", ergo(flags & COPI_DONT_ALLOCATE,
5067 +                                  flags & COPI_DONT_SHIFT_LEFT));
5068 +       assert("nikita-3411", ergo(flags & COPI_DONT_ALLOCATE,
5069 +                                  flags & COPI_DONT_SHIFT_RIGHT));
5070 +
5071 +       coord = op->u.insert.d->coord;
5072 +       orig_node = node = coord->node;
5073 +
5074 +       assert("nikita-908", node != NULL);
5075 +       assert("nikita-909", node_plugin_by_node(node) != NULL);
5076 +
5077 +       result = 0;
5078 +       /* If there is not enough space in a node, try to shift something to
5079 +          the left neighbor. This is a bit tricky, as locking to the left is
5080 +          low priority. This is handled by restart logic in carry().
5081 +        */
5082 +       not_enough_space = free_space_shortage(node, op);
5083 +       if (not_enough_space <= 0)
5084 +               /* it is possible that carry was called when there actually
5085 +                  was enough space in the node. For example, when inserting
5086 +                  leftmost item so that delimiting keys have to be updated.
5087 +                */
5088 +               return make_space_tail(op, doing, orig_node);
5089 +       if (!(flags & COPI_DONT_SHIFT_LEFT)) {
5090 +               carry_node *left;
5091 +               /* make note in statistics of an attempt to move
5092 +                  something into the left neighbor */
5093 +               left = find_left_neighbor(op, doing);
5094 +               if (unlikely(IS_ERR(left))) {
5095 +                       if (PTR_ERR(left) == -E_REPEAT)
5096 +                               return -E_REPEAT;
5097 +                       else {
5098 +                               /* some error other than restart request
5099 +                                  occurred. This shouldn't happen. Issue a
5100 +                                  warning and continue as if left neighbor
5101 +                                  weren't existing.
5102 +                                */
5103 +                               warning("nikita-924",
5104 +                                       "Error accessing left neighbor: %li",
5105 +                                       PTR_ERR(left));
5106 +                       }
5107 +               } else if (left != NULL) {
5108 +
5109 +                       /* shift everything possible on the left of and
5110 +                          including insertion coord into the left neighbor */
5111 +                       result = carry_shift_data(LEFT_SIDE, coord,
5112 +                                                 carry_real(left), doing, todo,
5113 +                                                 flags & COPI_GO_LEFT);
5114 +
5115 +                       /* reget node from coord: shift_left() might move
5116 +                          insertion coord to the left neighbor */
5117 +                       node = sync_op(op, left);
5118 +
5119 +                       not_enough_space = free_space_shortage(node, op);
5120 +                       /* There is not enough free space in @node, but
5121 +                          may be, there is enough free space in
5122 +                          @left. Various balancing decisions are valid here.
5123 +                          The same for the shifiting to the right.
5124 +                        */
5125 +               }
5126 +       }
5127 +       /* If there still is not enough space, shift to the right */
5128 +       if (not_enough_space > 0 && !(flags & COPI_DONT_SHIFT_RIGHT)) {
5129 +               carry_node *right;
5130 +
5131 +               right = find_right_neighbor(op, doing);
5132 +               if (IS_ERR(right)) {
5133 +                       warning("nikita-1065",
5134 +                               "Error accessing right neighbor: %li",
5135 +                               PTR_ERR(right));
5136 +               } else if (right != NULL) {
5137 +                       /* node containing insertion point, and its right
5138 +                          neighbor node are write locked by now.
5139 +
5140 +                          shift everything possible on the right of but
5141 +                          excluding insertion coord into the right neighbor
5142 +                        */
5143 +                       result = carry_shift_data(RIGHT_SIDE, coord,
5144 +                                                 carry_real(right),
5145 +                                                 doing, todo,
5146 +                                                 flags & COPI_GO_RIGHT);
5147 +                       /* reget node from coord: shift_right() might move
5148 +                          insertion coord to the right neighbor */
5149 +                       node = sync_op(op, right);
5150 +                       not_enough_space = free_space_shortage(node, op);
5151 +               }
5152 +       }
5153 +       /* If there is still not enough space, allocate new node(s).
5154 +
5155 +          We try to allocate new blocks if COPI_DONT_ALLOCATE is not set in
5156 +          the carry operation flags (currently this is needed during flush
5157 +          only).
5158 +        */
5159 +       for (blk_alloc = 0;
5160 +            not_enough_space > 0 && result == 0 && blk_alloc < 2 &&
5161 +            !(flags & COPI_DONT_ALLOCATE); ++blk_alloc) {
5162 +               carry_node *fresh;      /* new node we are allocating */
5163 +               coord_t coord_shadow;   /* remembered insertion point before
5164 +                                        * shifting data into new node */
5165 +               carry_node *node_shadow;        /* remembered insertion node before
5166 +                                                * shifting */
5167 +               unsigned int gointo;    /* whether insertion point should move
5168 +                                        * into newly allocated node */
5169 +
5170 +               /* allocate new node on the right of @node. Znode and disk
5171 +                  fake block number for new node are allocated.
5172 +
5173 +                  add_new_znode() posts carry operation COP_INSERT with
5174 +                  COPT_CHILD option to the parent level to add
5175 +                  pointer to newly created node to its parent.
5176 +
5177 +                  Subtle point: if several new nodes are required to complete
5178 +                  insertion operation at this level, they will be inserted
5179 +                  into their parents in the order of creation, which means
5180 +                  that @node will be valid "cookie" at the time of insertion.
5181 +
5182 +                */
5183 +               fresh = add_new_znode(node, op->node, doing, todo);
5184 +               if (IS_ERR(fresh))
5185 +                       return PTR_ERR(fresh);
5186 +
5187 +               /* Try to shift into new node. */
5188 +               result = lock_carry_node(doing, fresh);
5189 +               zput(carry_real(fresh));
5190 +               if (result != 0) {
5191 +                       warning("nikita-947",
5192 +                               "Cannot lock new node: %i", result);
5193 +                       return result;
5194 +               }
5195 +
5196 +               /* both nodes are write locked by now.
5197 +
5198 +                  shift everything possible on the right of and
5199 +                  including insertion coord into the right neighbor.
5200 +                */
5201 +               coord_dup(&coord_shadow, op->u.insert.d->coord);
5202 +               node_shadow = op->node;
5203 +               /* move insertion point into newly created node if:
5204 +
5205 +                  . insertion point is rightmost in the source node, or
5206 +                  . this is not the first node we are allocating in a row.
5207 +                */
5208 +               gointo =
5209 +                   (blk_alloc > 0) ||
5210 +                   coord_is_after_rightmost(op->u.insert.d->coord);
5211 +
5212 +               result = carry_shift_data(RIGHT_SIDE, coord, carry_real(fresh),
5213 +                                         doing, todo, gointo);
5214 +               /* if insertion point was actually moved into new node,
5215 +                  update carry node pointer in operation. */
5216 +               node = sync_op(op, fresh);
5217 +               not_enough_space = free_space_shortage(node, op);
5218 +               if ((not_enough_space > 0) && (node != coord_shadow.node)) {
5219 +                       /* there is not enough free in new node. Shift
5220 +                          insertion point back to the @shadow_node so that
5221 +                          next new node would be inserted between
5222 +                          @shadow_node and @fresh.
5223 +                        */
5224 +                       coord_normalize(&coord_shadow);
5225 +                       coord_dup(coord, &coord_shadow);
5226 +                       node = coord->node;
5227 +                       op->node = node_shadow;
5228 +                       if (1 || (flags & COPI_STEP_BACK)) {
5229 +                               /* still not enough space?! Maybe there is
5230 +                                  enough space in the source node (i.e., node
5231 +                                  data are moved from) now.
5232 +                                */
5233 +                               not_enough_space =
5234 +                                   free_space_shortage(node, op);
5235 +                       }
5236 +               }
5237 +       }
5238 +       if (not_enough_space > 0) {
5239 +               if (!(flags & COPI_DONT_ALLOCATE))
5240 +                       warning("nikita-948", "Cannot insert new item");
5241 +               result = -E_NODE_FULL;
5242 +       }
5243 +       assert("nikita-1622", ergo(result == 0,
5244 +                                  carry_real(op->node) == coord->node));
5245 +       assert("nikita-2616", coord == op->u.insert.d->coord);
5246 +       if (result == 0)
5247 +               result = make_space_tail(op, doing, orig_node);
5248 +       return result;
5249 +}
5250 +
5251 +/* insert_paste_common() - common part of insert and paste operations
5252 +
5253 +   This function performs common part of COP_INSERT and COP_PASTE.
5254 +
5255 +   There are two ways in which insertion/paste can be requested:
5256 +
5257 +    . by directly supplying reiser4_item_data. In this case, op ->
5258 +    u.insert.type is set to COPT_ITEM_DATA.
5259 +
5260 +    . by supplying child pointer to which is to inserted into parent. In this
5261 +    case op -> u.insert.type == COPT_CHILD.
5262 +
5263 +    . by supplying key of new item/unit. This is currently only used during
5264 +    extent insertion
5265 +
5266 +   This is required, because when new node is allocated we don't know at what
5267 +   position pointer to it is to be stored in the parent. Actually, we don't
5268 +   even know what its parent will be, because parent can be re-balanced
5269 +   concurrently and new node re-parented, and because parent can be full and
5270 +   pointer to the new node will go into some other node.
5271 +
5272 +   insert_paste_common() resolves pointer to child node into position in the
5273 +   parent by calling find_new_child_coord(), that fills
5274 +   reiser4_item_data. After this, insertion/paste proceeds uniformly.
5275 +
5276 +   Another complication is with finding free space during pasting. It may
5277 +   happen that while shifting items to the neighbors and newly allocated
5278 +   nodes, insertion coord can no longer be in the item we wanted to paste
5279 +   into. At this point, paste becomes (morphs) into insert. Moreover free
5280 +   space analysis has to be repeated, because amount of space required for
5281 +   insertion is different from that of paste (item header overhead, etc).
5282 +
5283 +   This function "unifies" different insertion modes (by resolving child
5284 +   pointer or key into insertion coord), and then calls make_space() to free
5285 +   enough space in the node by shifting data to the left and right and by
5286 +   allocating new nodes if necessary. Carry operation knows amount of space
5287 +   required for its completion. After enough free space is obtained, caller of
5288 +   this function (carry_{insert,paste,etc.}) performs actual insertion/paste
5289 +   by calling item plugin method.
5290 +
5291 +*/
5292 +static int insert_paste_common(carry_op * op   /* carry operation being
5293 +                                                * performed */ ,
5294 +                              carry_level * doing /* current carry level */ ,
5295 +                              carry_level * todo /* next carry level */ ,
5296 +                              carry_insert_data * cdata        /* pointer to
5297 +                                                                * cdata */ ,
5298 +                              coord_t * coord /* insertion/paste coord */ ,
5299 +                              reiser4_item_data * data /* data to be
5300 +                                                        * inserted/pasted */ )
5301 +{
5302 +       assert("nikita-981", op != NULL);
5303 +       assert("nikita-980", todo != NULL);
5304 +       assert("nikita-979", (op->op == COP_INSERT) || (op->op == COP_PASTE)
5305 +              || (op->op == COP_EXTENT));
5306 +
5307 +       if (op->u.insert.type == COPT_PASTE_RESTARTED) {
5308 +               /* nothing to do. Fall through to make_space(). */
5309 +               ;
5310 +       } else if (op->u.insert.type == COPT_KEY) {
5311 +               node_search_result intra_node;
5312 +               znode *node;
5313 +               /* Problem with doing batching at the lowest level, is that
5314 +                  operations here are given by coords where modification is
5315 +                  to be performed, and one modification can invalidate coords
5316 +                  of all following operations.
5317 +
5318 +                  So, we are implementing yet another type for operation that
5319 +                  will use (the only) "locator" stable across shifting of
5320 +                  data between nodes, etc.: key (COPT_KEY).
5321 +
5322 +                  This clause resolves key to the coord in the node.
5323 +
5324 +                  But node can change also. Probably some pieces have to be
5325 +                  added to the lock_carry_node(), to lock node by its key.
5326 +
5327 +                */
5328 +               /* NOTE-NIKITA Lookup bias is fixed to FIND_EXACT. Complain
5329 +                  if you need something else. */
5330 +               op->u.insert.d->coord = coord;
5331 +               node = carry_real(op->node);
5332 +               intra_node = node_plugin_by_node(node)->lookup
5333 +                   (node, op->u.insert.d->key, FIND_EXACT,
5334 +                    op->u.insert.d->coord);
5335 +               if ((intra_node != NS_FOUND) && (intra_node != NS_NOT_FOUND)) {
5336 +                       warning("nikita-1715", "Intra node lookup failure: %i",
5337 +                               intra_node);
5338 +                       return intra_node;
5339 +               }
5340 +       } else if (op->u.insert.type == COPT_CHILD) {
5341 +               /* if we are asked to insert pointer to the child into
5342 +                  internal node, first convert pointer to the child into
5343 +                  coord within parent node.
5344 +                */
5345 +               znode *child;
5346 +               int result;
5347 +
5348 +               op->u.insert.d = cdata;
5349 +               op->u.insert.d->coord = coord;
5350 +               op->u.insert.d->data = data;
5351 +               op->u.insert.d->coord->node = carry_real(op->node);
5352 +               result = find_new_child_coord(op);
5353 +               child = carry_real(op->u.insert.child);
5354 +               if (result != NS_NOT_FOUND) {
5355 +                       warning("nikita-993",
5356 +                               "Cannot find a place for child pointer: %i",
5357 +                               result);
5358 +                       return result;
5359 +               }
5360 +               /* This only happens when we did multiple insertions at
5361 +                  the previous level, trying to insert single item and
5362 +                  it so happened, that insertion of pointers to all new
5363 +                  nodes before this one already caused parent node to
5364 +                  split (may be several times).
5365 +
5366 +                  I am going to come up with better solution.
5367 +
5368 +                  You are not expected to understand this.
5369 +                  -- v6root/usr/sys/ken/slp.c
5370 +
5371 +                  Basically, what happens here is the following: carry came
5372 +                  to the parent level and is about to insert internal item
5373 +                  pointing to the child node that it just inserted in the
5374 +                  level below. Position where internal item is to be inserted
5375 +                  was found by find_new_child_coord() above, but node of the
5376 +                  current carry operation (that is, parent node of child
5377 +                  inserted on the previous level), was determined earlier in
5378 +                  the lock_carry_level/lock_carry_node. It could so happen
5379 +                  that other carry operations already performed on the parent
5380 +                  level already split parent node, so that insertion point
5381 +                  moved into another node. Handle this by creating new carry
5382 +                  node for insertion point if necessary.
5383 +                */
5384 +               if (carry_real(op->node) != op->u.insert.d->coord->node) {
5385 +                       pool_ordering direction;
5386 +                       znode *z1;
5387 +                       znode *z2;
5388 +                       reiser4_key k1;
5389 +                       reiser4_key k2;
5390 +
5391 +                       /*
5392 +                        * determine in what direction insertion point
5393 +                        * moved. Do this by comparing delimiting keys.
5394 +                        */
5395 +                       z1 = op->u.insert.d->coord->node;
5396 +                       z2 = carry_real(op->node);
5397 +                       if (keyle(leftmost_key_in_node(z1, &k1),
5398 +                                 leftmost_key_in_node(z2, &k2)))
5399 +                               /* insertion point moved to the left */
5400 +                               direction = POOLO_BEFORE;
5401 +                       else
5402 +                               /* insertion point moved to the right */
5403 +                               direction = POOLO_AFTER;
5404 +
5405 +                       op->node = add_carry_skip(doing, direction, op->node);
5406 +                       if (IS_ERR(op->node))
5407 +                               return PTR_ERR(op->node);
5408 +                       op->node->node = op->u.insert.d->coord->node;
5409 +                       op->node->free = 1;
5410 +                       result = lock_carry_node(doing, op->node);
5411 +                       if (result != 0)
5412 +                               return result;
5413 +               }
5414 +
5415 +               /*
5416 +                * set up key of an item being inserted: we are inserting
5417 +                * internal item and its key is (by the very definition of
5418 +                * search tree) is leftmost key in the child node.
5419 +                */
5420 +               write_lock_dk(znode_get_tree(child));
5421 +               op->u.insert.d->key = leftmost_key_in_node(child,
5422 +                                                          znode_get_ld_key(child));
5423 +               write_unlock_dk(znode_get_tree(child));
5424 +               op->u.insert.d->data->arg = op->u.insert.brother;
5425 +       } else {
5426 +               assert("vs-243", op->u.insert.d->coord != NULL);
5427 +               op->u.insert.d->coord->node = carry_real(op->node);
5428 +       }
5429 +
5430 +       /* find free space. */
5431 +       return make_space(op, doing, todo);
5432 +}
5433 +
5434 +/* handle carry COP_INSERT operation.
5435 +
5436 +   Insert new item into node. New item can be given in one of two ways:
5437 +
5438 +   - by passing &tree_coord and &reiser4_item_data as part of @op. This is
5439 +   only applicable at the leaf/twig level.
5440 +
5441 +   - by passing a child node pointer to which is to be inserted by this
5442 +   operation.
5443 +
5444 +*/
5445 +static int carry_insert(carry_op * op /* operation to perform */ ,
5446 +                       carry_level * doing     /* queue of operations @op
5447 +                                                * is part of */ ,
5448 +                       carry_level * todo      /* queue where new operations
5449 +                                                * are accumulated */ )
5450 +{
5451 +       znode *node;
5452 +       carry_insert_data cdata;
5453 +       coord_t coord;
5454 +       reiser4_item_data data;
5455 +       carry_plugin_info info;
5456 +       int result;
5457 +
5458 +       assert("nikita-1036", op != NULL);
5459 +       assert("nikita-1037", todo != NULL);
5460 +       assert("nikita-1038", op->op == COP_INSERT);
5461 +
5462 +       coord_init_zero(&coord);
5463 +
5464 +       /* perform common functionality of insert and paste. */
5465 +       result = insert_paste_common(op, doing, todo, &cdata, &coord, &data);
5466 +       if (result != 0)
5467 +               return result;
5468 +
5469 +       node = op->u.insert.d->coord->node;
5470 +       assert("nikita-1039", node != NULL);
5471 +       assert("nikita-1040", node_plugin_by_node(node) != NULL);
5472 +
5473 +       assert("nikita-949",
5474 +              space_needed_for_op(node, op) <= znode_free_space(node));
5475 +
5476 +       /* ask node layout to create new item. */
5477 +       info.doing = doing;
5478 +       info.todo = todo;
5479 +       result = node_plugin_by_node(node)->create_item
5480 +           (op->u.insert.d->coord, op->u.insert.d->key, op->u.insert.d->data,
5481 +            &info);
5482 +       doing->restartable = 0;
5483 +       znode_make_dirty(node);
5484 +
5485 +       return result;
5486 +}
5487 +
5488 +/*
5489 + * Flow insertion code. COP_INSERT_FLOW is special tree operation that is
5490 + * supplied with a "flow" (that is, a stream of data) and inserts it into tree
5491 + * by slicing into multiple items.
5492 + */
5493 +
5494 +#define flow_insert_point(op) ( ( op ) -> u.insert_flow.insert_point )
5495 +#define flow_insert_flow(op) ( ( op ) -> u.insert_flow.flow )
5496 +#define flow_insert_data(op) ( ( op ) -> u.insert_flow.data )
5497 +
5498 +static size_t item_data_overhead(carry_op * op)
5499 +{
5500 +       if (flow_insert_data(op)->iplug->b.estimate == NULL)
5501 +               return 0;
5502 +       return (flow_insert_data(op)->iplug->b.
5503 +               estimate(NULL /* estimate insertion */ , flow_insert_data(op)) -
5504 +               flow_insert_data(op)->length);
5505 +}
5506 +
5507 +/* FIXME-VS: this is called several times during one make_flow_for_insertion
5508 +   and it will always return the same result. Some optimization could be made
5509 +   by calculating this value once at the beginning and passing it around. That
5510 +   would reduce some flexibility in future changes
5511 +*/
5512 +static int can_paste(coord_t *, const reiser4_key *, const reiser4_item_data *);
5513 +static size_t flow_insertion_overhead(carry_op * op)
5514 +{
5515 +       znode *node;
5516 +       size_t insertion_overhead;
5517 +
5518 +       node = flow_insert_point(op)->node;
5519 +       insertion_overhead = 0;
5520 +       if (node->nplug->item_overhead &&
5521 +           !can_paste(flow_insert_point(op), &flow_insert_flow(op)->key,
5522 +                      flow_insert_data(op)))
5523 +               insertion_overhead =
5524 +                   node->nplug->item_overhead(node, NULL) +
5525 +                       item_data_overhead(op);
5526 +       return insertion_overhead;
5527 +}
5528 +
5529 +/* how many bytes of flow does fit to the node */
5530 +static int what_can_fit_into_node(carry_op * op)
5531 +{
5532 +       size_t free, overhead;
5533 +
5534 +       overhead = flow_insertion_overhead(op);
5535 +       free = znode_free_space(flow_insert_point(op)->node);
5536 +       if (free <= overhead)
5537 +               return 0;
5538 +       free -= overhead;
5539 +       /* FIXME: flow->length is loff_t only to not get overflowed in case of expandign truncate */
5540 +       if (free < op->u.insert_flow.flow->length)
5541 +               return free;
5542 +       return (int)op->u.insert_flow.flow->length;
5543 +}
5544 +
5545 +/* in make_space_for_flow_insertion we need to check either whether whole flow
5546 +   fits into a node or whether minimal fraction of flow fits into a node */
5547 +static int enough_space_for_whole_flow(carry_op * op)
5548 +{
5549 +       return (unsigned)what_can_fit_into_node(op) ==
5550 +           op->u.insert_flow.flow->length;
5551 +}
5552 +
5553 +#define MIN_FLOW_FRACTION 1
5554 +static int enough_space_for_min_flow_fraction(carry_op * op)
5555 +{
5556 +       assert("vs-902", coord_is_after_rightmost(flow_insert_point(op)));
5557 +
5558 +       return what_can_fit_into_node(op) >= MIN_FLOW_FRACTION;
5559 +}
5560 +
5561 +/* this returns 0 if left neighbor was obtained successfully and everything
5562 +   upto insertion point including it were shifted and left neighbor still has
5563 +   some free space to put minimal fraction of flow into it */
5564 +static int
5565 +make_space_by_shift_left(carry_op * op, carry_level * doing, carry_level * todo)
5566 +{
5567 +       carry_node *left;
5568 +       znode *orig;
5569 +
5570 +       left = find_left_neighbor(op, doing);
5571 +       if (unlikely(IS_ERR(left))) {
5572 +               warning("vs-899",
5573 +                       "make_space_by_shift_left: "
5574 +                       "error accessing left neighbor: %li", PTR_ERR(left));
5575 +               return 1;
5576 +       }
5577 +       if (left == NULL)
5578 +               /* left neighbor either does not exist or is unformatted
5579 +                  node */
5580 +               return 1;
5581 +
5582 +       orig = flow_insert_point(op)->node;
5583 +       /* try to shift content of node @orig from its head upto insert point
5584 +          including insertion point into the left neighbor */
5585 +       carry_shift_data(LEFT_SIDE, flow_insert_point(op), carry_real(left), doing, todo, 1     /* including insert
5586 +                                                                                                * point */ );
5587 +       if (carry_real(left) != flow_insert_point(op)->node) {
5588 +               /* insertion point did not move */
5589 +               return 1;
5590 +       }
5591 +
5592 +       /* insertion point is set after last item in the node */
5593 +       assert("vs-900", coord_is_after_rightmost(flow_insert_point(op)));
5594 +
5595 +       if (!enough_space_for_min_flow_fraction(op)) {
5596 +               /* insertion point node does not have enough free space to put
5597 +                  even minimal portion of flow into it, therefore, move
5598 +                  insertion point back to orig node (before first item) */
5599 +               coord_init_before_first_item(flow_insert_point(op), orig);
5600 +               return 1;
5601 +       }
5602 +
5603 +       /* part of flow is to be written to the end of node */
5604 +       op->node = left;
5605 +       return 0;
5606 +}
5607 +
5608 +/* this returns 0 if right neighbor was obtained successfully and everything to
5609 +   the right of insertion point was shifted to it and node got enough free
5610 +   space to put minimal fraction of flow into it */
5611 +static int
5612 +make_space_by_shift_right(carry_op * op, carry_level * doing,
5613 +                         carry_level * todo)
5614 +{
5615 +       carry_node *right;
5616 +
5617 +       right = find_right_neighbor(op, doing);
5618 +       if (unlikely(IS_ERR(right))) {
5619 +               warning("nikita-1065", "shift_right_excluding_insert_point: "
5620 +                       "error accessing right neighbor: %li", PTR_ERR(right));
5621 +               return 1;
5622 +       }
5623 +       if (right) {
5624 +               /* shift everything possible on the right of but excluding
5625 +                  insertion coord into the right neighbor */
5626 +               carry_shift_data(RIGHT_SIDE, flow_insert_point(op), carry_real(right), doing, todo, 0   /* not
5627 +                                                                                                        * including
5628 +                                                                                                        * insert
5629 +                                                                                                        * point */ );
5630 +       } else {
5631 +               /* right neighbor either does not exist or is unformatted
5632 +                  node */
5633 +               ;
5634 +       }
5635 +       if (coord_is_after_rightmost(flow_insert_point(op))) {
5636 +               if (enough_space_for_min_flow_fraction(op)) {
5637 +                       /* part of flow is to be written to the end of node */
5638 +                       return 0;
5639 +               }
5640 +       }
5641 +
5642 +       /* new node is to be added if insert point node did not get enough
5643 +          space for whole flow */
5644 +       return 1;
5645 +}
5646 +
5647 +/* this returns 0 when insert coord is set at the node end and fraction of flow
5648 +   fits into that node */
5649 +static int
5650 +make_space_by_new_nodes(carry_op * op, carry_level * doing, carry_level * todo)
5651 +{
5652 +       int result;
5653 +       znode *node;
5654 +       carry_node *new;
5655 +
5656 +       node = flow_insert_point(op)->node;
5657 +
5658 +       if (op->u.insert_flow.new_nodes == CARRY_FLOW_NEW_NODES_LIMIT)
5659 +               return RETERR(-E_NODE_FULL);
5660 +       /* add new node after insert point node */
5661 +       new = add_new_znode(node, op->node, doing, todo);
5662 +       if (unlikely(IS_ERR(new))) {
5663 +               return PTR_ERR(new);
5664 +       }
5665 +       result = lock_carry_node(doing, new);
5666 +       zput(carry_real(new));
5667 +       if (unlikely(result)) {
5668 +               return result;
5669 +       }
5670 +       op->u.insert_flow.new_nodes++;
5671 +       if (!coord_is_after_rightmost(flow_insert_point(op))) {
5672 +               carry_shift_data(RIGHT_SIDE, flow_insert_point(op), carry_real(new), doing, todo, 0     /* not
5673 +                                                                                                        * including
5674 +                                                                                                        * insert
5675 +                                                                                                        * point */ );
5676 +
5677 +               assert("vs-901",
5678 +                      coord_is_after_rightmost(flow_insert_point(op)));
5679 +
5680 +               if (enough_space_for_min_flow_fraction(op)) {
5681 +                       return 0;
5682 +               }
5683 +               if (op->u.insert_flow.new_nodes == CARRY_FLOW_NEW_NODES_LIMIT)
5684 +                       return RETERR(-E_NODE_FULL);
5685 +
5686 +               /* add one more new node */
5687 +               new = add_new_znode(node, op->node, doing, todo);
5688 +               if (unlikely(IS_ERR(new))) {
5689 +                       return PTR_ERR(new);
5690 +               }
5691 +               result = lock_carry_node(doing, new);
5692 +               zput(carry_real(new));
5693 +               if (unlikely(result)) {
5694 +                       return result;
5695 +               }
5696 +               op->u.insert_flow.new_nodes++;
5697 +       }
5698 +
5699 +       /* move insertion point to new node */
5700 +       coord_init_before_first_item(flow_insert_point(op), carry_real(new));
5701 +       op->node = new;
5702 +       return 0;
5703 +}
5704 +
5705 +static int
5706 +make_space_for_flow_insertion(carry_op * op, carry_level * doing,
5707 +                             carry_level * todo)
5708 +{
5709 +       __u32 flags = op->u.insert_flow.flags;
5710 +
5711 +       if (enough_space_for_whole_flow(op)) {
5712 +               /* whole flow fits into insert point node */
5713 +               return 0;
5714 +       }
5715 +
5716 +       if (!(flags & COPI_DONT_SHIFT_LEFT)
5717 +           && (make_space_by_shift_left(op, doing, todo) == 0)) {
5718 +               /* insert point is shifted to left neighbor of original insert
5719 +                  point node and is set after last unit in that node. It has
5720 +                  enough space to fit at least minimal fraction of flow. */
5721 +               return 0;
5722 +       }
5723 +
5724 +       if (enough_space_for_whole_flow(op)) {
5725 +               /* whole flow fits into insert point node */
5726 +               return 0;
5727 +       }
5728 +
5729 +       if (!(flags & COPI_DONT_SHIFT_RIGHT)
5730 +           && (make_space_by_shift_right(op, doing, todo) == 0)) {
5731 +               /* insert point is still set to the same node, but there is
5732 +                  nothing to the right of insert point. */
5733 +               return 0;
5734 +       }
5735 +
5736 +       if (enough_space_for_whole_flow(op)) {
5737 +               /* whole flow fits into insert point node */
5738 +               return 0;
5739 +       }
5740 +
5741 +       return make_space_by_new_nodes(op, doing, todo);
5742 +}
5743 +
5744 +/* implements COP_INSERT_FLOW operation */
5745 +static int
5746 +carry_insert_flow(carry_op * op, carry_level * doing, carry_level * todo)
5747 +{
5748 +       int result;
5749 +       flow_t *f;
5750 +       coord_t *insert_point;
5751 +       node_plugin *nplug;
5752 +       carry_plugin_info info;
5753 +       znode *orig_node;
5754 +       lock_handle *orig_lh;
5755 +
5756 +       f = op->u.insert_flow.flow;
5757 +       result = 0;
5758 +
5759 +       /* carry system needs this to work */
5760 +       info.doing = doing;
5761 +       info.todo = todo;
5762 +
5763 +       orig_node = flow_insert_point(op)->node;
5764 +       orig_lh = doing->tracked;
5765 +
5766 +       while (f->length) {
5767 +               result = make_space_for_flow_insertion(op, doing, todo);
5768 +               if (result)
5769 +                       break;
5770 +
5771 +               insert_point = flow_insert_point(op);
5772 +               nplug = node_plugin_by_node(insert_point->node);
5773 +
5774 +               /* compose item data for insertion/pasting */
5775 +               flow_insert_data(op)->data = f->data;
5776 +               flow_insert_data(op)->length = what_can_fit_into_node(op);
5777 +
5778 +               if (can_paste(insert_point, &f->key, flow_insert_data(op))) {
5779 +                       /* insert point is set to item of file we are writing to and we have to append to it */
5780 +                       assert("vs-903", insert_point->between == AFTER_UNIT);
5781 +                       nplug->change_item_size(insert_point,
5782 +                                               flow_insert_data(op)->length);
5783 +                       flow_insert_data(op)->iplug->b.paste(insert_point,
5784 +                                                            flow_insert_data
5785 +                                                            (op), &info);
5786 +               } else {
5787 +                       /* new item must be inserted */
5788 +                       pos_in_node_t new_pos;
5789 +                       flow_insert_data(op)->length += item_data_overhead(op);
5790 +
5791 +                       /* FIXME-VS: this is because node40_create_item changes
5792 +                          insert_point for obscure reasons */
5793 +                       switch (insert_point->between) {
5794 +                       case AFTER_ITEM:
5795 +                               new_pos = insert_point->item_pos + 1;
5796 +                               break;
5797 +                       case EMPTY_NODE:
5798 +                               new_pos = 0;
5799 +                               break;
5800 +                       case BEFORE_ITEM:
5801 +                               assert("vs-905", insert_point->item_pos == 0);
5802 +                               new_pos = 0;
5803 +                               break;
5804 +                       default:
5805 +                               impossible("vs-906",
5806 +                                          "carry_insert_flow: invalid coord");
5807 +                               new_pos = 0;
5808 +                               break;
5809 +                       }
5810 +
5811 +                       nplug->create_item(insert_point, &f->key,
5812 +                                          flow_insert_data(op), &info);
5813 +                       coord_set_item_pos(insert_point, new_pos);
5814 +               }
5815 +               coord_init_after_item_end(insert_point);
5816 +               doing->restartable = 0;
5817 +               znode_make_dirty(insert_point->node);
5818 +
5819 +               move_flow_forward(f, (unsigned)flow_insert_data(op)->length);
5820 +       }
5821 +
5822 +       if (orig_node != flow_insert_point(op)->node) {
5823 +               /* move lock to new insert point */
5824 +               done_lh(orig_lh);
5825 +               init_lh(orig_lh);
5826 +               result =
5827 +                   longterm_lock_znode(orig_lh, flow_insert_point(op)->node,
5828 +                                       ZNODE_WRITE_LOCK, ZNODE_LOCK_HIPRI);
5829 +       }
5830 +
5831 +       return result;
5832 +}
5833 +
5834 +/* implements COP_DELETE operation
5835 +
5836 +   Remove pointer to @op -> u.delete.child from it's parent.
5837 +
5838 +   This function also handles killing of a tree root is last pointer from it
5839 +   was removed. This is complicated by our handling of "twig" level: root on
5840 +   twig level is never killed.
5841 +
5842 +*/
5843 +static int carry_delete(carry_op * op /* operation to be performed */ ,
5844 +                       carry_level * doing UNUSED_ARG  /* current carry
5845 +                                                        * level */ ,
5846 +                       carry_level * todo /* next carry level */ )
5847 +{
5848 +       int result;
5849 +       coord_t coord;
5850 +       coord_t coord2;
5851 +       znode *parent;
5852 +       znode *child;
5853 +       carry_plugin_info info;
5854 +       reiser4_tree *tree;
5855 +
5856 +       /*
5857 +        * This operation is called to delete internal item pointing to the
5858 +        * child node that was removed by carry from the tree on the previous
5859 +        * tree level.
5860 +        */
5861 +
5862 +       assert("nikita-893", op != NULL);
5863 +       assert("nikita-894", todo != NULL);
5864 +       assert("nikita-895", op->op == COP_DELETE);
5865 +
5866 +       coord_init_zero(&coord);
5867 +       coord_init_zero(&coord2);
5868 +
5869 +       parent = carry_real(op->node);
5870 +       child = op->u.delete.child ?
5871 +           carry_real(op->u.delete.child) : op->node->node;
5872 +       tree = znode_get_tree(child);
5873 +       read_lock_tree(tree);
5874 +
5875 +       /*
5876 +        * @parent was determined when carry entered parent level
5877 +        * (lock_carry_level/lock_carry_node). Since then, actual parent of
5878 +        * @child node could change due to other carry operations performed on
5879 +        * the parent level. Check for this.
5880 +        */
5881 +
5882 +       if (znode_parent(child) != parent) {
5883 +               /* NOTE-NIKITA add stat counter for this. */
5884 +               parent = znode_parent(child);
5885 +               assert("nikita-2581", find_carry_node(doing, parent));
5886 +       }
5887 +       read_unlock_tree(tree);
5888 +
5889 +       assert("nikita-1213", znode_get_level(parent) > LEAF_LEVEL);
5890 +
5891 +       /* Twig level horrors: tree should be of height at least 2. So, last
5892 +          pointer from the root at twig level is preserved even if child is
5893 +          empty. This is ugly, but so it was architectured.
5894 +        */
5895 +
5896 +       if (znode_is_root(parent) &&
5897 +           znode_get_level(parent) <= REISER4_MIN_TREE_HEIGHT &&
5898 +           node_num_items(parent) == 1) {
5899 +               /* Delimiting key manipulations. */
5900 +               write_lock_dk(tree);
5901 +               znode_set_ld_key(child, znode_set_ld_key(parent, min_key()));
5902 +               znode_set_rd_key(child, znode_set_rd_key(parent, max_key()));
5903 +               ZF_SET(child, JNODE_DKSET);
5904 +               write_unlock_dk(tree);
5905 +
5906 +               /* @child escaped imminent death! */
5907 +               ZF_CLR(child, JNODE_HEARD_BANSHEE);
5908 +               return 0;
5909 +       }
5910 +
5911 +       /* convert child pointer to the coord_t */
5912 +       result = find_child_ptr(parent, child, &coord);
5913 +       if (result != NS_FOUND) {
5914 +               warning("nikita-994", "Cannot find child pointer: %i", result);
5915 +               print_coord_content("coord", &coord);
5916 +               return result;
5917 +       }
5918 +
5919 +       coord_dup(&coord2, &coord);
5920 +       info.doing = doing;
5921 +       info.todo = todo;
5922 +       {
5923 +               /*
5924 +                * Actually kill internal item: prepare structure with
5925 +                * arguments for ->cut_and_kill() method...
5926 +                */
5927 +
5928 +               struct carry_kill_data kdata;
5929 +               kdata.params.from = &coord;
5930 +               kdata.params.to = &coord2;
5931 +               kdata.params.from_key = NULL;
5932 +               kdata.params.to_key = NULL;
5933 +               kdata.params.smallest_removed = NULL;
5934 +               kdata.params.truncate = 1;
5935 +               kdata.flags = op->u.delete.flags;
5936 +               kdata.inode = NULL;
5937 +               kdata.left = NULL;
5938 +               kdata.right = NULL;
5939 +               kdata.buf = NULL;
5940 +               /* ... and call it. */
5941 +               result = node_plugin_by_node(parent)->cut_and_kill(&kdata,
5942 +                                                                  &info);
5943 +       }
5944 +       doing->restartable = 0;
5945 +
5946 +       /* check whether root should be killed violently */
5947 +       if (znode_is_root(parent) &&
5948 +           /* don't kill roots at and lower than twig level */
5949 +           znode_get_level(parent) > REISER4_MIN_TREE_HEIGHT &&
5950 +           node_num_items(parent) == 1) {
5951 +               result = kill_tree_root(coord.node);
5952 +       }
5953 +
5954 +       return result < 0 ? : 0;
5955 +}
5956 +
5957 +/* implements COP_CUT opration
5958 +
5959 +   Cuts part or whole content of node.
5960 +
5961 +*/
5962 +static int carry_cut(carry_op * op /* operation to be performed */ ,
5963 +                    carry_level * doing /* current carry level */ ,
5964 +                    carry_level * todo /* next carry level */ )
5965 +{
5966 +       int result;
5967 +       carry_plugin_info info;
5968 +       node_plugin *nplug;
5969 +
5970 +       assert("nikita-896", op != NULL);
5971 +       assert("nikita-897", todo != NULL);
5972 +       assert("nikita-898", op->op == COP_CUT);
5973 +
5974 +       info.doing = doing;
5975 +       info.todo = todo;
5976 +
5977 +       nplug = node_plugin_by_node(carry_real(op->node));
5978 +       if (op->u.cut_or_kill.is_cut)
5979 +               result = nplug->cut(op->u.cut_or_kill.u.cut, &info);
5980 +       else
5981 +               result = nplug->cut_and_kill(op->u.cut_or_kill.u.kill, &info);
5982 +
5983 +       doing->restartable = 0;
5984 +       return result < 0 ? : 0;
5985 +}
5986 +
5987 +/* helper function for carry_paste(): returns true if @op can be continued as
5988 +   paste  */
5989 +static int
5990 +can_paste(coord_t * icoord, const reiser4_key * key,
5991 +         const reiser4_item_data * data)
5992 +{
5993 +       coord_t circa;
5994 +       item_plugin *new_iplug;
5995 +       item_plugin *old_iplug;
5996 +       int result = 0;         /* to keep gcc shut */
5997 +
5998 +       assert("", icoord->between != AT_UNIT);
5999 +
6000 +       /* obviously, one cannot paste when node is empty---there is nothing
6001 +          to paste into. */
6002 +       if (node_is_empty(icoord->node))
6003 +               return 0;
6004 +       /* if insertion point is at the middle of the item, then paste */
6005 +       if (!coord_is_between_items(icoord))
6006 +               return 1;
6007 +       coord_dup(&circa, icoord);
6008 +       circa.between = AT_UNIT;
6009 +
6010 +       old_iplug = item_plugin_by_coord(&circa);
6011 +       new_iplug = data->iplug;
6012 +
6013 +       /* check whether we can paste to the item @icoord is "at" when we
6014 +          ignore ->between field */
6015 +       if (old_iplug == new_iplug && item_can_contain_key(&circa, key, data)) {
6016 +               result = 1;
6017 +       } else if (icoord->between == BEFORE_UNIT
6018 +                  || icoord->between == BEFORE_ITEM) {
6019 +               /* otherwise, try to glue to the item at the left, if any */
6020 +               coord_dup(&circa, icoord);
6021 +               if (coord_set_to_left(&circa)) {
6022 +                       result = 0;
6023 +                       coord_init_before_item(icoord);
6024 +               } else {
6025 +                       old_iplug = item_plugin_by_coord(&circa);
6026 +                       result = (old_iplug == new_iplug)
6027 +                           && item_can_contain_key(icoord, key, data);
6028 +                       if (result) {
6029 +                               coord_dup(icoord, &circa);
6030 +                               icoord->between = AFTER_UNIT;
6031 +                       }
6032 +               }
6033 +       } else if (icoord->between == AFTER_UNIT
6034 +                  || icoord->between == AFTER_ITEM) {
6035 +               coord_dup(&circa, icoord);
6036 +               /* otherwise, try to glue to the item at the right, if any */
6037 +               if (coord_set_to_right(&circa)) {
6038 +                       result = 0;
6039 +                       coord_init_after_item(icoord);
6040 +               } else {
6041 +                       int (*cck) (const coord_t *, const reiser4_key *,
6042 +                                   const reiser4_item_data *);
6043 +
6044 +                       old_iplug = item_plugin_by_coord(&circa);
6045 +
6046 +                       cck = old_iplug->b.can_contain_key;
6047 +                       if (cck == NULL)
6048 +                               /* item doesn't define ->can_contain_key
6049 +                                  method? So it is not expandable. */
6050 +                               result = 0;
6051 +                       else {
6052 +                               result = (old_iplug == new_iplug)
6053 +                                   && cck(&circa /*icoord */ , key, data);
6054 +                               if (result) {
6055 +                                       coord_dup(icoord, &circa);
6056 +                                       icoord->between = BEFORE_UNIT;
6057 +                               }
6058 +                       }
6059 +               }
6060 +       } else
6061 +               impossible("nikita-2513", "Nothing works");
6062 +       if (result) {
6063 +               if (icoord->between == BEFORE_ITEM) {
6064 +                       assert("vs-912", icoord->unit_pos == 0);
6065 +                       icoord->between = BEFORE_UNIT;
6066 +               } else if (icoord->between == AFTER_ITEM) {
6067 +                       coord_init_after_item_end(icoord);
6068 +               }
6069 +       }
6070 +       return result;
6071 +}
6072 +
6073 +/* implements COP_PASTE operation
6074 +
6075 +   Paste data into existing item. This is complicated by the fact that after
6076 +   we shifted something to the left or right neighbors trying to free some
6077 +   space, item we were supposed to paste into can be in different node than
6078 +   insertion coord. If so, we are no longer doing paste, but insert. See
6079 +   comments in insert_paste_common().
6080 +
6081 +*/
6082 +static int carry_paste(carry_op * op /* operation to be performed */ ,
6083 +                      carry_level * doing UNUSED_ARG   /* current carry
6084 +                                                        * level */ ,
6085 +                      carry_level * todo /* next carry level */ )
6086 +{
6087 +       znode *node;
6088 +       carry_insert_data cdata;
6089 +       coord_t dcoord;
6090 +       reiser4_item_data data;
6091 +       int result;
6092 +       int real_size;
6093 +       item_plugin *iplug;
6094 +       carry_plugin_info info;
6095 +       coord_t *coord;
6096 +
6097 +       assert("nikita-982", op != NULL);
6098 +       assert("nikita-983", todo != NULL);
6099 +       assert("nikita-984", op->op == COP_PASTE);
6100 +
6101 +       coord_init_zero(&dcoord);
6102 +
6103 +       result = insert_paste_common(op, doing, todo, &cdata, &dcoord, &data);
6104 +       if (result != 0)
6105 +               return result;
6106 +
6107 +       coord = op->u.insert.d->coord;
6108 +
6109 +       /* handle case when op -> u.insert.coord doesn't point to the item
6110 +          of required type. restart as insert. */
6111 +       if (!can_paste(coord, op->u.insert.d->key, op->u.insert.d->data)) {
6112 +               op->op = COP_INSERT;
6113 +               op->u.insert.type = COPT_PASTE_RESTARTED;
6114 +               result = op_dispatch_table[COP_INSERT].handler(op, doing, todo);
6115 +
6116 +               return result;
6117 +       }
6118 +
6119 +       node = coord->node;
6120 +       iplug = item_plugin_by_coord(coord);
6121 +       assert("nikita-992", iplug != NULL);
6122 +
6123 +       assert("nikita-985", node != NULL);
6124 +       assert("nikita-986", node_plugin_by_node(node) != NULL);
6125 +
6126 +       assert("nikita-987",
6127 +              space_needed_for_op(node, op) <= znode_free_space(node));
6128 +
6129 +       assert("nikita-1286", coord_is_existing_item(coord));
6130 +
6131 +       /*
6132 +        * if item is expanded as a result of this operation, we should first
6133 +        * change item size, than call ->b.paste item method. If item is
6134 +        * shrunk, it should be done other way around: first call ->b.paste
6135 +        * method, then reduce item size.
6136 +        */
6137 +
6138 +       real_size = space_needed_for_op(node, op);
6139 +       if (real_size > 0)
6140 +               node->nplug->change_item_size(coord, real_size);
6141 +
6142 +       doing->restartable = 0;
6143 +       info.doing = doing;
6144 +       info.todo = todo;
6145 +
6146 +       result = iplug->b.paste(coord, op->u.insert.d->data, &info);
6147 +
6148 +       if (real_size < 0)
6149 +               node->nplug->change_item_size(coord, real_size);
6150 +
6151 +       /* if we pasted at the beginning of the item, update item's key. */
6152 +       if (coord->unit_pos == 0 && coord->between != AFTER_UNIT)
6153 +               node->nplug->update_item_key(coord, op->u.insert.d->key, &info);
6154 +
6155 +       znode_make_dirty(node);
6156 +       return result;
6157 +}
6158 +
6159 +/* handle carry COP_EXTENT operation. */
6160 +static int carry_extent(carry_op * op /* operation to perform */ ,
6161 +                       carry_level * doing     /* queue of operations @op
6162 +                                                * is part of */ ,
6163 +                       carry_level * todo      /* queue where new operations
6164 +                                                * are accumulated */ )
6165 +{
6166 +       znode *node;
6167 +       carry_insert_data cdata;
6168 +       coord_t coord;
6169 +       reiser4_item_data data;
6170 +       carry_op *delete_dummy;
6171 +       carry_op *insert_extent;
6172 +       int result;
6173 +       carry_plugin_info info;
6174 +
6175 +       assert("nikita-1751", op != NULL);
6176 +       assert("nikita-1752", todo != NULL);
6177 +       assert("nikita-1753", op->op == COP_EXTENT);
6178 +
6179 +       /* extent insertion overview:
6180 +
6181 +          extents live on the TWIG LEVEL, which is level one above the leaf
6182 +          one. This complicates extent insertion logic somewhat: it may
6183 +          happen (and going to happen all the time) that in logical key
6184 +          ordering extent has to be placed between items I1 and I2, located
6185 +          at the leaf level, but I1 and I2 are in the same formatted leaf
6186 +          node N1. To insert extent one has to
6187 +
6188 +          (1) reach node N1 and shift data between N1, its neighbors and
6189 +          possibly newly allocated nodes until I1 and I2 fall into different
6190 +          nodes. Since I1 and I2 are still neighboring items in logical key
6191 +          order, they will be necessary utmost items in their respective
6192 +          nodes.
6193 +
6194 +          (2) After this new extent item is inserted into node on the twig
6195 +          level.
6196 +
6197 +          Fortunately this process can reuse almost all code from standard
6198 +          insertion procedure (viz. make_space() and insert_paste_common()),
6199 +          due to the following observation: make_space() only shifts data up
6200 +          to and excluding or including insertion point. It never
6201 +          "over-moves" through insertion point. Thus, one can use
6202 +          make_space() to perform step (1). All required for this is just to
6203 +          instruct free_space_shortage() to keep make_space() shifting data
6204 +          until insertion point is at the node border.
6205 +
6206 +        */
6207 +
6208 +       /* perform common functionality of insert and paste. */
6209 +       result = insert_paste_common(op, doing, todo, &cdata, &coord, &data);
6210 +       if (result != 0)
6211 +               return result;
6212 +
6213 +       node = op->u.extent.d->coord->node;
6214 +       assert("nikita-1754", node != NULL);
6215 +       assert("nikita-1755", node_plugin_by_node(node) != NULL);
6216 +       assert("nikita-1700", coord_wrt(op->u.extent.d->coord) != COORD_INSIDE);
6217 +
6218 +       /* NOTE-NIKITA add some checks here. Not assertions, -EIO. Check that
6219 +          extent fits between items. */
6220 +
6221 +       info.doing = doing;
6222 +       info.todo = todo;
6223 +
6224 +       /* there is another complication due to placement of extents on the
6225 +          twig level: extents are "rigid" in the sense that key-range
6226 +          occupied by extent cannot grow indefinitely to the right as it is
6227 +          for the formatted leaf nodes. Because of this when search finds two
6228 +          adjacent extents on the twig level, it has to "drill" to the leaf
6229 +          level, creating new node. Here we are removing this node.
6230 +        */
6231 +       if (node_is_empty(node)) {
6232 +               delete_dummy = node_post_carry(&info, COP_DELETE, node, 1);
6233 +               if (IS_ERR(delete_dummy))
6234 +                       return PTR_ERR(delete_dummy);
6235 +               delete_dummy->u.delete.child = NULL;
6236 +               delete_dummy->u.delete.flags = DELETE_RETAIN_EMPTY;
6237 +               ZF_SET(node, JNODE_HEARD_BANSHEE);
6238 +       }
6239 +
6240 +       /* proceed with inserting extent item into parent. We are definitely
6241 +          inserting rather than pasting if we get that far. */
6242 +       insert_extent = node_post_carry(&info, COP_INSERT, node, 1);
6243 +       if (IS_ERR(insert_extent))
6244 +               /* @delete_dummy will be automatically destroyed on the level
6245 +                  exiting  */
6246 +               return PTR_ERR(insert_extent);
6247 +       /* NOTE-NIKITA insertion by key is simplest option here. Another
6248 +          possibility is to insert on the left or right of already existing
6249 +          item.
6250 +        */
6251 +       insert_extent->u.insert.type = COPT_KEY;
6252 +       insert_extent->u.insert.d = op->u.extent.d;
6253 +       assert("nikita-1719", op->u.extent.d->key != NULL);
6254 +       insert_extent->u.insert.d->data->arg = op->u.extent.d->coord;
6255 +       insert_extent->u.insert.flags =
6256 +           znode_get_tree(node)->carry.new_extent_flags;
6257 +
6258 +       /*
6259 +        * if carry was asked to track lock handle we should actually track
6260 +        * lock handle on the twig node rather than on the leaf where
6261 +        * operation was started from. Transfer tracked lock handle.
6262 +        */
6263 +       if (doing->track_type) {
6264 +               assert("nikita-3242", doing->tracked != NULL);
6265 +               assert("nikita-3244", todo->tracked == NULL);
6266 +               todo->tracked = doing->tracked;
6267 +               todo->track_type = CARRY_TRACK_NODE;
6268 +               doing->tracked = NULL;
6269 +               doing->track_type = 0;
6270 +       }
6271 +
6272 +       return 0;
6273 +}
6274 +
6275 +/* update key in @parent between pointers to @left and @right.
6276 +
6277 +   Find coords of @left and @right and update delimiting key between them.
6278 +   This is helper function called by carry_update(). Finds position of
6279 +   internal item involved. Updates item key. Updates delimiting keys of child
6280 +   nodes involved.
6281 +*/
6282 +static int update_delimiting_key(znode * parent        /* node key is updated
6283 +                                                * in */ ,
6284 +                                znode * left /* child of @parent */ ,
6285 +                                znode * right /* child of @parent */ ,
6286 +                                carry_level * doing    /* current carry
6287 +                                                        * level */ ,
6288 +                                carry_level * todo     /* parent carry
6289 +                                                        * level */ ,
6290 +                                const char **error_msg /* place to
6291 +                                                        * store error
6292 +                                                        * message */ )
6293 +{
6294 +       coord_t left_pos;
6295 +       coord_t right_pos;
6296 +       int result;
6297 +       reiser4_key ldkey;
6298 +       carry_plugin_info info;
6299 +
6300 +       assert("nikita-1177", right != NULL);
6301 +       /* find position of right left child in a parent */
6302 +       result = find_child_ptr(parent, right, &right_pos);
6303 +       if (result != NS_FOUND) {
6304 +               *error_msg = "Cannot find position of right child";
6305 +               return result;
6306 +       }
6307 +
6308 +       if ((left != NULL) && !coord_is_leftmost_unit(&right_pos)) {
6309 +               /* find position of the left child in a parent */
6310 +               result = find_child_ptr(parent, left, &left_pos);
6311 +               if (result != NS_FOUND) {
6312 +                       *error_msg = "Cannot find position of left child";
6313 +                       return result;
6314 +               }
6315 +               assert("nikita-1355", left_pos.node != NULL);
6316 +       } else
6317 +               left_pos.node = NULL;
6318 +
6319 +       /* check that they are separated by exactly one key and are basically
6320 +          sane */
6321 +       if (REISER4_DEBUG) {
6322 +               if ((left_pos.node != NULL)
6323 +                   && !coord_is_existing_unit(&left_pos)) {
6324 +                       *error_msg = "Left child is bastard";
6325 +                       return RETERR(-EIO);
6326 +               }
6327 +               if (!coord_is_existing_unit(&right_pos)) {
6328 +                       *error_msg = "Right child is bastard";
6329 +                       return RETERR(-EIO);
6330 +               }
6331 +               if (left_pos.node != NULL &&
6332 +                   !coord_are_neighbors(&left_pos, &right_pos)) {
6333 +                       *error_msg = "Children are not direct siblings";
6334 +                       return RETERR(-EIO);
6335 +               }
6336 +       }
6337 +       *error_msg = NULL;
6338 +
6339 +       info.doing = doing;
6340 +       info.todo = todo;
6341 +
6342 +       /*
6343 +        * If child node is not empty, new key of internal item is a key of
6344 +        * leftmost item in the child node. If the child is empty, take its
6345 +        * right delimiting key as a new key of the internal item. Precise key
6346 +        * in the latter case is not important per se, because the child (and
6347 +        * the internal item) are going to be killed shortly anyway, but we
6348 +        * have to preserve correct order of keys in the parent node.
6349 +        */
6350 +
6351 +       if (!ZF_ISSET(right, JNODE_HEARD_BANSHEE))
6352 +               leftmost_key_in_node(right, &ldkey);
6353 +       else {
6354 +               read_lock_dk(znode_get_tree(parent));
6355 +               ldkey = *znode_get_rd_key(right);
6356 +               read_unlock_dk(znode_get_tree(parent));
6357 +       }
6358 +       node_plugin_by_node(parent)->update_item_key(&right_pos, &ldkey, &info);
6359 +       doing->restartable = 0;
6360 +       znode_make_dirty(parent);
6361 +       return 0;
6362 +}
6363 +
6364 +/* implements COP_UPDATE opration
6365 +
6366 +   Update delimiting keys.
6367 +
6368 +*/
6369 +static int carry_update(carry_op * op /* operation to be performed */ ,
6370 +                       carry_level * doing /* current carry level */ ,
6371 +                       carry_level * todo /* next carry level */ )
6372 +{
6373 +       int result;
6374 +       carry_node *missing UNUSED_ARG;
6375 +       znode *left;
6376 +       znode *right;
6377 +       carry_node *lchild;
6378 +       carry_node *rchild;
6379 +       const char *error_msg;
6380 +       reiser4_tree *tree;
6381 +
6382 +       /*
6383 +        * This operation is called to update key of internal item. This is
6384 +        * necessary when carry shifted of cut data on the child
6385 +        * level. Arguments of this operation are:
6386 +        *
6387 +        *     @right --- child node. Operation should update key of internal
6388 +        *     item pointing to @right.
6389 +        *
6390 +        *     @left --- left neighbor of @right. This parameter is optional.
6391 +        */
6392 +
6393 +       assert("nikita-902", op != NULL);
6394 +       assert("nikita-903", todo != NULL);
6395 +       assert("nikita-904", op->op == COP_UPDATE);
6396 +
6397 +       lchild = op->u.update.left;
6398 +       rchild = op->node;
6399 +
6400 +       if (lchild != NULL) {
6401 +               assert("nikita-1001", lchild->parent);
6402 +               assert("nikita-1003", !lchild->left);
6403 +               left = carry_real(lchild);
6404 +       } else
6405 +               left = NULL;
6406 +
6407 +       tree = znode_get_tree(rchild->node);
6408 +       read_lock_tree(tree);
6409 +       right = znode_parent(rchild->node);
6410 +       read_unlock_tree(tree);
6411 +
6412 +       if (right != NULL) {
6413 +               result = update_delimiting_key(right,
6414 +                                              lchild ? lchild->node : NULL,
6415 +                                              rchild->node,
6416 +                                              doing, todo, &error_msg);
6417 +       } else {
6418 +               error_msg = "Cannot find node to update key in";
6419 +               result = RETERR(-EIO);
6420 +       }
6421 +       /* operation will be reposted to the next level by the
6422 +          ->update_item_key() method of node plugin, if necessary. */
6423 +
6424 +       if (result != 0) {
6425 +               warning("nikita-999", "Error updating delimiting key: %s (%i)",
6426 +                       error_msg ? : "", result);
6427 +       }
6428 +       return result;
6429 +}
6430 +
6431 +/* move items from @node during carry */
6432 +static int carry_shift_data(sideof side /* in what direction to move data */ ,
6433 +                           coord_t * insert_coord      /* coord where new item
6434 +                                                        * is to be inserted */ ,
6435 +                           znode * node /* node which data are moved from */ ,
6436 +                           carry_level * doing /* active carry queue */ ,
6437 +                           carry_level * todo  /* carry queue where new
6438 +                                                * operations are to be put
6439 +                                                * in */ ,
6440 +                           unsigned int including_insert_coord_p       /* true if
6441 +                                                                        * @insertion_coord
6442 +                                                                        * can be moved */ )
6443 +{
6444 +       int result;
6445 +       znode *source;
6446 +       carry_plugin_info info;
6447 +       node_plugin *nplug;
6448 +
6449 +       source = insert_coord->node;
6450 +
6451 +       info.doing = doing;
6452 +       info.todo = todo;
6453 +
6454 +       nplug = node_plugin_by_node(node);
6455 +       result = nplug->shift(insert_coord, node,
6456 +                             (side == LEFT_SIDE) ? SHIFT_LEFT : SHIFT_RIGHT, 0,
6457 +                             (int)including_insert_coord_p, &info);
6458 +       /* the only error ->shift() method of node plugin can return is
6459 +          -ENOMEM due to carry node/operation allocation. */
6460 +       assert("nikita-915", result >= 0 || result == -ENOMEM);
6461 +       if (result > 0) {
6462 +               /*
6463 +                * if some number of bytes was actually shifted, mark nodes
6464 +                * dirty, and carry level as non-restartable.
6465 +                */
6466 +               doing->restartable = 0;
6467 +               znode_make_dirty(source);
6468 +               znode_make_dirty(node);
6469 +       }
6470 +
6471 +       assert("nikita-2077", coord_check(insert_coord));
6472 +       return 0;
6473 +}
6474 +
6475 +typedef carry_node *(*carry_iterator) (carry_node * node);
6476 +static carry_node *find_dir_carry(carry_node * node, carry_level * level,
6477 +                                 carry_iterator iterator);
6478 +
6479 +static carry_node *pool_level_list_prev(carry_node *node)
6480 +{
6481 +       return list_entry(node->header.level_linkage.prev, carry_node, header.level_linkage);
6482 +}
6483 +
6484 +/* look for the left neighbor of given carry node in a carry queue.
6485 +
6486 +   This is used by find_left_neighbor(), but I am not sure that this
6487 +   really gives any advantage. More statistics required.
6488 +
6489 +*/
6490 +carry_node *find_left_carry(carry_node * node  /* node to find left neighbor
6491 +                                                * of */ ,
6492 +                           carry_level * level /* level to scan */ )
6493 +{
6494 +       return find_dir_carry(node, level,
6495 +                             (carry_iterator) pool_level_list_prev);
6496 +}
6497 +
6498 +static carry_node *pool_level_list_next(carry_node *node)
6499 +{
6500 +       return list_entry(node->header.level_linkage.next, carry_node, header.level_linkage);
6501 +}
6502 +
6503 +/* look for the right neighbor of given carry node in a
6504 +   carry queue.
6505 +
6506 +   This is used by find_right_neighbor(), but I am not sure that this
6507 +   really gives any advantage. More statistics required.
6508 +
6509 +*/
6510 +carry_node *find_right_carry(carry_node * node /* node to find right neighbor
6511 +                                                * of */ ,
6512 +                            carry_level * level /* level to scan */ )
6513 +{
6514 +       return find_dir_carry(node, level,
6515 +                             (carry_iterator) pool_level_list_next);
6516 +}
6517 +
6518 +/* look for the left or right neighbor of given carry node in a carry
6519 +   queue.
6520 +
6521 +   Helper function used by find_{left|right}_carry().
6522 +*/
6523 +static carry_node *find_dir_carry(carry_node * node    /* node to start scanning
6524 +                                                        * from */ ,
6525 +                                 carry_level * level /* level to scan */ ,
6526 +                                 carry_iterator iterator       /* operation to
6527 +                                                                * move to the next
6528 +                                                                * node */ )
6529 +{
6530 +       carry_node *neighbor;
6531 +
6532 +       assert("nikita-1059", node != NULL);
6533 +       assert("nikita-1060", level != NULL);
6534 +
6535 +       /* scan list of carry nodes on this list dir-ward, skipping all
6536 +          carry nodes referencing the same znode. */
6537 +       neighbor = node;
6538 +       while (1) {
6539 +               neighbor = iterator(neighbor);
6540 +               if (carry_node_end(level, neighbor))
6541 +                       /* list head is reached */
6542 +                       return NULL;
6543 +               if (carry_real(neighbor) != carry_real(node))
6544 +                       return neighbor;
6545 +       }
6546 +}
6547 +
6548 +/*
6549 + * Memory reservation estimation.
6550 + *
6551 + * Carry process proceeds through tree levels upwards. Carry assumes that it
6552 + * takes tree in consistent state (e.g., that search tree invariants hold),
6553 + * and leaves tree consistent after it finishes. This means that when some
6554 + * error occurs carry cannot simply return if there are pending carry
6555 + * operations. Generic solution for this problem is carry-undo either as
6556 + * transaction manager feature (requiring checkpoints and isolation), or
6557 + * through some carry specific mechanism.
6558 + *
6559 + * Our current approach is to panic if carry hits an error while tree is
6560 + * inconsistent. Unfortunately -ENOMEM can easily be triggered. To work around
6561 + * this "memory reservation" mechanism was added.
6562 + *
6563 + * Memory reservation is implemented by perthread-pages.diff patch from
6564 + * core-patches. Its API is defined in <linux/gfp.h>
6565 + *
6566 + *     int  perthread_pages_reserve(int nrpages, gfp_t gfp);
6567 + *     void perthread_pages_release(int nrpages);
6568 + *     int  perthread_pages_count(void);
6569 + *
6570 + * carry estimates its worst case memory requirements at the entry, reserved
6571 + * enough memory, and released unused pages before returning.
6572 + *
6573 + * Code below estimates worst case memory requirements for a given carry
6574 + * queue. This is dome by summing worst case memory requirements for each
6575 + * operation in the queue.
6576 + *
6577 + */
6578 +
6579 +/*
6580 + * Memory memory requirements of many operations depends on the tree
6581 + * height. For example, item insertion requires new node to be inserted at
6582 + * each tree level in the worst case. What tree height should be used for
6583 + * estimation? Current tree height is wrong, because tree height can change
6584 + * between the time when estimation was done and the time when operation is
6585 + * actually performed. Maximal possible tree height (REISER4_MAX_ZTREE_HEIGHT)
6586 + * is also not desirable, because it would lead to the huge over-estimation
6587 + * all the time. Plausible solution is "capped tree height": if current tree
6588 + * height is less than some TREE_HEIGHT_CAP constant, capped tree height is
6589 + * TREE_HEIGHT_CAP, otherwise it's current tree height. Idea behind this is
6590 + * that if tree height is TREE_HEIGHT_CAP or larger, it's extremely unlikely
6591 + * to be increased even more during short interval of time.
6592 + */
6593 +#define TREE_HEIGHT_CAP (5)
6594 +
6595 +/* return capped tree height for the @tree. See comment above. */
6596 +static int cap_tree_height(reiser4_tree * tree)
6597 +{
6598 +       return max_t(int, tree->height, TREE_HEIGHT_CAP);
6599 +}
6600 +
6601 +/* return capped tree height for the current tree. */
6602 +static int capped_height(void)
6603 +{
6604 +       return cap_tree_height(current_tree);
6605 +}
6606 +
6607 +/* return number of pages required to store given number of bytes */
6608 +static int bytes_to_pages(int bytes)
6609 +{
6610 +       return (bytes + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
6611 +}
6612 +
6613 +/* how many pages are required to allocate znodes during item insertion. */
6614 +static int carry_estimate_znodes(void)
6615 +{
6616 +       /*
6617 +        * Note, that there we have some problem here: there is no way to
6618 +        * reserve pages specifically for the given slab. This means that
6619 +        * these pages can be hijacked for some other end.
6620 +        */
6621 +
6622 +       /* in the worst case we need 3 new znode on each tree level */
6623 +       return bytes_to_pages(capped_height() * sizeof(znode) * 3);
6624 +}
6625 +
6626 +/*
6627 + * how many pages are required to load bitmaps. One bitmap per level.
6628 + */
6629 +static int carry_estimate_bitmaps(void)
6630 +{
6631 +       if (reiser4_is_set(reiser4_get_current_sb(), REISER4_DONT_LOAD_BITMAP)) {
6632 +               int bytes;
6633 +
6634 +               bytes = capped_height() * (0 +  /* bnode should be added, but its is private to
6635 +                                                * bitmap.c, skip for now. */
6636 +                                          2 * sizeof(jnode));  /* working and commit jnodes */
6637 +               return bytes_to_pages(bytes) + 2;       /* and their contents */
6638 +       } else
6639 +               /* bitmaps were pre-loaded during mount */
6640 +               return 0;
6641 +}
6642 +
6643 +/* worst case item insertion memory requirements */
6644 +static int carry_estimate_insert(carry_op * op, carry_level * level)
6645 +{
6646 +       return carry_estimate_bitmaps() + carry_estimate_znodes() + 1 + /* new atom */
6647 +           capped_height() +   /* new block on each level */
6648 +           1 +                 /* and possibly extra new block at the leaf level */
6649 +           3;                  /* loading of leaves into memory */
6650 +}
6651 +
6652 +/* worst case item deletion memory requirements */
6653 +static int carry_estimate_delete(carry_op * op, carry_level * level)
6654 +{
6655 +       return carry_estimate_bitmaps() + carry_estimate_znodes() + 1 + /* new atom */
6656 +           3;                  /* loading of leaves into memory */
6657 +}
6658 +
6659 +/* worst case tree cut memory requirements */
6660 +static int carry_estimate_cut(carry_op * op, carry_level * level)
6661 +{
6662 +       return carry_estimate_bitmaps() + carry_estimate_znodes() + 1 + /* new atom */
6663 +           3;                  /* loading of leaves into memory */
6664 +}
6665 +
6666 +/* worst case memory requirements of pasting into item */
6667 +static int carry_estimate_paste(carry_op * op, carry_level * level)
6668 +{
6669 +       return carry_estimate_bitmaps() + carry_estimate_znodes() + 1 + /* new atom */
6670 +           capped_height() +   /* new block on each level */
6671 +           1 +                 /* and possibly extra new block at the leaf level */
6672 +           3;                  /* loading of leaves into memory */
6673 +}
6674 +
6675 +/* worst case memory requirements of extent insertion */
6676 +static int carry_estimate_extent(carry_op * op, carry_level * level)
6677 +{
6678 +       return carry_estimate_insert(op, level) +       /* insert extent */
6679 +           carry_estimate_delete(op, level);   /* kill leaf */
6680 +}
6681 +
6682 +/* worst case memory requirements of key update */
6683 +static int carry_estimate_update(carry_op * op, carry_level * level)
6684 +{
6685 +       return 0;
6686 +}
6687 +
6688 +/* worst case memory requirements of flow insertion */
6689 +static int carry_estimate_insert_flow(carry_op * op, carry_level * level)
6690 +{
6691 +       int newnodes;
6692 +
6693 +       newnodes = min(bytes_to_pages(op->u.insert_flow.flow->length),
6694 +                      CARRY_FLOW_NEW_NODES_LIMIT);
6695 +       /*
6696 +        * roughly estimate insert_flow as a sequence of insertions.
6697 +        */
6698 +       return newnodes * carry_estimate_insert(op, level);
6699 +}
6700 +
6701 +/* This is dispatch table for carry operations. It can be trivially
6702 +   abstracted into useful plugin: tunable balancing policy is a good
6703 +   thing. */
6704 +carry_op_handler op_dispatch_table[COP_LAST_OP] = {
6705 +       [COP_INSERT] = {
6706 +                       .handler = carry_insert,
6707 +                       .estimate = carry_estimate_insert}
6708 +       ,
6709 +       [COP_DELETE] = {
6710 +                       .handler = carry_delete,
6711 +                       .estimate = carry_estimate_delete}
6712 +       ,
6713 +       [COP_CUT] = {
6714 +                    .handler = carry_cut,
6715 +                    .estimate = carry_estimate_cut}
6716 +       ,
6717 +       [COP_PASTE] = {
6718 +                      .handler = carry_paste,
6719 +                      .estimate = carry_estimate_paste}
6720 +       ,
6721 +       [COP_EXTENT] = {
6722 +                       .handler = carry_extent,
6723 +                       .estimate = carry_estimate_extent}
6724 +       ,
6725 +       [COP_UPDATE] = {
6726 +                       .handler = carry_update,
6727 +                       .estimate = carry_estimate_update}
6728 +       ,
6729 +       [COP_INSERT_FLOW] = {
6730 +                            .handler = carry_insert_flow,
6731 +                            .estimate = carry_estimate_insert_flow}
6732 +};
6733 +
6734 +/* Make Linus happy.
6735 +   Local variables:
6736 +   c-indentation-style: "K&R"
6737 +   mode-name: "LC"
6738 +   c-basic-offset: 8
6739 +   tab-width: 8
6740 +   fill-column: 120
6741 +   scroll-step: 1
6742 +   End:
6743 +*/
6744 diff --git a/fs/reiser4/carry_ops.h b/fs/reiser4/carry_ops.h
6745 new file mode 100644
6746 index 0000000..688ca8f
6747 --- /dev/null
6748 +++ b/fs/reiser4/carry_ops.h
6749 @@ -0,0 +1,42 @@
6750 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
6751 +
6752 +/* implementation of carry operations. See carry_ops.c for details. */
6753 +
6754 +#if !defined( __CARRY_OPS_H__ )
6755 +#define __CARRY_OPS_H__
6756 +
6757 +#include "forward.h"
6758 +#include "znode.h"
6759 +#include "carry.h"
6760 +
6761 +/* carry operation handlers */
6762 +typedef struct carry_op_handler {
6763 +       /* perform operation */
6764 +       int (*handler) (carry_op * op, carry_level * doing, carry_level * todo);
6765 +       /* estimate memory requirements for @op */
6766 +       int (*estimate) (carry_op * op, carry_level * level);
6767 +} carry_op_handler;
6768 +
6769 +/* This is dispatch table for carry operations. It can be trivially
6770 +   abstracted into useful plugin: tunable balancing policy is a good
6771 +   thing. */
6772 +extern carry_op_handler op_dispatch_table[COP_LAST_OP];
6773 +
6774 +unsigned int space_needed(const znode * node, const coord_t * coord,
6775 +                         const reiser4_item_data * data, int inserting);
6776 +extern carry_node *find_left_carry(carry_node * node, carry_level * level);
6777 +extern carry_node *find_right_carry(carry_node * node, carry_level * level);
6778 +
6779 +/* __CARRY_OPS_H__ */
6780 +#endif
6781 +
6782 +/* Make Linus happy.
6783 +   Local variables:
6784 +   c-indentation-style: "K&R"
6785 +   mode-name: "LC"
6786 +   c-basic-offset: 8
6787 +   tab-width: 8
6788 +   fill-column: 120
6789 +   scroll-step: 1
6790 +   End:
6791 +*/
6792 diff --git a/fs/reiser4/context.c b/fs/reiser4/context.c
6793 new file mode 100644
6794 index 0000000..c2f6392
6795 --- /dev/null
6796 +++ b/fs/reiser4/context.c
6797 @@ -0,0 +1,278 @@
6798 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
6799 +
6800 +/* Manipulation of reiser4_context */
6801 +
6802 +/*
6803 + * global context used during system call. Variable of this type is allocated
6804 + * on the stack at the beginning of the reiser4 part of the system call and
6805 + * pointer to it is stored in the current->fs_context. This allows us to avoid
6806 + * passing pointer to current transaction and current lockstack (both in
6807 + * one-to-one mapping with threads) all over the call chain.
6808 + *
6809 + * It's kind of like those global variables the prof used to tell you not to
6810 + * use in CS1, except thread specific.;-) Nikita, this was a good idea.
6811 + *
6812 + * In some situations it is desirable to have ability to enter reiser4_context
6813 + * more than once for the same thread (nested contexts). For example, there
6814 + * are some functions that can be called either directly from VFS/VM or from
6815 + * already active reiser4 context (->writepage, for example).
6816 + *
6817 + * In such situations "child" context acts like dummy: all activity is
6818 + * actually performed in the top level context, and get_current_context()
6819 + * always returns top level context. Of course, init_context()/done_context()
6820 + * have to be properly nested any way.
6821 + *
6822 + * Note that there is an important difference between reiser4 uses
6823 + * ->fs_context and the way other file systems use it. Other file systems
6824 + * (ext3 and reiserfs) use ->fs_context only for the duration of _transaction_
6825 + * (this is why ->fs_context was initially called ->journal_info). This means,
6826 + * that when ext3 or reiserfs finds that ->fs_context is not NULL on the entry
6827 + * to the file system, they assume that some transaction is already underway,
6828 + * and usually bail out, because starting nested transaction would most likely
6829 + * lead to the deadlock. This gives false positives with reiser4, because we
6830 + * set ->fs_context before starting transaction.
6831 + */
6832 +
6833 +#include "debug.h"
6834 +#include "super.h"
6835 +#include "context.h"
6836 +
6837 +#include <linux/writeback.h>   /* balance_dirty_pages() */
6838 +#include <linux/hardirq.h>
6839 +
6840 +
6841 +static void _init_context(reiser4_context * context, struct super_block *super)
6842 +{
6843 +       memset(context, 0, sizeof(*context));
6844 +
6845 +       context->super = super;
6846 +       context->magic = context_magic;
6847 +       context->outer = current->journal_info;
6848 +       current->journal_info = (void *)context;
6849 +       context->nr_children = 0;
6850 +       context->gfp_mask = GFP_KERNEL;
6851 +
6852 +       init_lock_stack(&context->stack);
6853 +
6854 +       txn_begin(context);
6855 +
6856 +       /* initialize head of tap list */
6857 +       INIT_LIST_HEAD(&context->taps);
6858 +#if REISER4_DEBUG
6859 +       context->task = current;
6860 +#endif
6861 +       grab_space_enable();
6862 +}
6863 +
6864 +/* initialize context and bind it to the current thread
6865 +
6866 +   This function should be called at the beginning of reiser4 part of
6867 +   syscall.
6868 +*/
6869 +reiser4_context *init_context(struct super_block *super        /* super block we are going to
6870 +                                                        * work with */ )
6871 +{
6872 +       reiser4_context *context;
6873 +
6874 +       assert("nikita-2662", !in_interrupt() && !in_irq());
6875 +       assert("nikita-3357", super != NULL);
6876 +       assert("nikita-3358", super->s_op == NULL || is_reiser4_super(super));
6877 +
6878 +       context = get_current_context_check();
6879 +       if (context && context->super == super) {
6880 +               context = (reiser4_context *) current->journal_info;
6881 +               context->nr_children++;
6882 +               return context;
6883 +       }
6884 +
6885 +       context = kmalloc(sizeof(*context), GFP_KERNEL);
6886 +       if (context == NULL)
6887 +               return ERR_PTR(RETERR(-ENOMEM));
6888 +
6889 +       _init_context(context, super);
6890 +       return context;
6891 +}
6892 +
6893 +/* this is used in scan_mgr which is called with spinlock held and in
6894 +   reiser4_fill_super magic */
6895 +void init_stack_context(reiser4_context *context, struct super_block *super)
6896 +{
6897 +       assert("nikita-2662", !in_interrupt() && !in_irq());
6898 +       assert("nikita-3357", super != NULL);
6899 +       assert("nikita-3358", super->s_op == NULL || is_reiser4_super(super));
6900 +       assert("vs-12", !is_in_reiser4_context());
6901 +
6902 +       _init_context(context, super);
6903 +       context->on_stack = 1;
6904 +       return;
6905 +}
6906 +
6907 +/* cast lock stack embedded into reiser4 context up to its container */
6908 +reiser4_context *get_context_by_lock_stack(lock_stack * owner)
6909 +{
6910 +       return container_of(owner, reiser4_context, stack);
6911 +}
6912 +
6913 +/* true if there is already _any_ reiser4 context for the current thread */
6914 +int is_in_reiser4_context(void)
6915 +{
6916 +       reiser4_context *ctx;
6917 +
6918 +       ctx = current->journal_info;
6919 +       return ctx != NULL && ((unsigned long)ctx->magic) == context_magic;
6920 +}
6921 +
6922 +/*
6923 + * call balance dirty pages for the current context.
6924 + *
6925 + * File system is expected to call balance_dirty_pages_ratelimited() whenever
6926 + * it dirties a page. reiser4 does this for unformatted nodes (that is, during
6927 + * write---this covers vast majority of all dirty traffic), but we cannot do
6928 + * this immediately when formatted node is dirtied, because long term lock is
6929 + * usually held at that time. To work around this, dirtying of formatted node
6930 + * simply increases ->nr_marked_dirty counter in the current reiser4
6931 + * context. When we are about to leave this context,
6932 + * balance_dirty_pages_ratelimited() is called, if necessary.
6933 + *
6934 + * This introduces another problem: sometimes we do not want to run
6935 + * balance_dirty_pages_ratelimited() when leaving a context, for example
6936 + * because some important lock (like ->i_mutex on the parent directory) is
6937 + * held. To achieve this, ->nobalance flag can be set in the current context.
6938 + */
6939 +static void balance_dirty_pages_at(reiser4_context *context)
6940 +{
6941 +       reiser4_super_info_data *sbinfo = get_super_private(context->super);
6942 +
6943 +       /*
6944 +        * call balance_dirty_pages_ratelimited() to process formatted nodes
6945 +        * dirtied during this system call. Do that only if we are not in mount
6946 +        * and there were nodes dirtied in this context and we are not in
6947 +        * writepage (to avoid deadlock) and not in pdflush
6948 +        */
6949 +       if (sbinfo != NULL && sbinfo->fake != NULL &&
6950 +           context->nr_marked_dirty != 0 &&
6951 +           !(current->flags & PF_MEMALLOC) &&
6952 +           !current_is_pdflush())
6953 +               balance_dirty_pages_ratelimited(sbinfo->fake->i_mapping);
6954 +}
6955 +
6956 +/* release resources associated with context.
6957 +
6958 +   This function should be called at the end of "session" with reiser4,
6959 +   typically just before leaving reiser4 driver back to VFS.
6960 +
6961 +   This is good place to put some degugging consistency checks, like that
6962 +   thread released all locks and closed transcrash etc.
6963 +
6964 +*/
6965 +static void done_context(reiser4_context * context /* context being released */ )
6966 +{
6967 +       assert("nikita-860", context != NULL);
6968 +       assert("nikita-859", context->magic == context_magic);
6969 +       assert("vs-646", (reiser4_context *) current->journal_info == context);
6970 +       assert("zam-686", !in_interrupt() && !in_irq());
6971 +
6972 +       /* only do anything when leaving top-level reiser4 context. All nested
6973 +        * contexts are just dummies. */
6974 +       if (context->nr_children == 0) {
6975 +               assert("jmacd-673", context->trans == NULL);
6976 +               assert("jmacd-1002", lock_stack_isclean(&context->stack));
6977 +               assert("nikita-1936", no_counters_are_held());
6978 +               assert("nikita-2626", list_empty_careful(taps_list()));
6979 +               assert("zam-1004", ergo(get_super_private(context->super),
6980 +                                       get_super_private(context->super)->delete_sema_owner !=
6981 +                                       current));
6982 +
6983 +               /* release all grabbed but as yet unused blocks */
6984 +               if (context->grabbed_blocks != 0)
6985 +                       all_grabbed2free();
6986 +
6987 +               /*
6988 +                * synchronize against longterm_unlock_znode():
6989 +                * wake_up_requestor() wakes up requestors without holding
6990 +                * zlock (otherwise they will immediately bump into that lock
6991 +                * after wake up on another CPU). To work around (rare)
6992 +                * situation where requestor has been woken up asynchronously
6993 +                * and managed to run until completion (and destroy its
6994 +                * context and lock stack) before wake_up_requestor() called
6995 +                * wake_up() on it, wake_up_requestor() synchronize on lock
6996 +                * stack spin lock. It has actually been observed that spin
6997 +                * lock _was_ locked at this point, because
6998 +                * wake_up_requestor() took interrupt.
6999 +                */
7000 +               spin_lock_stack(&context->stack);
7001 +               spin_unlock_stack(&context->stack);
7002 +
7003 +               assert("zam-684", context->nr_children == 0);
7004 +               /* restore original ->fs_context value */
7005 +               current->journal_info = context->outer;
7006 +               if (context->on_stack == 0)
7007 +                       kfree(context);
7008 +       } else {
7009 +               context->nr_children--;
7010 +#if REISER4_DEBUG
7011 +               assert("zam-685", context->nr_children >= 0);
7012 +#endif
7013 +       }
7014 +}
7015 +
7016 +/*
7017 + * exit reiser4 context. Call balance_dirty_pages_at() if necessary. Close
7018 + * transaction. Call done_context() to do context related book-keeping.
7019 + */
7020 +void reiser4_exit_context(reiser4_context * context)
7021 +{
7022 +       assert("nikita-3021", schedulable());
7023 +
7024 +       if (context->nr_children == 0) {
7025 +               if (!context->nobalance) {
7026 +                       txn_restart(context);
7027 +                       balance_dirty_pages_at(context);
7028 +               }
7029 +
7030 +               /* if filesystem is mounted with -o sync or -o dirsync - commit
7031 +                  transaction.  FIXME: TXNH_DONT_COMMIT is used to avoid
7032 +                  commiting on exit_context when inode semaphore is held and
7033 +                  to have ktxnmgrd to do commit instead to get better
7034 +                  concurrent filesystem accesses. But, when one mounts with -o
7035 +                  sync, he cares more about reliability than about
7036 +                  performance. So, for now we have this simple mount -o sync
7037 +                  support. */
7038 +               if (context->super->s_flags & (MS_SYNCHRONOUS | MS_DIRSYNC)) {
7039 +                       txn_atom *atom;
7040 +
7041 +                       atom = get_current_atom_locked_nocheck();
7042 +                       if (atom) {
7043 +                               atom->flags |= ATOM_FORCE_COMMIT;
7044 +                               context->trans->flags &= ~TXNH_DONT_COMMIT;
7045 +                               spin_unlock_atom(atom);
7046 +                       }
7047 +               }
7048 +               txn_end(context);
7049 +       }
7050 +       done_context(context);
7051 +}
7052 +
7053 +void set_gfp_mask(void)
7054 +{
7055 +       reiser4_context *ctx;
7056 +
7057 +       ctx = get_current_context();
7058 +       if (ctx->entd == 0 &&
7059 +           list_empty(&ctx->stack.locks) &&
7060 +           ctx->trans->atom == NULL)
7061 +               ctx->gfp_mask = GFP_KERNEL;
7062 +       else
7063 +               ctx->gfp_mask = GFP_NOFS;
7064 +}
7065 +
7066 +/*
7067 + * Local variables:
7068 + * c-indentation-style: "K&R"
7069 + * mode-name: "LC"
7070 + * c-basic-offset: 8
7071 + * tab-width: 8
7072 + * fill-column: 120
7073 + * scroll-step: 1
7074 + * End:
7075 + */
7076 diff --git a/fs/reiser4/context.h b/fs/reiser4/context.h
7077 new file mode 100644
7078 index 0000000..a68e45e
7079 --- /dev/null
7080 +++ b/fs/reiser4/context.h
7081 @@ -0,0 +1,228 @@
7082 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
7083 + * reiser4/README */
7084 +
7085 +/* Reiser4 context. See context.c for details. */
7086 +
7087 +#if !defined( __REISER4_CONTEXT_H__ )
7088 +#define __REISER4_CONTEXT_H__
7089 +
7090 +#include "forward.h"
7091 +#include "debug.h"
7092 +#include "dformat.h"
7093 +#include "tap.h"
7094 +#include "lock.h"
7095 +
7096 +#include <linux/types.h>       /* for __u??  */
7097 +#include <linux/fs.h>          /* for struct super_block  */
7098 +#include <linux/spinlock.h>
7099 +#include <linux/sched.h>       /* for struct task_struct */
7100 +
7101 +
7102 +/* reiser4 per-thread context */
7103 +struct reiser4_context {
7104 +       /* magic constant. For identification of reiser4 contexts. */
7105 +       __u32 magic;
7106 +
7107 +       /* current lock stack. See lock.[ch]. This is where list of all
7108 +          locks taken by current thread is kept. This is also used in
7109 +          deadlock detection. */
7110 +       lock_stack stack;
7111 +
7112 +       /* current transcrash. */
7113 +       txn_handle *trans;
7114 +       /* transaction handle embedded into reiser4_context. ->trans points
7115 +        * here by default. */
7116 +       txn_handle trans_in_ctx;
7117 +
7118 +       /* super block we are working with.  To get the current tree
7119 +          use &get_super_private (reiser4_get_current_sb ())->tree. */
7120 +       struct super_block *super;
7121 +
7122 +       /* parent fs activation */
7123 +       struct fs_activation *outer;
7124 +
7125 +       /* per-thread grabbed (for further allocation) blocks counter */
7126 +       reiser4_block_nr grabbed_blocks;
7127 +
7128 +       /* list of taps currently monitored. See tap.c */
7129 +       struct list_head taps;
7130 +
7131 +       /* grabbing space is enabled */
7132 +       unsigned int grab_enabled:1;
7133 +       /* should be set when we are write dirty nodes to disk in jnode_flush or
7134 +        * reiser4_write_logs() */
7135 +       unsigned int writeout_mode:1;
7136 +       /* true, if current thread is an ent thread */
7137 +       unsigned int entd:1;
7138 +       /* true, if balance_dirty_pages() should not be run when leaving this
7139 +        * context. This is used to avoid lengthly balance_dirty_pages()
7140 +        * operation when holding some important resource, like directory
7141 +        * ->i_mutex */
7142 +       unsigned int nobalance:1;
7143 +
7144 +       /* this bit is used on done_context to decide whether context is
7145 +          kmalloc-ed and has to be kfree-ed */
7146 +       unsigned int on_stack:1;
7147 +
7148 +       /* count non-trivial jnode_set_dirty() calls */
7149 +       unsigned long nr_marked_dirty;
7150 +
7151 +       /* reiser4_sync_inodes calls (via generic_sync_sb_inodes)
7152 +        * reiser4_writepages for each of dirty inodes. Reiser4_writepages
7153 +        * captures pages. When number of pages captured in one
7154 +        * reiser4_sync_inodes reaches some threshold - some atoms get
7155 +        * flushed */
7156 +       int nr_captured;
7157 +       int nr_children;        /* number of child contexts */
7158 +#if REISER4_DEBUG
7159 +       /* debugging information about reiser4 locks held by the current
7160 +        * thread */
7161 +       lock_counters_info locks;
7162 +       struct task_struct *task;       /* so we can easily find owner of the stack */
7163 +
7164 +       /*
7165 +        * disk space grabbing debugging support
7166 +        */
7167 +       /* how many disk blocks were grabbed by the first call to
7168 +        * reiser4_grab_space() in this context */
7169 +       reiser4_block_nr grabbed_initially;
7170 +
7171 +       /* list of all threads doing flush currently */
7172 +       struct list_head flushers_link;
7173 +       /* information about last error encountered by reiser4 */
7174 +       err_site err;
7175 +#endif
7176 +       void *vp;
7177 +       gfp_t gfp_mask;
7178 +};
7179 +
7180 +extern reiser4_context *get_context_by_lock_stack(lock_stack *);
7181 +
7182 +/* Debugging helps. */
7183 +#if REISER4_DEBUG
7184 +extern void print_contexts(void);
7185 +#endif
7186 +
7187 +#define current_tree (&(get_super_private(reiser4_get_current_sb())->tree))
7188 +#define current_blocksize reiser4_get_current_sb()->s_blocksize
7189 +#define current_blocksize_bits reiser4_get_current_sb()->s_blocksize_bits
7190 +
7191 +extern reiser4_context *init_context(struct super_block *);
7192 +extern void init_stack_context(reiser4_context *, struct super_block *);
7193 +extern void reiser4_exit_context(reiser4_context *);
7194 +
7195 +/* magic constant we store in reiser4_context allocated at the stack. Used to
7196 +   catch accesses to staled or uninitialized contexts. */
7197 +#define context_magic ((__u32) 0x4b1b5d0b)
7198 +
7199 +extern int is_in_reiser4_context(void);
7200 +
7201 +/*
7202 + * return reiser4_context for the thread @tsk
7203 + */
7204 +static inline reiser4_context *get_context(const struct task_struct *tsk)
7205 +{
7206 +       assert("vs-1682",
7207 +              ((reiser4_context *) tsk->journal_info)->magic == context_magic);
7208 +       return (reiser4_context *) tsk->journal_info;
7209 +}
7210 +
7211 +/*
7212 + * return reiser4 context of the current thread, or NULL if there is none.
7213 + */
7214 +static inline reiser4_context *get_current_context_check(void)
7215 +{
7216 +       if (is_in_reiser4_context())
7217 +               return get_context(current);
7218 +       else
7219 +               return NULL;
7220 +}
7221 +
7222 +static inline reiser4_context *get_current_context(void);      /* __attribute__((const)); */
7223 +
7224 +/* return context associated with current thread */
7225 +static inline reiser4_context *get_current_context(void)
7226 +{
7227 +       return get_context(current);
7228 +}
7229 +
7230 +static inline gfp_t get_gfp_mask(void)
7231 +{
7232 +       reiser4_context *ctx;
7233 +
7234 +       ctx = get_current_context_check();
7235 +       return (ctx == NULL) ? GFP_KERNEL : ctx->gfp_mask;
7236 +}
7237 +
7238 +void set_gfp_mask(void);
7239 +
7240 +/*
7241 + * true if current thread is in the write-out mode. Thread enters write-out
7242 + * mode during jnode_flush and reiser4_write_logs().
7243 + */
7244 +static inline int is_writeout_mode(void)
7245 +{
7246 +       return get_current_context()->writeout_mode;
7247 +}
7248 +
7249 +/*
7250 + * enter write-out mode
7251 + */
7252 +static inline void writeout_mode_enable(void)
7253 +{
7254 +       assert("zam-941", !get_current_context()->writeout_mode);
7255 +       get_current_context()->writeout_mode = 1;
7256 +}
7257 +
7258 +/*
7259 + * leave write-out mode
7260 + */
7261 +static inline void writeout_mode_disable(void)
7262 +{
7263 +       assert("zam-942", get_current_context()->writeout_mode);
7264 +       get_current_context()->writeout_mode = 0;
7265 +}
7266 +
7267 +static inline void grab_space_enable(void)
7268 +{
7269 +       get_current_context()->grab_enabled = 1;
7270 +}
7271 +
7272 +static inline void grab_space_disable(void)
7273 +{
7274 +       get_current_context()->grab_enabled = 0;
7275 +}
7276 +
7277 +static inline void grab_space_set_enabled(int enabled)
7278 +{
7279 +       get_current_context()->grab_enabled = enabled;
7280 +}
7281 +
7282 +static inline int is_grab_enabled(reiser4_context * ctx)
7283 +{
7284 +       return ctx->grab_enabled;
7285 +}
7286 +
7287 +/* mark transaction handle in @ctx as TXNH_DONT_COMMIT, so that no commit or
7288 + * flush would be performed when it is closed. This is necessary when handle
7289 + * has to be closed under some coarse semaphore, like i_mutex of
7290 + * directory. Commit will be performed by ktxnmgrd. */
7291 +static inline void context_set_commit_async(reiser4_context * context)
7292 +{
7293 +       context->nobalance = 1;
7294 +       context->trans->flags |= TXNH_DONT_COMMIT;
7295 +}
7296 +
7297 +/* __REISER4_CONTEXT_H__ */
7298 +#endif
7299 +
7300 +/* Make Linus happy.
7301 +   Local variables:
7302 +   c-indentation-style: "K&R"
7303 +   mode-name: "LC"
7304 +   c-basic-offset: 8
7305 +   tab-width: 8
7306 +   fill-column: 120
7307 +   scroll-step: 1
7308 +   End:
7309 +*/
7310 diff --git a/fs/reiser4/coord.c b/fs/reiser4/coord.c
7311 new file mode 100644
7312 index 0000000..4a1781b
7313 --- /dev/null
7314 +++ b/fs/reiser4/coord.c
7315 @@ -0,0 +1,937 @@
7316 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
7317 +
7318 +#include "forward.h"
7319 +#include "debug.h"
7320 +#include "dformat.h"
7321 +#include "tree.h"
7322 +#include "plugin/item/item.h"
7323 +#include "znode.h"
7324 +#include "coord.h"
7325 +
7326 +/* Internal constructor. */
7327 +static inline void
7328 +coord_init_values(coord_t * coord, const znode * node, pos_in_node_t item_pos,
7329 +                 pos_in_node_t unit_pos, between_enum between)
7330 +{
7331 +       coord->node = (znode *) node;
7332 +       coord_set_item_pos(coord, item_pos);
7333 +       coord->unit_pos = unit_pos;
7334 +       coord->between = between;
7335 +       ON_DEBUG(coord->plug_v = 0);
7336 +       ON_DEBUG(coord->body_v = 0);
7337 +
7338 +       /*ON_TRACE (TRACE_COORDS, "init coord %p node %p: %u %u %s\n", coord, node, item_pos, unit_pos, coord_tween_tostring (between)); */
7339 +}
7340 +
7341 +/* after shifting of node content, coord previously set properly may become
7342 +   invalid, try to "normalize" it. */
7343 +void coord_normalize(coord_t * coord)
7344 +{
7345 +       znode *node;
7346 +
7347 +       node = coord->node;
7348 +       assert("vs-683", node);
7349 +
7350 +       coord_clear_iplug(coord);
7351 +
7352 +       if (node_is_empty(node)) {
7353 +               coord_init_first_unit(coord, node);
7354 +       } else if ((coord->between == AFTER_ITEM)
7355 +                  || (coord->between == AFTER_UNIT)) {
7356 +               return;
7357 +       } else if (coord->item_pos == coord_num_items(coord)
7358 +                  && coord->between == BEFORE_ITEM) {
7359 +               coord_dec_item_pos(coord);
7360 +               coord->between = AFTER_ITEM;
7361 +       } else if (coord->unit_pos == coord_num_units(coord)
7362 +                  && coord->between == BEFORE_UNIT) {
7363 +               coord->unit_pos--;
7364 +               coord->between = AFTER_UNIT;
7365 +       } else if (coord->item_pos == coord_num_items(coord)
7366 +                  && coord->unit_pos == 0 && coord->between == BEFORE_UNIT) {
7367 +               coord_dec_item_pos(coord);
7368 +               coord->unit_pos = 0;
7369 +               coord->between = AFTER_ITEM;
7370 +       }
7371 +}
7372 +
7373 +/* Copy a coordinate. */
7374 +void coord_dup(coord_t * coord, const coord_t * old_coord)
7375 +{
7376 +       assert("jmacd-9800", coord_check(old_coord));
7377 +       coord_dup_nocheck(coord, old_coord);
7378 +}
7379 +
7380 +/* Copy a coordinate without check. Useful when old_coord->node is not
7381 +   loaded. As in cbk_tree_lookup -> connect_znode -> connect_one_side */
7382 +void coord_dup_nocheck(coord_t * coord, const coord_t * old_coord)
7383 +{
7384 +       coord->node = old_coord->node;
7385 +       coord_set_item_pos(coord, old_coord->item_pos);
7386 +       coord->unit_pos = old_coord->unit_pos;
7387 +       coord->between = old_coord->between;
7388 +       coord->iplugid = old_coord->iplugid;
7389 +       ON_DEBUG(coord->plug_v = old_coord->plug_v);
7390 +       ON_DEBUG(coord->body_v = old_coord->body_v);
7391 +}
7392 +
7393 +/* Initialize an invalid coordinate. */
7394 +void coord_init_invalid(coord_t * coord, const znode * node)
7395 +{
7396 +       coord_init_values(coord, node, 0, 0, INVALID_COORD);
7397 +}
7398 +
7399 +void coord_init_first_unit_nocheck(coord_t * coord, const znode * node)
7400 +{
7401 +       coord_init_values(coord, node, 0, 0, AT_UNIT);
7402 +}
7403 +
7404 +/* Initialize a coordinate to point at the first unit of the first item.  If the node is
7405 +   empty, it is positioned at the EMPTY_NODE. */
7406 +void coord_init_first_unit(coord_t * coord, const znode * node)
7407 +{
7408 +       int is_empty = node_is_empty(node);
7409 +
7410 +       coord_init_values(coord, node, 0, 0, (is_empty ? EMPTY_NODE : AT_UNIT));
7411 +
7412 +       assert("jmacd-9801", coord_check(coord));
7413 +}
7414 +
7415 +/* Initialize a coordinate to point at the last unit of the last item.  If the node is
7416 +   empty, it is positioned at the EMPTY_NODE. */
7417 +void coord_init_last_unit(coord_t * coord, const znode * node)
7418 +{
7419 +       int is_empty = node_is_empty(node);
7420 +
7421 +       coord_init_values(coord, node,
7422 +                         (is_empty ? 0 : node_num_items(node) - 1), 0,
7423 +                         (is_empty ? EMPTY_NODE : AT_UNIT));
7424 +       if (!is_empty)
7425 +               coord->unit_pos = coord_last_unit_pos(coord);
7426 +       assert("jmacd-9802", coord_check(coord));
7427 +}
7428 +
7429 +/* Initialize a coordinate to before the first item.  If the node is empty, it is
7430 +   positioned at the EMPTY_NODE. */
7431 +void coord_init_before_first_item(coord_t * coord, const znode * node)
7432 +{
7433 +       int is_empty = node_is_empty(node);
7434 +
7435 +       coord_init_values(coord, node, 0, 0,
7436 +                         (is_empty ? EMPTY_NODE : BEFORE_UNIT));
7437 +
7438 +       assert("jmacd-9803", coord_check(coord));
7439 +}
7440 +
7441 +/* Initialize a coordinate to after the last item.  If the node is empty, it is positioned
7442 +   at the EMPTY_NODE. */
7443 +void coord_init_after_last_item(coord_t * coord, const znode * node)
7444 +{
7445 +       int is_empty = node_is_empty(node);
7446 +
7447 +       coord_init_values(coord, node,
7448 +                         (is_empty ? 0 : node_num_items(node) - 1), 0,
7449 +                         (is_empty ? EMPTY_NODE : AFTER_ITEM));
7450 +
7451 +       assert("jmacd-9804", coord_check(coord));
7452 +}
7453 +
7454 +/* Initialize a coordinate to after last unit in the item. Coord must be set
7455 +   already to existing item */
7456 +void coord_init_after_item_end(coord_t * coord)
7457 +{
7458 +       coord->between = AFTER_UNIT;
7459 +       coord->unit_pos = coord_last_unit_pos(coord);
7460 +}
7461 +
7462 +/* Initialize a coordinate to before the item. Coord must be set already to existing item */
7463 +void coord_init_before_item(coord_t * coord)
7464 +{
7465 +       coord->unit_pos = 0;
7466 +       coord->between = BEFORE_ITEM;
7467 +}
7468 +
7469 +/* Initialize a coordinate to after the item. Coord must be set already to existing item */
7470 +void coord_init_after_item(coord_t * coord)
7471 +{
7472 +       coord->unit_pos = 0;
7473 +       coord->between = AFTER_ITEM;
7474 +}
7475 +
7476 +/* Initialize a coordinate by 0s. Used in places where init_coord was used and
7477 +   it was not clear how actually */
7478 +void coord_init_zero(coord_t * coord)
7479 +{
7480 +       memset(coord, 0, sizeof(*coord));
7481 +}
7482 +
7483 +/* Return the number of units at the present item.  Asserts coord_is_existing_item(). */
7484 +unsigned coord_num_units(const coord_t * coord)
7485 +{
7486 +       assert("jmacd-9806", coord_is_existing_item(coord));
7487 +
7488 +       return item_plugin_by_coord(coord)->b.nr_units(coord);
7489 +}
7490 +
7491 +/* Returns true if the coord was initializewd by coord_init_invalid (). */
7492 +/* Audited by: green(2002.06.15) */
7493 +int coord_is_invalid(const coord_t * coord)
7494 +{
7495 +       return coord->between == INVALID_COORD;
7496 +}
7497 +
7498 +/* Returns true if the coordinate is positioned at an existing item, not before or after
7499 +   an item.  It may be placed at, before, or after any unit within the item, whether
7500 +   existing or not. */
7501 +int coord_is_existing_item(const coord_t * coord)
7502 +{
7503 +       switch (coord->between) {
7504 +       case EMPTY_NODE:
7505 +       case BEFORE_ITEM:
7506 +       case AFTER_ITEM:
7507 +       case INVALID_COORD:
7508 +               return 0;
7509 +
7510 +       case BEFORE_UNIT:
7511 +       case AT_UNIT:
7512 +       case AFTER_UNIT:
7513 +               return coord->item_pos < coord_num_items(coord);
7514 +       }
7515 +
7516 +       impossible("jmacd-9900", "unreachable coord: %p", coord);
7517 +       return 0;
7518 +}
7519 +
7520 +/* Returns true if the coordinate is positioned at an existing unit, not before or after a
7521 +   unit. */
7522 +/* Audited by: green(2002.06.15) */
7523 +int coord_is_existing_unit(const coord_t * coord)
7524 +{
7525 +       switch (coord->between) {
7526 +       case EMPTY_NODE:
7527 +       case BEFORE_UNIT:
7528 +       case AFTER_UNIT:
7529 +       case BEFORE_ITEM:
7530 +       case AFTER_ITEM:
7531 +       case INVALID_COORD:
7532 +               return 0;
7533 +
7534 +       case AT_UNIT:
7535 +               return (coord->item_pos < coord_num_items(coord)
7536 +                       && coord->unit_pos < coord_num_units(coord));
7537 +       }
7538 +
7539 +       impossible("jmacd-9902", "unreachable");
7540 +       return 0;
7541 +}
7542 +
7543 +/* Returns true if the coordinate is positioned at the first unit of the first item.  Not
7544 +   true for empty nodes nor coordinates positioned before the first item. */
7545 +/* Audited by: green(2002.06.15) */
7546 +int coord_is_leftmost_unit(const coord_t * coord)
7547 +{
7548 +       return (coord->between == AT_UNIT && coord->item_pos == 0
7549 +               && coord->unit_pos == 0);
7550 +}
7551 +
7552 +#if REISER4_DEBUG
7553 +/* For assertions only, checks for a valid coordinate. */
7554 +int coord_check(const coord_t * coord)
7555 +{
7556 +       if (coord->node == NULL) {
7557 +               return 0;
7558 +       }
7559 +       if (znode_above_root(coord->node))
7560 +               return 1;
7561 +
7562 +       switch (coord->between) {
7563 +       default:
7564 +       case INVALID_COORD:
7565 +               return 0;
7566 +       case EMPTY_NODE:
7567 +               if (!node_is_empty(coord->node)) {
7568 +                       return 0;
7569 +               }
7570 +               return coord->item_pos == 0 && coord->unit_pos == 0;
7571 +
7572 +       case BEFORE_UNIT:
7573 +       case AFTER_UNIT:
7574 +               if (node_is_empty(coord->node) && (coord->item_pos == 0)
7575 +                   && (coord->unit_pos == 0))
7576 +                       return 1;
7577 +       case AT_UNIT:
7578 +               break;
7579 +       case AFTER_ITEM:
7580 +       case BEFORE_ITEM:
7581 +               /* before/after item should not set unit_pos. */
7582 +               if (coord->unit_pos != 0) {
7583 +                       return 0;
7584 +               }
7585 +               break;
7586 +       }
7587 +
7588 +       if (coord->item_pos >= node_num_items(coord->node)) {
7589 +               return 0;
7590 +       }
7591 +
7592 +       /* FIXME-VS: we are going to check unit_pos. This makes no sense when
7593 +          between is set either AFTER_ITEM or BEFORE_ITEM */
7594 +       if (coord->between == AFTER_ITEM || coord->between == BEFORE_ITEM)
7595 +               return 1;
7596 +
7597 +       if (coord_is_iplug_set(coord) &&
7598 +           coord->unit_pos >
7599 +           item_plugin_by_coord(coord)->b.nr_units(coord) - 1) {
7600 +               return 0;
7601 +       }
7602 +       return 1;
7603 +}
7604 +#endif
7605 +
7606 +/* Adjust coordinate boundaries based on the number of items prior to coord_next/prev.
7607 +   Returns 1 if the new position is does not exist. */
7608 +static int coord_adjust_items(coord_t * coord, unsigned items, int is_next)
7609 +{
7610 +       /* If the node is invalid, leave it. */
7611 +       if (coord->between == INVALID_COORD) {
7612 +               return 1;
7613 +       }
7614 +
7615 +       /* If the node is empty, set it appropriately. */
7616 +       if (items == 0) {
7617 +               coord->between = EMPTY_NODE;
7618 +               coord_set_item_pos(coord, 0);
7619 +               coord->unit_pos = 0;
7620 +               return 1;
7621 +       }
7622 +
7623 +       /* If it was empty and it no longer is, set to BEFORE/AFTER_ITEM. */
7624 +       if (coord->between == EMPTY_NODE) {
7625 +               coord->between = (is_next ? BEFORE_ITEM : AFTER_ITEM);
7626 +               coord_set_item_pos(coord, 0);
7627 +               coord->unit_pos = 0;
7628 +               return 0;
7629 +       }
7630 +
7631 +       /* If the item_pos is out-of-range, set it appropriatly. */
7632 +       if (coord->item_pos >= items) {
7633 +               coord->between = AFTER_ITEM;
7634 +               coord_set_item_pos(coord, items - 1);
7635 +               coord->unit_pos = 0;
7636 +               /* If is_next, return 1 (can't go any further). */
7637 +               return is_next;
7638 +       }
7639 +
7640 +       return 0;
7641 +}
7642 +
7643 +/* Advances the coordinate by one unit to the right.  If empty, no change.  If
7644 +   coord_is_rightmost_unit, advances to AFTER THE LAST ITEM.  Returns 0 if new position is an
7645 +   existing unit. */
7646 +int coord_next_unit(coord_t * coord)
7647 +{
7648 +       unsigned items = coord_num_items(coord);
7649 +
7650 +       if (coord_adjust_items(coord, items, 1) == 1) {
7651 +               return 1;
7652 +       }
7653 +
7654 +       switch (coord->between) {
7655 +       case BEFORE_UNIT:
7656 +               /* Now it is positioned at the same unit. */
7657 +               coord->between = AT_UNIT;
7658 +               return 0;
7659 +
7660 +       case AFTER_UNIT:
7661 +       case AT_UNIT:
7662 +               /* If it was at or after a unit and there are more units in this item,
7663 +                  advance to the next one. */
7664 +               if (coord->unit_pos < coord_last_unit_pos(coord)) {
7665 +                       coord->unit_pos += 1;
7666 +                       coord->between = AT_UNIT;
7667 +                       return 0;
7668 +               }
7669 +
7670 +               /* Otherwise, it is crossing an item boundary and treated as if it was
7671 +                  after the current item. */
7672 +               coord->between = AFTER_ITEM;
7673 +               coord->unit_pos = 0;
7674 +               /* FALLTHROUGH */
7675 +
7676 +       case AFTER_ITEM:
7677 +               /* Check for end-of-node. */
7678 +               if (coord->item_pos == items - 1) {
7679 +                       return 1;
7680 +               }
7681 +
7682 +               coord_inc_item_pos(coord);
7683 +               coord->unit_pos = 0;
7684 +               coord->between = AT_UNIT;
7685 +               return 0;
7686 +
7687 +       case BEFORE_ITEM:
7688 +               /* The adjust_items checks ensure that we are valid here. */
7689 +               coord->unit_pos = 0;
7690 +               coord->between = AT_UNIT;
7691 +               return 0;
7692 +
7693 +       case INVALID_COORD:
7694 +       case EMPTY_NODE:
7695 +               /* Handled in coord_adjust_items(). */
7696 +               break;
7697 +       }
7698 +
7699 +       impossible("jmacd-9902", "unreachable");
7700 +       return 0;
7701 +}
7702 +
7703 +/* Advances the coordinate by one item to the right.  If empty, no change.  If
7704 +   coord_is_rightmost_unit, advances to AFTER THE LAST ITEM.  Returns 0 if new position is
7705 +   an existing item. */
7706 +int coord_next_item(coord_t * coord)
7707 +{
7708 +       unsigned items = coord_num_items(coord);
7709 +
7710 +       if (coord_adjust_items(coord, items, 1) == 1) {
7711 +               return 1;
7712 +       }
7713 +
7714 +       switch (coord->between) {
7715 +       case AFTER_UNIT:
7716 +       case AT_UNIT:
7717 +       case BEFORE_UNIT:
7718 +       case AFTER_ITEM:
7719 +               /* Check for end-of-node. */
7720 +               if (coord->item_pos == items - 1) {
7721 +                       coord->between = AFTER_ITEM;
7722 +                       coord->unit_pos = 0;
7723 +                       coord_clear_iplug(coord);
7724 +                       return 1;
7725 +               }
7726 +
7727 +               /* Anywhere in an item, go to the next one. */
7728 +               coord->between = AT_UNIT;
7729 +               coord_inc_item_pos(coord);
7730 +               coord->unit_pos = 0;
7731 +               return 0;
7732 +
7733 +       case BEFORE_ITEM:
7734 +               /* The out-of-range check ensures that we are valid here. */
7735 +               coord->unit_pos = 0;
7736 +               coord->between = AT_UNIT;
7737 +               return 0;
7738 +       case INVALID_COORD:
7739 +       case EMPTY_NODE:
7740 +               /* Handled in coord_adjust_items(). */
7741 +               break;
7742 +       }
7743 +
7744 +       impossible("jmacd-9903", "unreachable");
7745 +       return 0;
7746 +}
7747 +
7748 +/* Advances the coordinate by one unit to the left.  If empty, no change.  If
7749 +   coord_is_leftmost_unit, advances to BEFORE THE FIRST ITEM.  Returns 0 if new position
7750 +   is an existing unit. */
7751 +int coord_prev_unit(coord_t * coord)
7752 +{
7753 +       unsigned items = coord_num_items(coord);
7754 +
7755 +       if (coord_adjust_items(coord, items, 0) == 1) {
7756 +               return 1;
7757 +       }
7758 +
7759 +       switch (coord->between) {
7760 +       case AT_UNIT:
7761 +       case BEFORE_UNIT:
7762 +               if (coord->unit_pos > 0) {
7763 +                       coord->unit_pos -= 1;
7764 +                       coord->between = AT_UNIT;
7765 +                       return 0;
7766 +               }
7767 +
7768 +               if (coord->item_pos == 0) {
7769 +                       coord->between = BEFORE_ITEM;
7770 +                       return 1;
7771 +               }
7772 +
7773 +               coord_dec_item_pos(coord);
7774 +               coord->unit_pos = coord_last_unit_pos(coord);
7775 +               coord->between = AT_UNIT;
7776 +               return 0;
7777 +
7778 +       case AFTER_UNIT:
7779 +               /* What if unit_pos is out-of-range? */
7780 +               assert("jmacd-5442",
7781 +                      coord->unit_pos <= coord_last_unit_pos(coord));
7782 +               coord->between = AT_UNIT;
7783 +               return 0;
7784 +
7785 +       case BEFORE_ITEM:
7786 +               if (coord->item_pos == 0) {
7787 +                       return 1;
7788 +               }
7789 +
7790 +               coord_dec_item_pos(coord);
7791 +               /* FALLTHROUGH */
7792 +
7793 +       case AFTER_ITEM:
7794 +               coord->between = AT_UNIT;
7795 +               coord->unit_pos = coord_last_unit_pos(coord);
7796 +               return 0;
7797 +
7798 +       case INVALID_COORD:
7799 +       case EMPTY_NODE:
7800 +               break;
7801 +       }
7802 +
7803 +       impossible("jmacd-9904", "unreachable");
7804 +       return 0;
7805 +}
7806 +
7807 +/* Advances the coordinate by one item to the left.  If empty, no change.  If
7808 +   coord_is_leftmost_unit, advances to BEFORE THE FIRST ITEM.  Returns 0 if new position
7809 +   is an existing item. */
7810 +int coord_prev_item(coord_t * coord)
7811 +{
7812 +       unsigned items = coord_num_items(coord);
7813 +
7814 +       if (coord_adjust_items(coord, items, 0) == 1) {
7815 +               return 1;
7816 +       }
7817 +
7818 +       switch (coord->between) {
7819 +       case AT_UNIT:
7820 +       case AFTER_UNIT:
7821 +       case BEFORE_UNIT:
7822 +       case BEFORE_ITEM:
7823 +
7824 +               if (coord->item_pos == 0) {
7825 +                       coord->between = BEFORE_ITEM;
7826 +                       coord->unit_pos = 0;
7827 +                       return 1;
7828 +               }
7829 +
7830 +               coord_dec_item_pos(coord);
7831 +               coord->unit_pos = 0;
7832 +               coord->between = AT_UNIT;
7833 +               return 0;
7834 +
7835 +       case AFTER_ITEM:
7836 +               coord->between = AT_UNIT;
7837 +               coord->unit_pos = 0;
7838 +               return 0;
7839 +
7840 +       case INVALID_COORD:
7841 +       case EMPTY_NODE:
7842 +               break;
7843 +       }
7844 +
7845 +       impossible("jmacd-9905", "unreachable");
7846 +       return 0;
7847 +}
7848 +
7849 +/* Calls either coord_init_first_unit or coord_init_last_unit depending on sideof argument. */
7850 +void coord_init_sideof_unit(coord_t * coord, const znode * node, sideof dir)
7851 +{
7852 +       assert("jmacd-9821", dir == LEFT_SIDE || dir == RIGHT_SIDE);
7853 +       if (dir == LEFT_SIDE) {
7854 +               coord_init_first_unit(coord, node);
7855 +       } else {
7856 +               coord_init_last_unit(coord, node);
7857 +       }
7858 +}
7859 +
7860 +/* Calls either coord_is_before_leftmost or coord_is_after_rightmost depending on sideof
7861 +   argument. */
7862 +/* Audited by: green(2002.06.15) */
7863 +int coord_is_after_sideof_unit(coord_t * coord, sideof dir)
7864 +{
7865 +       assert("jmacd-9822", dir == LEFT_SIDE || dir == RIGHT_SIDE);
7866 +       if (dir == LEFT_SIDE) {
7867 +               return coord_is_before_leftmost(coord);
7868 +       } else {
7869 +               return coord_is_after_rightmost(coord);
7870 +       }
7871 +}
7872 +
7873 +/* Calls either coord_next_unit or coord_prev_unit depending on sideof argument. */
7874 +/* Audited by: green(2002.06.15) */
7875 +int coord_sideof_unit(coord_t * coord, sideof dir)
7876 +{
7877 +       assert("jmacd-9823", dir == LEFT_SIDE || dir == RIGHT_SIDE);
7878 +       if (dir == LEFT_SIDE) {
7879 +               return coord_prev_unit(coord);
7880 +       } else {
7881 +               return coord_next_unit(coord);
7882 +       }
7883 +}
7884 +
7885 +#if REISER4_DEBUG
7886 +#define DEBUG_COORD_FIELDS (sizeof(c1->plug_v) + sizeof(c1->body_v))
7887 +#else
7888 +#define DEBUG_COORD_FIELDS (0)
7889 +#endif
7890 +
7891 +int coords_equal(const coord_t * c1, const coord_t * c2)
7892 +{
7893 +       assert("nikita-2840", c1 != NULL);
7894 +       assert("nikita-2841", c2 != NULL);
7895 +
7896 +       return
7897 +           c1->node == c2->node &&
7898 +           c1->item_pos == c2->item_pos &&
7899 +           c1->unit_pos == c2->unit_pos && c1->between == c2->between;
7900 +}
7901 +
7902 +/* If coord_is_after_rightmost return NCOORD_ON_THE_RIGHT, if coord_is_after_leftmost
7903 +   return NCOORD_ON_THE_LEFT, otherwise return NCOORD_INSIDE. */
7904 +/* Audited by: green(2002.06.15) */
7905 +coord_wrt_node coord_wrt(const coord_t * coord)
7906 +{
7907 +       if (coord_is_before_leftmost(coord)) {
7908 +               return COORD_ON_THE_LEFT;
7909 +       }
7910 +
7911 +       if (coord_is_after_rightmost(coord)) {
7912 +               return COORD_ON_THE_RIGHT;
7913 +       }
7914 +
7915 +       return COORD_INSIDE;
7916 +}
7917 +
7918 +/* Returns true if the coordinate is positioned after the last item or after the last unit
7919 +   of the last item or it is an empty node. */
7920 +/* Audited by: green(2002.06.15) */
7921 +int coord_is_after_rightmost(const coord_t * coord)
7922 +{
7923 +       assert("jmacd-7313", coord_check(coord));
7924 +
7925 +       switch (coord->between) {
7926 +       case INVALID_COORD:
7927 +       case AT_UNIT:
7928 +       case BEFORE_UNIT:
7929 +       case BEFORE_ITEM:
7930 +               return 0;
7931 +
7932 +       case EMPTY_NODE:
7933 +               return 1;
7934 +
7935 +       case AFTER_ITEM:
7936 +               return (coord->item_pos == node_num_items(coord->node) - 1);
7937 +
7938 +       case AFTER_UNIT:
7939 +               return ((coord->item_pos == node_num_items(coord->node) - 1) &&
7940 +                       coord->unit_pos == coord_last_unit_pos(coord));
7941 +       }
7942 +
7943 +       impossible("jmacd-9908", "unreachable");
7944 +       return 0;
7945 +}
7946 +
7947 +/* Returns true if the coordinate is positioned before the first item or it is an empty
7948 +   node. */
7949 +int coord_is_before_leftmost(const coord_t * coord)
7950 +{
7951 +       /* FIXME-VS: coord_check requires node to be loaded whereas it is not
7952 +          necessary to check if coord is set before leftmost
7953 +          assert ("jmacd-7313", coord_check (coord)); */
7954 +       switch (coord->between) {
7955 +       case INVALID_COORD:
7956 +       case AT_UNIT:
7957 +       case AFTER_ITEM:
7958 +       case AFTER_UNIT:
7959 +               return 0;
7960 +
7961 +       case EMPTY_NODE:
7962 +               return 1;
7963 +
7964 +       case BEFORE_ITEM:
7965 +       case BEFORE_UNIT:
7966 +               return (coord->item_pos == 0) && (coord->unit_pos == 0);
7967 +       }
7968 +
7969 +       impossible("jmacd-9908", "unreachable");
7970 +       return 0;
7971 +}
7972 +
7973 +/* Returns true if the coordinate is positioned after a item, before a item, after the
7974 +   last unit of an item, before the first unit of an item, or at an empty node. */
7975 +/* Audited by: green(2002.06.15) */
7976 +int coord_is_between_items(const coord_t * coord)
7977 +{
7978 +       assert("jmacd-7313", coord_check(coord));
7979 +
7980 +       switch (coord->between) {
7981 +       case INVALID_COORD:
7982 +       case AT_UNIT:
7983 +               return 0;
7984 +
7985 +       case AFTER_ITEM:
7986 +       case BEFORE_ITEM:
7987 +       case EMPTY_NODE:
7988 +               return 1;
7989 +
7990 +       case BEFORE_UNIT:
7991 +               return coord->unit_pos == 0;
7992 +
7993 +       case AFTER_UNIT:
7994 +               return coord->unit_pos == coord_last_unit_pos(coord);
7995 +       }
7996 +
7997 +       impossible("jmacd-9908", "unreachable");
7998 +       return 0;
7999 +}
8000 +
8001 +/* Returns true if the coordinates are positioned at adjacent units, regardless of
8002 +   before-after or item boundaries. */
8003 +int coord_are_neighbors(coord_t * c1, coord_t * c2)
8004 +{
8005 +       coord_t *left;
8006 +       coord_t *right;
8007 +
8008 +       assert("nikita-1241", c1 != NULL);
8009 +       assert("nikita-1242", c2 != NULL);
8010 +       assert("nikita-1243", c1->node == c2->node);
8011 +       assert("nikita-1244", coord_is_existing_unit(c1));
8012 +       assert("nikita-1245", coord_is_existing_unit(c2));
8013 +
8014 +       left = right = NULL;
8015 +       switch (coord_compare(c1, c2)) {
8016 +       case COORD_CMP_ON_LEFT:
8017 +               left = c1;
8018 +               right = c2;
8019 +               break;
8020 +       case COORD_CMP_ON_RIGHT:
8021 +               left = c2;
8022 +               right = c1;
8023 +               break;
8024 +       case COORD_CMP_SAME:
8025 +               return 0;
8026 +       default:
8027 +               wrong_return_value("nikita-1246", "compare_coords()");
8028 +       }
8029 +       assert("vs-731", left && right);
8030 +       if (left->item_pos == right->item_pos) {
8031 +               return left->unit_pos + 1 == right->unit_pos;
8032 +       } else if (left->item_pos + 1 == right->item_pos) {
8033 +               return (left->unit_pos == coord_last_unit_pos(left))
8034 +                   && (right->unit_pos == 0);
8035 +       } else {
8036 +               return 0;
8037 +       }
8038 +}
8039 +
8040 +/* Assuming two coordinates are positioned in the same node, return COORD_CMP_ON_RIGHT,
8041 +   COORD_CMP_ON_LEFT, or COORD_CMP_SAME depending on c1's position relative to c2.  */
8042 +/* Audited by: green(2002.06.15) */
8043 +coord_cmp coord_compare(coord_t * c1, coord_t * c2)
8044 +{
8045 +       assert("vs-209", c1->node == c2->node);
8046 +       assert("vs-194", coord_is_existing_unit(c1)
8047 +              && coord_is_existing_unit(c2));
8048 +
8049 +       if (c1->item_pos > c2->item_pos)
8050 +               return COORD_CMP_ON_RIGHT;
8051 +       if (c1->item_pos < c2->item_pos)
8052 +               return COORD_CMP_ON_LEFT;
8053 +       if (c1->unit_pos > c2->unit_pos)
8054 +               return COORD_CMP_ON_RIGHT;
8055 +       if (c1->unit_pos < c2->unit_pos)
8056 +               return COORD_CMP_ON_LEFT;
8057 +       return COORD_CMP_SAME;
8058 +}
8059 +
8060 +/* If the coordinate is between items, shifts it to the right.  Returns 0 on success and
8061 +   non-zero if there is no position to the right. */
8062 +int coord_set_to_right(coord_t * coord)
8063 +{
8064 +       unsigned items = coord_num_items(coord);
8065 +
8066 +       if (coord_adjust_items(coord, items, 1) == 1) {
8067 +               return 1;
8068 +       }
8069 +
8070 +       switch (coord->between) {
8071 +       case AT_UNIT:
8072 +               return 0;
8073 +
8074 +       case BEFORE_ITEM:
8075 +       case BEFORE_UNIT:
8076 +               coord->between = AT_UNIT;
8077 +               return 0;
8078 +
8079 +       case AFTER_UNIT:
8080 +               if (coord->unit_pos < coord_last_unit_pos(coord)) {
8081 +                       coord->unit_pos += 1;
8082 +                       coord->between = AT_UNIT;
8083 +                       return 0;
8084 +               } else {
8085 +
8086 +                       coord->unit_pos = 0;
8087 +
8088 +                       if (coord->item_pos == items - 1) {
8089 +                               coord->between = AFTER_ITEM;
8090 +                               return 1;
8091 +                       }
8092 +
8093 +                       coord_inc_item_pos(coord);
8094 +                       coord->between = AT_UNIT;
8095 +                       return 0;
8096 +               }
8097 +
8098 +       case AFTER_ITEM:
8099 +               if (coord->item_pos == items - 1) {
8100 +                       return 1;
8101 +               }
8102 +
8103 +               coord_inc_item_pos(coord);
8104 +               coord->unit_pos = 0;
8105 +               coord->between = AT_UNIT;
8106 +               return 0;
8107 +
8108 +       case EMPTY_NODE:
8109 +               return 1;
8110 +
8111 +       case INVALID_COORD:
8112 +               break;
8113 +       }
8114 +
8115 +       impossible("jmacd-9920", "unreachable");
8116 +       return 0;
8117 +}
8118 +
8119 +/* If the coordinate is between items, shifts it to the left.  Returns 0 on success and
8120 +   non-zero if there is no position to the left. */
8121 +int coord_set_to_left(coord_t * coord)
8122 +{
8123 +       unsigned items = coord_num_items(coord);
8124 +
8125 +       if (coord_adjust_items(coord, items, 0) == 1) {
8126 +               return 1;
8127 +       }
8128 +
8129 +       switch (coord->between) {
8130 +       case AT_UNIT:
8131 +               return 0;
8132 +
8133 +       case AFTER_UNIT:
8134 +               coord->between = AT_UNIT;
8135 +               return 0;
8136 +
8137 +       case AFTER_ITEM:
8138 +               coord->between = AT_UNIT;
8139 +               coord->unit_pos = coord_last_unit_pos(coord);
8140 +               return 0;
8141 +
8142 +       case BEFORE_UNIT:
8143 +               if (coord->unit_pos > 0) {
8144 +                       coord->unit_pos -= 1;
8145 +                       coord->between = AT_UNIT;
8146 +                       return 0;
8147 +               } else {
8148 +
8149 +                       if (coord->item_pos == 0) {
8150 +                               coord->between = BEFORE_ITEM;
8151 +                               return 1;
8152 +                       }
8153 +
8154 +                       coord->unit_pos = coord_last_unit_pos(coord);
8155 +                       coord_dec_item_pos(coord);
8156 +                       coord->between = AT_UNIT;
8157 +                       return 0;
8158 +               }
8159 +
8160 +       case BEFORE_ITEM:
8161 +               if (coord->item_pos == 0) {
8162 +                       return 1;
8163 +               }
8164 +
8165 +               coord_dec_item_pos(coord);
8166 +               coord->unit_pos = coord_last_unit_pos(coord);
8167 +               coord->between = AT_UNIT;
8168 +               return 0;
8169 +
8170 +       case EMPTY_NODE:
8171 +               return 1;
8172 +
8173 +       case INVALID_COORD:
8174 +               break;
8175 +       }
8176 +
8177 +       impossible("jmacd-9920", "unreachable");
8178 +       return 0;
8179 +}
8180 +
8181 +static const char *coord_tween_tostring(between_enum n)
8182 +{
8183 +       switch (n) {
8184 +       case BEFORE_UNIT:
8185 +               return "before unit";
8186 +       case BEFORE_ITEM:
8187 +               return "before item";
8188 +       case AT_UNIT:
8189 +               return "at unit";
8190 +       case AFTER_UNIT:
8191 +               return "after unit";
8192 +       case AFTER_ITEM:
8193 +               return "after item";
8194 +       case EMPTY_NODE:
8195 +               return "empty node";
8196 +       case INVALID_COORD:
8197 +               return "invalid";
8198 +       default:
8199 +       {
8200 +               static char buf[30];
8201 +
8202 +               sprintf(buf, "unknown: %i", n);
8203 +               return buf;
8204 +       }
8205 +       }
8206 +}
8207 +
8208 +void print_coord(const char *mes, const coord_t * coord, int node)
8209 +{
8210 +       if (coord == NULL) {
8211 +               printk("%s: null\n", mes);
8212 +               return;
8213 +       }
8214 +       printk("%s: item_pos = %d, unit_pos %d, tween=%s, iplug=%d\n",
8215 +              mes, coord->item_pos, coord->unit_pos,
8216 +              coord_tween_tostring(coord->between), coord->iplugid);
8217 +}
8218 +
8219 +int
8220 +item_utmost_child_real_block(const coord_t * coord, sideof side,
8221 +                            reiser4_block_nr * blk)
8222 +{
8223 +       return item_plugin_by_coord(coord)->f.utmost_child_real_block(coord,
8224 +                                                                     side,
8225 +                                                                     blk);
8226 +}
8227 +
8228 +int item_utmost_child(const coord_t * coord, sideof side, jnode ** child)
8229 +{
8230 +       return item_plugin_by_coord(coord)->f.utmost_child(coord, side, child);
8231 +}
8232 +
8233 +/* @count bytes of flow @f got written, update correspondingly f->length,
8234 +   f->data and f->key */
8235 +void move_flow_forward(flow_t * f, unsigned count)
8236 +{
8237 +       if (f->data)
8238 +               f->data += count;
8239 +       f->length -= count;
8240 +       set_key_offset(&f->key, get_key_offset(&f->key) + count);
8241 +}
8242 +
8243 +/*
8244 +   Local variables:
8245 +   c-indentation-style: "K&R"
8246 +   mode-name: "LC"
8247 +   c-basic-offset: 8
8248 +   tab-width: 8
8249 +   fill-column: 120
8250 +   scroll-step: 1
8251 +   End:
8252 +*/
8253 diff --git a/fs/reiser4/coord.h b/fs/reiser4/coord.h
8254 new file mode 100644
8255 index 0000000..313e615
8256 --- /dev/null
8257 +++ b/fs/reiser4/coord.h
8258 @@ -0,0 +1,389 @@
8259 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
8260 +
8261 +/* Coords */
8262 +
8263 +#if !defined( __REISER4_COORD_H__ )
8264 +#define __REISER4_COORD_H__
8265 +
8266 +#include "forward.h"
8267 +#include "debug.h"
8268 +#include "dformat.h"
8269 +#include "key.h"
8270 +
8271 +/* insertions happen between coords in the tree, so we need some means
8272 +   of specifying the sense of betweenness. */
8273 +typedef enum {
8274 +       BEFORE_UNIT,            /* Note: we/init_coord depends on this value being zero. */
8275 +       AT_UNIT,
8276 +       AFTER_UNIT,
8277 +       BEFORE_ITEM,
8278 +       AFTER_ITEM,
8279 +       INVALID_COORD,
8280 +       EMPTY_NODE,
8281 +} between_enum;
8282 +
8283 +/* location of coord w.r.t. its node */
8284 +typedef enum {
8285 +       COORD_ON_THE_LEFT = -1,
8286 +       COORD_ON_THE_RIGHT = +1,
8287 +       COORD_INSIDE = 0
8288 +} coord_wrt_node;
8289 +
8290 +typedef enum {
8291 +       COORD_CMP_SAME = 0, COORD_CMP_ON_LEFT = -1, COORD_CMP_ON_RIGHT = +1
8292 +} coord_cmp;
8293 +
8294 +struct coord {
8295 +       /* node in a tree */
8296 +       /*  0 */ znode *node;
8297 +
8298 +       /* position of item within node */
8299 +       /*  4 */ pos_in_node_t item_pos;
8300 +       /* position of unit within item */
8301 +       /*  6 */ pos_in_node_t unit_pos;
8302 +       /* optimization: plugin of item is stored in coord_t. Until this was
8303 +          implemented, item_plugin_by_coord() was major CPU consumer. ->iplugid
8304 +          is invalidated (set to 0xff) on each modification of ->item_pos,
8305 +          and all such modifications are funneled through coord_*_item_pos()
8306 +          functions below.
8307 +        */
8308 +       /*  8 */ char iplugid;
8309 +       /* position of coord w.r.t. to neighboring items and/or units.
8310 +          Values are taken from &between_enum above.
8311 +        */
8312 +       /*  9 */ char between;
8313 +       /* padding. It will be added by the compiler anyway to conform to the
8314 +        * C language alignment requirements. We keep it here to be on the
8315 +        * safe side and to have a clear picture of the memory layout of this
8316 +        * structure. */
8317 +       /* 10 */ __u16 pad;
8318 +       /* 12 */ int offset;
8319 +#if REISER4_DEBUG
8320 +       unsigned long plug_v;
8321 +       unsigned long body_v;
8322 +#endif
8323 +};
8324 +
8325 +#define INVALID_PLUGID  ((char)((1 << 8) - 1))
8326 +#define INVALID_OFFSET -1
8327 +
8328 +static inline void coord_clear_iplug(coord_t * coord)
8329 +{
8330 +       assert("nikita-2835", coord != NULL);
8331 +       coord->iplugid = INVALID_PLUGID;
8332 +       coord->offset = INVALID_OFFSET;
8333 +}
8334 +
8335 +static inline int coord_is_iplug_set(const coord_t * coord)
8336 +{
8337 +       assert("nikita-2836", coord != NULL);
8338 +       return coord->iplugid != INVALID_PLUGID;
8339 +}
8340 +
8341 +static inline void coord_set_item_pos(coord_t * coord, pos_in_node_t pos)
8342 +{
8343 +       assert("nikita-2478", coord != NULL);
8344 +       coord->item_pos = pos;
8345 +       coord_clear_iplug(coord);
8346 +}
8347 +
8348 +static inline void coord_dec_item_pos(coord_t * coord)
8349 +{
8350 +       assert("nikita-2480", coord != NULL);
8351 +       --coord->item_pos;
8352 +       coord_clear_iplug(coord);
8353 +}
8354 +
8355 +static inline void coord_inc_item_pos(coord_t * coord)
8356 +{
8357 +       assert("nikita-2481", coord != NULL);
8358 +       ++coord->item_pos;
8359 +       coord_clear_iplug(coord);
8360 +}
8361 +
8362 +static inline void coord_add_item_pos(coord_t * coord, int delta)
8363 +{
8364 +       assert("nikita-2482", coord != NULL);
8365 +       coord->item_pos += delta;
8366 +       coord_clear_iplug(coord);
8367 +}
8368 +
8369 +static inline void coord_invalid_item_pos(coord_t * coord)
8370 +{
8371 +       assert("nikita-2832", coord != NULL);
8372 +       coord->item_pos = (unsigned short)~0;
8373 +       coord_clear_iplug(coord);
8374 +}
8375 +
8376 +/* Reverse a direction. */
8377 +static inline sideof sideof_reverse(sideof side)
8378 +{
8379 +       return side == LEFT_SIDE ? RIGHT_SIDE : LEFT_SIDE;
8380 +}
8381 +
8382 +/* NOTE: There is a somewhat odd mixture of the following opposed terms:
8383 +
8384 +   "first" and "last"
8385 +   "next" and "prev"
8386 +   "before" and "after"
8387 +   "leftmost" and "rightmost"
8388 +
8389 +   But I think the chosen names are decent the way they are.
8390 +*/
8391 +
8392 +/* COORD INITIALIZERS */
8393 +
8394 +/* Initialize an invalid coordinate. */
8395 +extern void coord_init_invalid(coord_t * coord, const znode * node);
8396 +
8397 +extern void coord_init_first_unit_nocheck(coord_t * coord, const znode * node);
8398 +
8399 +/* Initialize a coordinate to point at the first unit of the first item.  If the node is
8400 +   empty, it is positioned at the EMPTY_NODE. */
8401 +extern void coord_init_first_unit(coord_t * coord, const znode * node);
8402 +
8403 +/* Initialize a coordinate to point at the last unit of the last item.  If the node is
8404 +   empty, it is positioned at the EMPTY_NODE. */
8405 +extern void coord_init_last_unit(coord_t * coord, const znode * node);
8406 +
8407 +/* Initialize a coordinate to before the first item.  If the node is empty, it is
8408 +   positioned at the EMPTY_NODE. */
8409 +extern void coord_init_before_first_item(coord_t * coord, const znode * node);
8410 +
8411 +/* Initialize a coordinate to after the last item.  If the node is empty, it is positioned
8412 +   at the EMPTY_NODE. */
8413 +extern void coord_init_after_last_item(coord_t * coord, const znode * node);
8414 +
8415 +/* Initialize a coordinate to after last unit in the item. Coord must be set
8416 +   already to existing item */
8417 +void coord_init_after_item_end(coord_t * coord);
8418 +
8419 +/* Initialize a coordinate to before the item. Coord must be set already to existing item */
8420 +void coord_init_before_item(coord_t *);
8421 +/* Initialize a coordinate to after the item. Coord must be set already to existing item */
8422 +void coord_init_after_item(coord_t *);
8423 +
8424 +/* Calls either coord_init_first_unit or coord_init_last_unit depending on sideof argument. */
8425 +extern void coord_init_sideof_unit(coord_t * coord, const znode * node,
8426 +                                  sideof dir);
8427 +
8428 +/* Initialize a coordinate by 0s. Used in places where init_coord was used and
8429 +   it was not clear how actually
8430 +   FIXME-VS: added by vs (2002, june, 8) */
8431 +extern void coord_init_zero(coord_t * coord);
8432 +
8433 +/* COORD METHODS */
8434 +
8435 +/* after shifting of node content, coord previously set properly may become
8436 +   invalid, try to "normalize" it. */
8437 +void coord_normalize(coord_t * coord);
8438 +
8439 +/* Copy a coordinate. */
8440 +extern void coord_dup(coord_t * coord, const coord_t * old_coord);
8441 +
8442 +/* Copy a coordinate without check. */
8443 +void coord_dup_nocheck(coord_t * coord, const coord_t * old_coord);
8444 +
8445 +unsigned coord_num_units(const coord_t * coord);
8446 +
8447 +/* Return the last valid unit number at the present item (i.e.,
8448 +   coord_num_units() - 1). */
8449 +static inline unsigned coord_last_unit_pos(const coord_t * coord)
8450 +{
8451 +       return coord_num_units(coord) - 1;
8452 +}
8453 +
8454 +#if REISER4_DEBUG
8455 +/* For assertions only, checks for a valid coordinate. */
8456 +extern int coord_check(const coord_t * coord);
8457 +
8458 +extern unsigned long znode_times_locked(const znode * z);
8459 +
8460 +static inline void coord_update_v(coord_t * coord)
8461 +{
8462 +       coord->plug_v = coord->body_v = znode_times_locked(coord->node);
8463 +}
8464 +#endif
8465 +
8466 +extern int coords_equal(const coord_t * c1, const coord_t * c2);
8467 +
8468 +extern void print_coord(const char *mes, const coord_t * coord, int print_node);
8469 +
8470 +/* If coord_is_after_rightmost return NCOORD_ON_THE_RIGHT, if coord_is_after_leftmost
8471 +   return NCOORD_ON_THE_LEFT, otherwise return NCOORD_INSIDE. */
8472 +extern coord_wrt_node coord_wrt(const coord_t * coord);
8473 +
8474 +/* Returns true if the coordinates are positioned at adjacent units, regardless of
8475 +   before-after or item boundaries. */
8476 +extern int coord_are_neighbors(coord_t * c1, coord_t * c2);
8477 +
8478 +/* Assuming two coordinates are positioned in the same node, return NCOORD_CMP_ON_RIGHT,
8479 +   NCOORD_CMP_ON_LEFT, or NCOORD_CMP_SAME depending on c1's position relative to c2.  */
8480 +extern coord_cmp coord_compare(coord_t * c1, coord_t * c2);
8481 +
8482 +/* COORD PREDICATES */
8483 +
8484 +/* Returns true if the coord was initializewd by coord_init_invalid (). */
8485 +extern int coord_is_invalid(const coord_t * coord);
8486 +
8487 +/* Returns true if the coordinate is positioned at an existing item, not before or after
8488 +   an item.  It may be placed at, before, or after any unit within the item, whether
8489 +   existing or not.  If this is true you can call methods of the item plugin.  */
8490 +extern int coord_is_existing_item(const coord_t * coord);
8491 +
8492 +/* Returns true if the coordinate is positioned after a item, before a item, after the
8493 +   last unit of an item, before the first unit of an item, or at an empty node. */
8494 +extern int coord_is_between_items(const coord_t * coord);
8495 +
8496 +/* Returns true if the coordinate is positioned at an existing unit, not before or after a
8497 +   unit. */
8498 +extern int coord_is_existing_unit(const coord_t * coord);
8499 +
8500 +/* Returns true if the coordinate is positioned at an empty node. */
8501 +extern int coord_is_empty(const coord_t * coord);
8502 +
8503 +/* Returns true if the coordinate is positioned at the first unit of the first item.  Not
8504 +   true for empty nodes nor coordinates positioned before the first item. */
8505 +extern int coord_is_leftmost_unit(const coord_t * coord);
8506 +
8507 +/* Returns true if the coordinate is positioned after the last item or after the last unit
8508 +   of the last item or it is an empty node. */
8509 +extern int coord_is_after_rightmost(const coord_t * coord);
8510 +
8511 +/* Returns true if the coordinate is positioned before the first item or it is an empty
8512 +   node. */
8513 +extern int coord_is_before_leftmost(const coord_t * coord);
8514 +
8515 +/* Calls either coord_is_before_leftmost or coord_is_after_rightmost depending on sideof
8516 +   argument. */
8517 +extern int coord_is_after_sideof_unit(coord_t * coord, sideof dir);
8518 +
8519 +/* COORD MODIFIERS */
8520 +
8521 +/* Advances the coordinate by one unit to the right.  If empty, no change.  If
8522 +   coord_is_rightmost_unit, advances to AFTER THE LAST ITEM.  Returns 0 if new position is
8523 +   an existing unit. */
8524 +extern int coord_next_unit(coord_t * coord);
8525 +
8526 +/* Advances the coordinate by one item to the right.  If empty, no change.  If
8527 +   coord_is_rightmost_unit, advances to AFTER THE LAST ITEM.  Returns 0 if new position is
8528 +   an existing item. */
8529 +extern int coord_next_item(coord_t * coord);
8530 +
8531 +/* Advances the coordinate by one unit to the left.  If empty, no change.  If
8532 +   coord_is_leftmost_unit, advances to BEFORE THE FIRST ITEM.  Returns 0 if new position
8533 +   is an existing unit. */
8534 +extern int coord_prev_unit(coord_t * coord);
8535 +
8536 +/* Advances the coordinate by one item to the left.  If empty, no change.  If
8537 +   coord_is_leftmost_unit, advances to BEFORE THE FIRST ITEM.  Returns 0 if new position
8538 +   is an existing item. */
8539 +extern int coord_prev_item(coord_t * coord);
8540 +
8541 +/* If the coordinate is between items, shifts it to the right.  Returns 0 on success and
8542 +   non-zero if there is no position to the right. */
8543 +extern int coord_set_to_right(coord_t * coord);
8544 +
8545 +/* If the coordinate is between items, shifts it to the left.  Returns 0 on success and
8546 +   non-zero if there is no position to the left. */
8547 +extern int coord_set_to_left(coord_t * coord);
8548 +
8549 +/* If the coordinate is at an existing unit, set to after that unit.  Returns 0 on success
8550 +   and non-zero if the unit did not exist. */
8551 +extern int coord_set_after_unit(coord_t * coord);
8552 +
8553 +/* Calls either coord_next_unit or coord_prev_unit depending on sideof argument. */
8554 +extern int coord_sideof_unit(coord_t * coord, sideof dir);
8555 +
8556 +/* iterate over all units in @node */
8557 +#define for_all_units( coord, node )                                   \
8558 +       for( coord_init_before_first_item( ( coord ), ( node ) ) ;      \
8559 +            coord_next_unit( coord ) == 0 ; )
8560 +
8561 +/* iterate over all items in @node */
8562 +#define for_all_items( coord, node )                                   \
8563 +       for( coord_init_before_first_item( ( coord ), ( node ) ) ;      \
8564 +            coord_next_item( coord ) == 0 ; )
8565 +
8566 +/* COORD/ITEM METHODS */
8567 +
8568 +extern int item_utmost_child_real_block(const coord_t * coord, sideof side,
8569 +                                       reiser4_block_nr * blk);
8570 +extern int item_utmost_child(const coord_t * coord, sideof side,
8571 +                            jnode ** child);
8572 +
8573 +/* a flow is a sequence of bytes being written to or read from the tree.  The
8574 +   tree will slice the flow into items while storing it into nodes, but all of
8575 +   that is hidden from anything outside the tree.  */
8576 +
8577 +struct flow {
8578 +       reiser4_key key;        /* key of start of flow's sequence of bytes */
8579 +       loff_t length;          /* length of flow's sequence of bytes */
8580 +       char *data;             /* start of flow's sequence of bytes */
8581 +       int user;               /* if 1 data is user space, 0 - kernel space */
8582 +       rw_op op;               /* NIKITA-FIXME-HANS: comment is where?  */
8583 +};
8584 +
8585 +void move_flow_forward(flow_t * f, unsigned count);
8586 +
8587 +/* &reiser4_item_data - description of data to be inserted or pasted
8588 +
8589 +   Q: articulate the reasons for the difference between this and flow.
8590 +
8591 +   A: Becides flow we insert into tree other things: stat data, directory
8592 +   entry, etc.  To insert them into tree one has to provide this structure. If
8593 +   one is going to insert flow - he can use insert_flow, where this structure
8594 +   does not have to be created
8595 +*/
8596 +struct reiser4_item_data {
8597 +       /* actual data to be inserted. If NULL, ->create_item() will not
8598 +          do xmemcpy itself, leaving this up to the caller. This can
8599 +          save some amount of unnecessary memory copying, for example,
8600 +          during insertion of stat data.
8601 +
8602 +        */
8603 +       char *data;
8604 +       /* 1 if 'char * data' contains pointer to user space and 0 if it is
8605 +          kernel space */
8606 +       int user;
8607 +       /* amount of data we are going to insert or paste */
8608 +       int length;
8609 +       /* "Arg" is opaque data that is passed down to the
8610 +          ->create_item() method of node layout, which in turn
8611 +          hands it to the ->create_hook() of item being created. This
8612 +          arg is currently used by:
8613 +
8614 +          .  ->create_hook() of internal item
8615 +          (fs/reiser4/plugin/item/internal.c:internal_create_hook()),
8616 +          . ->paste() method of directory item.
8617 +          . ->create_hook() of extent item
8618 +
8619 +          For internal item, this is left "brother" of new node being
8620 +          inserted and it is used to add new node into sibling list
8621 +          after parent to it was just inserted into parent.
8622 +
8623 +          While ->arg does look somewhat of unnecessary compication,
8624 +          it actually saves a lot of headache in many places, because
8625 +          all data necessary to insert or paste new data into tree are
8626 +          collected in one place, and this eliminates a lot of extra
8627 +          argument passing and storing everywhere.
8628 +
8629 +        */
8630 +       void *arg;
8631 +       /* plugin of item we are inserting */
8632 +       item_plugin *iplug;
8633 +};
8634 +
8635 +/* __REISER4_COORD_H__ */
8636 +#endif
8637 +
8638 +/* Make Linus happy.
8639 +   Local variables:
8640 +   c-indentation-style: "K&R"
8641 +   mode-name: "LC"
8642 +   c-basic-offset: 8
8643 +   tab-width: 8
8644 +   fill-column: 120
8645 +   scroll-step: 1
8646 +   End:
8647 +*/
8648 diff --git a/fs/reiser4/debug.c b/fs/reiser4/debug.c
8649 new file mode 100644
8650 index 0000000..a619470
8651 --- /dev/null
8652 +++ b/fs/reiser4/debug.c
8653 @@ -0,0 +1,300 @@
8654 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
8655 + * reiser4/README */
8656 +
8657 +/* Debugging facilities. */
8658 +
8659 +/*
8660 + * This file contains generic debugging functions used by reiser4. Roughly
8661 + * following:
8662 + *
8663 + *     panicking: reiser4_do_panic(), reiser4_print_prefix().
8664 + *
8665 + *     locking: schedulable(), lock_counters(), print_lock_counters(),
8666 + *     no_counters_are_held(), commit_check_locks()
8667 + *
8668 + *     error code monitoring (see comment before RETERR macro): return_err(),
8669 + *     report_err().
8670 + *
8671 + *     stack back-tracing: fill_backtrace()
8672 + *
8673 + *     miscellaneous: preempt_point(), call_on_each_assert(), debugtrap().
8674 + *
8675 + */
8676 +
8677 +#include "reiser4.h"
8678 +#include "context.h"
8679 +#include "super.h"
8680 +#include "txnmgr.h"
8681 +#include "znode.h"
8682 +
8683 +#include <linux/sysfs.h>
8684 +#include <linux/slab.h>
8685 +#include <linux/types.h>
8686 +#include <linux/fs.h>
8687 +#include <linux/spinlock.h>
8688 +#include <linux/kallsyms.h>
8689 +#include <linux/vmalloc.h>
8690 +#include <linux/ctype.h>
8691 +#include <linux/sysctl.h>
8692 +#include <linux/hardirq.h>
8693 +
8694 +#if REISER4_DEBUG
8695 +static void report_err(void);
8696 +#else
8697 +#define report_err() noop
8698 +#endif
8699 +
8700 +/*
8701 + * global buffer where message given to reiser4_panic is formatted.
8702 + */
8703 +static char panic_buf[REISER4_PANIC_MSG_BUFFER_SIZE];
8704 +
8705 +/*
8706 + * lock protecting consistency of panic_buf under concurrent panics
8707 + */
8708 +static DEFINE_SPINLOCK(panic_guard);
8709 +
8710 +/* Your best friend. Call it on each occasion.  This is called by
8711 +    fs/reiser4/debug.h:reiser4_panic(). */
8712 +void reiser4_do_panic(const char *format /* format string */ , ... /* rest */ )
8713 +{
8714 +       static int in_panic = 0;
8715 +       va_list args;
8716 +
8717 +       /*
8718 +        * check for recursive panic.
8719 +        */
8720 +       if (in_panic == 0) {
8721 +               in_panic = 1;
8722 +
8723 +               spin_lock(&panic_guard);
8724 +               va_start(args, format);
8725 +               vsnprintf(panic_buf, sizeof(panic_buf), format, args);
8726 +               va_end(args);
8727 +               printk(KERN_EMERG "reiser4 panicked cowardly: %s", panic_buf);
8728 +               spin_unlock(&panic_guard);
8729 +
8730 +               /*
8731 +                * if kernel debugger is configured---drop in. Early dropping
8732 +                * into kgdb is not always convenient, because panic message
8733 +                * is not yet printed most of the times. But:
8734 +                *
8735 +                *     (1) message can be extracted from printk_buf[]
8736 +                *     (declared static inside of printk()), and
8737 +                *
8738 +                *     (2) sometimes serial/kgdb combo dies while printing
8739 +                *     long panic message, so it's more prudent to break into
8740 +                *     debugger earlier.
8741 +                *
8742 +                */
8743 +               DEBUGON(1);
8744 +       }
8745 +       /* to make gcc happy about noreturn attribute */
8746 +       panic("%s", panic_buf);
8747 +}
8748 +
8749 +void
8750 +reiser4_print_prefix(const char *level, int reperr, const char *mid,
8751 +                    const char *function, const char *file, int lineno)
8752 +{
8753 +       const char *comm;
8754 +       int pid;
8755 +
8756 +       if (unlikely(in_interrupt() || in_irq())) {
8757 +               comm = "interrupt";
8758 +               pid = 0;
8759 +       } else {
8760 +               comm = current->comm;
8761 +               pid = current->pid;
8762 +       }
8763 +       printk("%sreiser4[%.16s(%i)]: %s (%s:%i)[%s]:\n",
8764 +              level, comm, pid, function, file, lineno, mid);
8765 +       if (reperr)
8766 +               report_err();
8767 +}
8768 +
8769 +/* Preemption point: this should be called periodically during long running
8770 +   operations (carry, allocate, and squeeze are best examples) */
8771 +int preempt_point(void)
8772 +{
8773 +       assert("nikita-3008", schedulable());
8774 +       cond_resched();
8775 +       return signal_pending(current);
8776 +}
8777 +
8778 +#if REISER4_DEBUG
8779 +/* Debugging aid: return struct where information about locks taken by current
8780 +   thread is accumulated. This can be used to formulate lock ordering
8781 +   constraints and various assertions.
8782 +
8783 +*/
8784 +lock_counters_info *lock_counters(void)
8785 +{
8786 +       reiser4_context *ctx = get_current_context();
8787 +       assert("jmacd-1123", ctx != NULL);
8788 +       return &ctx->locks;
8789 +}
8790 +
8791 +/*
8792 + * print human readable information about locks held by the reiser4 context.
8793 + */
8794 +static void print_lock_counters(const char *prefix,
8795 +                               const lock_counters_info * info)
8796 +{
8797 +       printk("%s: jnode: %i, tree: %i (r:%i,w:%i), dk: %i (r:%i,w:%i)\n"
8798 +              "jload: %i, "
8799 +              "txnh: %i, atom: %i, stack: %i, txnmgr: %i, "
8800 +              "ktxnmgrd: %i, fq: %i\n"
8801 +              "inode: %i, "
8802 +              "cbk_cache: %i (r:%i,w%i), "
8803 +              "eflush: %i, "
8804 +              "zlock: %i,\n"
8805 +              "spin: %i, long: %i inode_sem: (r:%i,w:%i)\n"
8806 +              "d: %i, x: %i, t: %i\n", prefix,
8807 +              info->spin_locked_jnode,
8808 +              info->rw_locked_tree, info->read_locked_tree,
8809 +              info->write_locked_tree,
8810 +              info->rw_locked_dk, info->read_locked_dk, info->write_locked_dk,
8811 +              info->spin_locked_jload,
8812 +              info->spin_locked_txnh,
8813 +              info->spin_locked_atom, info->spin_locked_stack,
8814 +              info->spin_locked_txnmgr, info->spin_locked_ktxnmgrd,
8815 +              info->spin_locked_fq,
8816 +              info->spin_locked_inode,
8817 +              info->rw_locked_cbk_cache,
8818 +              info->read_locked_cbk_cache,
8819 +              info->write_locked_cbk_cache,
8820 +              info->spin_locked_super_eflush,
8821 +              info->spin_locked_zlock,
8822 +              info->spin_locked,
8823 +              info->long_term_locked_znode,
8824 +              info->inode_sem_r, info->inode_sem_w,
8825 +              info->d_refs, info->x_refs, info->t_refs);
8826 +}
8827 +
8828 +/* check that no spinlocks are held */
8829 +int schedulable(void)
8830 +{
8831 +       if (get_current_context_check() != NULL) {
8832 +               if (!LOCK_CNT_NIL(spin_locked)) {
8833 +                       print_lock_counters("in atomic", lock_counters());
8834 +                       return 0;
8835 +               }
8836 +       }
8837 +       might_sleep();
8838 +       return 1;
8839 +}
8840 +/*
8841 + * return true, iff no locks are held.
8842 + */
8843 +int no_counters_are_held(void)
8844 +{
8845 +       lock_counters_info *counters;
8846 +
8847 +       counters = lock_counters();
8848 +       return
8849 +           (counters->spin_locked_zlock == 0) &&
8850 +           (counters->spin_locked_jnode == 0) &&
8851 +           (counters->rw_locked_tree == 0) &&
8852 +           (counters->read_locked_tree == 0) &&
8853 +           (counters->write_locked_tree == 0) &&
8854 +           (counters->rw_locked_dk == 0) &&
8855 +           (counters->read_locked_dk == 0) &&
8856 +           (counters->write_locked_dk == 0) &&
8857 +           (counters->spin_locked_txnh == 0) &&
8858 +           (counters->spin_locked_atom == 0) &&
8859 +           (counters->spin_locked_stack == 0) &&
8860 +           (counters->spin_locked_txnmgr == 0) &&
8861 +           (counters->spin_locked_inode == 0) &&
8862 +           (counters->spin_locked == 0) &&
8863 +           (counters->long_term_locked_znode == 0) &&
8864 +           (counters->inode_sem_r == 0) &&
8865 +           (counters->inode_sem_w == 0) && (counters->d_refs == 0);
8866 +}
8867 +
8868 +/*
8869 + * return true, iff transaction commit can be done under locks held by the
8870 + * current thread.
8871 + */
8872 +int commit_check_locks(void)
8873 +{
8874 +       lock_counters_info *counters;
8875 +       int inode_sem_r;
8876 +       int inode_sem_w;
8877 +       int result;
8878 +
8879 +       /*
8880 +        * inode's read/write semaphore is the only reiser4 lock that can be
8881 +        * held during commit.
8882 +        */
8883 +
8884 +       counters = lock_counters();
8885 +       inode_sem_r = counters->inode_sem_r;
8886 +       inode_sem_w = counters->inode_sem_w;
8887 +
8888 +       counters->inode_sem_r = counters->inode_sem_w = 0;
8889 +       result = no_counters_are_held();
8890 +       counters->inode_sem_r = inode_sem_r;
8891 +       counters->inode_sem_w = inode_sem_w;
8892 +       return result;
8893 +}
8894 +
8895 +/*
8896 + * fill "error site" in the current reiser4 context. See comment before RETERR
8897 + * macro for more details.
8898 + */
8899 +void return_err(int code, const char *file, int line)
8900 +{
8901 +       if (code < 0 && is_in_reiser4_context()) {
8902 +               reiser4_context *ctx = get_current_context();
8903 +
8904 +               if (ctx != NULL) {
8905 +                       ctx->err.code = code;
8906 +                       ctx->err.file = file;
8907 +                       ctx->err.line = line;
8908 +               }
8909 +       }
8910 +}
8911 +
8912 +/*
8913 + * report error information recorder by return_err().
8914 + */
8915 +static void report_err(void)
8916 +{
8917 +       reiser4_context *ctx = get_current_context_check();
8918 +
8919 +       if (ctx != NULL) {
8920 +               if (ctx->err.code != 0) {
8921 +                       printk("code: %i at %s:%i\n",
8922 +                              ctx->err.code, ctx->err.file, ctx->err.line);
8923 +               }
8924 +       }
8925 +}
8926 +
8927 +#endif                         /* REISER4_DEBUG */
8928 +
8929 +#if KERNEL_DEBUGGER
8930 +
8931 +/*
8932 + * this functions just drops into kernel debugger. It is a convenient place to
8933 + * put breakpoint in.
8934 + */
8935 +void debugtrap(void)
8936 +{
8937 +       /* do nothing. Put break point here. */
8938 +#if defined(CONFIG_KGDB) && !defined(CONFIG_REISER4_FS_MODULE)
8939 +       extern void breakpoint(void);
8940 +       breakpoint();
8941 +#endif
8942 +}
8943 +#endif
8944 +
8945 +/* Make Linus happy.
8946 +   Local variables:
8947 +   c-indentation-style: "K&R"
8948 +   mode-name: "LC"
8949 +   c-basic-offset: 8
8950 +   tab-width: 8
8951 +   fill-column: 120
8952 +   End:
8953 +*/
8954 diff --git a/fs/reiser4/debug.h b/fs/reiser4/debug.h
8955 new file mode 100644
8956 index 0000000..bc3249e
8957 --- /dev/null
8958 +++ b/fs/reiser4/debug.h
8959 @@ -0,0 +1,350 @@
8960 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
8961 +
8962 +/* Declarations of debug macros. */
8963 +
8964 +#if !defined( __FS_REISER4_DEBUG_H__ )
8965 +#define __FS_REISER4_DEBUG_H__
8966 +
8967 +#include "forward.h"
8968 +#include "reiser4.h"
8969 +
8970 +/* generic function to produce formatted output, decorating it with
8971 +   whatever standard prefixes/postfixes we want. "Fun" is a function
8972 +   that will be actually called, can be printk, panic etc.
8973 +   This is for use by other debugging macros, not by users. */
8974 +#define DCALL(lev, fun, reperr, label, format, ...)                    \
8975 +({                                                                     \
8976 +       fun(lev "reiser4[%.16s(%i)]: %s (%s:%i)[%s]:\n" format "\n" ,   \
8977 +           current->comm, current->pid, __FUNCTION__,                  \
8978 +           __FILE__, __LINE__, label, ## __VA_ARGS__);                 \
8979 +})
8980 +
8981 +/*
8982 + * cause kernel to crash
8983 + */
8984 +#define reiser4_panic(mid, format, ...)                                \
8985 +       DCALL("", reiser4_do_panic, 1, mid, format , ## __VA_ARGS__)
8986 +
8987 +/* print message with indication of current process, file, line and
8988 +   function */
8989 +#define reiser4_log(label, format, ...)                                \
8990 +       DCALL(KERN_DEBUG, printk, 0, label, format , ## __VA_ARGS__)
8991 +
8992 +/* Assertion checked during compilation.
8993 +    If "cond" is false (0) we get duplicate case label in switch.
8994 +    Use this to check something like famous
8995 +       cassert (sizeof(struct reiserfs_journal_commit) == 4096) ;
8996 +    in 3.x journal.c. If cassertion fails you get compiler error,
8997 +    so no "maintainer-id".
8998 +*/
8999 +#define cassert(cond) ({ switch(-1) { case (cond): case 0: break; } })
9000 +
9001 +#define noop   do {;} while(0)
9002 +
9003 +#if REISER4_DEBUG
9004 +/* version of info that only actually prints anything when _d_ebugging
9005 +    is on */
9006 +#define dinfo(format, ...) printk(format , ## __VA_ARGS__)
9007 +/* macro to catch logical errors. Put it into `default' clause of
9008 +    switch() statement. */
9009 +#define impossible(label, format, ...)                         \
9010 +         reiser4_panic(label, "impossible: " format , ## __VA_ARGS__)
9011 +/* assert assures that @cond is true. If it is not, reiser4_panic() is
9012 +   called. Use this for checking logical consistency and _never_ call
9013 +   this to check correctness of external data: disk blocks and user-input . */
9014 +#define assert(label, cond)                                                    \
9015 +({                                                                             \
9016 +       /* call_on_each_assert(); */                                            \
9017 +       if (cond) {                                                             \
9018 +               /* put negated check to avoid using !(cond) that would lose     \
9019 +                * warnings for things like assert(a = b); */                   \
9020 +               ;                                                               \
9021 +       } else {                                                                \
9022 +               DEBUGON(1);                                                     \
9023 +               reiser4_panic(label, "assertion failed: %s", #cond);            \
9024 +       }                                                                       \
9025 +})
9026 +
9027 +/* like assertion, but @expr is evaluated even if REISER4_DEBUG is off. */
9028 +#define check_me( label, expr )        assert( label, ( expr ) )
9029 +
9030 +#define ON_DEBUG( exp ) exp
9031 +
9032 +extern int schedulable(void);
9033 +extern void call_on_each_assert(void);
9034 +
9035 +#else
9036 +
9037 +#define dinfo( format, args... ) noop
9038 +#define impossible( label, format, args... ) noop
9039 +#define assert( label, cond ) noop
9040 +#define check_me( label, expr )        ( ( void ) ( expr ) )
9041 +#define ON_DEBUG( exp )
9042 +#define schedulable() might_sleep()
9043 +
9044 +/* REISER4_DEBUG */
9045 +#endif
9046 +
9047 +#if REISER4_DEBUG
9048 +/* per-thread information about lock acquired by this thread. Used by lock
9049 + * ordering checking in spin_macros.h */
9050 +typedef struct lock_counters_info {
9051 +       int rw_locked_tree;
9052 +       int read_locked_tree;
9053 +       int write_locked_tree;
9054 +
9055 +       int rw_locked_dk;
9056 +       int read_locked_dk;
9057 +       int write_locked_dk;
9058 +
9059 +       int rw_locked_cbk_cache;
9060 +       int read_locked_cbk_cache;
9061 +       int write_locked_cbk_cache;
9062 +
9063 +       int spin_locked_zlock;
9064 +       int spin_locked_jnode;
9065 +       int spin_locked_jload;
9066 +       int spin_locked_txnh;
9067 +       int spin_locked_atom;
9068 +       int spin_locked_stack;
9069 +       int spin_locked_txnmgr;
9070 +       int spin_locked_ktxnmgrd;
9071 +       int spin_locked_fq;
9072 +       int spin_locked_inode;
9073 +       int spin_locked_super_eflush;
9074 +       int spin_locked;
9075 +       int long_term_locked_znode;
9076 +
9077 +       int inode_sem_r;
9078 +       int inode_sem_w;
9079 +
9080 +       int d_refs;
9081 +       int x_refs;
9082 +       int t_refs;
9083 +} lock_counters_info;
9084 +
9085 +extern lock_counters_info *lock_counters(void);
9086 +#define IN_CONTEXT(a, b) (is_in_reiser4_context() ? (a) : (b))
9087 +
9088 +/* increment lock-counter @counter, if present */
9089 +#define LOCK_CNT_INC(counter) IN_CONTEXT(++(lock_counters()->counter), 0)
9090 +
9091 +/* decrement lock-counter @counter, if present */
9092 +#define LOCK_CNT_DEC(counter) IN_CONTEXT(--(lock_counters()->counter), 0)
9093 +
9094 +/* check that lock-counter is zero. This is for use in assertions */
9095 +#define LOCK_CNT_NIL(counter) IN_CONTEXT(lock_counters()->counter == 0, 1)
9096 +
9097 +/* check that lock-counter is greater than zero. This is for use in
9098 + * assertions */
9099 +#define LOCK_CNT_GTZ(counter) IN_CONTEXT(lock_counters()->counter > 0, 1)
9100 +#define LOCK_CNT_LT(counter,n) IN_CONTEXT(lock_counters()->counter < n, 1)
9101 +
9102 +#else                          /* REISER4_DEBUG */
9103 +
9104 +/* no-op versions on the above */
9105 +
9106 +typedef struct lock_counters_info {
9107 +} lock_counters_info;
9108 +
9109 +#define lock_counters() ((lock_counters_info *)NULL)
9110 +#define LOCK_CNT_INC(counter) noop
9111 +#define LOCK_CNT_DEC(counter) noop
9112 +#define LOCK_CNT_NIL(counter) (1)
9113 +#define LOCK_CNT_GTZ(counter) (1)
9114 +#define LOCK_CNT_LT(counter,n) (1)
9115 +
9116 +#endif                         /* REISER4_DEBUG */
9117 +
9118 +#define assert_spin_not_locked(lock) BUG_ON(0)
9119 +#define assert_rw_write_locked(lock) BUG_ON(0)
9120 +#define assert_rw_read_locked(lock) BUG_ON(0)
9121 +#define assert_rw_locked(lock) BUG_ON(0)
9122 +#define assert_rw_not_write_locked(lock) BUG_ON(0)
9123 +#define assert_rw_not_read_locked(lock) BUG_ON(0)
9124 +#define assert_rw_not_locked(lock) BUG_ON(0)
9125 +
9126 +/* flags controlling debugging behavior. Are set through debug_flags=N mount
9127 +   option. */
9128 +typedef enum {
9129 +       /* print a lot of information during panic. When this is on all jnodes
9130 +        * are listed. This can be *very* large output. Usually you don't want
9131 +        * this. Especially over serial line. */
9132 +       REISER4_VERBOSE_PANIC = 0x00000001,
9133 +       /* print a lot of information during umount */
9134 +       REISER4_VERBOSE_UMOUNT = 0x00000002,
9135 +       /* print gathered statistics on umount */
9136 +       REISER4_STATS_ON_UMOUNT = 0x00000004,
9137 +       /* check node consistency */
9138 +       REISER4_CHECK_NODE = 0x00000008
9139 +} reiser4_debug_flags;
9140 +
9141 +extern int is_in_reiser4_context(void);
9142 +
9143 +/*
9144 + * evaluate expression @e only if with reiser4 context
9145 + */
9146 +#define ON_CONTEXT(e)  do {                    \
9147 +       if(is_in_reiser4_context()) {           \
9148 +               e;                              \
9149 +       } } while(0)
9150 +
9151 +/*
9152 + * evaluate expression @e only when within reiser4_context and debugging is
9153 + * on.
9154 + */
9155 +#define ON_DEBUG_CONTEXT( e ) ON_DEBUG( ON_CONTEXT( e ) )
9156 +
9157 +/*
9158 + * complain about unexpected function result and crash. Used in "default"
9159 + * branches of switch statements and alike to assert that invalid results are
9160 + * not silently ignored.
9161 + */
9162 +#define wrong_return_value( label, function )                          \
9163 +       impossible( label, "wrong return value from " function )
9164 +
9165 +/* Issue different types of reiser4 messages to the console */
9166 +#define warning( label, format, ... )                                  \
9167 +       DCALL( KERN_WARNING,                                            \
9168 +              printk, 1, label, "WARNING: " format , ## __VA_ARGS__ )
9169 +#define notice( label, format, ... )                                   \
9170 +       DCALL( KERN_NOTICE,                                             \
9171 +              printk, 1, label, "NOTICE: " format , ## __VA_ARGS__ )
9172 +
9173 +/* mark not yet implemented functionality */
9174 +#define not_yet( label, format, ... )                          \
9175 +       reiser4_panic( label, "NOT YET IMPLEMENTED: " format , ## __VA_ARGS__ )
9176 +
9177 +extern void reiser4_do_panic(const char *format, ...)
9178 +    __attribute__ ((noreturn, format(printf, 1, 2)));
9179 +
9180 +extern void reiser4_print_prefix(const char *level, int reperr, const char *mid,
9181 +                                const char *function,
9182 +                                const char *file, int lineno);
9183 +
9184 +extern int preempt_point(void);
9185 +extern void reiser4_print_stats(void);
9186 +
9187 +
9188 +#if REISER4_DEBUG
9189 +extern int no_counters_are_held(void);
9190 +extern int commit_check_locks(void);
9191 +#else
9192 +#define no_counters_are_held() (1)
9193 +#define commit_check_locks() (1)
9194 +#endif
9195 +
9196 +/* true if @i is power-of-two. Useful for rate-limited warnings, etc. */
9197 +#define IS_POW(i)                              \
9198 +({                                             \
9199 +       typeof(i) __i;                          \
9200 +                                               \
9201 +       __i = (i);                              \
9202 +       !(__i & (__i - 1));                     \
9203 +})
9204 +
9205 +#define KERNEL_DEBUGGER (1)
9206 +
9207 +#if KERNEL_DEBUGGER
9208 +
9209 +extern void debugtrap(void);
9210 +
9211 +/*
9212 + * Check condition @cond and drop into kernel debugger (kgdb) if it's true. If
9213 + * kgdb is not compiled in, do nothing.
9214 + */
9215 +#define DEBUGON(cond)                          \
9216 +({                                             \
9217 +       if (unlikely(cond))                     \
9218 +               debugtrap();                    \
9219 +})
9220 +#else
9221 +#define DEBUGON(cond) noop
9222 +#endif
9223 +
9224 +/*
9225 + * Error code tracing facility. (Idea is borrowed from XFS code.)
9226 + *
9227 + * Suppose some strange and/or unexpected code is returned from some function
9228 + * (for example, write(2) returns -EEXIST). It is possible to place a
9229 + * breakpoint in the reiser4_write(), but it is too late here. How to find out
9230 + * in what particular place -EEXIST was generated first?
9231 + *
9232 + * In reiser4 all places where actual error codes are produced (that is,
9233 + * statements of the form
9234 + *
9235 + *     return -EFOO;        // (1), or
9236 + *
9237 + *     result = -EFOO;      // (2)
9238 + *
9239 + * are replaced with
9240 + *
9241 + *     return RETERR(-EFOO);        // (1a), and
9242 + *
9243 + *     result = RETERR(-EFOO);      // (2a) respectively
9244 + *
9245 + * RETERR() macro fills a backtrace in reiser4_context. This back-trace is
9246 + * printed in error and warning messages. Moreover, it's possible to put a
9247 + * conditional breakpoint in return_err (low-level function called by RETERR()
9248 + * to do the actual work) to break into debugger immediately when particular
9249 + * error happens.
9250 + *
9251 + */
9252 +
9253 +#if REISER4_DEBUG
9254 +
9255 +/*
9256 + * data-type to store information about where error happened ("error site").
9257 + */
9258 +typedef struct err_site {
9259 +       int code;               /* error code */
9260 +       const char *file;       /* source file, filled by __FILE__ */
9261 +       int line;               /* source file line, filled by __LINE__ */
9262 +} err_site;
9263 +
9264 +extern void return_err(int code, const char *file, int line);
9265 +
9266 +/*
9267 + * fill &get_current_context()->err_site with error information.
9268 + */
9269 +#define RETERR(code)                           \
9270 +({                                             \
9271 +       typeof(code) __code;                    \
9272 +                                               \
9273 +       __code = (code);                        \
9274 +       return_err(__code, __FILE__, __LINE__); \
9275 +       __code;                                 \
9276 +})
9277 +
9278 +#else
9279 +
9280 +/*
9281 + * no-op versions of the above
9282 + */
9283 +
9284 +typedef struct err_site {
9285 +} err_site;
9286 +#define RETERR(code) code
9287 +#endif
9288 +
9289 +#if REISER4_LARGE_KEY
9290 +/*
9291 + * conditionally compile arguments only if REISER4_LARGE_KEY is on.
9292 + */
9293 +#define ON_LARGE_KEY(...) __VA_ARGS__
9294 +#else
9295 +#define ON_LARGE_KEY(...)
9296 +#endif
9297 +
9298 +/* __FS_REISER4_DEBUG_H__ */
9299 +#endif
9300 +
9301 +/* Make Linus happy.
9302 +   Local variables:
9303 +   c-indentation-style: "K&R"
9304 +   mode-name: "LC"
9305 +   c-basic-offset: 8
9306 +   tab-width: 8
9307 +   fill-column: 120
9308 +   End:
9309 +*/
9310 diff --git a/fs/reiser4/dformat.h b/fs/reiser4/dformat.h
9311 new file mode 100644
9312 index 0000000..b0f3160
9313 --- /dev/null
9314 +++ b/fs/reiser4/dformat.h
9315 @@ -0,0 +1,71 @@
9316 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
9317 +
9318 +/* Formats of on-disk data and conversion functions. */
9319 +
9320 +/* put all item formats in the files describing the particular items,
9321 +   our model is, everything you need to do to add an item to reiser4,
9322 +   (excepting the changes to the plugin that uses the item which go
9323 +   into the file defining that plugin), you put into one file. */
9324 +/* Data on disk are stored in little-endian format.
9325 +   To declare fields of on-disk structures, use d8, d16, d32 and d64.
9326 +   d??tocpu() and cputod??() to convert. */
9327 +
9328 +#if !defined( __FS_REISER4_DFORMAT_H__ )
9329 +#define __FS_REISER4_DFORMAT_H__
9330 +
9331 +#include <asm/byteorder.h>
9332 +#include <asm/unaligned.h>
9333 +#include <linux/types.h>
9334 +
9335 +
9336 +typedef __u8 d8;
9337 +typedef __le16 d16;
9338 +typedef __le32 d32;
9339 +typedef __le64 d64;
9340 +
9341 +#define PACKED __attribute__((packed))
9342 +
9343 +/* data-type for block number */
9344 +typedef __u64 reiser4_block_nr;
9345 +
9346 +/* data-type for block number on disk, disk format */
9347 +typedef __le64 reiser4_dblock_nr;
9348 +
9349 +/**
9350 + * disk_addr_eq - compare disk addresses
9351 + * @b1: pointer to block number ot compare
9352 + * @b2: pointer to block number ot compare
9353 + *
9354 + * Returns true if if disk addresses are the same
9355 + */
9356 +static inline int disk_addr_eq(const reiser4_block_nr *b1,
9357 +                              const reiser4_block_nr * b2)
9358 +{
9359 +       assert("nikita-1033", b1 != NULL);
9360 +       assert("nikita-1266", b2 != NULL);
9361 +
9362 +       return !memcmp(b1, b2, sizeof *b1);
9363 +}
9364 +
9365 +/* structure of master reiser4 super block */
9366 +typedef struct reiser4_master_sb {
9367 +       char magic[16];         /* "ReIsEr4" */
9368 +       __le16 disk_plugin_id;  /* id of disk layout plugin */
9369 +       __le16 blocksize;
9370 +       char uuid[16];          /* unique id */
9371 +       char label[16];         /* filesystem label */
9372 +       __le64 diskmap;         /* location of the diskmap. 0 if not present */
9373 +} reiser4_master_sb;
9374 +
9375 +/* __FS_REISER4_DFORMAT_H__ */
9376 +#endif
9377 +
9378 +/*
9379 + * Local variables:
9380 + * c-indentation-style: "K&R"
9381 + * mode-name: "LC"
9382 + * c-basic-offset: 8
9383 + * tab-width: 8
9384 + * fill-column: 79
9385 + * End:
9386 + */
9387 diff --git a/fs/reiser4/dscale.c b/fs/reiser4/dscale.c
9388 new file mode 100644
9389 index 0000000..a9bc224
9390 --- /dev/null
9391 +++ b/fs/reiser4/dscale.c
9392 @@ -0,0 +1,174 @@
9393 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
9394 + * reiser4/README */
9395 +
9396 +/* Scalable on-disk integers */
9397 +
9398 +/*
9399 + * Various on-disk structures contain integer-like structures. Stat-data
9400 + * contain [yes, "data" is plural, check the dictionary] file size, link
9401 + * count; extent unit contains extent width etc. To accommodate for general
9402 + * case enough space is reserved to keep largest possible value. 64 bits in
9403 + * all cases above. But in overwhelming majority of cases numbers actually
9404 + * stored in these fields will be comparatively small and reserving 8 bytes is
9405 + * a waste of precious disk bandwidth.
9406 + *
9407 + * Scalable integers are one way to solve this problem. dscale_write()
9408 + * function stores __u64 value in the given area consuming from 1 to 9 bytes,
9409 + * depending on the magnitude of the value supplied. dscale_read() reads value
9410 + * previously stored by dscale_write().
9411 + *
9412 + * dscale_write() produces format not completely unlike of UTF: two highest
9413 + * bits of the first byte are used to store "tag". One of 4 possible tag
9414 + * values is chosen depending on the number being encoded:
9415 + *
9416 + *           0 ... 0x3f               => 0           [table 1]
9417 + *        0x40 ... 0x3fff             => 1
9418 + *      0x4000 ... 0x3fffffff         => 2
9419 + *  0x40000000 ... 0xffffffffffffffff => 3
9420 + *
9421 + * (see dscale_range() function)
9422 + *
9423 + * Values in the range 0x40000000 ... 0xffffffffffffffff require 8 full bytes
9424 + * to be stored, so in this case there is no place in the first byte to store
9425 + * tag. For such values tag is stored in an extra 9th byte.
9426 + *
9427 + * As _highest_ bits are used for the test (which is natural) scaled integers
9428 + * are stored in BIG-ENDIAN format in contrast with the rest of reiser4 which
9429 + * uses LITTLE-ENDIAN.
9430 + *
9431 + */
9432 +
9433 +#include "debug.h"
9434 +#include "dscale.h"
9435 +
9436 +/* return tag of scaled integer stored at @address */
9437 +static int gettag(const unsigned char *address)
9438 +{
9439 +       /* tag is stored in two highest bits */
9440 +       return (*address) >> 6;
9441 +}
9442 +
9443 +/* clear tag from value. Clear tag embedded into @value. */
9444 +static void cleartag(__u64 * value, int tag)
9445 +{
9446 +       /*
9447 +        * W-w-what ?!
9448 +        *
9449 +        * Actually, this is rather simple: @value passed here was read by
9450 +        * dscale_read(), converted from BIG-ENDIAN, and padded to __u64 by
9451 +        * zeroes. Tag is still stored in the highest (arithmetically)
9452 +        * non-zero bits of @value, but relative position of tag within __u64
9453 +        * depends on @tag.
9454 +        *
9455 +        * For example if @tag is 0, it's stored 2 highest bits of lowest
9456 +        * byte, and its offset (counting from lowest bit) is 8 - 2 == 6 bits.
9457 +        *
9458 +        * If tag is 1, it's stored in two highest bits of 2nd lowest byte,
9459 +        * and it's offset if (2 * 8) - 2 == 14 bits.
9460 +        *
9461 +        * See table 1 above for details.
9462 +        *
9463 +        * All these cases are captured by the formula:
9464 +        */
9465 +       *value &= ~(3 << (((1 << tag) << 3) - 2));
9466 +       /*
9467 +        * That is, clear two (3 == 0t11) bits at the offset
9468 +        *
9469 +        *                  8 * (2 ^ tag) - 2,
9470 +        *
9471 +        * that is, two highest bits of (2 ^ tag)-th byte of @value.
9472 +        */
9473 +}
9474 +
9475 +/* return tag for @value. See table 1 above for details. */
9476 +static int dscale_range(__u64 value)
9477 +{
9478 +       if (value > 0x3fffffff)
9479 +               return 3;
9480 +       if (value > 0x3fff)
9481 +               return 2;
9482 +       if (value > 0x3f)
9483 +               return 1;
9484 +       return 0;
9485 +}
9486 +
9487 +/* restore value stored at @adderss by dscale_write() and return number of
9488 + * bytes consumed */
9489 +int dscale_read(unsigned char *address, __u64 * value)
9490 +{
9491 +       int tag;
9492 +
9493 +       /* read tag */
9494 +       tag = gettag(address);
9495 +       switch (tag) {
9496 +       case 3:
9497 +               /* In this case tag is stored in an extra byte, skip this byte
9498 +                * and decode value stored in the next 8 bytes.*/
9499 +               *value = __be64_to_cpu(get_unaligned((__be64 *)(address + 1)));
9500 +               /* worst case: 8 bytes for value itself plus one byte for
9501 +                * tag. */
9502 +               return 9;
9503 +       case 0:
9504 +               *value = get_unaligned(address);
9505 +               break;
9506 +       case 1:
9507 +               *value = __be16_to_cpu(get_unaligned((__be16 *)address));
9508 +               break;
9509 +       case 2:
9510 +               *value = __be32_to_cpu(get_unaligned((__be32 *)address));
9511 +               break;
9512 +       default:
9513 +               return RETERR(-EIO);
9514 +       }
9515 +       /* clear tag embedded into @value */
9516 +       cleartag(value, tag);
9517 +       /* number of bytes consumed is (2 ^ tag)---see table 1. */
9518 +       return 1 << tag;
9519 +}
9520 +
9521 +/* store @value at @address and return number of bytes consumed */
9522 +int dscale_write(unsigned char *address, __u64 value)
9523 +{
9524 +       int tag;
9525 +       int shift;
9526 +       __be64 v;
9527 +       unsigned char *valarr;
9528 +
9529 +       tag = dscale_range(value);
9530 +       v = __cpu_to_be64(value);
9531 +       valarr = (unsigned char *)&v;
9532 +       shift = (tag == 3) ? 1 : 0;
9533 +       memcpy(address + shift, valarr + sizeof v - (1 << tag), 1 << tag);
9534 +       *address |= (tag << 6);
9535 +       return shift + (1 << tag);
9536 +}
9537 +
9538 +/* number of bytes required to store @value */
9539 +int dscale_bytes(__u64 value)
9540 +{
9541 +       int bytes;
9542 +
9543 +       bytes = 1 << dscale_range(value);
9544 +       if (bytes == 8)
9545 +               ++bytes;
9546 +       return bytes;
9547 +}
9548 +
9549 +/* returns true if @value and @other require the same number of bytes to be
9550 + * stored. Used by detect when data structure (like stat-data) has to be
9551 + * expanded or contracted. */
9552 +int dscale_fit(__u64 value, __u64 other)
9553 +{
9554 +       return dscale_range(value) == dscale_range(other);
9555 +}
9556 +
9557 +/* Make Linus happy.
9558 +   Local variables:
9559 +   c-indentation-style: "K&R"
9560 +   mode-name: "LC"
9561 +   c-basic-offset: 8
9562 +   tab-width: 8
9563 +   fill-column: 120
9564 +   scroll-step: 1
9565 +   End:
9566 +*/
9567 diff --git a/fs/reiser4/dscale.h b/fs/reiser4/dscale.h
9568 new file mode 100644
9569 index 0000000..545e111
9570 --- /dev/null
9571 +++ b/fs/reiser4/dscale.h
9572 @@ -0,0 +1,27 @@
9573 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
9574 + * reiser4/README */
9575 +
9576 +/* Scalable on-disk integers. See dscale.h for details. */
9577 +
9578 +#if !defined( __FS_REISER4_DSCALE_H__ )
9579 +#define __FS_REISER4_DSCALE_H__
9580 +
9581 +#include "dformat.h"
9582 +
9583 +extern int dscale_read(unsigned char *address, __u64 * value);
9584 +extern int dscale_write(unsigned char *address, __u64 value);
9585 +extern int dscale_bytes(__u64 value);
9586 +extern int dscale_fit(__u64 value, __u64 other);
9587 +
9588 +/* __FS_REISER4_DSCALE_H__ */
9589 +#endif
9590 +
9591 +/* Make Linus happy.
9592 +   Local variables:
9593 +   c-indentation-style: "K&R"
9594 +   mode-name: "LC"
9595 +   c-basic-offset: 8
9596 +   tab-width: 8
9597 +   fill-column: 120
9598 +   End:
9599 +*/
9600 diff --git a/fs/reiser4/entd.c b/fs/reiser4/entd.c
9601 new file mode 100644
9602 index 0000000..719e6a3
9603 --- /dev/null
9604 +++ b/fs/reiser4/entd.c
9605 @@ -0,0 +1,354 @@
9606 +/* Copyright 2003, 2004 by Hans Reiser, licensing governed by
9607 + * reiser4/README */
9608 +
9609 +/* Ent daemon. */
9610 +
9611 +#include "debug.h"
9612 +#include "txnmgr.h"
9613 +#include "tree.h"
9614 +#include "entd.h"
9615 +#include "super.h"
9616 +#include "context.h"
9617 +#include "reiser4.h"
9618 +#include "vfs_ops.h"
9619 +#include "page_cache.h"
9620 +#include "inode.h"
9621 +
9622 +#include <linux/sched.h>       /* struct task_struct */
9623 +#include <linux/suspend.h>
9624 +#include <linux/kernel.h>
9625 +#include <linux/writeback.h>
9626 +#include <linux/time.h>                /* INITIAL_JIFFIES */
9627 +#include <linux/backing-dev.h> /* bdi_write_congested */
9628 +#include <linux/wait.h>
9629 +#include <linux/kthread.h>
9630 +
9631 +#define DEF_PRIORITY 12
9632 +#define MAX_ENTD_ITERS 10
9633 +
9634 +static void entd_flush(struct super_block *, struct wbq *);
9635 +static int entd(void *arg);
9636 +
9637 +/*
9638 + * set ->comm field of end thread to make its state visible to the user level
9639 + */
9640 +#define entd_set_comm(state)                                   \
9641 +       snprintf(current->comm, sizeof(current->comm),  \
9642 +                "ent:%s%s", super->s_id, (state))
9643 +
9644 +/**
9645 + * init_entd - initialize entd context and start kernel daemon
9646 + * @super: super block to start ent thread for
9647 + *
9648 + * Creates entd contexts, starts kernel thread and waits until it
9649 + * initializes.
9650 + */
9651 +int init_entd(struct super_block *super)
9652 +{
9653 +       entd_context *ctx;
9654 +
9655 +       assert("nikita-3104", super != NULL);
9656 +
9657 +       ctx = get_entd_context(super);
9658 +
9659 +       memset(ctx, 0, sizeof *ctx);
9660 +       spin_lock_init(&ctx->guard);
9661 +       init_waitqueue_head(&ctx->wait);
9662 +#if REISER4_DEBUG
9663 +       INIT_LIST_HEAD(&ctx->flushers_list);
9664 +#endif
9665 +       /* lists of writepage requests */
9666 +       INIT_LIST_HEAD(&ctx->todo_list);
9667 +       INIT_LIST_HEAD(&ctx->done_list);
9668 +       /* start entd */
9669 +       ctx->tsk = kthread_run(entd, super, "ent:%s", super->s_id);
9670 +       if (IS_ERR(ctx->tsk))
9671 +               return PTR_ERR(ctx->tsk);
9672 +       return 0;
9673 +}
9674 +
9675 +static void __put_wbq(entd_context *ent, struct wbq *rq)
9676 +{
9677 +       up(&rq->sem);
9678 +}
9679 +
9680 +/* ent should be locked */
9681 +static struct wbq *__get_wbq(entd_context * ent)
9682 +{
9683 +       struct wbq *wbq;
9684 +
9685 +       if (list_empty_careful(&ent->todo_list))
9686 +               return NULL;
9687 +
9688 +       ent->nr_todo_reqs --;
9689 +       wbq = list_entry(ent->todo_list.next, struct wbq, link);
9690 +       list_del_init(&wbq->link);
9691 +       return wbq;
9692 +}
9693 +
9694 +static void wakeup_all_wbq(entd_context * ent)
9695 +{
9696 +       struct wbq *rq;
9697 +
9698 +       spin_lock(&ent->guard);
9699 +       while ((rq = __get_wbq(ent)) != NULL)
9700 +               __put_wbq(ent, rq);
9701 +       spin_unlock(&ent->guard);
9702 +}
9703 +
9704 +/* ent thread function */
9705 +static int entd(void *arg)
9706 +{
9707 +       struct super_block *super;
9708 +       entd_context *ent;
9709 +       int done = 0;
9710 +
9711 +       super = arg;
9712 +       /* do_fork() just copies task_struct into the new
9713 +          thread. ->fs_context shouldn't be copied of course. This shouldn't
9714 +          be a problem for the rest of the code though.
9715 +        */
9716 +       current->journal_info = NULL;
9717 +
9718 +       ent = get_entd_context(super);
9719 +
9720 +       while (!done) {
9721 +               try_to_freeze();
9722 +
9723 +               spin_lock(&ent->guard);
9724 +               while (ent->nr_todo_reqs != 0) {
9725 +                       struct wbq *rq, *next;
9726 +
9727 +                       assert("", list_empty_careful(&ent->done_list));
9728 +
9729 +                       /* take request from the queue head */
9730 +                       rq = __get_wbq(ent);
9731 +                       assert("", rq != NULL);
9732 +                       ent->cur_request = rq;
9733 +                       spin_unlock(&ent->guard);
9734 +
9735 +                       entd_set_comm("!");
9736 +                       entd_flush(super, rq);
9737 +
9738 +                       iput(rq->mapping->host);
9739 +                       up(&(rq->sem));
9740 +
9741 +                       /*
9742 +                        * wakeup all requestors and iput their inodes
9743 +                        */
9744 +                       spin_lock(&ent->guard);
9745 +                       list_for_each_entry_safe(rq, next, &ent->done_list, link) {
9746 +                               list_del_init(&(rq->link));
9747 +                               ent->nr_done_reqs --;
9748 +                               spin_unlock(&ent->guard);
9749 +
9750 +                               assert("", rq->written == 1);
9751 +                               iput(rq->mapping->host);
9752 +                               up(&(rq->sem));
9753 +                               spin_lock(&ent->guard);
9754 +                       }
9755 +               }
9756 +               spin_unlock(&ent->guard);
9757 +
9758 +               entd_set_comm(".");
9759 +
9760 +               {
9761 +                       DEFINE_WAIT(__wait);
9762 +
9763 +                       do {
9764 +                               prepare_to_wait(&ent->wait, &__wait, TASK_INTERRUPTIBLE);
9765 +                               if (kthread_should_stop()) {
9766 +                                       done = 1;
9767 +                                       break;
9768 +                               }
9769 +                               if (ent->nr_todo_reqs != 0)
9770 +                                       break;
9771 +                               schedule();
9772 +                       } while (0);
9773 +                       finish_wait(&ent->wait, &__wait);
9774 +               }
9775 +       }
9776 +       spin_lock(&ent->guard);
9777 +       BUG_ON(ent->nr_todo_reqs != 0);
9778 +       spin_unlock(&ent->guard);
9779 +       wakeup_all_wbq(ent);
9780 +       return 0;
9781 +}
9782 +
9783 +/**
9784 + * done_entd - stop entd kernel thread
9785 + * @super: super block to stop ent thread for
9786 + *
9787 + * It is called on umount. Sends stop signal to entd and wait until it handles
9788 + * it.
9789 + */
9790 +void done_entd(struct super_block *super)
9791 +{
9792 +       entd_context *ent;
9793 +
9794 +       assert("nikita-3103", super != NULL);
9795 +
9796 +       ent = get_entd_context(super);
9797 +       assert("zam-1055", ent->tsk != NULL);
9798 +       kthread_stop(ent->tsk);
9799 +}
9800 +
9801 +/* called at the beginning of jnode_flush to register flusher thread with ent
9802 + * daemon */
9803 +void enter_flush(struct super_block *super)
9804 +{
9805 +       entd_context *ent;
9806 +
9807 +       assert("zam-1029", super != NULL);
9808 +       ent = get_entd_context(super);
9809 +
9810 +       assert("zam-1030", ent != NULL);
9811 +
9812 +       spin_lock(&ent->guard);
9813 +       ent->flushers++;
9814 +#if REISER4_DEBUG
9815 +       list_add(&get_current_context()->flushers_link, &ent->flushers_list);
9816 +#endif
9817 +       spin_unlock(&ent->guard);
9818 +}
9819 +
9820 +/* called at the end of jnode_flush */
9821 +void leave_flush(struct super_block *super)
9822 +{
9823 +       entd_context *ent;
9824 +       int wake_up_ent;
9825 +
9826 +       assert("zam-1027", super != NULL);
9827 +       ent = get_entd_context(super);
9828 +
9829 +       assert("zam-1028", ent != NULL);
9830 +
9831 +       spin_lock(&ent->guard);
9832 +       ent->flushers--;
9833 +       wake_up_ent = (ent->flushers == 0 && ent->nr_todo_reqs != 0);
9834 +#if REISER4_DEBUG
9835 +       list_del_init(&get_current_context()->flushers_link);
9836 +#endif
9837 +       spin_unlock(&ent->guard);
9838 +       if (wake_up_ent)
9839 +               wake_up(&ent->wait);
9840 +}
9841 +
9842 +#define ENTD_CAPTURE_APAGE_BURST SWAP_CLUSTER_MAX
9843 +
9844 +static void entd_flush(struct super_block *super, struct wbq *rq)
9845 +{
9846 +       reiser4_context ctx;
9847 +       int tmp;
9848 +
9849 +       init_stack_context(&ctx, super);
9850 +       ctx.entd = 1;
9851 +       ctx.gfp_mask = GFP_NOFS;
9852 +
9853 +       rq->wbc->range_start = rq->page->index << PAGE_CACHE_SHIFT;
9854 +       rq->wbc->range_end = (rq->page->index + ENTD_CAPTURE_APAGE_BURST) << PAGE_CACHE_SHIFT;
9855 +       tmp = rq->wbc->nr_to_write;
9856 +       rq->mapping->a_ops->writepages(rq->mapping, rq->wbc);
9857 +
9858 +       if (rq->wbc->nr_to_write > 0) {
9859 +               rq->wbc->range_start = 0;
9860 +               rq->wbc->range_end = 0;
9861 +               generic_sync_sb_inodes(super, rq->wbc);
9862 +       }
9863 +       rq->wbc->nr_to_write = ENTD_CAPTURE_APAGE_BURST;
9864 +       writeout(super, rq->wbc);
9865 +
9866 +       context_set_commit_async(&ctx);
9867 +       reiser4_exit_context(&ctx);
9868 +}
9869 +
9870 +/**
9871 + * write_page_by_ent - ask entd thread to flush this page as part of slum
9872 + * @page: page to be written
9873 + * @wbc: writeback control passed to reiser4_writepage
9874 + *
9875 + * Creates a request, puts it on entd list of requests, wakeups entd if
9876 + * necessary, waits until entd completes with the request.
9877 + */
9878 +int write_page_by_ent(struct page *page, struct writeback_control *wbc)
9879 +{
9880 +       struct super_block *sb;
9881 +       struct inode *inode;
9882 +       entd_context *ent;
9883 +       struct wbq rq;
9884 +
9885 +       assert("", PageLocked(page));
9886 +       assert("", page->mapping != NULL);
9887 +
9888 +       sb = page->mapping->host->i_sb;
9889 +       ent = get_entd_context(sb);
9890 +       assert("", ent && ent->done == 0);
9891 +
9892 +       /*
9893 +        * we are going to unlock page and ask ent thread to write the
9894 +        * page. Re-dirty page before unlocking so that if ent thread fails to
9895 +        * write it - it will remain dirty
9896 +        */
9897 +       set_page_dirty_internal(page);
9898 +
9899 +       /*
9900 +        * pin inode in memory, unlock page, entd_flush will iput. We can not
9901 +        * iput here becasue we can not allow delete_inode to be called here
9902 +        */
9903 +       inode = igrab(page->mapping->host);
9904 +       unlock_page(page);
9905 +       if (inode == NULL)
9906 +               /* inode is getting freed */
9907 +               return 0;
9908 +
9909 +       /* init wbq */
9910 +       INIT_LIST_HEAD(&rq.link);
9911 +       rq.magic = WBQ_MAGIC;
9912 +       rq.wbc = wbc;
9913 +       rq.page = page;
9914 +       rq.mapping = inode->i_mapping;
9915 +       rq.node = NULL;
9916 +       rq.written = 0;
9917 +       sema_init(&rq.sem, 0);
9918 +
9919 +       /* add request to entd's list of writepage requests */
9920 +       spin_lock(&ent->guard);
9921 +       ent->nr_todo_reqs++;
9922 +       list_add_tail(&rq.link, &ent->todo_list);
9923 +       if (ent->nr_todo_reqs == 1)
9924 +               wake_up(&ent->wait);
9925 +
9926 +       spin_unlock(&ent->guard);
9927 +
9928 +       /* wait until entd finishes */
9929 +       down(&rq.sem);
9930 +
9931 +       /*
9932 +        * spin until entd thread which did up(&rq.sem) does not need rq
9933 +        * anymore
9934 +        */
9935 +       spin_lock(&ent->guard);
9936 +       spin_unlock(&ent->guard);
9937 +
9938 +       if (rq.written)
9939 +               /* Eventually ENTD has written the page to disk. */
9940 +               return 0;
9941 +       return 0;
9942 +}
9943 +
9944 +int wbq_available(void)
9945 +{
9946 +       struct super_block *sb = reiser4_get_current_sb();
9947 +       entd_context *ent = get_entd_context(sb);
9948 +       return ent->nr_todo_reqs;
9949 +}
9950 +
9951 +/*
9952 + * Local variables:
9953 + * c-indentation-style: "K&R"
9954 + * mode-name: "LC"
9955 + * c-basic-offset: 8
9956 + * tab-width: 8
9957 + * fill-column: 79
9958 + * End:
9959 + */
9960 diff --git a/fs/reiser4/entd.h b/fs/reiser4/entd.h
9961 new file mode 100644
9962 index 0000000..99aaa02
9963 --- /dev/null
9964 +++ b/fs/reiser4/entd.h
9965 @@ -0,0 +1,90 @@
9966 +/* Copyright 2003 by Hans Reiser, licensing governed by reiser4/README */
9967 +
9968 +/* Ent daemon. */
9969 +
9970 +#ifndef __ENTD_H__
9971 +#define __ENTD_H__
9972 +
9973 +#include "context.h"
9974 +
9975 +#include <linux/fs.h>
9976 +#include <linux/completion.h>
9977 +#include <linux/wait.h>
9978 +#include <linux/spinlock.h>
9979 +#include <linux/sched.h>       /* for struct task_struct */
9980 +
9981 +#define WBQ_MAGIC 0x7876dc76
9982 +
9983 +/* write-back request. */
9984 +struct wbq {
9985 +       int magic;
9986 +       struct list_head link; /* list head of this list is in entd context */
9987 +       struct writeback_control *wbc;
9988 +       struct page *page;
9989 +       struct address_space *mapping;
9990 +       struct semaphore sem;
9991 +       jnode *node; /* set if ent thread captured requested page */
9992 +       int written; /* set if ent thread wrote requested page */
9993 +};
9994 +
9995 +/* ent-thread context. This is used to synchronize starting/stopping ent
9996 + * threads. */
9997 +typedef struct entd_context {
9998 +        /* wait queue that ent thread waits on for more work. It's
9999 +         * signaled by write_page_by_ent(). */
10000 +       wait_queue_head_t wait;
10001 +       /* spinlock protecting other fields */
10002 +       spinlock_t guard;
10003 +       /* ent thread */
10004 +       struct task_struct *tsk;
10005 +       /* set to indicate that ent thread should leave. */
10006 +       int done;
10007 +       /* counter of active flushers */
10008 +       int flushers;
10009 +       /*
10010 +        * when reiser4_writepage asks entd to write a page - it adds struct
10011 +        * wbq to this list
10012 +        */
10013 +       struct list_head todo_list;
10014 +       /* number of elements on the above list */
10015 +       int nr_todo_reqs;
10016 +
10017 +       struct wbq *cur_request;
10018 +       /*
10019 +        * when entd writes a page it moves write-back request from todo_list
10020 +        * to done_list. This list is used at the end of entd iteration to
10021 +        * wakeup requestors and iput inodes.
10022 +        */
10023 +       struct list_head done_list;
10024 +       /* number of elements on the above list */
10025 +       int nr_done_reqs;
10026 +
10027 +#if REISER4_DEBUG
10028 +       /* list of all active flushers */
10029 +       struct list_head flushers_list;
10030 +#endif
10031 +} entd_context;
10032 +
10033 +extern int  init_entd(struct super_block *);
10034 +extern void done_entd(struct super_block *);
10035 +
10036 +extern void enter_flush(struct super_block *);
10037 +extern void leave_flush(struct super_block *);
10038 +
10039 +extern int write_page_by_ent(struct page *, struct writeback_control *);
10040 +extern int wbq_available(void);
10041 +extern void ent_writes_page(struct super_block *, struct page *);
10042 +
10043 +extern jnode *get_jnode_by_wbq(struct super_block *, struct wbq *);
10044 +/* __ENTD_H__ */
10045 +#endif
10046 +
10047 +/* Make Linus happy.
10048 +   Local variables:
10049 +   c-indentation-style: "K&R"
10050 +   mode-name: "LC"
10051 +   c-basic-offset: 8
10052 +   tab-width: 8
10053 +   fill-column: 120
10054 +   End:
10055 +*/
10056 diff --git a/fs/reiser4/eottl.c b/fs/reiser4/eottl.c
10057 new file mode 100644
10058 index 0000000..0adf66b
10059 --- /dev/null
10060 +++ b/fs/reiser4/eottl.c
10061 @@ -0,0 +1,510 @@
10062 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
10063 +
10064 +#include "forward.h"
10065 +#include "debug.h"
10066 +#include "key.h"
10067 +#include "coord.h"
10068 +#include "plugin/item/item.h"
10069 +#include "plugin/node/node.h"
10070 +#include "znode.h"
10071 +#include "block_alloc.h"
10072 +#include "tree_walk.h"
10073 +#include "tree_mod.h"
10074 +#include "carry.h"
10075 +#include "tree.h"
10076 +#include "super.h"
10077 +
10078 +#include <linux/types.h>       /* for __u??  */
10079 +
10080 +/*
10081 + * Extents on the twig level (EOTTL) handling.
10082 + *
10083 + * EOTTL poses some problems to the tree traversal, that are better explained
10084 + * by example.
10085 + *
10086 + * Suppose we have block B1 on the twig level with the following items:
10087 + *
10088 + * 0. internal item I0 with key (0:0:0:0) (locality, key-type, object-id,
10089 + * offset)
10090 + * 1. extent item E1 with key (1:4:100:0), having 10 blocks of 4k each
10091 + * 2. internal item I2 with key (10:0:0:0)
10092 + *
10093 + * We are trying to insert item with key (5:0:0:0). Lookup finds node B1, and
10094 + * then intra-node lookup is done. This lookup finished on the E1, because the
10095 + * key we are looking for is larger than the key of E1 and is smaller than key
10096 + * the of I2.
10097 + *
10098 + * Here search is stuck.
10099 + *
10100 + * After some thought it is clear what is wrong here: extents on the twig level
10101 + * break some basic property of the *search* tree (on the pretext, that they
10102 + * restore property of balanced tree).
10103 + *
10104 + * Said property is the following: if in the internal node of the search tree
10105 + * we have [ ... Key1 Pointer Key2 ... ] then, all data that are or will be
10106 + * keyed in the tree with the Key such that Key1 <= Key < Key2 are accessible
10107 + * through the Pointer.
10108 + *
10109 + * This is not true, when Pointer is Extent-Pointer, simply because extent
10110 + * cannot expand indefinitely to the right to include any item with
10111 + *
10112 + *   Key1 <= Key <= Key2.
10113 + *
10114 + * For example, our E1 extent is only responsible for the data with keys
10115 + *
10116 + *   (1:4:100:0) <= key <= (1:4:100:0xffffffffffffffff), and
10117 + *
10118 + * so, key range
10119 + *
10120 + *   ( (1:4:100:0xffffffffffffffff), (10:0:0:0) )
10121 + *
10122 + * is orphaned: there is no way to get there from the tree root.
10123 + *
10124 + * In other words, extent pointers are different than normal child pointers as
10125 + * far as search tree is concerned, and this creates such problems.
10126 + *
10127 + * Possible solution for this problem is to insert our item into node pointed
10128 + * to by I2. There are some problems through:
10129 + *
10130 + * (1) I2 can be in a different node.
10131 + * (2) E1 can be immediately followed by another extent E2.
10132 + *
10133 + * (1) is solved by calling reiser4_get_right_neighbor() and accounting
10134 + * for locks/coords as necessary.
10135 + *
10136 + * (2) is more complex. Solution here is to insert new empty leaf node and
10137 + * insert internal item between E1 and E2 pointing to said leaf node. This is
10138 + * further complicated by possibility that E2 is in a different node, etc.
10139 + *
10140 + * Problems:
10141 + *
10142 + * (1) if there was internal item I2 immediately on the right of an extent E1
10143 + * we and we decided to insert new item S1 into node N2 pointed to by I2, then
10144 + * key of S1 will be less than smallest key in the N2. Normally, search key
10145 + * checks that key we are looking for is in the range of keys covered by the
10146 + * node key is being looked in. To work around of this situation, while
10147 + * preserving useful consistency check new flag CBK_TRUST_DK was added to the
10148 + * cbk falgs bitmask. This flag is automatically set on entrance to the
10149 + * coord_by_key() and is only cleared when we are about to enter situation
10150 + * described above.
10151 + *
10152 + * (2) If extent E1 is immediately followed by another extent E2 and we are
10153 + * searching for the key that is between E1 and E2 we only have to insert new
10154 + * empty leaf node when coord_by_key was called for insertion, rather than just
10155 + * for lookup. To distinguish these cases, new flag CBK_FOR_INSERT was added to
10156 + * the cbk falgs bitmask. This flag is automatically set by coord_by_key calls
10157 + * performed by insert_by_key() and friends.
10158 + *
10159 + * (3) Insertion of new empty leaf node (possibly) requires balancing. In any
10160 + * case it requires modification of node content which is only possible under
10161 + * write lock. It may well happen that we only have read lock on the node where
10162 + * new internal pointer is to be inserted (common case: lookup of non-existent
10163 + * stat-data that fells between two extents). If only read lock is held, tree
10164 + * traversal is restarted with lock_level modified so that next time we hit
10165 + * this problem, write lock will be held. Once we have write lock, balancing
10166 + * will be performed.
10167 + */
10168 +
10169 +/**
10170 + * is_next_item_internal - check whether next item is internal
10171 + * @coord: coordinate of extent item in twig node
10172 + * @key: search key
10173 + * @lh: twig node lock handle
10174 + *
10175 + * Looks at the unit next to @coord. If it is an internal one - 1 is returned,
10176 + * @coord is set to that unit. If that unit is in right neighbor, @lh is moved
10177 + * to that node, @coord is set to its first unit. If next item is not internal
10178 + * or does not exist then 0 is returned, @coord and @lh are left unchanged. 2
10179 + * is returned if search restart has to be done.
10180 + */
10181 +static int
10182 +is_next_item_internal(coord_t *coord, const reiser4_key *key,
10183 +                     lock_handle *lh)
10184 +{
10185 +       coord_t next;
10186 +       lock_handle rn;
10187 +       int result;
10188 +
10189 +       coord_dup(&next, coord);
10190 +       if (coord_next_unit(&next) == 0) {
10191 +               /* next unit is in this node */
10192 +               if (item_is_internal(&next)) {
10193 +                       coord_dup(coord, &next);
10194 +                       return 1;
10195 +               }
10196 +               assert("vs-3", item_is_extent(&next));
10197 +               return 0;
10198 +       }
10199 +
10200 +       /*
10201 +        * next unit either does not exist or is in right neighbor. If it is in
10202 +        * right neighbor we have to check right delimiting key because
10203 +        * concurrent thread could get their first and insert item with a key
10204 +        * smaller than @key
10205 +        */
10206 +       read_lock_dk(current_tree);
10207 +       result = keycmp(key, znode_get_rd_key(coord->node));
10208 +       read_unlock_dk(current_tree);
10209 +       assert("vs-6", result != EQUAL_TO);
10210 +       if (result == GREATER_THAN)
10211 +               return 2;
10212 +
10213 +       /* lock right neighbor */
10214 +       init_lh(&rn);
10215 +       result = reiser4_get_right_neighbor(&rn, coord->node,
10216 +                                           znode_is_wlocked(coord->node) ?
10217 +                                           ZNODE_WRITE_LOCK : ZNODE_READ_LOCK,
10218 +                                           GN_CAN_USE_UPPER_LEVELS);
10219 +       if (result == -E_NO_NEIGHBOR) {
10220 +               /* we are on the rightmost edge of the tree */
10221 +               done_lh(&rn);
10222 +               return 0;
10223 +       }
10224 +
10225 +       if (result) {
10226 +               assert("vs-4", result < 0);
10227 +               done_lh(&rn);
10228 +               return result;
10229 +       }
10230 +
10231 +       /*
10232 +        * check whether concurrent thread managed to insert item with a key
10233 +        * smaller than @key
10234 +        */
10235 +       read_lock_dk(current_tree);
10236 +       result = keycmp(key, znode_get_ld_key(rn.node));
10237 +       read_unlock_dk(current_tree);
10238 +       assert("vs-6", result != EQUAL_TO);
10239 +       if (result == GREATER_THAN) {
10240 +               done_lh(&rn);
10241 +               return 2;
10242 +       }
10243 +
10244 +       result = zload(rn.node);
10245 +       if (result) {
10246 +               assert("vs-5", result < 0);
10247 +               done_lh(&rn);
10248 +               return result;
10249 +       }
10250 +
10251 +       coord_init_first_unit(&next, rn.node);
10252 +       if (item_is_internal(&next)) {
10253 +               /*
10254 +                * next unit is in right neighbor and it is an unit of internal
10255 +                * item. Unlock coord->node. Move @lh to right neighbor. @coord
10256 +                * is set to the first unit of right neighbor.
10257 +                */
10258 +               coord_dup(coord, &next);
10259 +               zrelse(rn.node);
10260 +               done_lh(lh);
10261 +               move_lh(lh, &rn);
10262 +               return 1;
10263 +       }
10264 +
10265 +       /*
10266 +        * next unit is unit of extent item. Return without chaning @lh and
10267 +        * @coord.
10268 +        */
10269 +       assert("vs-6", item_is_extent(&next));
10270 +       zrelse(rn.node);
10271 +       done_lh(&rn);
10272 +       return 0;
10273 +}
10274 +
10275 +/**
10276 + * rd_key - calculate key of an item next to the given one
10277 + * @coord: position in a node
10278 + * @key: storage for result key
10279 + *
10280 + * @coord is set between items or after the last item in a node. Calculate key
10281 + * of item to the right of @coord.
10282 + */
10283 +static reiser4_key *rd_key(const coord_t *coord, reiser4_key *key)
10284 +{
10285 +       coord_t dup;
10286 +
10287 +       assert("nikita-2281", coord_is_between_items(coord));
10288 +       coord_dup(&dup, coord);
10289 +
10290 +       if (coord_set_to_right(&dup) == 0)
10291 +               /* next item is in this node. Return its key. */
10292 +               unit_key_by_coord(&dup, key);
10293 +       else {
10294 +               /*
10295 +                * next item either does not exist or is in right
10296 +                * neighbor. Return znode's right delimiting key.
10297 +                */
10298 +               read_lock_dk(current_tree);
10299 +               *key = *znode_get_rd_key(coord->node);
10300 +               read_unlock_dk(current_tree);
10301 +       }
10302 +       return key;
10303 +}
10304 +
10305 +/**
10306 + * add_empty_leaf - insert empty leaf between two extents
10307 + * @insert_coord: position in twig node between two extents
10308 + * @lh: twig node lock handle
10309 + * @key: left delimiting key of new node
10310 + * @rdkey: right delimiting key of new node
10311 + *
10312 + * Inserts empty leaf node between two extent items. It is necessary when we
10313 + * have to insert an item on leaf level between two extents (items on the twig
10314 + * level).
10315 + */
10316 +static int
10317 +add_empty_leaf(coord_t *insert_coord, lock_handle *lh,
10318 +              const reiser4_key *key, const reiser4_key *rdkey)
10319 +{
10320 +       int result;
10321 +       carry_pool *pool;
10322 +       carry_level *todo;
10323 +       reiser4_item_data *item;
10324 +       carry_insert_data *cdata;
10325 +       carry_op *op;
10326 +       znode *node;
10327 +       reiser4_tree *tree;
10328 +
10329 +       assert("vs-49827", znode_contains_key_lock(insert_coord->node, key));
10330 +       tree = znode_get_tree(insert_coord->node);
10331 +       node = new_node(insert_coord->node, LEAF_LEVEL);
10332 +       if (IS_ERR(node))
10333 +               return PTR_ERR(node);
10334 +
10335 +       /* setup delimiting keys for node being inserted */
10336 +       write_lock_dk(tree);
10337 +       znode_set_ld_key(node, key);
10338 +       znode_set_rd_key(node, rdkey);
10339 +       ON_DEBUG(node->creator = current);
10340 +       ON_DEBUG(node->first_key = *key);
10341 +       write_unlock_dk(tree);
10342 +
10343 +       ZF_SET(node, JNODE_ORPHAN);
10344 +
10345 +       /*
10346 +        * allocate carry_pool, 3 carry_level-s, reiser4_item_data and
10347 +        * carry_insert_data
10348 +        */
10349 +       pool = init_carry_pool(sizeof(*pool) + 3 * sizeof(*todo) +
10350 +                              sizeof(*item) + sizeof(*cdata));
10351 +       if (IS_ERR(pool))
10352 +               return PTR_ERR(pool);
10353 +       todo = (carry_level *) (pool + 1);
10354 +       init_carry_level(todo, pool);
10355 +
10356 +       item = (reiser4_item_data *) (todo + 3);
10357 +       cdata = (carry_insert_data *) (item + 1);
10358 +
10359 +       op = post_carry(todo, COP_INSERT, insert_coord->node, 0);
10360 +       if (!IS_ERR(op)) {
10361 +               cdata->coord = insert_coord;
10362 +               cdata->key = key;
10363 +               cdata->data = item;
10364 +               op->u.insert.d = cdata;
10365 +               op->u.insert.type = COPT_ITEM_DATA;
10366 +               build_child_ptr_data(node, item);
10367 +               item->arg = NULL;
10368 +               /* have @insert_coord to be set at inserted item after
10369 +                  insertion is done */
10370 +               todo->track_type = CARRY_TRACK_CHANGE;
10371 +               todo->tracked = lh;
10372 +
10373 +               result = carry(todo, NULL);
10374 +               if (result == 0) {
10375 +                       /*
10376 +                        * pin node in memory. This is necessary for
10377 +                        * znode_make_dirty() below.
10378 +                        */
10379 +                       result = zload(node);
10380 +                       if (result == 0) {
10381 +                               lock_handle local_lh;
10382 +
10383 +                               /*
10384 +                                * if we inserted new child into tree we have
10385 +                                * to mark it dirty so that flush will be able
10386 +                                * to process it.
10387 +                                */
10388 +                               init_lh(&local_lh);
10389 +                               result = longterm_lock_znode(&local_lh, node,
10390 +                                                            ZNODE_WRITE_LOCK,
10391 +                                                            ZNODE_LOCK_LOPRI);
10392 +                               if (result == 0) {
10393 +                                       znode_make_dirty(node);
10394 +
10395 +                                       /*
10396 +                                        * when internal item pointing to @node
10397 +                                        * was inserted into twig node
10398 +                                        * create_hook_internal did not connect
10399 +                                        * it properly because its right
10400 +                                        * neighbor was not known. Do it
10401 +                                        * here
10402 +                                        */
10403 +                                       write_lock_tree(tree);
10404 +                                       assert("nikita-3312",
10405 +                                              znode_is_right_connected(node));
10406 +                                       assert("nikita-2984",
10407 +                                              node->right == NULL);
10408 +                                       ZF_CLR(node, JNODE_RIGHT_CONNECTED);
10409 +                                       write_unlock_tree(tree);
10410 +                                       result =
10411 +                                           connect_znode(insert_coord, node);
10412 +                                       if (result == 0)
10413 +                                               ON_DEBUG(check_dkeys(node));
10414 +
10415 +                                       done_lh(lh);
10416 +                                       move_lh(lh, &local_lh);
10417 +                                       assert("vs-1676", node_is_empty(node));
10418 +                                       coord_init_first_unit(insert_coord,
10419 +                                                             node);
10420 +                               } else {
10421 +                                       warning("nikita-3136",
10422 +                                               "Cannot lock child");
10423 +                               }
10424 +                               done_lh(&local_lh);
10425 +                               zrelse(node);
10426 +                       }
10427 +               }
10428 +       } else
10429 +               result = PTR_ERR(op);
10430 +       zput(node);
10431 +       done_carry_pool(pool);
10432 +       return result;
10433 +}
10434 +
10435 +/**
10436 + * handle_eottl - handle extent-on-the-twig-level cases in tree traversal
10437 + * @h: search handle
10438 + * @outcome: flag saying whether search has to restart or is done
10439 + *
10440 + * Handles search on twig level. If this function completes search itself then
10441 + * it returns 1. If search has to go one level down then 0 is returned. If
10442 + * error happens then LOOKUP_DONE is returned via @outcome and error code is saved
10443 + * in @h->result.
10444 + */
10445 +int handle_eottl(cbk_handle *h, int *outcome)
10446 +{
10447 +       int result;
10448 +       reiser4_key key;
10449 +       coord_t *coord;
10450 +
10451 +       coord = h->coord;
10452 +
10453 +       if (h->level != TWIG_LEVEL ||
10454 +           (coord_is_existing_item(coord) && item_is_internal(coord))) {
10455 +               /* Continue to traverse tree downward. */
10456 +               return 0;
10457 +       }
10458 +
10459 +       /*
10460 +        * make sure that @h->coord is set to twig node and that it is either
10461 +        * set to extent item or after extent item
10462 +        */
10463 +       assert("vs-356", h->level == TWIG_LEVEL);
10464 +       assert("vs-357", ( {
10465 +                         coord_t lcoord;
10466 +                         coord_dup(&lcoord, coord);
10467 +                         check_me("vs-733", coord_set_to_left(&lcoord) == 0);
10468 +                         item_is_extent(&lcoord);
10469 +                         }
10470 +              ));
10471 +
10472 +       if (*outcome == NS_FOUND) {
10473 +               /* we have found desired key on twig level in extent item */
10474 +               h->result = CBK_COORD_FOUND;
10475 +               *outcome = LOOKUP_DONE;
10476 +               return 1;
10477 +       }
10478 +
10479 +       if (!(h->flags & CBK_FOR_INSERT)) {
10480 +               /* tree traversal is not for insertion. Just return
10481 +                  CBK_COORD_NOTFOUND. */
10482 +               h->result = CBK_COORD_NOTFOUND;
10483 +               *outcome = LOOKUP_DONE;
10484 +               return 1;
10485 +       }
10486 +
10487 +       /* take a look at the item to the right of h -> coord */
10488 +       result = is_next_item_internal(coord, h->key, h->active_lh);
10489 +       if (unlikely(result < 0)) {
10490 +               h->error = "get_right_neighbor failed";
10491 +               h->result = result;
10492 +               *outcome = LOOKUP_DONE;
10493 +               return 1;
10494 +       }
10495 +       if (result == 0) {
10496 +               /*
10497 +                * item to the right is also an extent one. Allocate a new node
10498 +                * and insert pointer to it after item h -> coord.
10499 +                *
10500 +                * This is a result of extents being located at the twig
10501 +                * level. For explanation, see comment just above
10502 +                * is_next_item_internal().
10503 +                */
10504 +               znode *loaded;
10505 +
10506 +               if (cbk_lock_mode(h->level, h) != ZNODE_WRITE_LOCK) {
10507 +                       /*
10508 +                        * we got node read locked, restart coord_by_key to
10509 +                        * have write lock on twig level
10510 +                        */
10511 +                       h->lock_level = TWIG_LEVEL;
10512 +                       h->lock_mode = ZNODE_WRITE_LOCK;
10513 +                       *outcome = LOOKUP_REST;
10514 +                       return 1;
10515 +               }
10516 +
10517 +               loaded = coord->node;
10518 +               result =
10519 +                   add_empty_leaf(coord, h->active_lh, h->key,
10520 +                                  rd_key(coord, &key));
10521 +               if (result) {
10522 +                       h->error = "could not add empty leaf";
10523 +                       h->result = result;
10524 +                       *outcome = LOOKUP_DONE;
10525 +                       return 1;
10526 +               }
10527 +               /* added empty leaf is locked (h->active_lh), its parent node
10528 +                  is unlocked, h->coord is set as EMPTY */
10529 +               assert("vs-13", coord->between == EMPTY_NODE);
10530 +               assert("vs-14", znode_is_write_locked(coord->node));
10531 +               assert("vs-15",
10532 +                      WITH_DATA(coord->node, node_is_empty(coord->node)));
10533 +               assert("vs-16", jnode_is_leaf(ZJNODE(coord->node)));
10534 +               assert("vs-17", coord->node == h->active_lh->node);
10535 +               *outcome = LOOKUP_DONE;
10536 +               h->result = CBK_COORD_NOTFOUND;
10537 +               return 1;
10538 +       } else if (result == 1) {
10539 +               /*
10540 +                * this is special case mentioned in the comment on
10541 +                * tree.h:cbk_flags. We have found internal item immediately on
10542 +                * the right of extent, and we are going to insert new item
10543 +                * there. Key of item we are going to insert is smaller than
10544 +                * leftmost key in the node pointed to by said internal item
10545 +                * (otherwise search wouldn't come to the extent in the first
10546 +                * place).
10547 +                *
10548 +                * This is a result of extents being located at the twig
10549 +                * level. For explanation, see comment just above
10550 +                * is_next_item_internal().
10551 +                */
10552 +               h->flags &= ~CBK_TRUST_DK;
10553 +       } else {
10554 +               assert("vs-8", result == 2);
10555 +               *outcome = LOOKUP_REST;
10556 +               return 1;
10557 +       }
10558 +       assert("vs-362", WITH_DATA(coord->node, item_is_internal(coord)));
10559 +       return 0;
10560 +}
10561 +
10562 +/*
10563 + * Local variables:
10564 + * c-indentation-style: "K&R"
10565 + * mode-name: "LC"
10566 + * c-basic-offset: 8
10567 + * tab-width: 8
10568 + * fill-column: 120
10569 + * scroll-step: 1
10570 + * End:
10571 + */
10572 diff --git a/fs/reiser4/estimate.c b/fs/reiser4/estimate.c
10573 new file mode 100644
10574 index 0000000..39cd3b0
10575 --- /dev/null
10576 +++ b/fs/reiser4/estimate.c
10577 @@ -0,0 +1,111 @@
10578 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
10579 +
10580 +#include "debug.h"
10581 +#include "dformat.h"
10582 +#include "tree.h"
10583 +#include "carry.h"
10584 +#include "inode.h"
10585 +#include "plugin/cluster.h"
10586 +#include "plugin/item/ctail.h"
10587 +
10588 +/* this returns how many nodes might get dirty and added nodes if @children nodes are dirtied
10589 +
10590 +   Amount of internals which will get dirty or get allocated we estimate as 5% of the childs + 1 balancing. 1 balancing
10591 +   is 2 neighbours, 2 new blocks and the current block on the leaf level, 2 neighbour nodes + the current (or 1
10592 +   neighbour and 1 new and the current) on twig level, 2 neighbour nodes on upper levels and 1 for a new root. So 5 for
10593 +   leaf level, 3 for twig level, 2 on upper + 1 for root.
10594 +
10595 +   Do not calculate the current node of the lowest level here - this is overhead only.
10596 +
10597 +   children is almost always 1 here. Exception is flow insertion
10598 +*/
10599 +static reiser4_block_nr
10600 +max_balance_overhead(reiser4_block_nr childen, tree_level tree_height)
10601 +{
10602 +       reiser4_block_nr ten_percent;
10603 +
10604 +       ten_percent = ((103 * childen) >> 10);
10605 +
10606 +       /* If we have too many balancings at the time, tree height can raise on more
10607 +          then 1. Assume that if tree_height is 5, it can raise on 1 only. */
10608 +       return ((tree_height < 5 ? 5 : tree_height) * 2 + (4 + ten_percent));
10609 +}
10610 +
10611 +/* this returns maximal possible number of nodes which can be modified plus number of new nodes which can be required to
10612 +   perform insertion of one item into the tree */
10613 +/* it is only called when tree height changes, or gets initialized */
10614 +reiser4_block_nr calc_estimate_one_insert(tree_level height)
10615 +{
10616 +       return 1 + max_balance_overhead(1, height);
10617 +}
10618 +
10619 +reiser4_block_nr estimate_one_insert_item(reiser4_tree * tree)
10620 +{
10621 +       return tree->estimate_one_insert;
10622 +}
10623 +
10624 +/* this returns maximal possible number of nodes which can be modified plus number of new nodes which can be required to
10625 +   perform insertion of one unit into an item in the tree */
10626 +reiser4_block_nr estimate_one_insert_into_item(reiser4_tree * tree)
10627 +{
10628 +       /* estimate insert into item just like item insertion */
10629 +       return tree->estimate_one_insert;
10630 +}
10631 +
10632 +reiser4_block_nr estimate_one_item_removal(reiser4_tree * tree)
10633 +{
10634 +       /* on item removal reiser4 does not try to pack nodes more complact, so, only one node may be dirtied on leaf
10635 +          level */
10636 +       return tree->estimate_one_insert;
10637 +}
10638 +
10639 +/* on leaf level insert_flow may add CARRY_FLOW_NEW_NODES_LIMIT new nodes and dirty 3 existing nodes (insert point and
10640 +   both its neighbors). Max_balance_overhead should estimate number of blocks which may change/get added on internal
10641 +   levels */
10642 +reiser4_block_nr estimate_insert_flow(tree_level height)
10643 +{
10644 +       return 3 + CARRY_FLOW_NEW_NODES_LIMIT + max_balance_overhead(3 +
10645 +                                                                    CARRY_FLOW_NEW_NODES_LIMIT,
10646 +                                                                    height);
10647 +}
10648 +
10649 +/* returnes max number of nodes can be occupied by disk cluster */
10650 +static reiser4_block_nr estimate_cluster(struct inode * inode, int unprepped)
10651 +{
10652 +       int per_cluster;
10653 +       per_cluster = (unprepped ? 1 : cluster_nrpages(inode));
10654 +       return 3 + per_cluster +
10655 +               max_balance_overhead(3 + per_cluster,
10656 +                                    REISER4_MAX_ZTREE_HEIGHT);
10657 +}
10658 +
10659 +/* how many nodes might get dirty and added
10660 +   during insertion of a disk cluster */
10661 +reiser4_block_nr estimate_insert_cluster(struct inode * inode)
10662 +{
10663 +       return estimate_cluster(inode, 1); /* 24 */
10664 +}
10665 +
10666 +/* how many nodes might get dirty and added
10667 +   during update of a (prepped or unprepped) disk cluster */
10668 +reiser4_block_nr estimate_update_cluster(struct inode * inode)
10669 +{
10670 +       return estimate_cluster(inode, 0); /* 44, for 64K-cluster */
10671 +}
10672 +
10673 +/* how many nodes occupied by a disk cluster might get dirty */
10674 +reiser4_block_nr estimate_dirty_cluster(struct inode * inode)
10675 +{
10676 +       return 2 + cluster_nrpages(inode);
10677 +}
10678 +
10679 +/* Make Linus happy.
10680 +   Local variables:
10681 +   c-indentation-style: "K&R"
10682 +   mode-name: "LC"
10683 +   c-basic-offset: 8
10684 +   tab-width: 8
10685 +   fill-column: 120
10686 +   scroll-step: 1
10687 +   End:
10688 +*/
10689 diff --git a/fs/reiser4/export_ops.c b/fs/reiser4/export_ops.c
10690 new file mode 100644
10691 index 0000000..4fcc85b
10692 --- /dev/null
10693 +++ b/fs/reiser4/export_ops.c
10694 @@ -0,0 +1,296 @@
10695 +/* Copyright 2005 by Hans Reiser, licensing governed by
10696 + * reiser4/README */
10697 +
10698 +#include "inode.h"
10699 +#include "plugin/plugin.h"
10700 +
10701 +
10702 +/*
10703 + * Supported file-handle types
10704 + */
10705 +typedef enum {
10706 +       FH_WITH_PARENT = 0x10,  /* file handle with parent */
10707 +       FH_WITHOUT_PARENT = 0x11        /* file handle without parent */
10708 +} reiser4_fhtype;
10709 +
10710 +#define NFSERROR (255)
10711 +
10712 +/* initialize place-holder for object */
10713 +static void object_on_wire_init(reiser4_object_on_wire *o)
10714 +{
10715 +       o->plugin = NULL;
10716 +}
10717 +
10718 +/* finish with @o */
10719 +static void object_on_wire_done(reiser4_object_on_wire *o)
10720 +{
10721 +       if (o->plugin != NULL)
10722 +               o->plugin->wire.done(o);
10723 +}
10724 +
10725 +/*
10726 + * read serialized object identity from @addr and store information about
10727 + * object in @obj. This is dual to encode_inode().
10728 + */
10729 +static char *decode_inode(struct super_block *s, char *addr,
10730 +                         reiser4_object_on_wire * obj)
10731 +{
10732 +       file_plugin *fplug;
10733 +
10734 +       /* identifier of object plugin is stored in the first two bytes,
10735 +        * followed by... */
10736 +       fplug = file_plugin_by_disk_id(get_tree(s), (d16 *) addr);
10737 +       if (fplug != NULL) {
10738 +               addr += sizeof(d16);
10739 +               obj->plugin = fplug;
10740 +               assert("nikita-3520", fplug->wire.read != NULL);
10741 +               /* plugin specific encoding of object identity. */
10742 +               addr = fplug->wire.read(addr, obj);
10743 +       } else
10744 +               addr = ERR_PTR(RETERR(-EINVAL));
10745 +       return addr;
10746 +}
10747 +
10748 +/**
10749 + * reiser4_decode_fh - decode_fh of export operations
10750 + * @super: super block
10751 + * @fh: nfsd file handle
10752 + * @len: length of file handle
10753 + * @fhtype: type of file handle
10754 + * @acceptable: acceptability testing function
10755 + * @context: argument for @acceptable
10756 + *
10757 + * Returns dentry referring to the same file as @fh.
10758 + */
10759 +static struct dentry *reiser4_decode_fh(struct super_block *super, __u32 *fh,
10760 +                                       int len, int fhtype,
10761 +                                       int (*acceptable) (void *context,
10762 +                                                          struct dentry *de),
10763 +                                       void *context)
10764 +{
10765 +       reiser4_context *ctx;
10766 +       reiser4_object_on_wire object;
10767 +       reiser4_object_on_wire parent;
10768 +       char *addr;
10769 +       int with_parent;
10770 +
10771 +       ctx = init_context(super);
10772 +       if (IS_ERR(ctx))
10773 +               return (struct dentry *)ctx;
10774 +
10775 +       assert("vs-1482",
10776 +              fhtype == FH_WITH_PARENT || fhtype == FH_WITHOUT_PARENT);
10777 +
10778 +       with_parent = (fhtype == FH_WITH_PARENT);
10779 +
10780 +       addr = (char *)fh;
10781 +
10782 +       object_on_wire_init(&object);
10783 +       object_on_wire_init(&parent);
10784 +
10785 +       addr = decode_inode(super, addr, &object);
10786 +       if (!IS_ERR(addr)) {
10787 +               if (with_parent)
10788 +                       addr = decode_inode(super, addr, &parent);
10789 +               if (!IS_ERR(addr)) {
10790 +                       struct dentry *d;
10791 +                       typeof(super->s_export_op->find_exported_dentry) fn;
10792 +
10793 +                       fn = super->s_export_op->find_exported_dentry;
10794 +                       assert("nikita-3521", fn != NULL);
10795 +                       d = fn(super, &object, with_parent ? &parent : NULL,
10796 +                              acceptable, context);
10797 +                       if (d != NULL && !IS_ERR(d))
10798 +                               /* FIXME check for -ENOMEM */
10799 +                               reiser4_get_dentry_fsdata(d)->stateless = 1;
10800 +                       addr = (char *)d;
10801 +               }
10802 +       }
10803 +
10804 +       object_on_wire_done(&object);
10805 +       object_on_wire_done(&parent);
10806 +
10807 +       reiser4_exit_context(ctx);
10808 +       return (void *)addr;
10809 +}
10810 +
10811 +/*
10812 + * Object serialization support.
10813 + *
10814 + * To support knfsd file system provides export_operations that are used to
10815 + * construct and interpret NFS file handles. As a generalization of this,
10816 + * reiser4 object plugins have serialization support: it provides methods to
10817 + * create on-wire representation of identity of reiser4 object, and
10818 + * re-create/locate object given its on-wire identity.
10819 + *
10820 + */
10821 +
10822 +/*
10823 + * return number of bytes that on-wire representation of @inode's identity
10824 + * consumes.
10825 + */
10826 +static int encode_inode_size(struct inode *inode)
10827 +{
10828 +       assert("nikita-3514", inode != NULL);
10829 +       assert("nikita-3515", inode_file_plugin(inode) != NULL);
10830 +       assert("nikita-3516", inode_file_plugin(inode)->wire.size != NULL);
10831 +
10832 +       return inode_file_plugin(inode)->wire.size(inode) + sizeof(d16);
10833 +}
10834 +
10835 +/*
10836 + * store on-wire representation of @inode's identity at the area beginning at
10837 + * @start.
10838 + */
10839 +static char *encode_inode(struct inode *inode, char *start)
10840 +{
10841 +       assert("nikita-3517", inode != NULL);
10842 +       assert("nikita-3518", inode_file_plugin(inode) != NULL);
10843 +       assert("nikita-3519", inode_file_plugin(inode)->wire.write != NULL);
10844 +
10845 +       /*
10846 +        * first, store two-byte identifier of object plugin, then
10847 +        */
10848 +       save_plugin_id(file_plugin_to_plugin(inode_file_plugin(inode)),
10849 +                      (d16 *) start);
10850 +       start += sizeof(d16);
10851 +       /*
10852 +        * call plugin to serialize object's identity
10853 +        */
10854 +       return inode_file_plugin(inode)->wire.write(inode, start);
10855 +}
10856 +
10857 +/* this returns number of 32 bit long numbers encoded in @lenp. 255 is
10858 + * returned if file handle can not be stored */
10859 +/**
10860 + * reiser4_encode_fh - encode_fh of export operations
10861 + * @dentry:
10862 + * @fh:
10863 + * @lenp:
10864 + * @need_parent:
10865 + *
10866 + */
10867 +static int
10868 +reiser4_encode_fh(struct dentry *dentry, __u32 *fh, int *lenp,
10869 +                 int need_parent)
10870 +{
10871 +       struct inode *inode;
10872 +       struct inode *parent;
10873 +       char *addr;
10874 +       int need;
10875 +       int delta;
10876 +       int result;
10877 +       reiser4_context *ctx;
10878 +
10879 +       /*
10880 +        * knfsd asks as to serialize object in @dentry, and, optionally its
10881 +        * parent (if need_parent != 0).
10882 +        *
10883 +        * encode_inode() and encode_inode_size() is used to build
10884 +        * representation of object and its parent. All hard work is done by
10885 +        * object plugins.
10886 +        */
10887 +       inode = dentry->d_inode;
10888 +       parent = dentry->d_parent->d_inode;
10889 +
10890 +       addr = (char *)fh;
10891 +
10892 +       need = encode_inode_size(inode);
10893 +       if (need < 0)
10894 +               return NFSERROR;
10895 +       if (need_parent) {
10896 +               delta = encode_inode_size(parent);
10897 +               if (delta < 0)
10898 +                       return NFSERROR;
10899 +               need += delta;
10900 +       }
10901 +
10902 +       ctx = init_context(dentry->d_inode->i_sb);
10903 +       if (IS_ERR(ctx))
10904 +               return PTR_ERR(ctx);
10905 +
10906 +       if (need <= sizeof(__u32) * (*lenp)) {
10907 +               addr = encode_inode(inode, addr);
10908 +               if (need_parent)
10909 +                       addr = encode_inode(parent, addr);
10910 +
10911 +               /* store in lenp number of 32bit words required for file
10912 +                * handle. */
10913 +               *lenp = (need + sizeof(__u32) - 1) >> 2;
10914 +               result = need_parent ? FH_WITH_PARENT : FH_WITHOUT_PARENT;
10915 +       } else
10916 +               /* no enough space in file handle */
10917 +               result = NFSERROR;
10918 +       reiser4_exit_context(ctx);
10919 +       return result;
10920 +}
10921 +
10922 +/**
10923 + * reiser4_get_dentry_parent - get_parent of export operations
10924 + * @child:
10925 + *
10926 + */
10927 +static struct dentry *reiser4_get_dentry_parent(struct dentry *child)
10928 +{
10929 +       struct inode *dir;
10930 +       dir_plugin *dplug;
10931 +
10932 +       assert("nikita-3527", child != NULL);
10933 +       /* see comment in reiser4_get_dentry() about following assertion */
10934 +       assert("nikita-3528", is_in_reiser4_context());
10935 +
10936 +       dir = child->d_inode;
10937 +       assert("nikita-3529", dir != NULL);
10938 +       dplug = inode_dir_plugin(dir);
10939 +       assert("nikita-3531", ergo(dplug != NULL, dplug->get_parent != NULL));
10940 +       if (dplug != NULL)
10941 +               return dplug->get_parent(dir);
10942 +       else
10943 +               return ERR_PTR(RETERR(-ENOTDIR));
10944 +}
10945 +
10946 +/**
10947 + * reiser4_get_dentry - get_dentry of export operations
10948 + * @super:
10949 + * @data:
10950 + *
10951 + *
10952 + */
10953 +static struct dentry *reiser4_get_dentry(struct super_block *super, void *data)
10954 +{
10955 +       reiser4_object_on_wire *o;
10956 +
10957 +       assert("nikita-3522", super != NULL);
10958 +       assert("nikita-3523", data != NULL);
10959 +       /*
10960 +        * this is only supposed to be called by
10961 +        *
10962 +        *     reiser4_decode_fh->find_exported_dentry
10963 +        *
10964 +        * so, reiser4_context should be here already.
10965 +        */
10966 +       assert("nikita-3526", is_in_reiser4_context());
10967 +
10968 +       o = (reiser4_object_on_wire *)data;
10969 +       assert("nikita-3524", o->plugin != NULL);
10970 +       assert("nikita-3525", o->plugin->wire.get != NULL);
10971 +
10972 +       return o->plugin->wire.get(super, o);
10973 +}
10974 +
10975 +struct export_operations reiser4_export_operations = {
10976 +       .encode_fh = reiser4_encode_fh,
10977 +       .decode_fh = reiser4_decode_fh,
10978 +       .get_parent = reiser4_get_dentry_parent,
10979 +       .get_dentry = reiser4_get_dentry
10980 +};
10981 +
10982 +/*
10983 + * Local variables:
10984 + * c-indentation-style: "K&R"
10985 + * mode-name: "LC"
10986 + * c-basic-offset: 8
10987 + * tab-width: 8
10988 + * fill-column: 79
10989 + * End:
10990 + */
10991 diff --git a/fs/reiser4/flush.c b/fs/reiser4/flush.c
10992 new file mode 100644
10993 index 0000000..1fd8208
10994 --- /dev/null
10995 +++ b/fs/reiser4/flush.c
10996 @@ -0,0 +1,3626 @@
10997 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
10998 +
10999 +/* The design document for this file is at http://www.namesys.com/v4/v4.html. */
11000 +
11001 +#include "forward.h"
11002 +#include "debug.h"
11003 +#include "dformat.h"
11004 +#include "key.h"
11005 +#include "coord.h"
11006 +#include "plugin/item/item.h"
11007 +#include "plugin/plugin.h"
11008 +#include "plugin/object.h"
11009 +#include "txnmgr.h"
11010 +#include "jnode.h"
11011 +#include "znode.h"
11012 +#include "block_alloc.h"
11013 +#include "tree_walk.h"
11014 +#include "carry.h"
11015 +#include "tree.h"
11016 +#include "vfs_ops.h"
11017 +#include "inode.h"
11018 +#include "page_cache.h"
11019 +#include "wander.h"
11020 +#include "super.h"
11021 +#include "entd.h"
11022 +#include "reiser4.h"
11023 +#include "flush.h"
11024 +#include "writeout.h"
11025 +
11026 +#include <asm/atomic.h>
11027 +#include <linux/fs.h>          /* for struct super_block  */
11028 +#include <linux/mm.h>          /* for struct page */
11029 +#include <linux/bio.h>         /* for struct bio */
11030 +#include <linux/pagemap.h>
11031 +#include <linux/blkdev.h>
11032 +
11033 +/* IMPLEMENTATION NOTES */
11034 +
11035 +/* PARENT-FIRST: Some terminology: A parent-first traversal is a way of assigning a total
11036 +   order to the nodes of the tree in which the parent is placed before its children, which
11037 +   are ordered (recursively) in left-to-right order.  When we speak of a "parent-first preceder", it
11038 +   describes the node that "came before in forward parent-first order".  When we speak of a
11039 +   "parent-first follower", it describes the node that "comes next in parent-first
11040 +   order" (alternatively the node that "came before in reverse parent-first order").
11041 +
11042 +   The following pseudo-code prints the nodes of a tree in forward parent-first order:
11043 +
11044 +   void parent_first (node)
11045 +   {
11046 +     print_node (node);
11047 +     if (node->level > leaf) {
11048 +       for (i = 0; i < num_children; i += 1) {
11049 +         parent_first (node->child[i]);
11050 +       }
11051 +     }
11052 +   }
11053 +*/
11054 +
11055 +/* JUST WHAT ARE WE TRYING TO OPTIMIZE, HERE?  The idea is to optimize block allocation so
11056 +   that a left-to-right scan of the tree's data (i.e., the leaves in left-to-right order)
11057 +   can be accomplished with sequential reads, which results in reading nodes in their
11058 +   parent-first order.  This is a read-optimization aspect of the flush algorithm, and
11059 +   there is also a write-optimization aspect, which is that we wish to make large
11060 +   sequential writes to the disk by allocating or reallocating blocks so that they can be
11061 +   written in sequence.  Sometimes the read-optimization and write-optimization goals
11062 +   conflict with each other, as we discuss in more detail below.
11063 +*/
11064 +
11065 +/* STATE BITS: The flush code revolves around the state of the jnodes it covers.  Here are
11066 +   the relevant jnode->state bits and their relevence to flush:
11067 +
11068 +     JNODE_DIRTY: If a node is dirty, it must be flushed.  But in order to be written it
11069 +     must be allocated first.  In order to be considered allocated, the jnode must have
11070 +     exactly one of { JNODE_OVRWR, JNODE_RELOC } set.  These two bits are exclusive, and
11071 +     all dirtied jnodes eventually have one of these bits set during each transaction.
11072 +
11073 +     JNODE_CREATED: The node was freshly created in its transaction and has no previous
11074 +     block address, so it is unconditionally assigned to be relocated, although this is
11075 +     mainly for code-convenience.  It is not being 'relocated' from anything, but in
11076 +     almost every regard it is treated as part of the relocate set.  The JNODE_CREATED bit
11077 +     remains set even after JNODE_RELOC is set, so the actual relocate can be
11078 +     distinguished from the created-and-allocated set easily: relocate-set members
11079 +     (belonging to the preserve-set) have (JNODE_RELOC) set and created-set members which
11080 +     have no previous location to preserve have (JNODE_RELOC | JNODE_CREATED) set.
11081 +
11082 +     JNODE_OVRWR: The node belongs to atom's overwrite set. The flush algorithm made the
11083 +     decision to maintain the pre-existing location for this node and it will be written
11084 +     to the wandered-log.
11085 +
11086 +     JNODE_RELOC: The flush algorithm made the decision to relocate this block (if it was
11087 +     not created, see note above).  A block with JNODE_RELOC set is eligible for
11088 +     early-flushing and may be submitted during flush_empty_queues.  When the JNODE_RELOC
11089 +     bit is set on a znode, the parent node's internal item is modified and the znode is
11090 +     rehashed.
11091 +
11092 +     JNODE_SQUEEZABLE: Before shifting everything left, the flush algorithm scans the node
11093 +     and calls plugin->f.squeeze() method for its items. By this technology we update disk
11094 +     clusters of cryptcompress objects. Also if leftmost point that was found by flush scan
11095 +     has this flag (races with write(), rare case) the flush algorythm makes the decision
11096 +     to pass it to squalloc() in spite of its flushprepped status for squeezing, not for
11097 +     repeated allocation.
11098 +
11099 +     JNODE_FLUSH_QUEUED: This bit is set when a call to flush enters the jnode into its
11100 +     flush queue.  This means the jnode is not on any clean or dirty list, instead it is
11101 +     moved to one of the flush queue (see flush_queue.h) object private list. This
11102 +     prevents multiple concurrent flushes from attempting to start flushing from the
11103 +     same node.
11104 +
11105 +     (DEAD STATE BIT) JNODE_FLUSH_BUSY: This bit was set during the bottom-up
11106 +     squeeze-and-allocate on a node while its children are actively being squeezed and
11107 +     allocated.  This flag was created to avoid submitting a write request for a node
11108 +     while its children are still being allocated and squeezed. Then flush queue was
11109 +     re-implemented to allow unlimited number of nodes be queued. This flag support was
11110 +     commented out in source code because we decided that there was no reason to submit
11111 +     queued nodes before jnode_flush() finishes.  However, current code calls fq_write()
11112 +     during a slum traversal and may submit "busy nodes" to disk. Probably we can
11113 +     re-enable the JNODE_FLUSH_BUSY bit support in future.
11114 +
11115 +   With these state bits, we describe a test used frequently in the code below,
11116 +   jnode_is_flushprepped() (and the spin-lock-taking jnode_check_flushprepped()).  The
11117 +   test for "flushprepped" returns true if any of the following are true:
11118 +
11119 +     - The node is not dirty
11120 +     - The node has JNODE_RELOC set
11121 +     - The node has JNODE_OVRWR set
11122 +
11123 +   If either the node is not dirty or it has already been processed by flush (and assigned
11124 +   JNODE_OVRWR or JNODE_RELOC), then it is prepped.  If jnode_is_flushprepped() returns
11125 +   true then flush has work to do on that node.
11126 +*/
11127 +
11128 +/* FLUSH_PREP_ONCE_PER_TRANSACTION: Within a single transaction a node is never
11129 +   flushprepped twice (unless an explicit call to flush_unprep is made as described in
11130 +   detail below).  For example a node is dirtied, allocated, and then early-flushed to
11131 +   disk and set clean.  Before the transaction commits, the page is dirtied again and, due
11132 +   to memory pressure, the node is flushed again.  The flush algorithm will not relocate
11133 +   the node to a new disk location, it will simply write it to the same, previously
11134 +   relocated position again.
11135 +*/
11136 +
11137 +/* THE BOTTOM-UP VS. TOP-DOWN ISSUE: This code implements a bottom-up algorithm where we
11138 +   start at a leaf node and allocate in parent-first order by iterating to the right.  At
11139 +   each step of the iteration, we check for the right neighbor.  Before advancing to the
11140 +   right neighbor, we check if the current position and the right neighbor share the same
11141 +   parent.  If they do not share the same parent, the parent is allocated before the right
11142 +   neighbor.
11143 +
11144 +   This process goes recursively up the tree and squeeze nodes level by level as long as
11145 +   the right neighbor and the current position have different parents, then it allocates
11146 +   the right-neighbors-with-different-parents on the way back down.  This process is
11147 +   described in more detail in flush_squalloc_changed_ancestor and the recursive function
11148 +   squalloc_one_changed_ancestor.  But the purpose here is not to discuss the
11149 +   specifics of the bottom-up approach as it is to contrast the bottom-up and top-down
11150 +   approaches.
11151 +
11152 +   The top-down algorithm was implemented earlier (April-May 2002).  In the top-down
11153 +   approach, we find a starting point by scanning left along each level past dirty nodes,
11154 +   then going up and repeating the process until the left node and the parent node are
11155 +   clean.  We then perform a parent-first traversal from the starting point, which makes
11156 +   allocating in parent-first order trivial.  After one subtree has been allocated in this
11157 +   manner, we move to the right, try moving upward, then repeat the parent-first
11158 +   traversal.
11159 +
11160 +   Both approaches have problems that need to be addressed.  Both are approximately the
11161 +   same amount of code, but the bottom-up approach has advantages in the order it acquires
11162 +   locks which, at the very least, make it the better approach.  At first glance each one
11163 +   makes the other one look simpler, so it is important to remember a few of the problems
11164 +   with each one.
11165 +
11166 +   Main problem with the top-down approach: When you encounter a clean child during the
11167 +   parent-first traversal, what do you do?  You would like to avoid searching through a
11168 +   large tree of nodes just to find a few dirty leaves at the bottom, and there is not an
11169 +   obvious solution.  One of the advantages of the top-down approach is that during the
11170 +   parent-first traversal you check every child of a parent to see if it is dirty.  In
11171 +   this way, the top-down approach easily handles the main problem of the bottom-up
11172 +   approach: unallocated children.
11173 +
11174 +   The unallocated children problem is that before writing a node to disk we must make
11175 +   sure that all of its children are allocated.  Otherwise, the writing the node means
11176 +   extra I/O because the node will have to be written again when the child is finally
11177 +   allocated.
11178 +
11179 +   WE HAVE NOT YET ELIMINATED THE UNALLOCATED CHILDREN PROBLEM.  Except for bugs, this
11180 +   should not cause any file system corruption, it only degrades I/O performance because a
11181 +   node may be written when it is sure to be written at least one more time in the same
11182 +   transaction when the remaining children are allocated.  What follows is a description
11183 +   of how we will solve the problem.
11184 +*/
11185 +
11186 +/* HANDLING UNALLOCATED CHILDREN: During flush we may allocate a parent node then,
11187 +   proceeding in parent first order, allocate some of its left-children, then encounter a
11188 +   clean child in the middle of the parent.  We do not allocate the clean child, but there
11189 +   may remain unallocated (dirty) children to the right of the clean child.  If we were to
11190 +   stop flushing at this moment and write everything to disk, the parent might still
11191 +   contain unallocated children.
11192 +
11193 +   We could try to allocate all the descendents of every node that we allocate, but this
11194 +   is not necessary.  Doing so could result in allocating the entire tree: if the root
11195 +   node is allocated then every unallocated node would have to be allocated before
11196 +   flushing.  Actually, we do not have to write a node just because we allocate it.  It is
11197 +   possible to allocate but not write a node during flush, when it still has unallocated
11198 +   children.  However, this approach is probably not optimal for the following reason.
11199 +
11200 +   The flush algorithm is designed to allocate nodes in parent-first order in an attempt
11201 +   to optimize reads that occur in the same order.  Thus we are read-optimizing for a
11202 +   left-to-right scan through all the leaves in the system, and we are hoping to
11203 +   write-optimize at the same time because those nodes will be written together in batch.
11204 +   What happens, however, if we assign a block number to a node in its read-optimized
11205 +   order but then avoid writing it because it has unallocated children?  In that
11206 +   situation, we lose out on the write-optimization aspect because a node will have to be
11207 +   written again to the its location on the device, later, which likely means seeking back
11208 +   to that location.
11209 +
11210 +   So there are tradeoffs. We can choose either:
11211 +
11212 +   A. Allocate all unallocated children to preserve both write-optimization and
11213 +   read-optimization, but this is not always desirable because it may mean having to
11214 +   allocate and flush very many nodes at once.
11215 +
11216 +   B. Defer writing nodes with unallocated children, keep their read-optimized locations,
11217 +   but sacrifice write-optimization because those nodes will be written again.
11218 +
11219 +   C. Defer writing nodes with unallocated children, but do not keep their read-optimized
11220 +   locations.  Instead, choose to write-optimize them later, when they are written.  To
11221 +   facilitate this, we "undo" the read-optimized allocation that was given to the node so
11222 +   that later it can be write-optimized, thus "unpreparing" the flush decision.  This is a
11223 +   case where we disturb the FLUSH_PREP_ONCE_PER_TRANSACTION rule described above.  By a
11224 +   call to flush_unprep() we will: if the node was wandered, unset the JNODE_OVRWR bit;
11225 +   if the node was relocated, unset the JNODE_RELOC bit, non-deferred-deallocate its block
11226 +   location, and set the JNODE_CREATED bit, effectively setting the node back to an
11227 +   unallocated state.
11228 +
11229 +   We will take the following approach in v4.0: for twig nodes we will always finish
11230 +   allocating unallocated children (A).  For nodes with (level > TWIG) we will defer
11231 +   writing and choose write-optimization (C).
11232 +
11233 +   To summarize, there are several parts to a solution that avoids the problem with
11234 +   unallocated children:
11235 +
11236 +   FIXME-ZAM: Still no one approach is implemented to eliminate the "UNALLOCATED CHILDREN"
11237 +   problem because there was an experiment which was done showed that we have 1-2 nodes
11238 +   with unallocated children for thousands of written nodes.  The experiment was simple
11239 +   like coping / deletion of linux kernel sources.  However the problem can arise in more
11240 +   complex tests.  I think we have jnode_io_hook to insert a check for unallocated
11241 +   children and see what kind of problem we have.
11242 +
11243 +   1. When flush reaches a stopping point (e.g., a clean node), it should continue calling
11244 +   squeeze-and-allocate on any remaining unallocated children.  FIXME: Difficulty to
11245 +   implement: should be simple -- amounts to adding a while loop to jnode_flush, see
11246 +   comments in that function.
11247 +
11248 +   2. When flush reaches flush_empty_queue(), some of the (level > TWIG) nodes may still
11249 +   have unallocated children.  If the twig level has unallocated children it is an
11250 +   assertion failure.  If a higher-level node has unallocated children, then it should be
11251 +   explicitly de-allocated by a call to flush_unprep().  FIXME: Difficulty to implement:
11252 +   should be simple.
11253 +
11254 +   3. (CPU-Optimization) Checking whether a node has unallocated children may consume more
11255 +   CPU cycles than we would like, and it is possible (but medium complexity) to optimize
11256 +   this somewhat in the case where large sub-trees are flushed.  The following observation
11257 +   helps: if both the left- and right-neighbor of a node are processed by the flush
11258 +   algorithm then the node itself is guaranteed to have all of its children allocated.
11259 +   However, the cost of this check may not be so expensive after all: it is not needed for
11260 +   leaves and flush can guarantee this property for twigs.  That leaves only (level >
11261 +   TWIG) nodes that have to be checked, so this optimization only helps if at least three
11262 +   (level > TWIG) nodes are flushed in one pass, and the savings will be very small unless
11263 +   there are many more (level > TWIG) nodes.  But if there are many (level > TWIG) nodes
11264 +   then the number of blocks being written will be very large, so the savings may be
11265 +   insignificant.  That said, the idea is to maintain both the left and right edges of
11266 +   nodes that are processed in flush.  When flush_empty_queue() is called, a relatively
11267 +   simple test will tell whether the (level > TWIG) node is on the edge.  If it is on the
11268 +   edge, the slow check is necessary, but if it is in the interior then it can be assumed
11269 +   to have all of its children allocated.  FIXME: medium complexity to implement, but
11270 +   simple to verify given that we must have a slow check anyway.
11271 +
11272 +   4. (Optional) This part is optional, not for v4.0--flush should work independently of
11273 +   whether this option is used or not.  Called RAPID_SCAN, the idea is to amend the
11274 +   left-scan operation to take unallocated children into account.  Normally, the left-scan
11275 +   operation goes left as long as adjacent nodes are dirty up until some large maximum
11276 +   value (FLUSH_SCAN_MAXNODES) at which point it stops and begins flushing.  But scan-left
11277 +   may stop at a position where there are unallocated children to the left with the same
11278 +   parent.  When RAPID_SCAN is enabled, the ordinary scan-left operation stops after
11279 +   FLUSH_RELOCATE_THRESHOLD, which is much smaller than FLUSH_SCAN_MAXNODES, then procedes
11280 +   with a rapid scan.  The rapid scan skips all the interior children of a node--if the
11281 +   leftmost child of a twig is dirty, check its left neighbor (the rightmost child of the
11282 +   twig to the left).  If the left neighbor of the leftmost child is also dirty, then
11283 +   continue the scan at the left twig and repeat.  This option will cause flush to
11284 +   allocate more twigs in a single pass, but it also has the potential to write many more
11285 +   nodes than would otherwise be written without the RAPID_SCAN option.  RAPID_SCAN
11286 +   was partially implemented, code removed August 12, 2002 by JMACD.
11287 +*/
11288 +
11289 +/* FLUSH CALLED ON NON-LEAF LEVEL.  Most of our design considerations assume that the
11290 +   starting point for flush is a leaf node, but actually the flush code cares very little
11291 +   about whether or not this is true.  It is possible that all the leaf nodes are flushed
11292 +   and dirty parent nodes still remain, in which case jnode_flush() is called on a
11293 +   non-leaf argument.  Flush doesn't care--it treats the argument node as if it were a
11294 +   leaf, even when it is not.  This is a simple approach, and there may be a more optimal
11295 +   policy but until a problem with this approach is discovered, simplest is probably best.
11296 +
11297 +   NOTE: In this case, the ordering produced by flush is parent-first only if you ignore
11298 +   the leaves.  This is done as a matter of simplicity and there is only one (shaky)
11299 +   justification.  When an atom commits, it flushes all leaf level nodes first, followed
11300 +   by twigs, and so on.  With flushing done in this order, if flush is eventually called
11301 +   on a non-leaf node it means that (somehow) we reached a point where all leaves are
11302 +   clean and only internal nodes need to be flushed.  If that it the case, then it means
11303 +   there were no leaves that were the parent-first preceder/follower of the parent.  This
11304 +   is expected to be a rare case, which is why we do nothing special about it.  However,
11305 +   memory pressure may pass an internal node to flush when there are still dirty leaf
11306 +   nodes that need to be flushed, which could prove our original assumptions
11307 +   "inoperative".  If this needs to be fixed, then scan_left/right should have
11308 +   special checks for the non-leaf levels.  For example, instead of passing from a node to
11309 +   the left neighbor, it should pass from the node to the left neighbor's rightmost
11310 +   descendent (if dirty).
11311 +
11312 +*/
11313 +
11314 +/* UNIMPLEMENTED AS YET: REPACKING AND RESIZING.  We walk the tree in 4MB-16MB chunks, dirtying everything and putting
11315 +   it into a transaction.  We tell the allocator to allocate the blocks as far as possible towards one end of the
11316 +   logical device--the left (starting) end of the device if we are walking from left to right, the right end of the
11317 +   device if we are walking from right to left.  We then make passes in alternating directions, and as we do this the
11318 +   device becomes sorted such that tree order and block number order fully correlate.
11319 +
11320 +   Resizing is done by shifting everything either all the way to the left or all the way
11321 +   to the right, and then reporting the last block.
11322 +*/
11323 +
11324 +/* RELOCATE DECISIONS: The code makes a decision to relocate in several places.  This
11325 +   descibes the policy from the highest level:
11326 +
11327 +   The FLUSH_RELOCATE_THRESHOLD parameter: If we count this many consecutive nodes on the
11328 +   leaf level during flush-scan (right, left), then we unconditionally decide to relocate
11329 +   leaf nodes.
11330 +
11331 +   Otherwise, there are two contexts in which we make a decision to relocate:
11332 +
11333 +   1. The REVERSE PARENT-FIRST context: Implemented in reverse_relocate_test().
11334 +   During the initial stages of flush, after scan-right completes, we want to ask the
11335 +   question: should we relocate this leaf node and thus dirty the parent node.  Then if
11336 +   the node is a leftmost child its parent is its own parent-first preceder, thus we repeat
11337 +   the question at the next level up, and so on.  In these cases we are moving in the
11338 +   reverse-parent first direction.
11339 +
11340 +   There is another case which is considered the reverse direction, which comes at the end
11341 +   of a twig in reverse_relocate_end_of_twig().  As we finish processing a twig we may
11342 +   reach a point where there is a clean twig to the right with a dirty leftmost child.  In
11343 +   this case, we may wish to relocate the child by testing if it should be relocated
11344 +   relative to its parent.
11345 +
11346 +   2. The FORWARD PARENT-FIRST context: Testing for forward relocation is done in
11347 +   allocate_znode.  What distinguishes the forward parent-first case from the
11348 +   reverse-parent first case is that the preceder has already been allocated in the
11349 +   forward case, whereas in the reverse case we don't know what the preceder is until we
11350 +   finish "going in reverse".  That simplifies the forward case considerably, and there we
11351 +   actually use the block allocator to determine whether, e.g., a block closer to the
11352 +   preceder is available.
11353 +*/
11354 +
11355 +/* SQUEEZE_LEFT_EDGE: Unimplemented idea for future consideration.  The idea is, once we
11356 +   finish scan-left and find a starting point, if the parent's left neighbor is dirty then
11357 +   squeeze the parent's left neighbor and the parent.  This may change the
11358 +   flush-starting-node's parent.  Repeat until the child's parent is stable.  If the child
11359 +   is a leftmost child, repeat this left-edge squeezing operation at the next level up.
11360 +   Note that we cannot allocate extents during this or they will be out of parent-first
11361 +   order.  There is also some difficult coordinate maintenence issues.  We can't do a tree
11362 +   search to find coordinates again (because we hold locks), we have to determine them
11363 +   from the two nodes being squeezed.  Looks difficult, but has potential to increase
11364 +   space utilization. */
11365 +
11366 +/* Flush-scan helper functions. */
11367 +static void scan_init(flush_scan * scan);
11368 +static void scan_done(flush_scan * scan);
11369 +
11370 +/* Flush-scan algorithm. */
11371 +static int scan_left(flush_scan * scan, flush_scan * right, jnode * node,
11372 +                    unsigned limit);
11373 +static int scan_right(flush_scan * scan, jnode * node, unsigned limit);
11374 +static int scan_common(flush_scan * scan, flush_scan * other);
11375 +static int scan_formatted(flush_scan * scan);
11376 +static int scan_unformatted(flush_scan * scan, flush_scan * other);
11377 +static int scan_by_coord(flush_scan * scan);
11378 +
11379 +/* Initial flush-point ancestor allocation. */
11380 +static int alloc_pos_and_ancestors(flush_pos_t * pos);
11381 +static int alloc_one_ancestor(const coord_t * coord, flush_pos_t * pos);
11382 +static int set_preceder(const coord_t * coord_in, flush_pos_t * pos);
11383 +
11384 +/* Main flush algorithm.  Note on abbreviation: "squeeze and allocate" == "squalloc". */
11385 +static int squalloc(flush_pos_t * pos);
11386 +
11387 +/* Flush squeeze implementation. */
11388 +static int squeeze_right_non_twig(znode * left, znode * right);
11389 +static int shift_one_internal_unit(znode * left, znode * right);
11390 +
11391 +/* Flush reverse parent-first relocation routines. */
11392 +static int reverse_relocate_if_close_enough(const reiser4_block_nr * pblk,
11393 +                                           const reiser4_block_nr * nblk);
11394 +static int reverse_relocate_test(jnode * node, const coord_t * parent_coord,
11395 +                                flush_pos_t * pos);
11396 +static int reverse_relocate_check_dirty_parent(jnode * node,
11397 +                                              const coord_t * parent_coord,
11398 +                                              flush_pos_t * pos);
11399 +
11400 +/* Flush allocate write-queueing functions: */
11401 +static int allocate_znode(znode * node, const coord_t * parent_coord,
11402 +                         flush_pos_t * pos);
11403 +static int allocate_znode_update(znode * node, const coord_t * parent_coord,
11404 +                                flush_pos_t * pos);
11405 +static int lock_parent_and_allocate_znode(znode *, flush_pos_t *);
11406 +
11407 +/* Flush helper functions: */
11408 +static int jnode_lock_parent_coord(jnode * node,
11409 +                                  coord_t * coord,
11410 +                                  lock_handle * parent_lh,
11411 +                                  load_count * parent_zh,
11412 +                                  znode_lock_mode mode, int try);
11413 +static int neighbor_in_slum(znode * node, lock_handle * right_lock, sideof side,
11414 +                           znode_lock_mode mode, int check_dirty);
11415 +static int znode_same_parents(znode * a, znode * b);
11416 +
11417 +static int znode_check_flushprepped(znode * node)
11418 +{
11419 +       return jnode_check_flushprepped(ZJNODE(node));
11420 +}
11421 +
11422 +/* Flush position functions */
11423 +static void pos_init(flush_pos_t * pos);
11424 +static int pos_valid(flush_pos_t * pos);
11425 +static void pos_done(flush_pos_t * pos);
11426 +static int pos_stop(flush_pos_t * pos);
11427 +
11428 +/* check that @org is first jnode extent unit, if extent is unallocated,
11429 + * because all jnodes of unallocated extent are dirty and of the same atom. */
11430 +#define checkchild(scan)                                               \
11431 +assert("nikita-3435",                                                  \
11432 +       ergo(scan->direction == LEFT_SIDE &&                            \
11433 +            (scan->parent_coord.node->level == TWIG_LEVEL) &&           \
11434 +           jnode_is_unformatted(scan->node) &&                         \
11435 +           extent_is_unallocated(&scan->parent_coord),                 \
11436 +           extent_unit_index(&scan->parent_coord) == index_jnode(scan->node)))
11437 +
11438 +/* This flush_cnt variable is used to track the number of concurrent flush operations,
11439 +   useful for debugging.  It is initialized in txnmgr.c out of laziness (because flush has
11440 +   no static initializer function...) */
11441 +ON_DEBUG(atomic_t flush_cnt;
11442 +    )
11443 +
11444 +/* check fs backing device for write congestion */
11445 +static int check_write_congestion(void)
11446 +{
11447 +       struct super_block *sb;
11448 +       struct backing_dev_info *bdi;
11449 +
11450 +       sb = reiser4_get_current_sb();
11451 +       bdi = get_super_fake(sb)->i_mapping->backing_dev_info;
11452 +       return bdi_write_congested(bdi);
11453 +}
11454 +
11455 +/* conditionally write flush queue */
11456 +static int write_prepped_nodes(flush_pos_t * pos)
11457 +{
11458 +       int ret;
11459 +
11460 +       assert("zam-831", pos);
11461 +       assert("zam-832", pos->fq);
11462 +
11463 +       if (!(pos->flags & JNODE_FLUSH_WRITE_BLOCKS))
11464 +               return 0;
11465 +
11466 +       if (check_write_congestion())
11467 +               return 0;
11468 +
11469 +       ret = write_fq(pos->fq, pos->nr_written,
11470 +                      WRITEOUT_SINGLE_STREAM | WRITEOUT_FOR_PAGE_RECLAIM);
11471 +       return ret;
11472 +}
11473 +
11474 +/* Proper release all flush pos. resources then move flush position to new
11475 +   locked node */
11476 +static void move_flush_pos(flush_pos_t * pos, lock_handle * new_lock,
11477 +                          load_count * new_load, const coord_t * new_coord)
11478 +{
11479 +       assert("zam-857", new_lock->node == new_load->node);
11480 +
11481 +       if (new_coord) {
11482 +               assert("zam-858", new_coord->node == new_lock->node);
11483 +               coord_dup(&pos->coord, new_coord);
11484 +       } else {
11485 +               coord_init_first_unit(&pos->coord, new_lock->node);
11486 +       }
11487 +
11488 +       if (pos->child) {
11489 +               jput(pos->child);
11490 +               pos->child = NULL;
11491 +       }
11492 +
11493 +       move_load_count(&pos->load, new_load);
11494 +       done_lh(&pos->lock);
11495 +       move_lh(&pos->lock, new_lock);
11496 +}
11497 +
11498 +/* delete empty node which link from the parent still exists. */
11499 +static int delete_empty_node(znode * node)
11500 +{
11501 +       reiser4_key smallest_removed;
11502 +
11503 +       assert("zam-1019", node != NULL);
11504 +       assert("zam-1020", node_is_empty(node));
11505 +       assert("zam-1023", znode_is_wlocked(node));
11506 +
11507 +       return delete_node(node, &smallest_removed, NULL, 1);
11508 +}
11509 +
11510 +/* Prepare flush position for alloc_pos_and_ancestors() and squalloc() */
11511 +static int prepare_flush_pos(flush_pos_t * pos, jnode * org)
11512 +{
11513 +       int ret;
11514 +       load_count load;
11515 +       lock_handle lock;
11516 +
11517 +       init_lh(&lock);
11518 +       init_load_count(&load);
11519 +
11520 +       if (jnode_is_znode(org)) {
11521 +               ret = longterm_lock_znode(&lock, JZNODE(org),
11522 +                                         ZNODE_WRITE_LOCK, ZNODE_LOCK_HIPRI);
11523 +               if (ret)
11524 +                       return ret;
11525 +
11526 +               ret = incr_load_count_znode(&load, JZNODE(org));
11527 +               if (ret)
11528 +                       return ret;
11529 +
11530 +               pos->state =
11531 +                   (jnode_get_level(org) ==
11532 +                    LEAF_LEVEL) ? POS_ON_LEAF : POS_ON_INTERNAL;
11533 +               move_flush_pos(pos, &lock, &load, NULL);
11534 +       } else {
11535 +               coord_t parent_coord;
11536 +               ret = jnode_lock_parent_coord(org, &parent_coord, &lock,
11537 +                                             &load, ZNODE_WRITE_LOCK, 0);
11538 +               if (ret)
11539 +                       goto done;
11540 +               if (!item_is_extent(&parent_coord)) {
11541 +                       /* file was converted to tail, org became HB, we found internal
11542 +                          item */
11543 +                       ret = -EAGAIN;
11544 +                       goto done;
11545 +               }
11546 +
11547 +               pos->state = POS_ON_EPOINT;
11548 +               move_flush_pos(pos, &lock, &load, &parent_coord);
11549 +               pos->child = jref(org);
11550 +               if (extent_is_unallocated(&parent_coord)
11551 +                   && extent_unit_index(&parent_coord) != index_jnode(org)) {
11552 +                       /* @org is not first child of its parent unit. This may happen
11553 +                          because longerm lock of its parent node was released between
11554 +                          scan_left and scan_right. For now work around this having flush to repeat */
11555 +                       ret = -EAGAIN;
11556 +               }
11557 +       }
11558 +
11559 +      done:
11560 +       done_load_count(&load);
11561 +       done_lh(&lock);
11562 +       return ret;
11563 +}
11564 +
11565 +/* TODO LIST (no particular order): */
11566 +/* I have labelled most of the legitimate FIXME comments in this file with letters to
11567 +   indicate which issue they relate to.  There are a few miscellaneous FIXMEs with
11568 +   specific names mentioned instead that need to be inspected/resolved. */
11569 +/* B. There is an issue described in reverse_relocate_test having to do with an
11570 +   imprecise is_preceder? check having to do with partially-dirty extents.  The code that
11571 +   sets preceder hints and computes the preceder is basically untested.  Careful testing
11572 +   needs to be done that preceder calculations are done correctly, since if it doesn't
11573 +   affect correctness we will not catch this stuff during regular testing. */
11574 +/* C. EINVAL, E_DEADLOCK, E_NO_NEIGHBOR, ENOENT handling.  It is unclear which of these are
11575 +   considered expected but unlikely conditions.  Flush currently returns 0 (i.e., success
11576 +   but no progress, i.e., restart) whenever it receives any of these in jnode_flush().
11577 +   Many of the calls that may produce one of these return values (i.e.,
11578 +   longterm_lock_znode, reiser4_get_parent, reiser4_get_neighbor, ...) check some of these
11579 +   values themselves and, for instance, stop flushing instead of resulting in a restart.
11580 +   If any of these results are true error conditions then flush will go into a busy-loop,
11581 +   as we noticed during testing when a corrupt tree caused find_child_ptr to return
11582 +   ENOENT.  It needs careful thought and testing of corner conditions.
11583 +*/
11584 +/* D. Atomicity of flush_prep against deletion and flush concurrency.  Suppose a created
11585 +   block is assigned a block number then early-flushed to disk.  It is dirtied again and
11586 +   flush is called again.  Concurrently, that block is deleted, and the de-allocation of
11587 +   its block number does not need to be deferred, since it is not part of the preserve set
11588 +   (i.e., it didn't exist before the transaction).  I think there may be a race condition
11589 +   where flush writes the dirty, created block after the non-deferred deallocated block
11590 +   number is re-allocated, making it possible to write deleted data on top of non-deleted
11591 +   data.  Its just a theory, but it needs to be thought out. */
11592 +/* F. bio_alloc() failure is not handled gracefully. */
11593 +/* G. Unallocated children. */
11594 +/* H. Add a WANDERED_LIST to the atom to clarify the placement of wandered blocks. */
11595 +/* I. Rename flush-scan to scan-point, (flush-pos to flush-point?) */
11596 +
11597 +/* JNODE_FLUSH: MAIN ENTRY POINT */
11598 +/* This is the main entry point for flushing a jnode and its dirty neighborhood (dirty
11599 +   neighborhood is named "slum").  Jnode_flush() is called if reiser4 has to write dirty
11600 +   blocks to disk, it happens when Linux VM decides to reduce number of dirty pages or as
11601 +   a part of transaction commit.
11602 +
11603 +   Our objective here is to prep and flush the slum the jnode belongs to. We want to
11604 +   squish the slum together, and allocate the nodes in it as we squish because allocation
11605 +   of children affects squishing of parents.
11606 +
11607 +   The "argument" @node tells flush where to start.  From there, flush finds the left edge
11608 +   of the slum, and calls squalloc (in which nodes are squeezed and allocated).  To find a
11609 +   "better place" to start squalloc first we perform a flush_scan.
11610 +
11611 +   Flush-scanning may be performed in both left and right directions, but for different
11612 +   purposes.  When scanning to the left, we are searching for a node that precedes a
11613 +   sequence of parent-first-ordered nodes which we will then flush in parent-first order.
11614 +   During flush-scanning, we also take the opportunity to count the number of consecutive
11615 +   leaf nodes.  If this number is past some threshold (FLUSH_RELOCATE_THRESHOLD), then we
11616 +   make a decision to reallocate leaf nodes (thus favoring write-optimization).
11617 +
11618 +   Since the flush argument node can be anywhere in a sequence of dirty leaves, there may
11619 +   also be dirty nodes to the right of the argument.  If the scan-left operation does not
11620 +   count at least FLUSH_RELOCATE_THRESHOLD nodes then we follow it with a right-scan
11621 +   operation to see whether there is, in fact, enough nodes to meet the relocate
11622 +   threshold.  Each right- and left-scan operation uses a single flush_scan object.
11623 +
11624 +   After left-scan and possibly right-scan, we prepare a flush_position object with the
11625 +   starting flush point or parent coordinate, which was determined using scan-left.
11626 +
11627 +   Next we call the main flush routine, squalloc, which iterates along the
11628 +   leaf level, squeezing and allocating nodes (and placing them into the flush queue).
11629 +
11630 +   After squalloc returns we take extra steps to ensure that all the children
11631 +   of the final twig node are allocated--this involves repeating squalloc
11632 +   until we finish at a twig with no unallocated children.
11633 +
11634 +   Finally, we call flush_empty_queue to submit write-requests to disk.  If we encounter
11635 +   any above-twig nodes during flush_empty_queue that still have unallocated children, we
11636 +   flush_unprep them.
11637 +
11638 +   Flush treats several "failure" cases as non-failures, essentially causing them to start
11639 +   over.  E_DEADLOCK is one example.  FIXME:(C) EINVAL, E_NO_NEIGHBOR, ENOENT: these should
11640 +   probably be handled properly rather than restarting, but there are a bunch of cases to
11641 +   audit.
11642 +*/
11643 +
11644 +static int
11645 +jnode_flush(jnode * node, long nr_to_write, long *nr_written,
11646 +           flush_queue_t * fq, int flags)
11647 +{
11648 +       long ret = 0;
11649 +       flush_scan *right_scan;
11650 +       flush_scan *left_scan;
11651 +       flush_pos_t *flush_pos;
11652 +       int todo;
11653 +       struct super_block *sb;
11654 +       reiser4_super_info_data *sbinfo;
11655 +       jnode *leftmost_in_slum = NULL;
11656 +
11657 +       assert("jmacd-76619", lock_stack_isclean(get_current_lock_stack()));
11658 +       assert("nikita-3022", schedulable());
11659 +
11660 +       /* lock ordering: delete_sema and flush_sema are unordered */
11661 +       assert("nikita-3185",
11662 +              get_current_super_private()->delete_sema_owner != current);
11663 +
11664 +       /* allocate right_scan, left_scan and flush_pos */
11665 +       right_scan =
11666 +           kmalloc(2 * sizeof(*right_scan) + sizeof(*flush_pos), get_gfp_mask());
11667 +       if (right_scan == NULL)
11668 +               return RETERR(-ENOMEM);
11669 +       left_scan = right_scan + 1;
11670 +       flush_pos = (flush_pos_t *) (left_scan + 1);
11671 +
11672 +       sb = reiser4_get_current_sb();
11673 +       sbinfo = get_super_private(sb);
11674 +       if (!reiser4_is_set(sb, REISER4_MTFLUSH)) {
11675 +               down(&sbinfo->flush_sema);
11676 +       }
11677 +
11678 +       /* Flush-concurrency debug code */
11679 +#if REISER4_DEBUG
11680 +       atomic_inc(&flush_cnt);
11681 +#endif
11682 +
11683 +       enter_flush(sb);
11684 +
11685 +       /* Initialize a flush position. */
11686 +       pos_init(flush_pos);
11687 +
11688 +       flush_pos->nr_written = nr_written;
11689 +       flush_pos->fq = fq;
11690 +       flush_pos->flags = flags;
11691 +       flush_pos->nr_to_write = nr_to_write;
11692 +
11693 +       scan_init(right_scan);
11694 +       scan_init(left_scan);
11695 +
11696 +       /* First scan left and remember the leftmost scan position.  If the leftmost
11697 +          position is unformatted we remember its parent_coord.  We scan until counting
11698 +          FLUSH_SCAN_MAXNODES.
11699 +
11700 +          If starting @node is unformatted, at the beginning of left scan its
11701 +          parent (twig level node, containing extent item) will be long term
11702 +          locked and lock handle will be stored in the
11703 +          @right_scan->parent_lock. This lock is used to start the rightward
11704 +          scan without redoing the tree traversal (necessary to find parent)
11705 +          and, hence, is kept during leftward scan. As a result, we have to
11706 +          use try-lock when taking long term locks during the leftward scan.
11707 +        */
11708 +       ret = scan_left(left_scan, right_scan,
11709 +                       node, sbinfo->flush.scan_maxnodes);
11710 +       if (ret != 0)
11711 +               goto failed;
11712 +
11713 +       leftmost_in_slum = jref(left_scan->node);
11714 +       scan_done(left_scan);
11715 +
11716 +       /* Then possibly go right to decide if we will use a policy of relocating leaves.
11717 +          This is only done if we did not scan past (and count) enough nodes during the
11718 +          leftward scan.  If we do scan right, we only care to go far enough to establish
11719 +          that at least FLUSH_RELOCATE_THRESHOLD number of nodes are being flushed.  The
11720 +          scan limit is the difference between left_scan.count and the threshold. */
11721 +
11722 +       todo = sbinfo->flush.relocate_threshold - left_scan->count;
11723 +       /* scan right is inherently deadlock prone, because we are
11724 +        * (potentially) holding a lock on the twig node at this moment.
11725 +        * FIXME: this is incorrect comment: lock is not held */
11726 +       if (todo > 0) {
11727 +               ret = scan_right(right_scan, node, (unsigned)todo);
11728 +               if (ret != 0)
11729 +                       goto failed;
11730 +       }
11731 +
11732 +       /* Only the right-scan count is needed, release any rightward locks right away. */
11733 +       scan_done(right_scan);
11734 +
11735 +       /* ... and the answer is: we should relocate leaf nodes if at least
11736 +          FLUSH_RELOCATE_THRESHOLD nodes were found. */
11737 +       flush_pos->leaf_relocate = JF_ISSET(node, JNODE_REPACK) ||
11738 +           (left_scan->count + right_scan->count >=
11739 +            sbinfo->flush.relocate_threshold);
11740 +
11741 +       /* Funny business here.  We set the 'point' in the flush_position at prior to
11742 +          starting squalloc regardless of whether the first point is
11743 +          formatted or unformatted.  Without this there would be an invariant, in the
11744 +          rest of the code, that if the flush_position is unformatted then
11745 +          flush_position->point is NULL and flush_position->parent_{lock,coord} is set,
11746 +          and if the flush_position is formatted then flush_position->point is non-NULL
11747 +          and no parent info is set.
11748 +
11749 +          This seems lazy, but it makes the initial calls to reverse_relocate_test
11750 +          (which ask "is it the pos->point the leftmost child of its parent") much easier
11751 +          because we know the first child already.  Nothing is broken by this, but the
11752 +          reasoning is subtle.  Holding an extra reference on a jnode during flush can
11753 +          cause us to see nodes with HEARD_BANSHEE during squalloc, because nodes are not
11754 +          removed from sibling lists until they have zero reference count.  Flush would
11755 +          never observe a HEARD_BANSHEE node on the left-edge of flush, nodes are only
11756 +          deleted to the right.  So if nothing is broken, why fix it?
11757 +
11758 +          NOTE-NIKITA actually, flush can meet HEARD_BANSHEE node at any
11759 +          point and in any moment, because of the concurrent file system
11760 +          activity (for example, truncate). */
11761 +
11762 +       /* Check jnode state after flush_scan completed. Having a lock on this
11763 +          node or its parent (in case of unformatted) helps us in case of
11764 +          concurrent flushing. */
11765 +       if (jnode_check_flushprepped(leftmost_in_slum)
11766 +           && !jnode_convertible(leftmost_in_slum)) {
11767 +               ret = 0;
11768 +               goto failed;
11769 +       }
11770 +
11771 +       /* Now setup flush_pos using scan_left's endpoint. */
11772 +       ret = prepare_flush_pos(flush_pos, leftmost_in_slum);
11773 +       if (ret)
11774 +               goto failed;
11775 +
11776 +       if (znode_get_level(flush_pos->coord.node) == LEAF_LEVEL
11777 +           && node_is_empty(flush_pos->coord.node)) {
11778 +               znode *empty = flush_pos->coord.node;
11779 +
11780 +               assert("zam-1022", !ZF_ISSET(empty, JNODE_HEARD_BANSHEE));
11781 +               ret = delete_empty_node(empty);
11782 +               goto failed;
11783 +       }
11784 +
11785 +       if (jnode_check_flushprepped(leftmost_in_slum)
11786 +           && !jnode_convertible(leftmost_in_slum)) {
11787 +               ret = 0;
11788 +               goto failed;
11789 +       }
11790 +
11791 +       /* Set pos->preceder and (re)allocate pos and its ancestors if it is needed  */
11792 +       ret = alloc_pos_and_ancestors(flush_pos);
11793 +       if (ret)
11794 +               goto failed;
11795 +
11796 +       /* Do the main rightward-bottom-up squeeze and allocate loop. */
11797 +       ret = squalloc(flush_pos);
11798 +       pos_stop(flush_pos);
11799 +       if (ret)
11800 +               goto failed;
11801 +
11802 +       /* FIXME_NFQUCMPD: Here, handle the twig-special case for unallocated children.
11803 +          First, the pos_stop() and pos_valid() routines should be modified
11804 +          so that pos_stop() sets a flush_position->stop flag to 1 without
11805 +          releasing the current position immediately--instead release it in
11806 +          pos_done().  This is a better implementation than the current one anyway.
11807 +
11808 +          It is not clear that all fields of the flush_position should not be released,
11809 +          but at the very least the parent_lock, parent_coord, and parent_load should
11810 +          remain held because they are hold the last twig when pos_stop() is
11811 +          called.
11812 +
11813 +          When we reach this point in the code, if the parent_coord is set to after the
11814 +          last item then we know that flush reached the end of a twig (and according to
11815 +          the new flush queueing design, we will return now).  If parent_coord is not
11816 +          past the last item, we should check if the current twig has any unallocated
11817 +          children to the right (we are not concerned with unallocated children to the
11818 +          left--in that case the twig itself should not have been allocated).  If the
11819 +          twig has unallocated children to the right, set the parent_coord to that
11820 +          position and then repeat the call to squalloc.
11821 +
11822 +          Testing for unallocated children may be defined in two ways: if any internal
11823 +          item has a fake block number, it is unallocated; if any extent item is
11824 +          unallocated then all of its children are unallocated.  But there is a more
11825 +          aggressive approach: if there are any dirty children of the twig to the right
11826 +          of the current position, we may wish to relocate those nodes now.  Checking for
11827 +          potential relocation is more expensive as it requires knowing whether there are
11828 +          any dirty children that are not unallocated.  The extent_needs_allocation
11829 +          should be used after setting the correct preceder.
11830 +
11831 +          When we reach the end of a twig at this point in the code, if the flush can
11832 +          continue (when the queue is ready) it will need some information on the future
11833 +          starting point.  That should be stored away in the flush_handle using a seal, I
11834 +          believe.  Holding a jref() on the future starting point may break other code
11835 +          that deletes that node.
11836 +        */
11837 +
11838 +       /* FIXME_NFQUCMPD: Also, we don't want to do any flushing when flush is called
11839 +          above the twig level.  If the VM calls flush above the twig level, do nothing
11840 +          and return (but figure out why this happens).  The txnmgr should be modified to
11841 +          only flush its leaf-level dirty list.  This will do all the necessary squeeze
11842 +          and allocate steps but leave unallocated branches and possibly unallocated
11843 +          twigs (when the twig's leftmost child is not dirty).  After flushing the leaf
11844 +          level, the remaining unallocated nodes should be given write-optimized
11845 +          locations.  (Possibly, the remaining unallocated twigs should be allocated just
11846 +          before their leftmost child.)
11847 +        */
11848 +
11849 +       /* Any failure reaches this point. */
11850 +      failed:
11851 +
11852 +       switch (ret) {
11853 +       case -E_REPEAT:
11854 +       case -EINVAL:
11855 +       case -E_DEADLOCK:
11856 +       case -E_NO_NEIGHBOR:
11857 +       case -ENOENT:
11858 +               /* FIXME(C): Except for E_DEADLOCK, these should probably be handled properly
11859 +                  in each case.  They already are handled in many cases. */
11860 +               /* Something bad happened, but difficult to avoid...  Try again! */
11861 +               ret = 0;
11862 +       }
11863 +
11864 +       if (leftmost_in_slum)
11865 +               jput(leftmost_in_slum);
11866 +
11867 +       pos_done(flush_pos);
11868 +       scan_done(left_scan);
11869 +       scan_done(right_scan);
11870 +       kfree(right_scan);
11871 +
11872 +       ON_DEBUG(atomic_dec(&flush_cnt));
11873 +
11874 +       leave_flush(sb);
11875 +
11876 +       if (!reiser4_is_set(sb, REISER4_MTFLUSH))
11877 +               up(&sbinfo->flush_sema);
11878 +
11879 +       return ret;
11880 +}
11881 +
11882 +/* The reiser4 flush subsystem can be turned into "rapid flush mode" means that
11883 + * flusher should submit all prepped nodes immediately without keeping them in
11884 + * flush queues for long time.  The reason for rapid flush mode is to free
11885 + * memory as fast as possible. */
11886 +
11887 +#if REISER4_USE_RAPID_FLUSH
11888 +
11889 +/**
11890 + * submit all prepped nodes if rapid flush mode is set,
11891 + * turn rapid flush mode off.
11892 + */
11893 +
11894 +static int rapid_flush(flush_pos_t * pos)
11895 +{
11896 +       if (!wbq_available())
11897 +               return 0;
11898 +
11899 +       return write_prepped_nodes(pos);
11900 +}
11901 +
11902 +#else
11903 +
11904 +#define rapid_flush(pos) (0)
11905 +
11906 +#endif                         /* REISER4_USE_RAPID_FLUSH */
11907 +
11908 +static jnode *find_flush_start_jnode(jnode *start, txn_atom *atom,
11909 +                                    flush_queue_t *fq, int *nr_queued,
11910 +                                    int flags)
11911 +{
11912 +       jnode * node;
11913 +
11914 +       if (start != NULL) {
11915 +               spin_lock_jnode(start);
11916 +               if (!jnode_is_flushprepped(start)) {
11917 +                       assert("zam-1056", start->atom == atom);
11918 +                       node = start;
11919 +                       goto enter;
11920 +               }
11921 +               spin_unlock_jnode(start);
11922 +       }
11923 +       /*
11924 +        * In this loop we process all already prepped (RELOC or OVRWR) and dirtied again
11925 +        * nodes. The atom spin lock is not released until all dirty nodes processed or
11926 +        * not prepped node found in the atom dirty lists.
11927 +        */
11928 +       while ((node = find_first_dirty_jnode(atom, flags))) {
11929 +               spin_lock_jnode(node);
11930 +       enter:
11931 +               assert("zam-881", JF_ISSET(node, JNODE_DIRTY));
11932 +               assert("zam-898", !JF_ISSET(node, JNODE_OVRWR));
11933 +
11934 +               if (JF_ISSET(node, JNODE_WRITEBACK)) {
11935 +                       /* move node to the end of atom's writeback list */
11936 +                       list_move_tail(&node->capture_link, ATOM_WB_LIST(atom));
11937 +
11938 +                       /*
11939 +                        * jnode is not necessarily on dirty list: if it was dirtied when
11940 +                        * it was on flush queue - it does not get moved to dirty list
11941 +                        */
11942 +                       ON_DEBUG(count_jnode(atom, node, NODE_LIST(node),
11943 +                                            WB_LIST, 1));
11944 +
11945 +               } else if (jnode_is_znode(node)
11946 +                          && znode_above_root(JZNODE(node))) {
11947 +                       /*
11948 +                        * A special case for znode-above-root.  The above-root (fake)
11949 +                        * znode is captured and dirtied when the tree height changes or
11950 +                        * when the root node is relocated.  This causes atoms to fuse so
11951 +                        * that changes at the root are serialized.  However, this node is
11952 +                        * never flushed.  This special case used to be in lock.c to
11953 +                        * prevent the above-root node from ever being captured, but now
11954 +                        * that it is captured we simply prevent it from flushing.  The
11955 +                        * log-writer code relies on this to properly log superblock
11956 +                        * modifications of the tree height.
11957 +                        */
11958 +                       jnode_make_wander_nolock(node);
11959 +               } else if (JF_ISSET(node, JNODE_RELOC)) {
11960 +                       queue_jnode(fq, node);
11961 +                       ++(*nr_queued);
11962 +               } else
11963 +                       break;
11964 +
11965 +               spin_unlock_jnode(node);
11966 +       }
11967 +       return node;
11968 +}
11969 +
11970 +
11971 +/* Flush some nodes of current atom, usually slum, return -E_REPEAT if there are more nodes
11972 + * to flush, return 0 if atom's dirty lists empty and keep current atom locked, return
11973 + * other errors as they are. */
11974 +int
11975 +flush_current_atom(int flags, long nr_to_write, long *nr_submitted,
11976 +                  txn_atom ** atom, jnode *start)
11977 +{
11978 +       reiser4_super_info_data *sinfo = get_current_super_private();
11979 +       flush_queue_t *fq = NULL;
11980 +       jnode *node;
11981 +       int nr_queued;
11982 +       int ret;
11983 +
11984 +       assert("zam-889", atom != NULL && *atom != NULL);
11985 +       assert_spin_locked(&((*atom)->alock));
11986 +       assert("zam-892", get_current_context()->trans->atom == *atom);
11987 +
11988 +       nr_to_write = LONG_MAX;
11989 +       while (1) {
11990 +               ret = fq_by_atom(*atom, &fq);
11991 +               if (ret != -E_REPEAT)
11992 +                       break;
11993 +               *atom = get_current_atom_locked();
11994 +       }
11995 +       if (ret)
11996 +               return ret;
11997 +
11998 +       assert_spin_locked(&((*atom)->alock));
11999 +
12000 +       /* parallel flushers limit */
12001 +       if (sinfo->tmgr.atom_max_flushers != 0) {
12002 +               while ((*atom)->nr_flushers >= sinfo->tmgr.atom_max_flushers) {
12003 +                       /* An atom_send_event() call is inside fq_put_nolock() which is
12004 +                          called when flush is finished and nr_flushers is
12005 +                          decremented. */
12006 +                       atom_wait_event(*atom);
12007 +                       *atom = get_current_atom_locked();
12008 +               }
12009 +       }
12010 +
12011 +       /* count ourself as a flusher */
12012 +       (*atom)->nr_flushers++;
12013 +
12014 +       writeout_mode_enable();
12015 +
12016 +       nr_queued = 0;
12017 +       node = find_flush_start_jnode(start, *atom, fq, &nr_queued, flags);
12018 +
12019 +       if (node == NULL) {
12020 +               if (nr_queued == 0) {
12021 +                       (*atom)->nr_flushers--;
12022 +                       fq_put_nolock(fq);
12023 +                       atom_send_event(*atom);
12024 +                       /* current atom remains locked */
12025 +                       writeout_mode_disable();
12026 +                       return 0;
12027 +               }
12028 +               spin_unlock_atom(*atom);
12029 +       } else {
12030 +               jref(node);
12031 +               BUG_ON((*atom)->super != node->tree->super);
12032 +               spin_unlock_atom(*atom);
12033 +               spin_unlock_jnode(node);
12034 +               BUG_ON(nr_to_write == 0);
12035 +               ret = jnode_flush(node, nr_to_write, nr_submitted, fq, flags);
12036 +               jput(node);
12037 +       }
12038 +
12039 +       ret =
12040 +           write_fq(fq, nr_submitted,
12041 +                    WRITEOUT_SINGLE_STREAM | WRITEOUT_FOR_PAGE_RECLAIM);
12042 +
12043 +       *atom = get_current_atom_locked();
12044 +       (*atom)->nr_flushers--;
12045 +       fq_put_nolock(fq);
12046 +       atom_send_event(*atom);
12047 +       spin_unlock_atom(*atom);
12048 +
12049 +       writeout_mode_disable();
12050 +
12051 +       if (ret == 0)
12052 +               ret = -E_REPEAT;
12053 +
12054 +       return ret;
12055 +}
12056 +
12057 +/* REVERSE PARENT-FIRST RELOCATION POLICIES */
12058 +
12059 +/* This implements the is-it-close-enough-to-its-preceder? test for relocation in the
12060 +   reverse parent-first relocate context.  Here all we know is the preceder and the block
12061 +   number.  Since we are going in reverse, the preceder may still be relocated as well, so
12062 +   we can't ask the block allocator "is there a closer block available to relocate?" here.
12063 +   In the _forward_ parent-first relocate context (not here) we actually call the block
12064 +   allocator to try and find a closer location. */
12065 +static int
12066 +reverse_relocate_if_close_enough(const reiser4_block_nr * pblk,
12067 +                                const reiser4_block_nr * nblk)
12068 +{
12069 +       reiser4_block_nr dist;
12070 +
12071 +       assert("jmacd-7710", *pblk != 0 && *nblk != 0);
12072 +       assert("jmacd-7711", !blocknr_is_fake(pblk));
12073 +       assert("jmacd-7712", !blocknr_is_fake(nblk));
12074 +
12075 +       /* Distance is the absolute value. */
12076 +       dist = (*pblk > *nblk) ? (*pblk - *nblk) : (*nblk - *pblk);
12077 +
12078 +       /* If the block is less than FLUSH_RELOCATE_DISTANCE blocks away from its preceder
12079 +          block, do not relocate. */
12080 +       if (dist <= get_current_super_private()->flush.relocate_distance) {
12081 +               return 0;
12082 +       }
12083 +
12084 +       return 1;
12085 +}
12086 +
12087 +/* This function is a predicate that tests for relocation.  Always called in the
12088 +   reverse-parent-first context, when we are asking whether the current node should be
12089 +   relocated in order to expand the flush by dirtying the parent level (and thus
12090 +   proceeding to flush that level).  When traversing in the forward parent-first direction
12091 +   (not here), relocation decisions are handled in two places: allocate_znode() and
12092 +   extent_needs_allocation(). */
12093 +static int
12094 +reverse_relocate_test(jnode * node, const coord_t * parent_coord,
12095 +                     flush_pos_t * pos)
12096 +{
12097 +       reiser4_block_nr pblk = 0;
12098 +       reiser4_block_nr nblk = 0;
12099 +
12100 +       assert("jmacd-8989", !jnode_is_root(node));
12101 +
12102 +       /*
12103 +        * This function is called only from the
12104 +        * reverse_relocate_check_dirty_parent() and only if the parent
12105 +        * node is clean. This implies that the parent has the real (i.e., not
12106 +        * fake) block number, and, so does the child, because otherwise the
12107 +        * parent would be dirty.
12108 +        */
12109 +
12110 +       /* New nodes are treated as if they are being relocated. */
12111 +       if (JF_ISSET (node, JNODE_CREATED) ||
12112 +           (pos->leaf_relocate && jnode_get_level(node) == LEAF_LEVEL)) {
12113 +               return 1;
12114 +       }
12115 +
12116 +       /* Find the preceder.  FIXME(B): When the child is an unformatted, previously
12117 +          existing node, the coord may be leftmost even though the child is not the
12118 +          parent-first preceder of the parent.  If the first dirty node appears somewhere
12119 +          in the middle of the first extent unit, this preceder calculation is wrong.
12120 +          Needs more logic in here. */
12121 +       if (coord_is_leftmost_unit(parent_coord)) {
12122 +               pblk = *znode_get_block(parent_coord->node);
12123 +       } else {
12124 +               pblk = pos->preceder.blk;
12125 +       }
12126 +       check_preceder(pblk);
12127 +
12128 +       /* If (pblk == 0) then the preceder isn't allocated or isn't known: relocate. */
12129 +       if (pblk == 0) {
12130 +               return 1;
12131 +       }
12132 +
12133 +       nblk = *jnode_get_block(node);
12134 +
12135 +       if (blocknr_is_fake(&nblk))
12136 +               /* child is unallocated, mark parent dirty */
12137 +               return 1;
12138 +
12139 +       return reverse_relocate_if_close_enough(&pblk, &nblk);
12140 +}
12141 +
12142 +/* This function calls reverse_relocate_test to make a reverse-parent-first
12143 +   relocation decision and then, if yes, it marks the parent dirty. */
12144 +static int
12145 +reverse_relocate_check_dirty_parent(jnode * node, const coord_t * parent_coord,
12146 +                                   flush_pos_t * pos)
12147 +{
12148 +       int ret;
12149 +
12150 +       if (!JF_ISSET(ZJNODE(parent_coord->node), JNODE_DIRTY)) {
12151 +
12152 +               ret = reverse_relocate_test(node, parent_coord, pos);
12153 +               if (ret < 0) {
12154 +                       return ret;
12155 +               }
12156 +
12157 +               /* FIXME-ZAM
12158 +                  if parent is already relocated - we do not want to grab space, right? */
12159 +               if (ret == 1) {
12160 +                       int grabbed;
12161 +
12162 +                       grabbed = get_current_context()->grabbed_blocks;
12163 +                       if (reiser4_grab_space_force((__u64) 1, BA_RESERVED) !=
12164 +                           0)
12165 +                               reiser4_panic("umka-1250",
12166 +                                             "No space left during flush.");
12167 +
12168 +                       assert("jmacd-18923",
12169 +                              znode_is_write_locked(parent_coord->node));
12170 +                       znode_make_dirty(parent_coord->node);
12171 +                       grabbed2free_mark(grabbed);
12172 +               }
12173 +       }
12174 +
12175 +       return 0;
12176 +}
12177 +
12178 +/* INITIAL ALLOCATE ANCESTORS STEP (REVERSE PARENT-FIRST ALLOCATION BEFORE FORWARD
12179 +   PARENT-FIRST LOOP BEGINS) */
12180 +
12181 +/* Get the leftmost child for given coord. */
12182 +static int get_leftmost_child_of_unit(const coord_t * coord, jnode ** child)
12183 +{
12184 +       int ret;
12185 +
12186 +       ret = item_utmost_child(coord, LEFT_SIDE, child);
12187 +
12188 +       if (ret)
12189 +               return ret;
12190 +
12191 +       if (IS_ERR(*child))
12192 +               return PTR_ERR(*child);
12193 +
12194 +       return 0;
12195 +}
12196 +
12197 +/* This step occurs after the left- and right-scans are completed, before starting the
12198 +   forward parent-first traversal.  Here we attempt to allocate ancestors of the starting
12199 +   flush point, which means continuing in the reverse parent-first direction to the
12200 +   parent, grandparent, and so on (as long as the child is a leftmost child).  This
12201 +   routine calls a recursive process, alloc_one_ancestor, which does the real work,
12202 +   except there is special-case handling here for the first ancestor, which may be a twig.
12203 +   At each level (here and alloc_one_ancestor), we check for relocation and then, if
12204 +   the child is a leftmost child, repeat at the next level.  On the way back down (the
12205 +   recursion), we allocate the ancestors in parent-first order. */
12206 +static int alloc_pos_and_ancestors(flush_pos_t * pos)
12207 +{
12208 +       int ret = 0;
12209 +       lock_handle plock;
12210 +       load_count pload;
12211 +       coord_t pcoord;
12212 +
12213 +       if (znode_check_flushprepped(pos->lock.node))
12214 +               return 0;
12215 +
12216 +       coord_init_invalid(&pcoord, NULL);
12217 +       init_lh(&plock);
12218 +       init_load_count(&pload);
12219 +
12220 +       if (pos->state == POS_ON_EPOINT) {
12221 +               /* a special case for pos on twig level, where we already have
12222 +                  a lock on parent node. */
12223 +               /* The parent may not be dirty, in which case we should decide
12224 +                  whether to relocate the child now. If decision is made to
12225 +                  relocate the child, the parent is marked dirty. */
12226 +               ret =
12227 +                   reverse_relocate_check_dirty_parent(pos->child, &pos->coord,
12228 +                                                       pos);
12229 +               if (ret)
12230 +                       goto exit;
12231 +
12232 +               /* FIXME_NFQUCMPD: We only need to allocate the twig (if child
12233 +                  is leftmost) and the leaf/child, so recursion is not needed.
12234 +                  Levels above the twig will be allocated for
12235 +                  write-optimization before the transaction commits.  */
12236 +
12237 +               /* Do the recursive step, allocating zero or more of our
12238 +                * ancestors. */
12239 +               ret = alloc_one_ancestor(&pos->coord, pos);
12240 +
12241 +       } else {
12242 +               if (!znode_is_root(pos->lock.node)) {
12243 +                       /* all formatted nodes except tree root */
12244 +                       ret =
12245 +                           reiser4_get_parent(&plock, pos->lock.node,
12246 +                                              ZNODE_WRITE_LOCK);
12247 +                       if (ret)
12248 +                               goto exit;
12249 +
12250 +                       ret = incr_load_count_znode(&pload, plock.node);
12251 +                       if (ret)
12252 +                               goto exit;
12253 +
12254 +                       ret =
12255 +                           find_child_ptr(plock.node, pos->lock.node, &pcoord);
12256 +                       if (ret)
12257 +                               goto exit;
12258 +
12259 +                       ret =
12260 +                           reverse_relocate_check_dirty_parent(ZJNODE
12261 +                                                               (pos->lock.
12262 +                                                                node), &pcoord,
12263 +                                                               pos);
12264 +                       if (ret)
12265 +                               goto exit;
12266 +
12267 +                       ret = alloc_one_ancestor(&pcoord, pos);
12268 +                       if (ret)
12269 +                               goto exit;
12270 +               }
12271 +
12272 +               ret = allocate_znode(pos->lock.node, &pcoord, pos);
12273 +       }
12274 +      exit:
12275 +       done_load_count(&pload);
12276 +       done_lh(&plock);
12277 +       return ret;
12278 +}
12279 +
12280 +/* This is the recursive step described in alloc_pos_and_ancestors, above.  Ignoring the
12281 +   call to set_preceder, which is the next function described, this checks if the
12282 +   child is a leftmost child and returns if it is not.  If the child is a leftmost child
12283 +   it checks for relocation, possibly dirtying the parent.  Then it performs the recursive
12284 +   step. */
12285 +static int alloc_one_ancestor(const coord_t * coord, flush_pos_t * pos)
12286 +{
12287 +       int ret = 0;
12288 +       lock_handle alock;
12289 +       load_count aload;
12290 +       coord_t acoord;
12291 +
12292 +       /* As we ascend at the left-edge of the region to flush, take this opportunity at
12293 +          the twig level to find our parent-first preceder unless we have already set
12294 +          it. */
12295 +       if (pos->preceder.blk == 0) {
12296 +               ret = set_preceder(coord, pos);
12297 +               if (ret != 0)
12298 +                       return ret;
12299 +       }
12300 +
12301 +       /* If the ancestor is clean or already allocated, or if the child is not a
12302 +          leftmost child, stop going up, even leaving coord->node not flushprepped. */
12303 +       if (znode_check_flushprepped(coord->node)
12304 +           || !coord_is_leftmost_unit(coord))
12305 +               return 0;
12306 +
12307 +       init_lh(&alock);
12308 +       init_load_count(&aload);
12309 +       coord_init_invalid(&acoord, NULL);
12310 +
12311 +       /* Only ascend to the next level if it is a leftmost child, but write-lock the
12312 +          parent in case we will relocate the child. */
12313 +       if (!znode_is_root(coord->node)) {
12314 +
12315 +               ret =
12316 +                   jnode_lock_parent_coord(ZJNODE(coord->node), &acoord,
12317 +                                           &alock, &aload, ZNODE_WRITE_LOCK,
12318 +                                           0);
12319 +               if (ret != 0) {
12320 +                       /* FIXME(C): check EINVAL, E_DEADLOCK */
12321 +                       goto exit;
12322 +               }
12323 +
12324 +               ret =
12325 +                   reverse_relocate_check_dirty_parent(ZJNODE(coord->node),
12326 +                                                       &acoord, pos);
12327 +               if (ret != 0) {
12328 +                       goto exit;
12329 +               }
12330 +
12331 +               /* Recursive call. */
12332 +               if (!znode_check_flushprepped(acoord.node)) {
12333 +                       ret = alloc_one_ancestor(&acoord, pos);
12334 +                       if (ret)
12335 +                               goto exit;
12336 +               }
12337 +       }
12338 +
12339 +       /* Note: we call allocate with the parent write-locked (except at the root) in
12340 +          case we relocate the child, in which case it will modify the parent during this
12341 +          call. */
12342 +       ret = allocate_znode(coord->node, &acoord, pos);
12343 +
12344 +      exit:
12345 +       done_load_count(&aload);
12346 +       done_lh(&alock);
12347 +       return ret;
12348 +}
12349 +
12350 +/* During the reverse parent-first alloc_pos_and_ancestors process described above there is
12351 +   a call to this function at the twig level.  During alloc_pos_and_ancestors we may ask:
12352 +   should this node be relocated (in reverse parent-first context)?  We repeat this
12353 +   process as long as the child is the leftmost child, eventually reaching an ancestor of
12354 +   the flush point that is not a leftmost child.  The preceder of that ancestors, which is
12355 +   not a leftmost child, is actually on the leaf level.  The preceder of that block is the
12356 +   left-neighbor of the flush point.  The preceder of that block is the rightmost child of
12357 +   the twig on the left.  So, when alloc_pos_and_ancestors passes upward through the twig
12358 +   level, it stops momentarily to remember the block of the rightmost child of the twig on
12359 +   the left and sets it to the flush_position's preceder_hint.
12360 +
12361 +   There is one other place where we may set the flush_position's preceder hint, which is
12362 +   during scan-left.
12363 +*/
12364 +static int set_preceder(const coord_t * coord_in, flush_pos_t * pos)
12365 +{
12366 +       int ret;
12367 +       coord_t coord;
12368 +       lock_handle left_lock;
12369 +       load_count left_load;
12370 +
12371 +       coord_dup(&coord, coord_in);
12372 +
12373 +       init_lh(&left_lock);
12374 +       init_load_count(&left_load);
12375 +
12376 +       /* FIXME(B): Same FIXME as in "Find the preceder" in reverse_relocate_test.
12377 +          coord_is_leftmost_unit is not the right test if the unformatted child is in the
12378 +          middle of the first extent unit. */
12379 +       if (!coord_is_leftmost_unit(&coord)) {
12380 +               coord_prev_unit(&coord);
12381 +       } else {
12382 +               ret =
12383 +                   reiser4_get_left_neighbor(&left_lock, coord.node,
12384 +                                             ZNODE_READ_LOCK, GN_SAME_ATOM);
12385 +               if (ret) {
12386 +                       /* If we fail for any reason it doesn't matter because the
12387 +                          preceder is only a hint.  We are low-priority at this point, so
12388 +                          this must be the case. */
12389 +                       if (ret == -E_REPEAT || ret == -E_NO_NEIGHBOR ||
12390 +                           ret == -ENOENT || ret == -EINVAL
12391 +                           || ret == -E_DEADLOCK) {
12392 +                               ret = 0;
12393 +                       }
12394 +                       goto exit;
12395 +               }
12396 +
12397 +               ret = incr_load_count_znode(&left_load, left_lock.node);
12398 +               if (ret)
12399 +                       goto exit;
12400 +
12401 +               coord_init_last_unit(&coord, left_lock.node);
12402 +       }
12403 +
12404 +       ret =
12405 +           item_utmost_child_real_block(&coord, RIGHT_SIDE,
12406 +                                        &pos->preceder.blk);
12407 +      exit:
12408 +       check_preceder(pos->preceder.blk);
12409 +       done_load_count(&left_load);
12410 +       done_lh(&left_lock);
12411 +       return ret;
12412 +}
12413 +
12414 +/* MAIN SQUEEZE AND ALLOCATE LOOP (THREE BIG FUNCTIONS) */
12415 +
12416 +/* This procedure implements the outer loop of the flush algorithm.  To put this in
12417 +   context, here is the general list of steps taken by the flush routine as a whole:
12418 +
12419 +   1. Scan-left
12420 +   2. Scan-right (maybe)
12421 +   3. Allocate initial flush position and its ancestors
12422 +   4. <handle extents>
12423 +   5. <squeeze and next position and its ancestors to-the-right,
12424 +       then update position to-the-right>
12425 +   6. <repeat from #4 until flush is stopped>
12426 +
12427 +   This procedure implements the loop in steps 4 through 6 in the above listing.
12428 +
12429 +   Step 4: if the current flush position is an extent item (position on the twig level),
12430 +   it allocates the extent (allocate_extent_item_in_place) then shifts to the next
12431 +   coordinate.  If the next coordinate's leftmost child needs flushprep, we will continue.
12432 +   If the next coordinate is an internal item, we descend back to the leaf level,
12433 +   otherwise we repeat a step #4 (labeled ALLOC_EXTENTS below).  If the "next coordinate"
12434 +   brings us past the end of the twig level, then we call
12435 +   reverse_relocate_end_of_twig to possibly dirty the next (right) twig, prior to
12436 +   step #5 which moves to the right.
12437 +
12438 +   Step 5: calls squalloc_changed_ancestors, which initiates a recursive call up the
12439 +   tree to allocate any ancestors of the next-right flush position that are not also
12440 +   ancestors of the current position.  Those ancestors (in top-down order) are the next in
12441 +   parent-first order.  We squeeze adjacent nodes on the way up until the right node and
12442 +   current node share the same parent, then allocate on the way back down.  Finally, this
12443 +   step sets the flush position to the next-right node.  Then repeat steps 4 and 5.
12444 +*/
12445 +
12446 +/* SQUEEZE CODE */
12447 +
12448 +/* squalloc_right_twig helper function, cut a range of extent items from
12449 +   cut node to->node from the beginning up to coord @to. */
12450 +static int squalloc_right_twig_cut(coord_t * to, reiser4_key * to_key,
12451 +                                  znode * left)
12452 +{
12453 +       coord_t from;
12454 +       reiser4_key from_key;
12455 +
12456 +       coord_init_first_unit(&from, to->node);
12457 +       item_key_by_coord(&from, &from_key);
12458 +
12459 +       return cut_node_content(&from, to, &from_key, to_key, NULL);
12460 +}
12461 +
12462 +/* Copy as much of the leading extents from @right to @left, allocating
12463 +   unallocated extents as they are copied.  Returns SQUEEZE_TARGET_FULL or
12464 +   SQUEEZE_SOURCE_EMPTY when no more can be shifted.  If the next item is an
12465 +   internal item it calls shift_one_internal_unit and may then return
12466 +   SUBTREE_MOVED. */
12467 +static int squeeze_right_twig(znode * left, znode * right, flush_pos_t * pos)
12468 +{
12469 +       int ret = SUBTREE_MOVED;
12470 +       coord_t coord;          /* used to iterate over items */
12471 +       reiser4_key stop_key;
12472 +
12473 +       assert("jmacd-2008", !node_is_empty(right));
12474 +       coord_init_first_unit(&coord, right);
12475 +
12476 +       /* FIXME: can be optimized to cut once */
12477 +       while (!node_is_empty(coord.node) && item_is_extent(&coord)) {
12478 +               ON_DEBUG(void *vp);
12479 +
12480 +               assert("vs-1468", coord_is_leftmost_unit(&coord));
12481 +               ON_DEBUG(vp = shift_check_prepare(left, coord.node));
12482 +
12483 +               /* stop_key is used to find what was copied and what to cut */
12484 +               stop_key = *min_key();
12485 +               ret = squalloc_extent(left, &coord, pos, &stop_key);
12486 +               if (ret != SQUEEZE_CONTINUE) {
12487 +                       ON_DEBUG(kfree(vp));
12488 +                       break;
12489 +               }
12490 +               assert("vs-1465", !keyeq(&stop_key, min_key()));
12491 +
12492 +               /* Helper function to do the cutting. */
12493 +               set_key_offset(&stop_key, get_key_offset(&stop_key) - 1);
12494 +               check_me("vs-1466",
12495 +                        squalloc_right_twig_cut(&coord, &stop_key, left) == 0);
12496 +
12497 +               ON_DEBUG(shift_check(vp, left, coord.node));
12498 +       }
12499 +
12500 +       if (node_is_empty(coord.node))
12501 +               ret = SQUEEZE_SOURCE_EMPTY;
12502 +
12503 +       if (ret == SQUEEZE_TARGET_FULL) {
12504 +               goto out;
12505 +       }
12506 +
12507 +       if (node_is_empty(right)) {
12508 +               /* The whole right node was copied into @left. */
12509 +               assert("vs-464", ret == SQUEEZE_SOURCE_EMPTY);
12510 +               goto out;
12511 +       }
12512 +
12513 +       coord_init_first_unit(&coord, right);
12514 +
12515 +       if (!item_is_internal(&coord)) {
12516 +               /* we do not want to squeeze anything else to left neighbor because "slum"
12517 +                  is over */
12518 +               ret = SQUEEZE_TARGET_FULL;
12519 +               goto out;
12520 +       }
12521 +       assert("jmacd-433", item_is_internal(&coord));
12522 +
12523 +       /* Shift an internal unit.  The child must be allocated before shifting any more
12524 +          extents, so we stop here. */
12525 +       ret = shift_one_internal_unit(left, right);
12526 +
12527 +      out:
12528 +       assert("jmacd-8612", ret < 0 || ret == SQUEEZE_TARGET_FULL
12529 +              || ret == SUBTREE_MOVED || ret == SQUEEZE_SOURCE_EMPTY);
12530 +
12531 +       if (ret == SQUEEZE_TARGET_FULL) {
12532 +               /* We submit prepped nodes here and expect that this @left twig
12533 +                * will not be modified again during this jnode_flush() call. */
12534 +               int ret1;
12535 +
12536 +               /* NOTE: seems like io is done under long term locks. */
12537 +               ret1 = write_prepped_nodes(pos);
12538 +               if (ret1 < 0)
12539 +                       return ret1;
12540 +       }
12541 +
12542 +       return ret;
12543 +}
12544 +
12545 +#if REISER4_DEBUG
12546 +static void item_convert_invariant(flush_pos_t * pos)
12547 +{
12548 +       assert("edward-1225", coord_is_existing_item(&pos->coord));
12549 +       if (chaining_data_present(pos)) {
12550 +               item_plugin *iplug = item_convert_plug(pos);
12551 +
12552 +               assert("edward-1000",
12553 +                      iplug == item_plugin_by_coord(&pos->coord));
12554 +               assert("edward-1001", iplug->f.convert != NULL);
12555 +       } else
12556 +               assert("edward-1226", pos->child == NULL);
12557 +}
12558 +#else
12559 +
12560 +#define item_convert_invariant(pos) noop
12561 +
12562 +#endif
12563 +
12564 +/* Scan node items starting from the first one and apply for each
12565 +   item its flush ->convert() method (if any). This method may
12566 +   resize/kill the item so the tree will be changed.
12567 +*/
12568 +static int convert_node(flush_pos_t * pos, znode * node)
12569 +{
12570 +       int ret = 0;
12571 +       item_plugin *iplug;
12572 +
12573 +       assert("edward-304", pos != NULL);
12574 +       assert("edward-305", pos->child == NULL);
12575 +       assert("edward-475", znode_convertible(node));
12576 +       assert("edward-669", znode_is_wlocked(node));
12577 +       assert("edward-1210", !node_is_empty(node));
12578 +
12579 +       if (znode_get_level(node) != LEAF_LEVEL)
12580 +               /* unsupported */
12581 +               goto exit;
12582 +
12583 +       coord_init_first_unit(&pos->coord, node);
12584 +
12585 +       while (1) {
12586 +               ret = 0;
12587 +               coord_set_to_left(&pos->coord);
12588 +               item_convert_invariant(pos);
12589 +
12590 +               iplug = item_plugin_by_coord(&pos->coord);
12591 +               assert("edward-844", iplug != NULL);
12592 +
12593 +               if (iplug->f.convert) {
12594 +                       ret = iplug->f.convert(pos);
12595 +                       if (ret)
12596 +                               goto exit;
12597 +               }
12598 +               assert("edward-307", pos->child == NULL);
12599 +
12600 +               if (coord_next_item(&pos->coord)) {
12601 +                       /* node is over */
12602 +
12603 +                       if (!chaining_data_present(pos))
12604 +                               /* finished this node */
12605 +                               break;
12606 +                       if (should_chain_next_node(pos)) {
12607 +                               /* go to next node */
12608 +                               move_chaining_data(pos, 0 /* to next node */ );
12609 +                               break;
12610 +                       }
12611 +                       /* repeat this node */
12612 +                       move_chaining_data(pos, 1 /* this node */ );
12613 +                       continue;
12614 +               }
12615 +               /* Node is not over.
12616 +                  Check if there is attached convert data.
12617 +                  If so roll one item position back and repeat
12618 +                  on this node
12619 +                */
12620 +               if (chaining_data_present(pos)) {
12621 +
12622 +                       if (iplug != item_plugin_by_coord(&pos->coord))
12623 +                               set_item_convert_count(pos, 0);
12624 +
12625 +                       ret = coord_prev_item(&pos->coord);
12626 +                       assert("edward-1003", !ret);
12627 +
12628 +                       move_chaining_data(pos, 1 /* this node */ );
12629 +               }
12630 +       }
12631 +       JF_CLR(ZJNODE(node), JNODE_CONVERTIBLE);
12632 +       znode_make_dirty(node);
12633 +      exit:
12634 +       assert("edward-1004", !ret);
12635 +       return ret;
12636 +}
12637 +
12638 +/* Squeeze and allocate the right neighbor.  This is called after @left and
12639 +   its current children have been squeezed and allocated already.  This
12640 +   procedure's job is to squeeze and items from @right to @left.
12641 +
12642 +   If at the leaf level, use the shift_everything_left memcpy-optimized
12643 +   version of shifting (squeeze_right_leaf).
12644 +
12645 +   If at the twig level, extents are allocated as they are shifted from @right
12646 +   to @left (squalloc_right_twig).
12647 +
12648 +   At any other level, shift one internal item and return to the caller
12649 +   (squalloc_parent_first) so that the shifted-subtree can be processed in
12650 +   parent-first order.
12651 +
12652 +   When unit of internal item is moved, squeezing stops and SUBTREE_MOVED is
12653 +   returned.  When all content of @right is squeezed, SQUEEZE_SOURCE_EMPTY is
12654 +   returned.  If nothing can be moved into @left anymore, SQUEEZE_TARGET_FULL
12655 +   is returned.
12656 +*/
12657 +
12658 +static int squeeze_right_neighbor(flush_pos_t * pos, znode * left,
12659 +                                 znode * right)
12660 +{
12661 +       int ret;
12662 +
12663 +       /* FIXME it is possible to see empty hasn't-heard-banshee node in a
12664 +        * tree owing to error (for example, ENOSPC) in write */
12665 +       /* assert("jmacd-9321", !node_is_empty(left)); */
12666 +       assert("jmacd-9322", !node_is_empty(right));
12667 +       assert("jmacd-9323", znode_get_level(left) == znode_get_level(right));
12668 +
12669 +       switch (znode_get_level(left)) {
12670 +       case TWIG_LEVEL:
12671 +               /* Shift with extent allocating until either an internal item
12672 +                  is encountered or everything is shifted or no free space
12673 +                  left in @left */
12674 +               ret = squeeze_right_twig(left, right, pos);
12675 +               break;
12676 +
12677 +       default:
12678 +               /* All other levels can use shift_everything until we implement per-item
12679 +                  flush plugins. */
12680 +               ret = squeeze_right_non_twig(left, right);
12681 +               break;
12682 +       }
12683 +
12684 +       assert("jmacd-2011", (ret < 0 ||
12685 +                             ret == SQUEEZE_SOURCE_EMPTY
12686 +                             || ret == SQUEEZE_TARGET_FULL
12687 +                             || ret == SUBTREE_MOVED));
12688 +       return ret;
12689 +}
12690 +
12691 +static int squeeze_right_twig_and_advance_coord(flush_pos_t * pos,
12692 +                                               znode * right)
12693 +{
12694 +       int ret;
12695 +
12696 +       ret = squeeze_right_twig(pos->lock.node, right, pos);
12697 +       if (ret < 0)
12698 +               return ret;
12699 +       if (ret > 0) {
12700 +               coord_init_after_last_item(&pos->coord, pos->lock.node);
12701 +               return ret;
12702 +       }
12703 +
12704 +       coord_init_last_unit(&pos->coord, pos->lock.node);
12705 +       return 0;
12706 +}
12707 +
12708 +/* forward declaration */
12709 +static int squalloc_upper_levels(flush_pos_t *, znode *, znode *);
12710 +
12711 +/* do a fast check for "same parents" condition before calling
12712 + * squalloc_upper_levels() */
12713 +static inline int check_parents_and_squalloc_upper_levels(flush_pos_t * pos,
12714 +                                                         znode * left,
12715 +                                                         znode * right)
12716 +{
12717 +       if (znode_same_parents(left, right))
12718 +               return 0;
12719 +
12720 +       return squalloc_upper_levels(pos, left, right);
12721 +}
12722 +
12723 +/* Check whether the parent of given @right node needs to be processes
12724 +   ((re)allocated) prior to processing of the child.  If @left and @right do not
12725 +   share at least the parent of the @right is after the @left but before the
12726 +   @right in parent-first order, we have to (re)allocate it before the @right
12727 +   gets (re)allocated. */
12728 +static int squalloc_upper_levels(flush_pos_t * pos, znode * left, znode * right)
12729 +{
12730 +       int ret;
12731 +
12732 +       lock_handle left_parent_lock;
12733 +       lock_handle right_parent_lock;
12734 +
12735 +       load_count left_parent_load;
12736 +       load_count right_parent_load;
12737 +
12738 +       init_lh(&left_parent_lock);
12739 +       init_lh(&right_parent_lock);
12740 +
12741 +       init_load_count(&left_parent_load);
12742 +       init_load_count(&right_parent_load);
12743 +
12744 +       ret = reiser4_get_parent(&left_parent_lock, left, ZNODE_WRITE_LOCK);
12745 +       if (ret)
12746 +               goto out;
12747 +
12748 +       ret = reiser4_get_parent(&right_parent_lock, right, ZNODE_WRITE_LOCK);
12749 +       if (ret)
12750 +               goto out;
12751 +
12752 +       /* Check for same parents */
12753 +       if (left_parent_lock.node == right_parent_lock.node)
12754 +               goto out;
12755 +
12756 +       if (znode_check_flushprepped(right_parent_lock.node)) {
12757 +               /* Keep parent-first order.  In the order, the right parent node stands
12758 +                  before the @right node.  If it is already allocated, we set the
12759 +                  preceder (next block search start point) to its block number, @right
12760 +                  node should be allocated after it.
12761 +
12762 +                  However, preceder is set only if the right parent is on twig level.
12763 +                  The explanation is the following: new branch nodes are allocated over
12764 +                  already allocated children while the tree grows, it is difficult to
12765 +                  keep tree ordered, we assume that only leaves and twings are correctly
12766 +                  allocated.  So, only twigs are used as a preceder for allocating of the
12767 +                  rest of the slum. */
12768 +               if (znode_get_level(right_parent_lock.node) == TWIG_LEVEL) {
12769 +                       pos->preceder.blk =
12770 +                           *znode_get_block(right_parent_lock.node);
12771 +                       check_preceder(pos->preceder.blk);
12772 +               }
12773 +               goto out;
12774 +       }
12775 +
12776 +       ret = incr_load_count_znode(&left_parent_load, left_parent_lock.node);
12777 +       if (ret)
12778 +               goto out;
12779 +
12780 +       ret = incr_load_count_znode(&right_parent_load, right_parent_lock.node);
12781 +       if (ret)
12782 +               goto out;
12783 +
12784 +       ret =
12785 +           squeeze_right_neighbor(pos, left_parent_lock.node,
12786 +                                  right_parent_lock.node);
12787 +       /* We stop if error. We stop if some items/units were shifted (ret == 0)
12788 +        * and thus @right changed its parent. It means we have not process
12789 +        * right_parent node prior to processing of @right. Positive return
12790 +        * values say that shifting items was not happen because of "empty
12791 +        * source" or "target full" conditions. */
12792 +       if (ret <= 0)
12793 +               goto out;
12794 +
12795 +       /* parent(@left) and parent(@right) may have different parents also. We
12796 +        * do a recursive call for checking that. */
12797 +       ret =
12798 +           check_parents_and_squalloc_upper_levels(pos, left_parent_lock.node,
12799 +                                                   right_parent_lock.node);
12800 +       if (ret)
12801 +               goto out;
12802 +
12803 +       /* allocate znode when going down */
12804 +       ret = lock_parent_and_allocate_znode(right_parent_lock.node, pos);
12805 +
12806 +      out:
12807 +       done_load_count(&left_parent_load);
12808 +       done_load_count(&right_parent_load);
12809 +
12810 +       done_lh(&left_parent_lock);
12811 +       done_lh(&right_parent_lock);
12812 +
12813 +       return ret;
12814 +}
12815 +
12816 +/* Check the leftmost child "flushprepped" status, also returns true if child
12817 + * node was not found in cache.  */
12818 +static int leftmost_child_of_unit_check_flushprepped(const coord_t * coord)
12819 +{
12820 +       int ret;
12821 +       int prepped;
12822 +
12823 +       jnode *child;
12824 +
12825 +       ret = get_leftmost_child_of_unit(coord, &child);
12826 +
12827 +       if (ret)
12828 +               return ret;
12829 +
12830 +       if (child) {
12831 +               prepped = jnode_check_flushprepped(child);
12832 +               jput(child);
12833 +       } else {
12834 +               /* We consider not existing child as a node which slum
12835 +                  processing should not continue to.  Not cached node is clean,
12836 +                  so it is flushprepped. */
12837 +               prepped = 1;
12838 +       }
12839 +
12840 +       return prepped;
12841 +}
12842 +
12843 +/* (re)allocate znode with automated getting parent node */
12844 +static int lock_parent_and_allocate_znode(znode * node, flush_pos_t * pos)
12845 +{
12846 +       int ret;
12847 +       lock_handle parent_lock;
12848 +       load_count parent_load;
12849 +       coord_t pcoord;
12850 +
12851 +       assert("zam-851", znode_is_write_locked(node));
12852 +
12853 +       init_lh(&parent_lock);
12854 +       init_load_count(&parent_load);
12855 +
12856 +       ret = reiser4_get_parent(&parent_lock, node, ZNODE_WRITE_LOCK);
12857 +       if (ret)
12858 +               goto out;
12859 +
12860 +       ret = incr_load_count_znode(&parent_load, parent_lock.node);
12861 +       if (ret)
12862 +               goto out;
12863 +
12864 +       ret = find_child_ptr(parent_lock.node, node, &pcoord);
12865 +       if (ret)
12866 +               goto out;
12867 +
12868 +       ret = allocate_znode(node, &pcoord, pos);
12869 +
12870 +      out:
12871 +       done_load_count(&parent_load);
12872 +       done_lh(&parent_lock);
12873 +       return ret;
12874 +}
12875 +
12876 +/* Process nodes on leaf level until unformatted node or rightmost node in the
12877 + * slum reached.  */
12878 +static int handle_pos_on_formatted(flush_pos_t * pos)
12879 +{
12880 +       int ret;
12881 +       lock_handle right_lock;
12882 +       load_count right_load;
12883 +
12884 +       init_lh(&right_lock);
12885 +       init_load_count(&right_load);
12886 +
12887 +       if (should_convert_node(pos, pos->lock.node)) {
12888 +               ret = convert_node(pos, pos->lock.node);
12889 +               if (ret)
12890 +                       return ret;
12891 +       }
12892 +
12893 +       while (1) {
12894 +               ret =
12895 +                   neighbor_in_slum(pos->lock.node, &right_lock, RIGHT_SIDE,
12896 +                                    ZNODE_WRITE_LOCK,
12897 +                                    !should_convert_next_node(pos,
12898 +                                                              right_lock.
12899 +                                                              node));
12900 +               if (ret)
12901 +                       break;
12902 +
12903 +               /* we don't prep(allocate) nodes for flushing twice.  This can be suboptimal, or it
12904 +                * can be optimal.  For now we choose to live with the risk that it will
12905 +                * be suboptimal because it would be quite complex to code it to be
12906 +                * smarter. */
12907 +               if (znode_check_flushprepped(right_lock.node)
12908 +                   && !znode_convertible(right_lock.node)) {
12909 +                       assert("edward-1005",
12910 +                              !should_convert_next_node(pos, right_lock.node));
12911 +                       pos_stop(pos);
12912 +                       break;
12913 +               }
12914 +
12915 +               ret = incr_load_count_znode(&right_load, right_lock.node);
12916 +               if (ret)
12917 +                       break;
12918 +
12919 +               if (should_convert_node(pos, right_lock.node)) {
12920 +                       ret = convert_node(pos, right_lock.node);
12921 +                       if (ret)
12922 +                               break;
12923 +                       if (node_is_empty(right_lock.node)) {
12924 +                               /* node became empty after converting, repeat */
12925 +                               done_load_count(&right_load);
12926 +                               done_lh(&right_lock);
12927 +                               continue;
12928 +                       }
12929 +               }
12930 +
12931 +               /* squeeze _before_ going upward. */
12932 +               ret =
12933 +                   squeeze_right_neighbor(pos, pos->lock.node,
12934 +                                          right_lock.node);
12935 +               if (ret < 0)
12936 +                       break;
12937 +
12938 +               if (znode_check_flushprepped(right_lock.node)) {
12939 +                       if (should_convert_next_node(pos, right_lock.node)) {
12940 +                               /* in spite of flushprepped status of the node,
12941 +                                  its right slum neighbor should be converted */
12942 +                               assert("edward-953", convert_data(pos));
12943 +                               assert("edward-954", item_convert_data(pos));
12944 +
12945 +                               if (node_is_empty(right_lock.node)) {
12946 +                                       done_load_count(&right_load);
12947 +                                       done_lh(&right_lock);
12948 +                               } else
12949 +                                       move_flush_pos(pos, &right_lock,
12950 +                                                      &right_load, NULL);
12951 +                               continue;
12952 +                       }
12953 +                       pos_stop(pos);
12954 +                       break;
12955 +               }
12956 +
12957 +               if (node_is_empty(right_lock.node)) {
12958 +                       /* repeat if right node was squeezed completely */
12959 +                       done_load_count(&right_load);
12960 +                       done_lh(&right_lock);
12961 +                       continue;
12962 +               }
12963 +
12964 +               /* parent(right_lock.node) has to be processed before
12965 +                * (right_lock.node) due to "parent-first" allocation order. */
12966 +               ret =
12967 +                   check_parents_and_squalloc_upper_levels(pos, pos->lock.node,
12968 +                                                           right_lock.node);
12969 +               if (ret)
12970 +                       break;
12971 +               /* (re)allocate _after_ going upward */
12972 +               ret = lock_parent_and_allocate_znode(right_lock.node, pos);
12973 +               if (ret)
12974 +                       break;
12975 +
12976 +               if (should_terminate_squalloc(pos)) {
12977 +                       set_item_convert_count(pos, 0);
12978 +                       break;
12979 +               }
12980 +
12981 +               /* advance the flush position to the right neighbor */
12982 +               move_flush_pos(pos, &right_lock, &right_load, NULL);
12983 +
12984 +               ret = rapid_flush(pos);
12985 +               if (ret)
12986 +                       break;
12987 +       }
12988 +
12989 +       assert("edward-1006", !convert_data(pos) || !item_convert_data(pos));
12990 +
12991 +       done_load_count(&right_load);
12992 +       done_lh(&right_lock);
12993 +
12994 +       /* This function indicates via pos whether to stop or go to twig or continue on current
12995 +        * level. */
12996 +       return ret;
12997 +
12998 +}
12999 +
13000 +/* Process nodes on leaf level until unformatted node or rightmost node in the
13001 + * slum reached.  */
13002 +static int handle_pos_on_leaf(flush_pos_t * pos)
13003 +{
13004 +       int ret;
13005 +
13006 +       assert("zam-845", pos->state == POS_ON_LEAF);
13007 +
13008 +       ret = handle_pos_on_formatted(pos);
13009 +
13010 +       if (ret == -E_NO_NEIGHBOR) {
13011 +               /* cannot get right neighbor, go process extents. */
13012 +               pos->state = POS_TO_TWIG;
13013 +               return 0;
13014 +       }
13015 +
13016 +       return ret;
13017 +}
13018 +
13019 +/* Process slum on level > 1 */
13020 +static int handle_pos_on_internal(flush_pos_t * pos)
13021 +{
13022 +       assert("zam-850", pos->state == POS_ON_INTERNAL);
13023 +       return handle_pos_on_formatted(pos);
13024 +}
13025 +
13026 +/* check whether squalloc should stop before processing given extent */
13027 +static int squalloc_extent_should_stop(flush_pos_t * pos)
13028 +{
13029 +       assert("zam-869", item_is_extent(&pos->coord));
13030 +
13031 +       /* pos->child is a jnode handle_pos_on_extent() should start with in
13032 +        * stead of the first child of the first extent unit. */
13033 +       if (pos->child) {
13034 +               int prepped;
13035 +
13036 +               assert("vs-1383", jnode_is_unformatted(pos->child));
13037 +               prepped = jnode_check_flushprepped(pos->child);
13038 +               pos->pos_in_unit =
13039 +                   jnode_get_index(pos->child) -
13040 +                   extent_unit_index(&pos->coord);
13041 +               assert("vs-1470",
13042 +                      pos->pos_in_unit < extent_unit_width(&pos->coord));
13043 +               assert("nikita-3434",
13044 +                      ergo(extent_is_unallocated(&pos->coord),
13045 +                           pos->pos_in_unit == 0));
13046 +               jput(pos->child);
13047 +               pos->child = NULL;
13048 +
13049 +               return prepped;
13050 +       }
13051 +
13052 +       pos->pos_in_unit = 0;
13053 +       if (extent_is_unallocated(&pos->coord))
13054 +               return 0;
13055 +
13056 +       return leftmost_child_of_unit_check_flushprepped(&pos->coord);
13057 +}
13058 +
13059 +/* Handle the case when regular reiser4 tree (znodes connected one to its
13060 + * neighbors by sibling pointers) is interrupted on leaf level by one or more
13061 + * unformatted nodes.  By having a lock on twig level and use extent code
13062 + * routines to process unformatted nodes we swim around an irregular part of
13063 + * reiser4 tree. */
13064 +static int handle_pos_on_twig(flush_pos_t * pos)
13065 +{
13066 +       int ret;
13067 +
13068 +       assert("zam-844", pos->state == POS_ON_EPOINT);
13069 +       assert("zam-843", item_is_extent(&pos->coord));
13070 +
13071 +       /* We decide should we continue slum processing with current extent
13072 +          unit: if leftmost child of current extent unit is flushprepped
13073 +          (i.e. clean or already processed by flush) we stop squalloc().  There
13074 +          is a fast check for unallocated extents which we assume contain all
13075 +          not flushprepped nodes. */
13076 +       /* FIXME: Here we implement simple check, we are only looking on the
13077 +          leftmost child. */
13078 +       ret = squalloc_extent_should_stop(pos);
13079 +       if (ret != 0) {
13080 +               pos_stop(pos);
13081 +               return ret;
13082 +       }
13083 +
13084 +       while (pos_valid(pos) && coord_is_existing_unit(&pos->coord)
13085 +              && item_is_extent(&pos->coord)) {
13086 +               ret = alloc_extent(pos);
13087 +               if (ret) {
13088 +                       break;
13089 +               }
13090 +               coord_next_unit(&pos->coord);
13091 +       }
13092 +
13093 +       if (coord_is_after_rightmost(&pos->coord)) {
13094 +               pos->state = POS_END_OF_TWIG;
13095 +               return 0;
13096 +       }
13097 +       if (item_is_internal(&pos->coord)) {
13098 +               pos->state = POS_TO_LEAF;
13099 +               return 0;
13100 +       }
13101 +
13102 +       assert("zam-860", item_is_extent(&pos->coord));
13103 +
13104 +       /* "slum" is over */
13105 +       pos->state = POS_INVALID;
13106 +       return 0;
13107 +}
13108 +
13109 +/* When we about to return flush position from twig to leaf level we can process
13110 + * the right twig node or move position to the leaf.  This processes right twig
13111 + * if it is possible and jump to leaf level if not. */
13112 +static int handle_pos_end_of_twig(flush_pos_t * pos)
13113 +{
13114 +       int ret;
13115 +       lock_handle right_lock;
13116 +       load_count right_load;
13117 +       coord_t at_right;
13118 +       jnode *child = NULL;
13119 +
13120 +       assert("zam-848", pos->state == POS_END_OF_TWIG);
13121 +       assert("zam-849", coord_is_after_rightmost(&pos->coord));
13122 +
13123 +       init_lh(&right_lock);
13124 +       init_load_count(&right_load);
13125 +
13126 +       /* We get a lock on the right twig node even it is not dirty because
13127 +        * slum continues or discontinues on leaf level not on next twig. This
13128 +        * lock on the right twig is needed for getting its leftmost child. */
13129 +       ret =
13130 +           reiser4_get_right_neighbor(&right_lock, pos->lock.node,
13131 +                                      ZNODE_WRITE_LOCK, GN_SAME_ATOM);
13132 +       if (ret)
13133 +               goto out;
13134 +
13135 +       ret = incr_load_count_znode(&right_load, right_lock.node);
13136 +       if (ret)
13137 +               goto out;
13138 +
13139 +       /* right twig could be not dirty */
13140 +       if (JF_ISSET(ZJNODE(right_lock.node), JNODE_DIRTY)) {
13141 +               /* If right twig node is dirty we always attempt to squeeze it
13142 +                * content to the left... */
13143 +             became_dirty:
13144 +               ret =
13145 +                   squeeze_right_twig_and_advance_coord(pos, right_lock.node);
13146 +               if (ret <= 0) {
13147 +                       /* pos->coord is on internal item, go to leaf level, or
13148 +                        * we have an error which will be caught in squalloc() */
13149 +                       pos->state = POS_TO_LEAF;
13150 +                       goto out;
13151 +               }
13152 +
13153 +               /* If right twig was squeezed completely we wave to re-lock
13154 +                * right twig. now it is done through the top-level squalloc
13155 +                * routine. */
13156 +               if (node_is_empty(right_lock.node))
13157 +                       goto out;
13158 +
13159 +               /* ... and prep it if it is not yet prepped */
13160 +               if (!znode_check_flushprepped(right_lock.node)) {
13161 +                       /* As usual, process parent before ... */
13162 +                       ret =
13163 +                           check_parents_and_squalloc_upper_levels(pos,
13164 +                                                                   pos->lock.
13165 +                                                                   node,
13166 +                                                                   right_lock.
13167 +                                                                   node);
13168 +                       if (ret)
13169 +                               goto out;
13170 +
13171 +                       /* ... processing the child */
13172 +                       ret =
13173 +                           lock_parent_and_allocate_znode(right_lock.node,
13174 +                                                          pos);
13175 +                       if (ret)
13176 +                               goto out;
13177 +               }
13178 +       } else {
13179 +               coord_init_first_unit(&at_right, right_lock.node);
13180 +
13181 +               /* check first child of next twig, should we continue there ? */
13182 +               ret = get_leftmost_child_of_unit(&at_right, &child);
13183 +               if (ret || child == NULL || jnode_check_flushprepped(child)) {
13184 +                       pos_stop(pos);
13185 +                       goto out;
13186 +               }
13187 +
13188 +               /* check clean twig for possible relocation */
13189 +               if (!znode_check_flushprepped(right_lock.node)) {
13190 +                       ret =
13191 +                           reverse_relocate_check_dirty_parent(child,
13192 +                                                               &at_right, pos);
13193 +                       if (ret)
13194 +                               goto out;
13195 +                       if (JF_ISSET(ZJNODE(right_lock.node), JNODE_DIRTY))
13196 +                               goto became_dirty;
13197 +               }
13198 +       }
13199 +
13200 +       assert("zam-875", znode_check_flushprepped(right_lock.node));
13201 +
13202 +       /* Update the preceder by a block number of just processed right twig
13203 +        * node. The code above could miss the preceder updating because
13204 +        * allocate_znode() could not be called for this node. */
13205 +       pos->preceder.blk = *znode_get_block(right_lock.node);
13206 +       check_preceder(pos->preceder.blk);
13207 +
13208 +       coord_init_first_unit(&at_right, right_lock.node);
13209 +       assert("zam-868", coord_is_existing_unit(&at_right));
13210 +
13211 +       pos->state = item_is_extent(&at_right) ? POS_ON_EPOINT : POS_TO_LEAF;
13212 +       move_flush_pos(pos, &right_lock, &right_load, &at_right);
13213 +
13214 +      out:
13215 +       done_load_count(&right_load);
13216 +       done_lh(&right_lock);
13217 +
13218 +       if (child)
13219 +               jput(child);
13220 +
13221 +       return ret;
13222 +}
13223 +
13224 +/* Move the pos->lock to leaf node pointed by pos->coord, check should we
13225 + * continue there. */
13226 +static int handle_pos_to_leaf(flush_pos_t * pos)
13227 +{
13228 +       int ret;
13229 +       lock_handle child_lock;
13230 +       load_count child_load;
13231 +       jnode *child;
13232 +
13233 +       assert("zam-846", pos->state == POS_TO_LEAF);
13234 +       assert("zam-847", item_is_internal(&pos->coord));
13235 +
13236 +       init_lh(&child_lock);
13237 +       init_load_count(&child_load);
13238 +
13239 +       ret = get_leftmost_child_of_unit(&pos->coord, &child);
13240 +       if (ret)
13241 +               return ret;
13242 +       if (child == NULL) {
13243 +               pos_stop(pos);
13244 +               return 0;
13245 +       }
13246 +
13247 +       if (jnode_check_flushprepped(child)) {
13248 +               pos->state = POS_INVALID;
13249 +               goto out;
13250 +       }
13251 +
13252 +       ret =
13253 +           longterm_lock_znode(&child_lock, JZNODE(child), ZNODE_WRITE_LOCK,
13254 +                               ZNODE_LOCK_LOPRI);
13255 +       if (ret)
13256 +               goto out;
13257 +
13258 +       ret = incr_load_count_znode(&child_load, JZNODE(child));
13259 +       if (ret)
13260 +               goto out;
13261 +
13262 +       ret = allocate_znode(JZNODE(child), &pos->coord, pos);
13263 +       if (ret)
13264 +               goto out;
13265 +
13266 +       /* move flush position to leaf level */
13267 +       pos->state = POS_ON_LEAF;
13268 +       move_flush_pos(pos, &child_lock, &child_load, NULL);
13269 +
13270 +       if (node_is_empty(JZNODE(child))) {
13271 +               ret = delete_empty_node(JZNODE(child));
13272 +               pos->state = POS_INVALID;
13273 +       }
13274 +      out:
13275 +       done_load_count(&child_load);
13276 +       done_lh(&child_lock);
13277 +       jput(child);
13278 +
13279 +       return ret;
13280 +}
13281 +
13282 +/* move pos from leaf to twig, and move lock from leaf to twig. */
13283 +/* Move pos->lock to upper (twig) level */
13284 +static int handle_pos_to_twig(flush_pos_t * pos)
13285 +{
13286 +       int ret;
13287 +
13288 +       lock_handle parent_lock;
13289 +       load_count parent_load;
13290 +       coord_t pcoord;
13291 +
13292 +       assert("zam-852", pos->state == POS_TO_TWIG);
13293 +
13294 +       init_lh(&parent_lock);
13295 +       init_load_count(&parent_load);
13296 +
13297 +       ret =
13298 +           reiser4_get_parent(&parent_lock, pos->lock.node, ZNODE_WRITE_LOCK);
13299 +       if (ret)
13300 +               goto out;
13301 +
13302 +       ret = incr_load_count_znode(&parent_load, parent_lock.node);
13303 +       if (ret)
13304 +               goto out;
13305 +
13306 +       ret = find_child_ptr(parent_lock.node, pos->lock.node, &pcoord);
13307 +       if (ret)
13308 +               goto out;
13309 +
13310 +       assert("zam-870", item_is_internal(&pcoord));
13311 +       coord_next_item(&pcoord);
13312 +
13313 +       if (coord_is_after_rightmost(&pcoord))
13314 +               pos->state = POS_END_OF_TWIG;
13315 +       else if (item_is_extent(&pcoord))
13316 +               pos->state = POS_ON_EPOINT;
13317 +       else {
13318 +               /* Here we understand that getting -E_NO_NEIGHBOR in
13319 +                * handle_pos_on_leaf() was because of just a reaching edge of
13320 +                * slum */
13321 +               pos_stop(pos);
13322 +               goto out;
13323 +       }
13324 +
13325 +       move_flush_pos(pos, &parent_lock, &parent_load, &pcoord);
13326 +
13327 +      out:
13328 +       done_load_count(&parent_load);
13329 +       done_lh(&parent_lock);
13330 +
13331 +       return ret;
13332 +}
13333 +
13334 +typedef int (*pos_state_handle_t) (flush_pos_t *);
13335 +static pos_state_handle_t flush_pos_handlers[] = {
13336 +       /* process formatted nodes on leaf level, keep lock on a leaf node */
13337 +       [POS_ON_LEAF] = handle_pos_on_leaf,
13338 +       /* process unformatted nodes, keep lock on twig node, pos->coord points to extent currently
13339 +        * being processed */
13340 +       [POS_ON_EPOINT] = handle_pos_on_twig,
13341 +       /* move a lock from leaf node to its parent for further processing of unformatted nodes */
13342 +       [POS_TO_TWIG] = handle_pos_to_twig,
13343 +       /* move a lock from twig to leaf level when a processing of unformatted nodes finishes,
13344 +        * pos->coord points to the leaf node we jump to */
13345 +       [POS_TO_LEAF] = handle_pos_to_leaf,
13346 +       /* after processing last extent in the twig node, attempting to shift items from the twigs
13347 +        * right neighbor and process them while shifting */
13348 +       [POS_END_OF_TWIG] = handle_pos_end_of_twig,
13349 +       /* process formatted nodes on internal level, keep lock on an internal node */
13350 +       [POS_ON_INTERNAL] = handle_pos_on_internal
13351 +};
13352 +
13353 +/* Advance flush position horizontally, prepare for flushing ((re)allocate, squeeze,
13354 + * encrypt) nodes and their ancestors in "parent-first" order */
13355 +static int squalloc(flush_pos_t * pos)
13356 +{
13357 +       int ret = 0;
13358 +
13359 +       /* maybe needs to be made a case statement with handle_pos_on_leaf as first case, for
13360 +        * greater CPU efficiency? Measure and see.... -Hans */
13361 +       while (pos_valid(pos)) {
13362 +               ret = flush_pos_handlers[pos->state] (pos);
13363 +               if (ret < 0)
13364 +                       break;
13365 +
13366 +               ret = rapid_flush(pos);
13367 +               if (ret)
13368 +                       break;
13369 +       }
13370 +
13371 +       /* any positive value or -E_NO_NEIGHBOR are legal return codes for handle_pos*
13372 +          routines, -E_NO_NEIGHBOR means that slum edge was reached */
13373 +       if (ret > 0 || ret == -E_NO_NEIGHBOR)
13374 +               ret = 0;
13375 +
13376 +       return ret;
13377 +}
13378 +
13379 +static void update_ldkey(znode * node)
13380 +{
13381 +       reiser4_key ldkey;
13382 +
13383 +       assert_rw_write_locked(&(znode_get_tree(node)->dk_lock));
13384 +       if (node_is_empty(node))
13385 +               return;
13386 +
13387 +       znode_set_ld_key(node, leftmost_key_in_node(node, &ldkey));
13388 +}
13389 +
13390 +/* this is to be called after calling of shift node's method to shift data from @right to
13391 +   @left. It sets left delimiting keys of @left and @right to keys of first items of @left
13392 +   and @right correspondingly and sets right delimiting key of @left to first key of @right */
13393 +static void update_znode_dkeys(znode * left, znode * right)
13394 +{
13395 +       assert_rw_write_locked(&(znode_get_tree(right)->dk_lock));
13396 +       assert("vs-1629", (znode_is_write_locked(left) &&
13397 +                          znode_is_write_locked(right)));
13398 +
13399 +       /* we need to update left delimiting of left if it was empty before shift */
13400 +       update_ldkey(left);
13401 +       update_ldkey(right);
13402 +       if (node_is_empty(right))
13403 +               znode_set_rd_key(left, znode_get_rd_key(right));
13404 +       else
13405 +               znode_set_rd_key(left, znode_get_ld_key(right));
13406 +}
13407 +
13408 +/* try to shift everything from @right to @left. If everything was shifted -
13409 +   @right is removed from the tree.  Result is the number of bytes shifted. */
13410 +static int
13411 +shift_everything_left(znode * right, znode * left, carry_level * todo)
13412 +{
13413 +       coord_t from;
13414 +       node_plugin *nplug;
13415 +       carry_plugin_info info;
13416 +
13417 +       coord_init_after_last_item(&from, right);
13418 +
13419 +       nplug = node_plugin_by_node(right);
13420 +       info.doing = NULL;
13421 +       info.todo = todo;
13422 +       return nplug->shift(&from, left, SHIFT_LEFT,
13423 +                           1 /* delete @right if it becomes empty */ ,
13424 +                           1
13425 +                           /* move coord @from to node @left if everything will be shifted */
13426 +                           ,
13427 +                           &info);
13428 +}
13429 +
13430 +/* Shift as much as possible from @right to @left using the memcpy-optimized
13431 +   shift_everything_left.  @left and @right are formatted neighboring nodes on
13432 +   leaf level. */
13433 +static int squeeze_right_non_twig(znode * left, znode * right)
13434 +{
13435 +       int ret;
13436 +       carry_pool *pool;
13437 +       carry_level *todo;
13438 +
13439 +       assert("nikita-2246", znode_get_level(left) == znode_get_level(right));
13440 +
13441 +       if (!JF_ISSET(ZJNODE(left), JNODE_DIRTY) ||
13442 +           !JF_ISSET(ZJNODE(right), JNODE_DIRTY))
13443 +               return SQUEEZE_TARGET_FULL;
13444 +
13445 +       pool = init_carry_pool(sizeof(*pool) + 3 * sizeof(*todo));
13446 +       if (IS_ERR(pool))
13447 +               return PTR_ERR(pool);
13448 +       todo = (carry_level *) (pool + 1);
13449 +       init_carry_level(todo, pool);
13450 +
13451 +       ret = shift_everything_left(right, left, todo);
13452 +       if (ret > 0) {
13453 +               /* something was shifted */
13454 +               reiser4_tree *tree;
13455 +               __u64 grabbed;
13456 +
13457 +               znode_make_dirty(left);
13458 +               znode_make_dirty(right);
13459 +
13460 +               /* update delimiting keys of nodes which participated in
13461 +                  shift. FIXME: it would be better to have this in shift
13462 +                  node's operation. But it can not be done there. Nobody
13463 +                  remembers why, though */
13464 +               tree = znode_get_tree(left);
13465 +               write_lock_dk(tree);
13466 +               update_znode_dkeys(left, right);
13467 +               write_unlock_dk(tree);
13468 +
13469 +               /* Carry is called to update delimiting key and, maybe, to remove empty
13470 +                  node. */
13471 +               grabbed = get_current_context()->grabbed_blocks;
13472 +               ret = reiser4_grab_space_force(tree->height, BA_RESERVED);
13473 +               assert("nikita-3003", ret == 0);        /* reserved space is exhausted. Ask Hans. */
13474 +               ret = carry(todo, NULL /* previous level */ );
13475 +               grabbed2free_mark(grabbed);
13476 +       } else {
13477 +               /* Shifting impossible, we return appropriate result code */
13478 +               ret =
13479 +                   node_is_empty(right) ? SQUEEZE_SOURCE_EMPTY :
13480 +                   SQUEEZE_TARGET_FULL;
13481 +       }
13482 +
13483 +       done_carry_pool(pool);
13484 +
13485 +       return ret;
13486 +}
13487 +
13488 +#if REISER4_DEBUG
13489 +static int sibling_link_is_ok(const znode *left, const znode *right)
13490 +{
13491 +       int result;
13492 +
13493 +       read_lock_tree(znode_get_tree(left));
13494 +       result = (left->right == right && left == right->left);
13495 +       read_unlock_tree(znode_get_tree(left));
13496 +       return result;
13497 +}
13498 +#endif
13499 +
13500 +/* Shift first unit of first item if it is an internal one.  Return
13501 +   SQUEEZE_TARGET_FULL if it fails to shift an item, otherwise return
13502 +   SUBTREE_MOVED. */
13503 +static int shift_one_internal_unit(znode * left, znode * right)
13504 +{
13505 +       int ret;
13506 +       carry_pool *pool;
13507 +       carry_level *todo;
13508 +       coord_t *coord;
13509 +       carry_plugin_info *info;
13510 +       int size, moved;
13511 +
13512 +       assert("nikita-2247", znode_get_level(left) == znode_get_level(right));
13513 +       assert("nikita-2435", znode_is_write_locked(left));
13514 +       assert("nikita-2436", znode_is_write_locked(right));
13515 +       assert("nikita-2434", sibling_link_is_ok(left, right));
13516 +
13517 +       pool = init_carry_pool(sizeof(*pool) + 3 * sizeof(*todo) +
13518 +                              sizeof(*coord) + sizeof(*info)
13519 +#if REISER4_DEBUG
13520 +                              + sizeof(*coord) + 2 * sizeof(reiser4_key)
13521 +#endif
13522 +           );
13523 +       if (IS_ERR(pool))
13524 +               return PTR_ERR(pool);
13525 +       todo = (carry_level *) (pool + 1);
13526 +       init_carry_level(todo, pool);
13527 +
13528 +       coord = (coord_t *) (todo + 3);
13529 +       coord_init_first_unit(coord, right);
13530 +       info = (carry_plugin_info *) (coord + 1);
13531 +
13532 +#if REISER4_DEBUG
13533 +       if (!node_is_empty(left)) {
13534 +               coord_t *last;
13535 +               reiser4_key *right_key;
13536 +               reiser4_key *left_key;
13537 +
13538 +               last = (coord_t *) (info + 1);
13539 +               right_key = (reiser4_key *) (last + 1);
13540 +               left_key = right_key + 1;
13541 +               coord_init_last_unit(last, left);
13542 +
13543 +               assert("nikita-2463",
13544 +                      keyle(item_key_by_coord(last, left_key),
13545 +                            item_key_by_coord(coord, right_key)));
13546 +       }
13547 +#endif
13548 +
13549 +       assert("jmacd-2007", item_is_internal(coord));
13550 +
13551 +       size = item_length_by_coord(coord);
13552 +       info->todo = todo;
13553 +       info->doing = NULL;
13554 +
13555 +       ret = node_plugin_by_node(left)->shift(coord, left, SHIFT_LEFT,
13556 +                                              1
13557 +                                              /* delete @right if it becomes empty */
13558 +                                              ,
13559 +                                              0
13560 +                                              /* do not move coord @coord to node @left */
13561 +                                              ,
13562 +                                              info);
13563 +
13564 +       /* If shift returns positive, then we shifted the item. */
13565 +       assert("vs-423", ret <= 0 || size == ret);
13566 +       moved = (ret > 0);
13567 +
13568 +       if (moved) {
13569 +               /* something was moved */
13570 +               reiser4_tree *tree;
13571 +               int grabbed;
13572 +
13573 +               znode_make_dirty(left);
13574 +               znode_make_dirty(right);
13575 +               tree = znode_get_tree(left);
13576 +               write_lock_dk(tree);
13577 +               update_znode_dkeys(left, right);
13578 +               write_unlock_dk(tree);
13579 +
13580 +               /* reserve space for delimiting keys after shifting */
13581 +               grabbed = get_current_context()->grabbed_blocks;
13582 +               ret = reiser4_grab_space_force(tree->height, BA_RESERVED);
13583 +               assert("nikita-3003", ret == 0);        /* reserved space is exhausted. Ask Hans. */
13584 +
13585 +               ret = carry(todo, NULL /* previous level */ );
13586 +               grabbed2free_mark(grabbed);
13587 +       }
13588 +
13589 +       done_carry_pool(pool);
13590 +
13591 +       if (ret != 0) {
13592 +               /* Shift or carry operation failed. */
13593 +               assert("jmacd-7325", ret < 0);
13594 +               return ret;
13595 +       }
13596 +
13597 +       return moved ? SUBTREE_MOVED : SQUEEZE_TARGET_FULL;
13598 +}
13599 +
13600 +/* Make the final relocate/wander decision during forward parent-first squalloc for a
13601 +   znode.  For unformatted nodes this is done in plugin/item/extent.c:extent_needs_allocation(). */
13602 +static int
13603 +allocate_znode_loaded(znode * node,
13604 +                     const coord_t * parent_coord, flush_pos_t * pos)
13605 +{
13606 +       int ret;
13607 +       reiser4_super_info_data *sbinfo = get_current_super_private();
13608 +       /* FIXME(D): We have the node write-locked and should have checked for !
13609 +          allocated() somewhere before reaching this point, but there can be a race, so
13610 +          this assertion is bogus. */
13611 +       assert("jmacd-7987", !jnode_check_flushprepped(ZJNODE(node)));
13612 +       assert("jmacd-7988", znode_is_write_locked(node));
13613 +       assert("jmacd-7989", coord_is_invalid(parent_coord)
13614 +              || znode_is_write_locked(parent_coord->node));
13615 +
13616 +       if (ZF_ISSET(node, JNODE_REPACK) || ZF_ISSET(node, JNODE_CREATED) ||
13617 +           znode_is_root(node) ||
13618 +           /* We have enough nodes to relocate no matter what. */
13619 +           (pos->leaf_relocate != 0 && znode_get_level(node) == LEAF_LEVEL)) {
13620 +               /* No need to decide with new nodes, they are treated the same as
13621 +                  relocate. If the root node is dirty, relocate. */
13622 +               if (pos->preceder.blk == 0) {
13623 +                       /* preceder is unknown and we have decided to relocate node --
13624 +                          using of default value for search start is better than search
13625 +                          from block #0. */
13626 +                       get_blocknr_hint_default(&pos->preceder.blk);
13627 +                       check_preceder(pos->preceder.blk);
13628 +               }
13629 +
13630 +               goto best_reloc;
13631 +
13632 +       } else if (pos->preceder.blk == 0) {
13633 +               /* If we don't know the preceder, leave it where it is. */
13634 +               jnode_make_wander(ZJNODE(node));
13635 +       } else {
13636 +               /* Make a decision based on block distance. */
13637 +               reiser4_block_nr dist;
13638 +               reiser4_block_nr nblk = *znode_get_block(node);
13639 +
13640 +               assert("jmacd-6172", !blocknr_is_fake(&nblk));
13641 +               assert("jmacd-6173", !blocknr_is_fake(&pos->preceder.blk));
13642 +               assert("jmacd-6174", pos->preceder.blk != 0);
13643 +
13644 +               if (pos->preceder.blk == nblk - 1) {
13645 +                       /* Ideal. */
13646 +                       jnode_make_wander(ZJNODE(node));
13647 +               } else {
13648 +
13649 +                       dist =
13650 +                           (nblk <
13651 +                            pos->preceder.blk) ? (pos->preceder.blk -
13652 +                                                  nblk) : (nblk -
13653 +                                                           pos->preceder.blk);
13654 +
13655 +                       /* See if we can find a closer block (forward direction only). */
13656 +                       pos->preceder.max_dist =
13657 +                           min((reiser4_block_nr) sbinfo->flush.
13658 +                               relocate_distance, dist);
13659 +                       pos->preceder.level = znode_get_level(node);
13660 +
13661 +                       ret = allocate_znode_update(node, parent_coord, pos);
13662 +
13663 +                       pos->preceder.max_dist = 0;
13664 +
13665 +                       if (ret && (ret != -ENOSPC))
13666 +                               return ret;
13667 +
13668 +                       if (ret == 0) {
13669 +                               /* Got a better allocation. */
13670 +                               znode_make_reloc(node, pos->fq);
13671 +                       } else if (dist < sbinfo->flush.relocate_distance) {
13672 +                               /* The present allocation is good enough. */
13673 +                               jnode_make_wander(ZJNODE(node));
13674 +                       } else {
13675 +                               /* Otherwise, try to relocate to the best position. */
13676 +                             best_reloc:
13677 +                               ret =
13678 +                                   allocate_znode_update(node, parent_coord,
13679 +                                                         pos);
13680 +                               if (ret != 0)
13681 +                                       return ret;
13682 +
13683 +                               /* set JNODE_RELOC bit _after_ node gets allocated */
13684 +                               znode_make_reloc(node, pos->fq);
13685 +                       }
13686 +               }
13687 +       }
13688 +
13689 +       /* This is the new preceder. */
13690 +       pos->preceder.blk = *znode_get_block(node);
13691 +       check_preceder(pos->preceder.blk);
13692 +       pos->alloc_cnt += 1;
13693 +
13694 +       assert("jmacd-4277", !blocknr_is_fake(&pos->preceder.blk));
13695 +
13696 +       return 0;
13697 +}
13698 +
13699 +static int
13700 +allocate_znode(znode * node, const coord_t * parent_coord, flush_pos_t * pos)
13701 +{
13702 +       /*
13703 +        * perform znode allocation with znode pinned in memory to avoid races
13704 +        * with asynchronous emergency flush (which plays with
13705 +        * JNODE_FLUSH_RESERVED bit).
13706 +        */
13707 +       return WITH_DATA(node, allocate_znode_loaded(node, parent_coord, pos));
13708 +}
13709 +
13710 +/* A subroutine of allocate_znode, this is called first to see if there is a close
13711 +   position to relocate to.  It may return ENOSPC if there is no close position.  If there
13712 +   is no close position it may not relocate.  This takes care of updating the parent node
13713 +   with the relocated block address. */
13714 +static int
13715 +allocate_znode_update(znode * node, const coord_t * parent_coord,
13716 +                     flush_pos_t * pos)
13717 +{
13718 +       int ret;
13719 +       reiser4_block_nr blk;
13720 +       lock_handle uber_lock;
13721 +       int flush_reserved_used = 0;
13722 +       int grabbed;
13723 +       reiser4_context *ctx;
13724 +       reiser4_super_info_data *sbinfo;
13725 +
13726 +       init_lh(&uber_lock);
13727 +
13728 +       ctx = get_current_context();
13729 +       sbinfo = get_super_private(ctx->super);
13730 +
13731 +       grabbed = ctx->grabbed_blocks;
13732 +
13733 +       /* discard e-flush allocation */
13734 +       ret = zload(node);
13735 +       if (ret)
13736 +               return ret;
13737 +
13738 +       if (ZF_ISSET(node, JNODE_CREATED)) {
13739 +               assert("zam-816", blocknr_is_fake(znode_get_block(node)));
13740 +               pos->preceder.block_stage = BLOCK_UNALLOCATED;
13741 +       } else {
13742 +               pos->preceder.block_stage = BLOCK_GRABBED;
13743 +
13744 +               /* The disk space for relocating the @node is already reserved in "flush reserved"
13745 +                * counter if @node is leaf, otherwise we grab space using BA_RESERVED (means grab
13746 +                * space from whole disk not from only 95%). */
13747 +               if (znode_get_level(node) == LEAF_LEVEL) {
13748 +                       /*
13749 +                        * earlier (during do_jnode_make_dirty()) we decided
13750 +                        * that @node can possibly go into overwrite set and
13751 +                        * reserved block for its wandering location.
13752 +                        */
13753 +                       txn_atom *atom = get_current_atom_locked();
13754 +                       assert("nikita-3449",
13755 +                              ZF_ISSET(node, JNODE_FLUSH_RESERVED));
13756 +                       flush_reserved2grabbed(atom, (__u64) 1);
13757 +                       spin_unlock_atom(atom);
13758 +                       /*
13759 +                        * we are trying to move node into relocate
13760 +                        * set. Allocation of relocated position "uses"
13761 +                        * reserved block.
13762 +                        */
13763 +                       ZF_CLR(node, JNODE_FLUSH_RESERVED);
13764 +                       flush_reserved_used = 1;
13765 +               } else {
13766 +                       ret = reiser4_grab_space_force((__u64) 1, BA_RESERVED);
13767 +                       if (ret != 0)
13768 +                               goto exit;
13769 +               }
13770 +       }
13771 +
13772 +       /* We may do not use 5% of reserved disk space here and flush will not pack tightly. */
13773 +       ret = reiser4_alloc_block(&pos->preceder, &blk,
13774 +                                 BA_FORMATTED | BA_PERMANENT);
13775 +       if (ret)
13776 +               goto exit;
13777 +
13778 +       if (!ZF_ISSET(node, JNODE_CREATED) &&
13779 +           (ret =
13780 +            reiser4_dealloc_block(znode_get_block(node), 0,
13781 +                                  BA_DEFER | BA_FORMATTED)))
13782 +               goto exit;
13783 +
13784 +       if (likely(!znode_is_root(node))) {
13785 +               item_plugin *iplug;
13786 +
13787 +               iplug = item_plugin_by_coord(parent_coord);
13788 +               assert("nikita-2954", iplug->f.update != NULL);
13789 +               iplug->f.update(parent_coord, &blk);
13790 +
13791 +               znode_make_dirty(parent_coord->node);
13792 +
13793 +       } else {
13794 +               reiser4_tree *tree = znode_get_tree(node);
13795 +               znode *uber;
13796 +
13797 +               /* We take a longterm lock on the fake node in order to change
13798 +                  the root block number.  This may cause atom fusion. */
13799 +               ret = get_uber_znode(tree, ZNODE_WRITE_LOCK, ZNODE_LOCK_HIPRI,
13800 +                                    &uber_lock);
13801 +               /* The fake node cannot be deleted, and we must have priority
13802 +                  here, and may not be confused with ENOSPC. */
13803 +               assert("jmacd-74412",
13804 +                      ret != -EINVAL && ret != -E_DEADLOCK && ret != -ENOSPC);
13805 +
13806 +               if (ret)
13807 +                       goto exit;
13808 +
13809 +               uber = uber_lock.node;
13810 +
13811 +               write_lock_tree(tree);
13812 +               tree->root_block = blk;
13813 +               write_unlock_tree(tree);
13814 +
13815 +               znode_make_dirty(uber);
13816 +       }
13817 +
13818 +       ret = znode_rehash(node, &blk);
13819 +      exit:
13820 +       if (ret) {
13821 +               /* Get flush reserved block back if something fails, because
13822 +                * callers assume that on error block wasn't relocated and its
13823 +                * flush reserved block wasn't used. */
13824 +               if (flush_reserved_used) {
13825 +                       /*
13826 +                        * ok, we failed to move node into relocate
13827 +                        * set. Restore status quo.
13828 +                        */
13829 +                       grabbed2flush_reserved((__u64) 1);
13830 +                       ZF_SET(node, JNODE_FLUSH_RESERVED);
13831 +               }
13832 +       }
13833 +       zrelse(node);
13834 +       done_lh(&uber_lock);
13835 +       grabbed2free_mark(grabbed);
13836 +       return ret;
13837 +}
13838 +
13839 +/* JNODE INTERFACE */
13840 +
13841 +/* Lock a node (if formatted) and then get its parent locked, set the child's
13842 +   coordinate in the parent.  If the child is the root node, the above_root
13843 +   znode is returned but the coord is not set.  This function may cause atom
13844 +   fusion, but it is only used for read locks (at this point) and therefore
13845 +   fusion only occurs when the parent is already dirty. */
13846 +/* Hans adds this note: remember to ask how expensive this operation is vs. storing parent
13847 +   pointer in jnodes. */
13848 +static int
13849 +jnode_lock_parent_coord(jnode * node,
13850 +                       coord_t * coord,
13851 +                       lock_handle * parent_lh,
13852 +                       load_count * parent_zh,
13853 +                       znode_lock_mode parent_mode, int try)
13854 +{
13855 +       int ret;
13856 +
13857 +       assert("edward-53", jnode_is_unformatted(node) || jnode_is_znode(node));
13858 +       assert("edward-54", jnode_is_unformatted(node)
13859 +              || znode_is_any_locked(JZNODE(node)));
13860 +
13861 +       if (!jnode_is_znode(node)) {
13862 +               reiser4_key key;
13863 +               tree_level stop_level = TWIG_LEVEL;
13864 +               lookup_bias bias = FIND_EXACT;
13865 +
13866 +               assert("edward-168", !(jnode_get_type(node) == JNODE_BITMAP));
13867 +
13868 +               /* The case when node is not znode, but can have parent coord
13869 +                  (unformatted node, node which represents cluster page,
13870 +                  etc..).  Generate a key for the appropriate entry, search
13871 +                  in the tree using coord_by_key, which handles locking for
13872 +                  us. */
13873 +
13874 +               /*
13875 +                * nothing is locked at this moment, so, nothing prevents
13876 +                * concurrent truncate from removing jnode from inode. To
13877 +                * prevent this spin-lock jnode. jnode can be truncated just
13878 +                * after call to the jnode_build_key(), but this is ok,
13879 +                * because coord_by_key() will just fail to find appropriate
13880 +                * extent.
13881 +                */
13882 +               spin_lock_jnode(node);
13883 +               if (!JF_ISSET(node, JNODE_HEARD_BANSHEE)) {
13884 +                       jnode_build_key(node, &key);
13885 +                       ret = 0;
13886 +               } else
13887 +                       ret = RETERR(-ENOENT);
13888 +               spin_unlock_jnode(node);
13889 +
13890 +               if (ret != 0)
13891 +                       return ret;
13892 +
13893 +               if (jnode_is_cluster_page(node))
13894 +                       stop_level = LEAF_LEVEL;
13895 +
13896 +               assert("jmacd-1812", coord != NULL);
13897 +
13898 +               ret = coord_by_key(jnode_get_tree(node), &key, coord, parent_lh,
13899 +                                  parent_mode, bias, stop_level, stop_level,
13900 +                                  CBK_UNIQUE, NULL /*ra_info */ );
13901 +               switch (ret) {
13902 +               case CBK_COORD_NOTFOUND:
13903 +                       assert("edward-1038",
13904 +                              ergo(jnode_is_cluster_page(node),
13905 +                                   JF_ISSET(node, JNODE_HEARD_BANSHEE)));
13906 +                       if (!JF_ISSET(node, JNODE_HEARD_BANSHEE))
13907 +                               warning("nikita-3177", "Parent not found");
13908 +                       return ret;
13909 +               case CBK_COORD_FOUND:
13910 +                       if (coord->between != AT_UNIT) {
13911 +                               /* FIXME: comment needed */
13912 +                               done_lh(parent_lh);
13913 +                               if (!JF_ISSET(node, JNODE_HEARD_BANSHEE)) {
13914 +                                       warning("nikita-3178",
13915 +                                               "Found but not happy: %i",
13916 +                                               coord->between);
13917 +                               }
13918 +                               return RETERR(-ENOENT);
13919 +                       }
13920 +                       ret = incr_load_count_znode(parent_zh, parent_lh->node);
13921 +                       if (ret != 0)
13922 +                               return ret;
13923 +                       /* if (jnode_is_cluster_page(node)) {
13924 +                          races with write() are possible
13925 +                          check_child_cluster (parent_lh->node);
13926 +                          }
13927 +                        */
13928 +                       break;
13929 +               default:
13930 +                       return ret;
13931 +               }
13932 +
13933 +       } else {
13934 +               int flags;
13935 +               znode *z;
13936 +
13937 +               z = JZNODE(node);
13938 +               /* Formatted node case: */
13939 +               assert("jmacd-2061", !znode_is_root(z));
13940 +
13941 +               flags = GN_ALLOW_NOT_CONNECTED;
13942 +               if (try)
13943 +                       flags |= GN_TRY_LOCK;
13944 +
13945 +               ret =
13946 +                   reiser4_get_parent_flags(parent_lh, z, parent_mode, flags);
13947 +               if (ret != 0)
13948 +                       /* -E_REPEAT is ok here, it is handled by the caller. */
13949 +                       return ret;
13950 +
13951 +               /* Make the child's position "hint" up-to-date.  (Unless above
13952 +                  root, which caller must check.) */
13953 +               if (coord != NULL) {
13954 +
13955 +                       ret = incr_load_count_znode(parent_zh, parent_lh->node);
13956 +                       if (ret != 0) {
13957 +                               warning("jmacd-976812386",
13958 +                                       "incr_load_count_znode failed: %d",
13959 +                                       ret);
13960 +                               return ret;
13961 +                       }
13962 +
13963 +                       ret = find_child_ptr(parent_lh->node, z, coord);
13964 +                       if (ret != 0) {
13965 +                               warning("jmacd-976812",
13966 +                                       "find_child_ptr failed: %d", ret);
13967 +                               return ret;
13968 +                       }
13969 +               }
13970 +       }
13971 +
13972 +       return 0;
13973 +}
13974 +
13975 +/* Get the (locked) next neighbor of a znode which is dirty and a member of the same atom.
13976 +   If there is no next neighbor or the neighbor is not in memory or if there is a
13977 +   neighbor but it is not dirty or not in the same atom, -E_NO_NEIGHBOR is returned.
13978 +   In some cases the slum may include nodes which are not dirty, if so @check_dirty should be 0 */
13979 +static int neighbor_in_slum(znode * node,      /* starting point */
13980 +                           lock_handle * lock, /* lock on starting point */
13981 +                           sideof side,        /* left or right direction we seek the next node in */
13982 +                           znode_lock_mode mode,       /* kind of lock we want */
13983 +                           int check_dirty)
13984 +{                              /* true if the neighbor should be dirty */
13985 +       int ret;
13986 +
13987 +       assert("jmacd-6334", znode_is_connected(node));
13988 +
13989 +       ret =
13990 +           reiser4_get_neighbor(lock, node, mode,
13991 +                                GN_SAME_ATOM | (side ==
13992 +                                                LEFT_SIDE ? GN_GO_LEFT : 0));
13993 +
13994 +       if (ret) {
13995 +               /* May return -ENOENT or -E_NO_NEIGHBOR. */
13996 +               /* FIXME(C): check EINVAL, E_DEADLOCK */
13997 +               if (ret == -ENOENT) {
13998 +                       ret = RETERR(-E_NO_NEIGHBOR);
13999 +               }
14000 +
14001 +               return ret;
14002 +       }
14003 +       if (!check_dirty)
14004 +               return 0;
14005 +       /* Check dirty bit of locked znode, no races here */
14006 +       if (JF_ISSET(ZJNODE(lock->node), JNODE_DIRTY))
14007 +               return 0;
14008 +
14009 +       done_lh(lock);
14010 +       return RETERR(-E_NO_NEIGHBOR);
14011 +}
14012 +
14013 +/* Return true if two znodes have the same parent.  This is called with both nodes
14014 +   write-locked (for squeezing) so no tree lock is needed. */
14015 +static int znode_same_parents(znode * a, znode * b)
14016 +{
14017 +       int result;
14018 +
14019 +       assert("jmacd-7011", znode_is_write_locked(a));
14020 +       assert("jmacd-7012", znode_is_write_locked(b));
14021 +
14022 +       /* We lock the whole tree for this check.... I really don't like whole tree
14023 +        * locks... -Hans */
14024 +       read_lock_tree(znode_get_tree(a));
14025 +       result = (znode_parent(a) == znode_parent(b));
14026 +       read_unlock_tree(znode_get_tree(a));
14027 +       return result;
14028 +}
14029 +
14030 +/* FLUSH SCAN */
14031 +
14032 +/* Initialize the flush_scan data structure. */
14033 +static void scan_init(flush_scan * scan)
14034 +{
14035 +       memset(scan, 0, sizeof(*scan));
14036 +       init_lh(&scan->node_lock);
14037 +       init_lh(&scan->parent_lock);
14038 +       init_load_count(&scan->parent_load);
14039 +       init_load_count(&scan->node_load);
14040 +       coord_init_invalid(&scan->parent_coord, NULL);
14041 +}
14042 +
14043 +/* Release any resources held by the flush scan, e.g., release locks, free memory, etc. */
14044 +static void scan_done(flush_scan * scan)
14045 +{
14046 +       done_load_count(&scan->node_load);
14047 +       if (scan->node != NULL) {
14048 +               jput(scan->node);
14049 +               scan->node = NULL;
14050 +       }
14051 +       done_load_count(&scan->parent_load);
14052 +       done_lh(&scan->parent_lock);
14053 +       done_lh(&scan->node_lock);
14054 +}
14055 +
14056 +/* Returns true if flush scanning is finished. */
14057 +int scan_finished(flush_scan * scan)
14058 +{
14059 +       return scan->stop || (scan->direction == RIGHT_SIDE &&
14060 +                             scan->count >= scan->max_count);
14061 +}
14062 +
14063 +/* Return true if the scan should continue to the @tonode.  True if the node meets the
14064 +   same_slum_check condition.  If not, deref the "left" node and stop the scan. */
14065 +int scan_goto(flush_scan * scan, jnode * tonode)
14066 +{
14067 +       int go = same_slum_check(scan->node, tonode, 1, 0);
14068 +
14069 +       if (!go) {
14070 +               scan->stop = 1;
14071 +               jput(tonode);
14072 +       }
14073 +
14074 +       return go;
14075 +}
14076 +
14077 +/* Set the current scan->node, refcount it, increment count by the @add_count (number to
14078 +   count, e.g., skipped unallocated nodes), deref previous current, and copy the current
14079 +   parent coordinate. */
14080 +int
14081 +scan_set_current(flush_scan * scan, jnode * node, unsigned add_count,
14082 +                const coord_t * parent)
14083 +{
14084 +       /* Release the old references, take the new reference. */
14085 +       done_load_count(&scan->node_load);
14086 +
14087 +       if (scan->node != NULL) {
14088 +               jput(scan->node);
14089 +       }
14090 +       scan->node = node;
14091 +       scan->count += add_count;
14092 +
14093 +       /* This next stmt is somewhat inefficient.  The scan_extent_coord code could
14094 +          delay this update step until it finishes and update the parent_coord only once.
14095 +          It did that before, but there was a bug and this was the easiest way to make it
14096 +          correct. */
14097 +       if (parent != NULL) {
14098 +               coord_dup(&scan->parent_coord, parent);
14099 +       }
14100 +
14101 +       /* Failure may happen at the incr_load_count call, but the caller can assume the reference
14102 +          is safely taken. */
14103 +       return incr_load_count_jnode(&scan->node_load, node);
14104 +}
14105 +
14106 +/* Return true if scanning in the leftward direction. */
14107 +int scanning_left(flush_scan * scan)
14108 +{
14109 +       return scan->direction == LEFT_SIDE;
14110 +}
14111 +
14112 +/* Performs leftward scanning starting from either kind of node.  Counts the starting
14113 +   node.  The right-scan object is passed in for the left-scan in order to copy the parent
14114 +   of an unformatted starting position.  This way we avoid searching for the unformatted
14115 +   node's parent when scanning in each direction.  If we search for the parent once it is
14116 +   set in both scan objects.  The limit parameter tells flush-scan when to stop.
14117 +
14118 +   Rapid scanning is used only during scan_left, where we are interested in finding the
14119 +   'leftpoint' where we begin flushing.  We are interested in stopping at the left child
14120 +   of a twig that does not have a dirty left neighbor.  THIS IS A SPECIAL CASE.  The
14121 +   problem is finding a way to flush only those nodes without unallocated children, and it
14122 +   is difficult to solve in the bottom-up flushing algorithm we are currently using.  The
14123 +   problem can be solved by scanning left at every level as we go upward, but this would
14124 +   basically bring us back to using a top-down allocation strategy, which we already tried
14125 +   (see BK history from May 2002), and has a different set of problems.  The top-down
14126 +   strategy makes avoiding unallocated children easier, but makes it difficult to
14127 +   propertly flush dirty children with clean parents that would otherwise stop the
14128 +   top-down flush, only later to dirty the parent once the children are flushed.  So we
14129 +   solve the problem in the bottom-up algorithm with a special case for twigs and leaves
14130 +   only.
14131 +
14132 +   The first step in solving the problem is this rapid leftward scan.  After we determine
14133 +   that there are at least enough nodes counted to qualify for FLUSH_RELOCATE_THRESHOLD we
14134 +   are no longer interested in the exact count, we are only interested in finding a the
14135 +   best place to start the flush.  We could choose one of two possibilities:
14136 +
14137 +   1. Stop at the leftmost child (of a twig) that does not have a dirty left neighbor.
14138 +   This requires checking one leaf per rapid-scan twig
14139 +
14140 +   2. Stop at the leftmost child (of a twig) where there are no dirty children of the twig
14141 +   to the left.  This requires checking possibly all of the in-memory children of each
14142 +   twig during the rapid scan.
14143 +
14144 +   For now we implement the first policy.
14145 +*/
14146 +static int
14147 +scan_left(flush_scan * scan, flush_scan * right, jnode * node, unsigned limit)
14148 +{
14149 +       int ret = 0;
14150 +
14151 +       scan->max_count = limit;
14152 +       scan->direction = LEFT_SIDE;
14153 +
14154 +       ret = scan_set_current(scan, jref(node), 1, NULL);
14155 +       if (ret != 0) {
14156 +               return ret;
14157 +       }
14158 +
14159 +       ret = scan_common(scan, right);
14160 +       if (ret != 0) {
14161 +               return ret;
14162 +       }
14163 +
14164 +       /* Before rapid scanning, we need a lock on scan->node so that we can get its
14165 +          parent, only if formatted. */
14166 +       if (jnode_is_znode(scan->node)) {
14167 +               ret = longterm_lock_znode(&scan->node_lock, JZNODE(scan->node),
14168 +                                         ZNODE_WRITE_LOCK, ZNODE_LOCK_LOPRI);
14169 +       }
14170 +
14171 +       /* Rapid_scan would go here (with limit set to FLUSH_RELOCATE_THRESHOLD). */
14172 +       return ret;
14173 +}
14174 +
14175 +/* Performs rightward scanning... Does not count the starting node.  The limit parameter
14176 +   is described in scan_left.  If the starting node is unformatted then the
14177 +   parent_coord was already set during scan_left.  The rapid_after parameter is not used
14178 +   during right-scanning.
14179 +
14180 +   scan_right is only called if the scan_left operation does not count at least
14181 +   FLUSH_RELOCATE_THRESHOLD nodes for flushing.  Otherwise, the limit parameter is set to
14182 +   the difference between scan-left's count and FLUSH_RELOCATE_THRESHOLD, meaning
14183 +   scan-right counts as high as FLUSH_RELOCATE_THRESHOLD and then stops. */
14184 +static int scan_right(flush_scan * scan, jnode * node, unsigned limit)
14185 +{
14186 +       int ret;
14187 +
14188 +       scan->max_count = limit;
14189 +       scan->direction = RIGHT_SIDE;
14190 +
14191 +       ret = scan_set_current(scan, jref(node), 0, NULL);
14192 +       if (ret != 0) {
14193 +               return ret;
14194 +       }
14195 +
14196 +       return scan_common(scan, NULL);
14197 +}
14198 +
14199 +/* Common code to perform left or right scanning. */
14200 +static int scan_common(flush_scan * scan, flush_scan * other)
14201 +{
14202 +       int ret;
14203 +
14204 +       assert("nikita-2376", scan->node != NULL);
14205 +       assert("edward-54", jnode_is_unformatted(scan->node)
14206 +              || jnode_is_znode(scan->node));
14207 +
14208 +       /* Special case for starting at an unformatted node.  Optimization: we only want
14209 +          to search for the parent (which requires a tree traversal) once.  Obviously, we
14210 +          shouldn't have to call it once for the left scan and once for the right scan.
14211 +          For this reason, if we search for the parent during scan-left we then duplicate
14212 +          the coord/lock/load into the scan-right object. */
14213 +       if (jnode_is_unformatted(scan->node)) {
14214 +               ret = scan_unformatted(scan, other);
14215 +               if (ret != 0)
14216 +                       return ret;
14217 +       }
14218 +       /* This loop expects to start at a formatted position and performs chaining of
14219 +          formatted regions */
14220 +       while (!scan_finished(scan)) {
14221 +
14222 +               ret = scan_formatted(scan);
14223 +               if (ret != 0) {
14224 +                       return ret;
14225 +               }
14226 +       }
14227 +
14228 +       return 0;
14229 +}
14230 +
14231 +static int scan_unformatted(flush_scan * scan, flush_scan * other)
14232 +{
14233 +       int ret = 0;
14234 +       int try = 0;
14235 +
14236 +       if (!coord_is_invalid(&scan->parent_coord))
14237 +               goto scan;
14238 +
14239 +       /* set parent coord from */
14240 +       if (!jnode_is_unformatted(scan->node)) {
14241 +               /* formatted position */
14242 +
14243 +               lock_handle lock;
14244 +               assert("edward-301", jnode_is_znode(scan->node));
14245 +               init_lh(&lock);
14246 +
14247 +               /*
14248 +                * when flush starts from unformatted node, first thing it
14249 +                * does is tree traversal to find formatted parent of starting
14250 +                * node. This parent is then kept lock across scans to the
14251 +                * left and to the right. This means that during scan to the
14252 +                * left we cannot take left-ward lock, because this is
14253 +                * dead-lock prone. So, if we are scanning to the left and
14254 +                * there is already lock held by this thread,
14255 +                * jnode_lock_parent_coord() should use try-lock.
14256 +                */
14257 +               try = scanning_left(scan)
14258 +                   && !lock_stack_isclean(get_current_lock_stack());
14259 +               /* Need the node locked to get the parent lock, We have to
14260 +                  take write lock since there is at least one call path
14261 +                  where this znode is already write-locked by us. */
14262 +               ret =
14263 +                   longterm_lock_znode(&lock, JZNODE(scan->node),
14264 +                                       ZNODE_WRITE_LOCK,
14265 +                                       scanning_left(scan) ? ZNODE_LOCK_LOPRI :
14266 +                                       ZNODE_LOCK_HIPRI);
14267 +               if (ret != 0)
14268 +                       /* EINVAL or E_DEADLOCK here mean... try again!  At this point we've
14269 +                          scanned too far and can't back out, just start over. */
14270 +                       return ret;
14271 +
14272 +               ret = jnode_lock_parent_coord(scan->node,
14273 +                                             &scan->parent_coord,
14274 +                                             &scan->parent_lock,
14275 +                                             &scan->parent_load,
14276 +                                             ZNODE_WRITE_LOCK, try);
14277 +
14278 +               /* FIXME(C): check EINVAL, E_DEADLOCK */
14279 +               done_lh(&lock);
14280 +               if (ret == -E_REPEAT) {
14281 +                       scan->stop = 1;
14282 +                       return 0;
14283 +               }
14284 +               if (ret)
14285 +                       return ret;
14286 +
14287 +       } else {
14288 +               /* unformatted position */
14289 +
14290 +               ret =
14291 +                   jnode_lock_parent_coord(scan->node, &scan->parent_coord,
14292 +                                           &scan->parent_lock,
14293 +                                           &scan->parent_load,
14294 +                                           ZNODE_WRITE_LOCK, try);
14295 +
14296 +               if (IS_CBKERR(ret))
14297 +                       return ret;
14298 +
14299 +               if (ret == CBK_COORD_NOTFOUND)
14300 +                       /* FIXME(C): check EINVAL, E_DEADLOCK */
14301 +                       return ret;
14302 +
14303 +               /* parent was found */
14304 +               assert("jmacd-8661", other != NULL);
14305 +               /* Duplicate the reference into the other flush_scan. */
14306 +               coord_dup(&other->parent_coord, &scan->parent_coord);
14307 +               copy_lh(&other->parent_lock, &scan->parent_lock);
14308 +               copy_load_count(&other->parent_load, &scan->parent_load);
14309 +       }
14310 +      scan:
14311 +       return scan_by_coord(scan);
14312 +}
14313 +
14314 +/* Performs left- or rightward scanning starting from a formatted node. Follow left
14315 +   pointers under tree lock as long as:
14316 +
14317 +   - node->left/right is non-NULL
14318 +   - node->left/right is connected, dirty
14319 +   - node->left/right belongs to the same atom
14320 +   - scan has not reached maximum count
14321 +*/
14322 +static int scan_formatted(flush_scan * scan)
14323 +{
14324 +       int ret;
14325 +       znode *neighbor = NULL;
14326 +
14327 +       assert("jmacd-1401", !scan_finished(scan));
14328 +
14329 +       do {
14330 +               znode *node = JZNODE(scan->node);
14331 +
14332 +               /* Node should be connected, but if not stop the scan. */
14333 +               if (!znode_is_connected(node)) {
14334 +                       scan->stop = 1;
14335 +                       break;
14336 +               }
14337 +
14338 +               /* Lock the tree, check-for and reference the next sibling. */
14339 +               read_lock_tree(znode_get_tree(node));
14340 +
14341 +               /* It may be that a node is inserted or removed between a node and its
14342 +                  left sibling while the tree lock is released, but the flush-scan count
14343 +                  does not need to be precise.  Thus, we release the tree lock as soon as
14344 +                  we get the neighboring node. */
14345 +               neighbor = scanning_left(scan) ? node->left : node->right;
14346 +               if (neighbor != NULL) {
14347 +                       zref(neighbor);
14348 +               }
14349 +
14350 +               read_unlock_tree(znode_get_tree(node));
14351 +
14352 +               /* If neighbor is NULL at the leaf level, need to check for an unformatted
14353 +                  sibling using the parent--break in any case. */
14354 +               if (neighbor == NULL) {
14355 +                       break;
14356 +               }
14357 +
14358 +               /* Check the condition for going left, break if it is not met.  This also
14359 +                  releases (jputs) the neighbor if false. */
14360 +               if (!scan_goto(scan, ZJNODE(neighbor))) {
14361 +                       break;
14362 +               }
14363 +
14364 +               /* Advance the flush_scan state to the left, repeat. */
14365 +               ret = scan_set_current(scan, ZJNODE(neighbor), 1, NULL);
14366 +               if (ret != 0) {
14367 +                       return ret;
14368 +               }
14369 +
14370 +       } while (!scan_finished(scan));
14371 +
14372 +       /* If neighbor is NULL then we reached the end of a formatted region, or else the
14373 +          sibling is out of memory, now check for an extent to the left (as long as
14374 +          LEAF_LEVEL). */
14375 +       if (neighbor != NULL || jnode_get_level(scan->node) != LEAF_LEVEL
14376 +           || scan_finished(scan)) {
14377 +               scan->stop = 1;
14378 +               return 0;
14379 +       }
14380 +       /* Otherwise, calls scan_by_coord for the right(left)most item of the
14381 +          left(right) neighbor on the parent level, then possibly continue. */
14382 +
14383 +       coord_init_invalid(&scan->parent_coord, NULL);
14384 +       return scan_unformatted(scan, NULL);
14385 +}
14386 +
14387 +/* NOTE-EDWARD:
14388 +   This scans adjacent items of the same type and calls scan flush plugin for each one.
14389 +   Performs left(right)ward scanning starting from a (possibly) unformatted node.  If we start
14390 +   from unformatted node, then we continue only if the next neighbor is also unformatted.
14391 +   When called from scan_formatted, we skip first iteration (to make sure that
14392 +   right(left)most item of the left(right) neighbor on the parent level is of the same
14393 +   type and set appropriate coord). */
14394 +static int scan_by_coord(flush_scan * scan)
14395 +{
14396 +       int ret = 0;
14397 +       int scan_this_coord;
14398 +       lock_handle next_lock;
14399 +       load_count next_load;
14400 +       coord_t next_coord;
14401 +       jnode *child;
14402 +       item_plugin *iplug;
14403 +
14404 +       init_lh(&next_lock);
14405 +       init_load_count(&next_load);
14406 +       scan_this_coord = (jnode_is_unformatted(scan->node) ? 1 : 0);
14407 +
14408 +       /* set initial item id */
14409 +       iplug = item_plugin_by_coord(&scan->parent_coord);
14410 +
14411 +       for (; !scan_finished(scan); scan_this_coord = 1) {
14412 +               if (scan_this_coord) {
14413 +                       /* Here we expect that unit is scannable. it would not be so due
14414 +                        * to race with extent->tail conversion.  */
14415 +                       if (iplug->f.scan == NULL) {
14416 +                               scan->stop = 1;
14417 +                               ret = -E_REPEAT;
14418 +                               /* skip the check at the end. */
14419 +                               goto race;
14420 +                       }
14421 +
14422 +                       ret = iplug->f.scan(scan);
14423 +                       if (ret != 0)
14424 +                               goto exit;
14425 +
14426 +                       if (scan_finished(scan)) {
14427 +                               checkchild(scan);
14428 +                               break;
14429 +                       }
14430 +               } else {
14431 +                       /* the same race against truncate as above is possible
14432 +                        * here, it seems */
14433 +
14434 +                       /* NOTE-JMACD: In this case, apply the same end-of-node logic but don't scan
14435 +                          the first coordinate. */
14436 +                       assert("jmacd-1231",
14437 +                              item_is_internal(&scan->parent_coord));
14438 +               }
14439 +
14440 +               if (iplug->f.utmost_child == NULL
14441 +                   || znode_get_level(scan->parent_coord.node) != TWIG_LEVEL) {
14442 +                       /* stop this coord and continue on parrent level */
14443 +                       ret =
14444 +                           scan_set_current(scan,
14445 +                                            ZJNODE(zref
14446 +                                                   (scan->parent_coord.node)),
14447 +                                            1, NULL);
14448 +                       if (ret != 0)
14449 +                               goto exit;
14450 +                       break;
14451 +               }
14452 +
14453 +               /* Either way, the invariant is that scan->parent_coord is set to the
14454 +                  parent of scan->node. Now get the next unit. */
14455 +               coord_dup(&next_coord, &scan->parent_coord);
14456 +               coord_sideof_unit(&next_coord, scan->direction);
14457 +
14458 +               /* If off-the-end of the twig, try the next twig. */
14459 +               if (coord_is_after_sideof_unit(&next_coord, scan->direction)) {
14460 +                       /* We take the write lock because we may start flushing from this
14461 +                        * coordinate. */
14462 +                       ret =
14463 +                           neighbor_in_slum(next_coord.node, &next_lock,
14464 +                                            scan->direction, ZNODE_WRITE_LOCK,
14465 +                                            1 /* check dirty */ );
14466 +                       if (ret == -E_NO_NEIGHBOR) {
14467 +                               scan->stop = 1;
14468 +                               ret = 0;
14469 +                               break;
14470 +                       }
14471 +
14472 +                       if (ret != 0) {
14473 +                               goto exit;
14474 +                       }
14475 +
14476 +                       ret = incr_load_count_znode(&next_load, next_lock.node);
14477 +                       if (ret != 0) {
14478 +                               goto exit;
14479 +                       }
14480 +
14481 +                       coord_init_sideof_unit(&next_coord, next_lock.node,
14482 +                                              sideof_reverse(scan->direction));
14483 +               }
14484 +
14485 +               iplug = item_plugin_by_coord(&next_coord);
14486 +
14487 +               /* Get the next child. */
14488 +               ret =
14489 +                   iplug->f.utmost_child(&next_coord,
14490 +                                         sideof_reverse(scan->direction),
14491 +                                         &child);
14492 +               if (ret != 0)
14493 +                       goto exit;
14494 +               /* If the next child is not in memory, or, item_utmost_child
14495 +                  failed (due to race with unlink, most probably), stop
14496 +                  here. */
14497 +               if (child == NULL || IS_ERR(child)) {
14498 +                       scan->stop = 1;
14499 +                       checkchild(scan);
14500 +                       break;
14501 +               }
14502 +
14503 +               assert("nikita-2374", jnode_is_unformatted(child)
14504 +                      || jnode_is_znode(child));
14505 +
14506 +               /* See if it is dirty, part of the same atom. */
14507 +               if (!scan_goto(scan, child)) {
14508 +                       checkchild(scan);
14509 +                       break;
14510 +               }
14511 +
14512 +               /* If so, make this child current. */
14513 +               ret = scan_set_current(scan, child, 1, &next_coord);
14514 +               if (ret != 0)
14515 +                       goto exit;
14516 +
14517 +               /* Now continue.  If formatted we release the parent lock and return, then
14518 +                  proceed. */
14519 +               if (jnode_is_znode(child))
14520 +                       break;
14521 +
14522 +               /* Otherwise, repeat the above loop with next_coord. */
14523 +               if (next_load.node != NULL) {
14524 +                       done_lh(&scan->parent_lock);
14525 +                       move_lh(&scan->parent_lock, &next_lock);
14526 +                       move_load_count(&scan->parent_load, &next_load);
14527 +               }
14528 +       }
14529 +
14530 +       assert("jmacd-6233", scan_finished(scan) || jnode_is_znode(scan->node));
14531 +      exit:
14532 +       checkchild(scan);
14533 +      race:                    /* skip the above check  */
14534 +       if (jnode_is_znode(scan->node)) {
14535 +               done_lh(&scan->parent_lock);
14536 +               done_load_count(&scan->parent_load);
14537 +       }
14538 +
14539 +       done_load_count(&next_load);
14540 +       done_lh(&next_lock);
14541 +       return ret;
14542 +}
14543 +
14544 +/* FLUSH POS HELPERS */
14545 +
14546 +/* Initialize the fields of a flush_position. */
14547 +static void pos_init(flush_pos_t * pos)
14548 +{
14549 +       memset(pos, 0, sizeof *pos);
14550 +
14551 +       pos->state = POS_INVALID;
14552 +       coord_init_invalid(&pos->coord, NULL);
14553 +       init_lh(&pos->lock);
14554 +       init_load_count(&pos->load);
14555 +
14556 +       blocknr_hint_init(&pos->preceder);
14557 +}
14558 +
14559 +/* The flush loop inside squalloc periodically checks pos_valid to
14560 +   determine when "enough flushing" has been performed.  This will return true until one
14561 +   of the following conditions is met:
14562 +
14563 +   1. the number of flush-queued nodes has reached the kernel-supplied "int *nr_to_flush"
14564 +   parameter, meaning we have flushed as many blocks as the kernel requested.  When
14565 +   flushing to commit, this parameter is NULL.
14566 +
14567 +   2. pos_stop() is called because squalloc discovers that the "next" node in the
14568 +   flush order is either non-existant, not dirty, or not in the same atom.
14569 +*/
14570 +
14571 +static int pos_valid(flush_pos_t * pos)
14572 +{
14573 +       return pos->state != POS_INVALID;
14574 +}
14575 +
14576 +/* Release any resources of a flush_position.  Called when jnode_flush finishes. */
14577 +static void pos_done(flush_pos_t * pos)
14578 +{
14579 +       pos_stop(pos);
14580 +       blocknr_hint_done(&pos->preceder);
14581 +       if (convert_data(pos))
14582 +               free_convert_data(pos);
14583 +}
14584 +
14585 +/* Reset the point and parent.  Called during flush subroutines to terminate the
14586 +   squalloc loop. */
14587 +static int pos_stop(flush_pos_t * pos)
14588 +{
14589 +       pos->state = POS_INVALID;
14590 +       done_lh(&pos->lock);
14591 +       done_load_count(&pos->load);
14592 +       coord_init_invalid(&pos->coord, NULL);
14593 +
14594 +       if (pos->child) {
14595 +               jput(pos->child);
14596 +               pos->child = NULL;
14597 +       }
14598 +
14599 +       return 0;
14600 +}
14601 +
14602 +/* Return the flush_position's block allocator hint. */
14603 +reiser4_blocknr_hint *pos_hint(flush_pos_t * pos)
14604 +{
14605 +       return &pos->preceder;
14606 +}
14607 +
14608 +flush_queue_t *pos_fq(flush_pos_t * pos)
14609 +{
14610 +       return pos->fq;
14611 +}
14612 +
14613 +/* Make Linus happy.
14614 +   Local variables:
14615 +   c-indentation-style: "K&R"
14616 +   mode-name: "LC"
14617 +   c-basic-offset: 8
14618 +   tab-width: 8
14619 +   fill-column: 90
14620 +   LocalWords:  preceder
14621 +   End:
14622 +*/
14623 diff --git a/fs/reiser4/flush.h b/fs/reiser4/flush.h
14624 new file mode 100644
14625 index 0000000..ec362cc
14626 --- /dev/null
14627 +++ b/fs/reiser4/flush.h
14628 @@ -0,0 +1,274 @@
14629 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
14630 +
14631 +/* DECLARATIONS: */
14632 +
14633 +#if !defined(__REISER4_FLUSH_H__)
14634 +#define __REISER4_FLUSH_H__
14635 +
14636 +#include "plugin/cluster.h"
14637 +
14638 +/* The flush_scan data structure maintains the state of an in-progress flush-scan on a
14639 +   single level of the tree.  A flush-scan is used for counting the number of adjacent
14640 +   nodes to flush, which is used to determine whether we should relocate, and it is also
14641 +   used to find a starting point for flush.  A flush-scan object can scan in both right
14642 +   and left directions via the scan_left() and scan_right() interfaces.  The
14643 +   right- and left-variations are similar but perform different functions.  When scanning
14644 +   left we (optionally perform rapid scanning and then) longterm-lock the endpoint node.
14645 +   When scanning right we are simply counting the number of adjacent, dirty nodes. */
14646 +struct flush_scan {
14647 +
14648 +       /* The current number of nodes scanned on this level. */
14649 +       unsigned count;
14650 +
14651 +       /* There may be a maximum number of nodes for a scan on any single level.  When
14652 +          going leftward, max_count is determined by FLUSH_SCAN_MAXNODES (see reiser4.h) */
14653 +       unsigned max_count;
14654 +
14655 +       /* Direction: Set to one of the sideof enumeration: { LEFT_SIDE, RIGHT_SIDE }. */
14656 +       sideof direction;
14657 +
14658 +       /* Initially @stop is set to false then set true once some condition stops the
14659 +          search (e.g., we found a clean node before reaching max_count or we found a
14660 +          node belonging to another atom). */
14661 +       int stop;
14662 +
14663 +       /* The current scan position.  If @node is non-NULL then its reference count has
14664 +          been incremented to reflect this reference. */
14665 +       jnode *node;
14666 +
14667 +       /* A handle for zload/zrelse of current scan position node. */
14668 +       load_count node_load;
14669 +
14670 +       /* During left-scan, if the final position (a.k.a. endpoint node) is formatted the
14671 +          node is locked using this lock handle.  The endpoint needs to be locked for
14672 +          transfer to the flush_position object after scanning finishes. */
14673 +       lock_handle node_lock;
14674 +
14675 +       /* When the position is unformatted, its parent, coordinate, and parent
14676 +          zload/zrelse handle. */
14677 +       lock_handle parent_lock;
14678 +       coord_t parent_coord;
14679 +       load_count parent_load;
14680 +
14681 +       /* The block allocator preceder hint.  Sometimes flush_scan determines what the
14682 +          preceder is and if so it sets it here, after which it is copied into the
14683 +          flush_position.  Otherwise, the preceder is computed later. */
14684 +       reiser4_block_nr preceder_blk;
14685 +};
14686 +
14687 +typedef struct convert_item_info {
14688 +       dc_item_stat d_cur;     /* disk cluster state of the current item */
14689 +       dc_item_stat d_next;    /* disk cluster state of the next slum item */
14690 +       struct inode *inode;
14691 +       flow_t flow;
14692 +} convert_item_info_t;
14693 +
14694 +typedef struct convert_info {
14695 +       int count;              /* for squalloc terminating */
14696 +       reiser4_cluster_t clust;        /* transform cluster */
14697 +       item_plugin *iplug;     /* current item plugin */
14698 +       convert_item_info_t *itm;       /* current item info */
14699 +} convert_info_t;
14700 +
14701 +typedef enum flush_position_state {
14702 +       POS_INVALID,            /* Invalid or stopped pos, do not continue slum
14703 +                                * processing */
14704 +       POS_ON_LEAF,            /* pos points to already prepped, locked formatted node at
14705 +                                * leaf level */
14706 +       POS_ON_EPOINT,          /* pos keeps a lock on twig level, "coord" field is used
14707 +                                * to traverse unformatted nodes */
14708 +       POS_TO_LEAF,            /* pos is being moved to leaf level */
14709 +       POS_TO_TWIG,            /* pos is being moved to twig level */
14710 +       POS_END_OF_TWIG,        /* special case of POS_ON_TWIG, when coord is after
14711 +                                * rightmost unit of the current twig */
14712 +       POS_ON_INTERNAL         /* same as POS_ON_LEAF, but points to internal node */
14713 +} flushpos_state_t;
14714 +
14715 +/* An encapsulation of the current flush point and all the parameters that are passed
14716 +   through the entire squeeze-and-allocate stage of the flush routine.  A single
14717 +   flush_position object is constructed after left- and right-scanning finishes. */
14718 +struct flush_position {
14719 +       flushpos_state_t state;
14720 +
14721 +       coord_t coord;          /* coord to traverse unformatted nodes */
14722 +       lock_handle lock;       /* current lock we hold */
14723 +       load_count load;        /* load status for current locked formatted node  */
14724 +
14725 +       jnode *child;           /* for passing a reference to unformatted child
14726 +                                * across pos state changes */
14727 +
14728 +       reiser4_blocknr_hint preceder;  /* The flush 'hint' state. */
14729 +       int leaf_relocate;      /* True if enough leaf-level nodes were
14730 +                                * found to suggest a relocate policy. */
14731 +       int alloc_cnt;          /* The number of nodes allocated during squeeze and allococate. */
14732 +       int prep_or_free_cnt;   /* The number of nodes prepared for write (allocate) or squeezed and freed. */
14733 +       flush_queue_t *fq;
14734 +       long *nr_written;       /* number of nodes submitted to disk */
14735 +       int flags;              /* a copy of jnode_flush flags argument */
14736 +
14737 +       znode *prev_twig;       /* previous parent pointer value, used to catch
14738 +                                * processing of new twig node */
14739 +       convert_info_t *sq;     /* convert info */
14740 +
14741 +       unsigned long pos_in_unit;      /* for extents only. Position
14742 +                                          within an extent unit of first
14743 +                                          jnode of slum */
14744 +       long nr_to_write;       /* number of unformatted nodes to handle on flush */
14745 +};
14746 +
14747 +static inline int item_convert_count(flush_pos_t * pos)
14748 +{
14749 +       return pos->sq->count;
14750 +}
14751 +static inline void inc_item_convert_count(flush_pos_t * pos)
14752 +{
14753 +       pos->sq->count++;
14754 +}
14755 +static inline void set_item_convert_count(flush_pos_t * pos, int count)
14756 +{
14757 +       pos->sq->count = count;
14758 +}
14759 +static inline item_plugin *item_convert_plug(flush_pos_t * pos)
14760 +{
14761 +       return pos->sq->iplug;
14762 +}
14763 +
14764 +static inline convert_info_t *convert_data(flush_pos_t * pos)
14765 +{
14766 +       return pos->sq;
14767 +}
14768 +
14769 +static inline convert_item_info_t *item_convert_data(flush_pos_t * pos)
14770 +{
14771 +       assert("edward-955", convert_data(pos));
14772 +       return pos->sq->itm;
14773 +}
14774 +
14775 +static inline tfm_cluster_t *tfm_cluster_sq(flush_pos_t * pos)
14776 +{
14777 +       return &pos->sq->clust.tc;
14778 +}
14779 +
14780 +static inline tfm_stream_t *tfm_stream_sq(flush_pos_t * pos, tfm_stream_id id)
14781 +{
14782 +       assert("edward-854", pos->sq != NULL);
14783 +       return tfm_stream(tfm_cluster_sq(pos), id);
14784 +}
14785 +
14786 +static inline int chaining_data_present(flush_pos_t * pos)
14787 +{
14788 +       return convert_data(pos) && item_convert_data(pos);
14789 +}
14790 +
14791 +/* Returns true if next node contains next item of the disk cluster
14792 +   so item convert data should be moved to the right slum neighbor.
14793 +*/
14794 +static inline int should_chain_next_node(flush_pos_t * pos)
14795 +{
14796 +       int result = 0;
14797 +
14798 +       assert("edward-1007", chaining_data_present(pos));
14799 +
14800 +       switch (item_convert_data(pos)->d_next) {
14801 +       case DC_CHAINED_ITEM:
14802 +               result = 1;
14803 +               break;
14804 +       case DC_AFTER_CLUSTER:
14805 +               break;
14806 +       default:
14807 +               impossible("edward-1009", "bad state of next slum item");
14808 +       }
14809 +       return result;
14810 +}
14811 +
14812 +/* update item state in a disk cluster to assign conversion mode */
14813 +static inline void
14814 +move_chaining_data(flush_pos_t * pos, int this_node /* where is next item */ )
14815 +{
14816 +
14817 +       assert("edward-1010", chaining_data_present(pos));
14818 +
14819 +       if (this_node == 0) {
14820 +               /* next item is on the right neighbor */
14821 +               assert("edward-1011",
14822 +                      item_convert_data(pos)->d_cur == DC_FIRST_ITEM ||
14823 +                      item_convert_data(pos)->d_cur == DC_CHAINED_ITEM);
14824 +               assert("edward-1012",
14825 +                      item_convert_data(pos)->d_next == DC_CHAINED_ITEM);
14826 +
14827 +               item_convert_data(pos)->d_cur = DC_CHAINED_ITEM;
14828 +               item_convert_data(pos)->d_next = DC_INVALID_STATE;
14829 +       } else {
14830 +               /* next item is on the same node */
14831 +               assert("edward-1013",
14832 +                      item_convert_data(pos)->d_cur == DC_FIRST_ITEM ||
14833 +                      item_convert_data(pos)->d_cur == DC_CHAINED_ITEM);
14834 +               assert("edward-1227",
14835 +                      item_convert_data(pos)->d_next == DC_AFTER_CLUSTER ||
14836 +                      item_convert_data(pos)->d_next == DC_INVALID_STATE);
14837 +
14838 +               item_convert_data(pos)->d_cur = DC_AFTER_CLUSTER;
14839 +               item_convert_data(pos)->d_next = DC_INVALID_STATE;
14840 +       }
14841 +}
14842 +
14843 +static inline int should_convert_node(flush_pos_t * pos, znode * node)
14844 +{
14845 +       return znode_convertible(node);
14846 +}
14847 +
14848 +/* true if there is attached convert item info */
14849 +static inline int should_convert_next_node(flush_pos_t * pos, znode * node)
14850 +{
14851 +       return convert_data(pos) && item_convert_data(pos);
14852 +}
14853 +
14854 +#define SQUALLOC_THRESHOLD 256
14855 +
14856 +static inline int should_terminate_squalloc(flush_pos_t * pos)
14857 +{
14858 +       return convert_data(pos) &&
14859 +           !item_convert_data(pos) &&
14860 +           item_convert_count(pos) >= SQUALLOC_THRESHOLD;
14861 +}
14862 +
14863 +void free_convert_data(flush_pos_t * pos);
14864 +/* used in extent.c */
14865 +int scan_set_current(flush_scan * scan, jnode * node, unsigned add_size,
14866 +                    const coord_t * parent);
14867 +int scan_finished(flush_scan * scan);
14868 +int scanning_left(flush_scan * scan);
14869 +int scan_goto(flush_scan * scan, jnode * tonode);
14870 +txn_atom *atom_locked_by_fq(flush_queue_t * fq);
14871 +int alloc_extent(flush_pos_t *flush_pos);
14872 +squeeze_result squalloc_extent(znode *left, const coord_t *, flush_pos_t *,
14873 +                              reiser4_key *stop_key);
14874 +extern int init_fqs(void);
14875 +extern void done_fqs(void);
14876 +
14877 +#if REISER4_DEBUG
14878 +
14879 +extern void check_fq(const txn_atom *atom);
14880 +extern atomic_t flush_cnt;
14881 +
14882 +#define check_preceder(blk) \
14883 +assert("nikita-2588", blk < reiser4_block_count(reiser4_get_current_sb()));
14884 +extern void check_pos(flush_pos_t * pos);
14885 +#else
14886 +#define check_preceder(b) noop
14887 +#define check_pos(pos) noop
14888 +#endif
14889 +
14890 +/* __REISER4_FLUSH_H__ */
14891 +#endif
14892 +
14893 +/* Make Linus happy.
14894 +   Local variables:
14895 +   c-indentation-style: "K&R"
14896 +   mode-name: "LC"
14897 +   c-basic-offset: 8
14898 +   tab-width: 8
14899 +   fill-column: 90
14900 +   LocalWords:  preceder
14901 +   End:
14902 +*/
14903 diff --git a/fs/reiser4/flush_queue.c b/fs/reiser4/flush_queue.c
14904 new file mode 100644
14905 index 0000000..0980e8a
14906 --- /dev/null
14907 +++ b/fs/reiser4/flush_queue.c
14908 @@ -0,0 +1,681 @@
14909 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
14910 +
14911 +#include "debug.h"
14912 +#include "super.h"
14913 +#include "txnmgr.h"
14914 +#include "jnode.h"
14915 +#include "znode.h"
14916 +#include "page_cache.h"
14917 +#include "wander.h"
14918 +#include "vfs_ops.h"
14919 +#include "writeout.h"
14920 +#include "flush.h"
14921 +
14922 +#include <linux/bio.h>
14923 +#include <linux/mm.h>
14924 +#include <linux/pagemap.h>
14925 +#include <linux/blkdev.h>
14926 +#include <linux/writeback.h>
14927 +
14928 +/* A flush queue object is an accumulator for keeping jnodes prepared
14929 +   by the jnode_flush() function for writing to disk. Those "queued" jnodes are
14930 +   kept on the flush queue until memory pressure or atom commit asks
14931 +   flush queues to write some or all from their jnodes. */
14932 +
14933 +/*
14934 +   LOCKING:
14935 +
14936 +   fq->guard spin lock protects fq->atom pointer and nothing else.  fq->prepped
14937 +   list protected by atom spin lock.  fq->prepped list uses the following
14938 +   locking:
14939 +
14940 +   two ways to protect fq->prepped list for read-only list traversal:
14941 +
14942 +   1. atom spin-lock atom.
14943 +   2. fq is IN_USE, atom->nr_running_queues increased.
14944 +
14945 +   and one for list modification:
14946 +
14947 +   1. atom is spin-locked and one condition is true: fq is IN_USE or
14948 +      atom->nr_running_queues == 0.
14949 +
14950 +   The deadlock-safe order for flush queues and atoms is: first lock atom, then
14951 +   lock flush queue, then lock jnode.
14952 +*/
14953 +
14954 +#define fq_in_use(fq)          ((fq)->state & FQ_IN_USE)
14955 +#define fq_ready(fq)           (!fq_in_use(fq))
14956 +
14957 +#define mark_fq_in_use(fq)     do { (fq)->state |= FQ_IN_USE;    } while (0)
14958 +#define mark_fq_ready(fq)      do { (fq)->state &= ~FQ_IN_USE;   } while (0)
14959 +
14960 +/* get lock on atom from locked flush queue object */
14961 +static txn_atom *atom_locked_by_fq_nolock(flush_queue_t * fq)
14962 +{
14963 +       /* This code is similar to jnode_get_atom(), look at it for the
14964 +        * explanation. */
14965 +       txn_atom *atom;
14966 +
14967 +       assert_spin_locked(&(fq->guard));
14968 +
14969 +       while (1) {
14970 +               atom = fq->atom;
14971 +               if (atom == NULL)
14972 +                       break;
14973 +
14974 +               if (spin_trylock_atom(atom))
14975 +                       break;
14976 +
14977 +               atomic_inc(&atom->refcount);
14978 +               spin_unlock(&(fq->guard));
14979 +               spin_lock_atom(atom);
14980 +               spin_lock(&(fq->guard));
14981 +
14982 +               if (fq->atom == atom) {
14983 +                       atomic_dec(&atom->refcount);
14984 +                       break;
14985 +               }
14986 +
14987 +               spin_unlock(&(fq->guard));
14988 +               atom_dec_and_unlock(atom);
14989 +               spin_lock(&(fq->guard));
14990 +       }
14991 +
14992 +       return atom;
14993 +}
14994 +
14995 +txn_atom *atom_locked_by_fq(flush_queue_t * fq)
14996 +{
14997 +       txn_atom *atom;
14998 +
14999 +       spin_lock(&(fq->guard));
15000 +       atom = atom_locked_by_fq_nolock(fq);
15001 +       spin_unlock(&(fq->guard));
15002 +       return atom;
15003 +}
15004 +
15005 +static void init_fq(flush_queue_t * fq)
15006 +{
15007 +       memset(fq, 0, sizeof *fq);
15008 +
15009 +       atomic_set(&fq->nr_submitted, 0);
15010 +
15011 +       INIT_LIST_HEAD(ATOM_FQ_LIST(fq));
15012 +
15013 +       sema_init(&fq->io_sem, 0);
15014 +       spin_lock_init(&fq->guard);
15015 +}
15016 +
15017 +/* slab for flush queues */
15018 +static kmem_cache_t *fq_slab;
15019 +
15020 +
15021 +/**
15022 + * init_fqs - create flush queue cache
15023 + *
15024 + * Initializes slab cache of flush queues. It is part of reiser4 module
15025 + * initialization.
15026 + */
15027 +int init_fqs(void)
15028 +{
15029 +       fq_slab = kmem_cache_create("fq",
15030 +                                   sizeof(flush_queue_t),
15031 +                                   0, SLAB_HWCACHE_ALIGN, NULL, NULL);
15032 +       if (fq_slab == NULL)
15033 +               return RETERR(-ENOMEM);
15034 +       return 0;
15035 +}
15036 +
15037 +/**
15038 + * done_fqs - delete flush queue cache
15039 + *
15040 + * This is called on reiser4 module unloading or system shutdown.
15041 + */
15042 +void done_fqs(void)
15043 +{
15044 +       destroy_reiser4_cache(&fq_slab);
15045 +}
15046 +
15047 +/* create new flush queue object */
15048 +static flush_queue_t *create_fq(gfp_t gfp)
15049 +{
15050 +       flush_queue_t *fq;
15051 +
15052 +       fq = kmem_cache_alloc(fq_slab, gfp);
15053 +       if (fq)
15054 +               init_fq(fq);
15055 +
15056 +       return fq;
15057 +}
15058 +
15059 +/* adjust atom's and flush queue's counters of queued nodes */
15060 +static void count_enqueued_node(flush_queue_t * fq)
15061 +{
15062 +       ON_DEBUG(fq->atom->num_queued++);
15063 +}
15064 +
15065 +static void count_dequeued_node(flush_queue_t * fq)
15066 +{
15067 +       assert("zam-993", fq->atom->num_queued > 0);
15068 +       ON_DEBUG(fq->atom->num_queued--);
15069 +}
15070 +
15071 +/* attach flush queue object to the atom */
15072 +static void attach_fq(txn_atom *atom, flush_queue_t *fq)
15073 +{
15074 +       assert_spin_locked(&(atom->alock));
15075 +       list_add(&fq->alink, &atom->flush_queues);
15076 +       fq->atom = atom;
15077 +       ON_DEBUG(atom->nr_flush_queues++);
15078 +}
15079 +
15080 +static void detach_fq(flush_queue_t * fq)
15081 +{
15082 +       assert_spin_locked(&(fq->atom->alock));
15083 +
15084 +       spin_lock(&(fq->guard));
15085 +       list_del_init(&fq->alink);
15086 +       assert("vs-1456", fq->atom->nr_flush_queues > 0);
15087 +       ON_DEBUG(fq->atom->nr_flush_queues--);
15088 +       fq->atom = NULL;
15089 +       spin_unlock(&(fq->guard));
15090 +}
15091 +
15092 +/* destroy flush queue object */
15093 +static void done_fq(flush_queue_t * fq)
15094 +{
15095 +       assert("zam-763", list_empty_careful(ATOM_FQ_LIST(fq)));
15096 +       assert("zam-766", atomic_read(&fq->nr_submitted) == 0);
15097 +
15098 +       kmem_cache_free(fq_slab, fq);
15099 +}
15100 +
15101 +/* */
15102 +static void mark_jnode_queued(flush_queue_t * fq, jnode * node)
15103 +{
15104 +       JF_SET(node, JNODE_FLUSH_QUEUED);
15105 +       count_enqueued_node(fq);
15106 +}
15107 +
15108 +/* Putting jnode into the flush queue. Both atom and jnode should be
15109 +   spin-locked. */
15110 +void queue_jnode(flush_queue_t * fq, jnode * node)
15111 +{
15112 +       assert_spin_locked(&(node->guard));
15113 +       assert("zam-713", node->atom != NULL);
15114 +       assert_spin_locked(&(node->atom->alock));
15115 +       assert("zam-716", fq->atom != NULL);
15116 +       assert("zam-717", fq->atom == node->atom);
15117 +       assert("zam-907", fq_in_use(fq));
15118 +
15119 +       assert("zam-714", JF_ISSET(node, JNODE_DIRTY));
15120 +       assert("zam-826", JF_ISSET(node, JNODE_RELOC));
15121 +       assert("vs-1481", !JF_ISSET(node, JNODE_FLUSH_QUEUED));
15122 +       assert("vs-1481", NODE_LIST(node) != FQ_LIST);
15123 +
15124 +       mark_jnode_queued(fq, node);
15125 +       list_move_tail(&node->capture_link, ATOM_FQ_LIST(fq));
15126 +
15127 +       ON_DEBUG(count_jnode(node->atom, node, NODE_LIST(node),
15128 +                            FQ_LIST, 1));
15129 +}
15130 +
15131 +/* repeatable process for waiting io completion on a flush queue object */
15132 +static int wait_io(flush_queue_t * fq, int *nr_io_errors)
15133 +{
15134 +       assert("zam-738", fq->atom != NULL);
15135 +       assert_spin_locked(&(fq->atom->alock));
15136 +       assert("zam-736", fq_in_use(fq));
15137 +       assert("zam-911", list_empty_careful(ATOM_FQ_LIST(fq)));
15138 +
15139 +       if (atomic_read(&fq->nr_submitted) != 0) {
15140 +               struct super_block *super;
15141 +
15142 +               spin_unlock_atom(fq->atom);
15143 +
15144 +               assert("nikita-3013", schedulable());
15145 +
15146 +               super = reiser4_get_current_sb();
15147 +
15148 +               /* FIXME: this is instead of blk_run_queues() */
15149 +               blk_run_address_space(get_super_fake(super)->i_mapping);
15150 +
15151 +               if (!(super->s_flags & MS_RDONLY))
15152 +                       down(&fq->io_sem);
15153 +
15154 +               /* Ask the caller to re-acquire the locks and call this
15155 +                  function again. Note: this technique is commonly used in
15156 +                  the txnmgr code. */
15157 +               return -E_REPEAT;
15158 +       }
15159 +
15160 +       *nr_io_errors += atomic_read(&fq->nr_errors);
15161 +       return 0;
15162 +}
15163 +
15164 +/* wait on I/O completion, re-submit dirty nodes to write */
15165 +static int finish_fq(flush_queue_t * fq, int *nr_io_errors)
15166 +{
15167 +       int ret;
15168 +       txn_atom *atom = fq->atom;
15169 +
15170 +       assert("zam-801", atom != NULL);
15171 +       assert_spin_locked(&(atom->alock));
15172 +       assert("zam-762", fq_in_use(fq));
15173 +
15174 +       ret = wait_io(fq, nr_io_errors);
15175 +       if (ret)
15176 +               return ret;
15177 +
15178 +       detach_fq(fq);
15179 +       done_fq(fq);
15180 +
15181 +       atom_send_event(atom);
15182 +
15183 +       return 0;
15184 +}
15185 +
15186 +/* wait for all i/o for given atom to be completed, actually do one iteration
15187 +   on that and return -E_REPEAT if there more iterations needed */
15188 +static int finish_all_fq(txn_atom * atom, int *nr_io_errors)
15189 +{
15190 +       flush_queue_t *fq;
15191 +
15192 +       assert_spin_locked(&(atom->alock));
15193 +
15194 +       if (list_empty_careful(&atom->flush_queues))
15195 +               return 0;
15196 +
15197 +       list_for_each_entry(fq, &atom->flush_queues, alink) {
15198 +               if (fq_ready(fq)) {
15199 +                       int ret;
15200 +
15201 +                       mark_fq_in_use(fq);
15202 +                       assert("vs-1247", fq->owner == NULL);
15203 +                       ON_DEBUG(fq->owner = current);
15204 +                       ret = finish_fq(fq, nr_io_errors);
15205 +
15206 +                       if (*nr_io_errors)
15207 +                               reiser4_handle_error();
15208 +
15209 +                       if (ret) {
15210 +                               fq_put(fq);
15211 +                               return ret;
15212 +                       }
15213 +
15214 +                       spin_unlock_atom(atom);
15215 +
15216 +                       return -E_REPEAT;
15217 +               }
15218 +       }
15219 +
15220 +       /* All flush queues are in use; atom remains locked */
15221 +       return -EBUSY;
15222 +}
15223 +
15224 +/* wait all i/o for current atom */
15225 +int current_atom_finish_all_fq(void)
15226 +{
15227 +       txn_atom *atom;
15228 +       int nr_io_errors = 0;
15229 +       int ret = 0;
15230 +
15231 +       do {
15232 +               while (1) {
15233 +                       atom = get_current_atom_locked();
15234 +                       ret = finish_all_fq(atom, &nr_io_errors);
15235 +                       if (ret != -EBUSY)
15236 +                               break;
15237 +                       atom_wait_event(atom);
15238 +               }
15239 +       } while (ret == -E_REPEAT);
15240 +
15241 +       /* we do not need locked atom after this function finishes, SUCCESS or
15242 +          -EBUSY are two return codes when atom remains locked after
15243 +          finish_all_fq */
15244 +       if (!ret)
15245 +               spin_unlock_atom(atom);
15246 +
15247 +       assert_spin_not_locked(&(atom->alock));
15248 +
15249 +       if (ret)
15250 +               return ret;
15251 +
15252 +       if (nr_io_errors)
15253 +               return RETERR(-EIO);
15254 +
15255 +       return 0;
15256 +}
15257 +
15258 +/* change node->atom field for all jnode from given list */
15259 +static void
15260 +scan_fq_and_update_atom_ref(struct list_head *list, txn_atom *atom)
15261 +{
15262 +       jnode *cur;
15263 +
15264 +       list_for_each_entry(cur, list, capture_link) {
15265 +               spin_lock_jnode(cur);
15266 +               cur->atom = atom;
15267 +               spin_unlock_jnode(cur);
15268 +       }
15269 +}
15270 +
15271 +/* support for atom fusion operation */
15272 +void fuse_fq(txn_atom *to, txn_atom *from)
15273 +{
15274 +       flush_queue_t *fq;
15275 +
15276 +       assert_spin_locked(&(to->alock));
15277 +       assert_spin_locked(&(from->alock));
15278 +
15279 +       list_for_each_entry(fq, &from->flush_queues, alink) {
15280 +               scan_fq_and_update_atom_ref(ATOM_FQ_LIST(fq), to);
15281 +               spin_lock(&(fq->guard));
15282 +               fq->atom = to;
15283 +               spin_unlock(&(fq->guard));
15284 +       }
15285 +
15286 +       list_splice_init(&from->flush_queues, to->flush_queues.prev);
15287 +
15288 +#if REISER4_DEBUG
15289 +       to->num_queued += from->num_queued;
15290 +       to->nr_flush_queues += from->nr_flush_queues;
15291 +       from->nr_flush_queues = 0;
15292 +#endif
15293 +}
15294 +
15295 +#if REISER4_DEBUG
15296 +int atom_fq_parts_are_clean(txn_atom * atom)
15297 +{
15298 +       assert("zam-915", atom != NULL);
15299 +       return list_empty_careful(&atom->flush_queues);
15300 +}
15301 +#endif
15302 +/* Bio i/o completion routine for reiser4 write operations. */
15303 +static int
15304 +end_io_handler(struct bio *bio, unsigned int bytes_done UNUSED_ARG,
15305 +              int err)
15306 +{
15307 +       int i;
15308 +       int nr_errors = 0;
15309 +       flush_queue_t *fq;
15310 +
15311 +       assert("zam-958", bio->bi_rw & WRITE);
15312 +
15313 +       /* i/o op. is not fully completed */
15314 +       if (bio->bi_size != 0)
15315 +               return 1;
15316 +
15317 +       if (err == -EOPNOTSUPP)
15318 +               set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
15319 +
15320 +       /* we expect that bio->private is set to NULL or fq object which is used
15321 +        * for synchronization and error counting. */
15322 +       fq = bio->bi_private;
15323 +       /* Check all elements of io_vec for correct write completion. */
15324 +       for (i = 0; i < bio->bi_vcnt; i += 1) {
15325 +               struct page *pg = bio->bi_io_vec[i].bv_page;
15326 +
15327 +               if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
15328 +                       SetPageError(pg);
15329 +                       nr_errors++;
15330 +               }
15331 +
15332 +               {
15333 +                       /* jnode WRITEBACK ("write is in progress bit") is
15334 +                        * atomically cleared here. */
15335 +                       jnode *node;
15336 +
15337 +                       assert("zam-736", pg != NULL);
15338 +                       assert("zam-736", PagePrivate(pg));
15339 +                       node = jprivate(pg);
15340 +
15341 +                       JF_CLR(node, JNODE_WRITEBACK);
15342 +               }
15343 +
15344 +               end_page_writeback(pg);
15345 +               page_cache_release(pg);
15346 +       }
15347 +
15348 +       if (fq) {
15349 +               /* count i/o error in fq object */
15350 +               atomic_add(nr_errors, &fq->nr_errors);
15351 +
15352 +               /* If all write requests registered in this "fq" are done we up
15353 +                * the semaphore. */
15354 +               if (atomic_sub_and_test(bio->bi_vcnt, &fq->nr_submitted))
15355 +                       up(&fq->io_sem);
15356 +       }
15357 +
15358 +       bio_put(bio);
15359 +       return 0;
15360 +}
15361 +
15362 +/* Count I/O requests which will be submitted by @bio in given flush queues
15363 +   @fq */
15364 +void add_fq_to_bio(flush_queue_t * fq, struct bio *bio)
15365 +{
15366 +       bio->bi_private = fq;
15367 +       bio->bi_end_io = end_io_handler;
15368 +
15369 +       if (fq)
15370 +               atomic_add(bio->bi_vcnt, &fq->nr_submitted);
15371 +}
15372 +
15373 +/* Move all queued nodes out from @fq->prepped list. */
15374 +static void release_prepped_list(flush_queue_t * fq)
15375 +{
15376 +       txn_atom *atom;
15377 +
15378 +       assert("zam-904", fq_in_use(fq));
15379 +       atom = atom_locked_by_fq(fq);
15380 +
15381 +       while (!list_empty(ATOM_FQ_LIST(fq))) {
15382 +               jnode *cur;
15383 +
15384 +               cur = list_entry(ATOM_FQ_LIST(fq)->next, jnode, capture_link);
15385 +               list_del_init(&cur->capture_link);
15386 +
15387 +               count_dequeued_node(fq);
15388 +               spin_lock_jnode(cur);
15389 +               assert("nikita-3154", !JF_ISSET(cur, JNODE_OVRWR));
15390 +               assert("nikita-3154", JF_ISSET(cur, JNODE_RELOC));
15391 +               assert("nikita-3154", JF_ISSET(cur, JNODE_FLUSH_QUEUED));
15392 +               JF_CLR(cur, JNODE_FLUSH_QUEUED);
15393 +
15394 +               if (JF_ISSET(cur, JNODE_DIRTY)) {
15395 +                       list_add_tail(&cur->capture_link,
15396 +                                     ATOM_DIRTY_LIST(atom, jnode_get_level(cur)));
15397 +                       ON_DEBUG(count_jnode(atom, cur, FQ_LIST,
15398 +                                            DIRTY_LIST, 1));
15399 +               } else {
15400 +                       list_add_tail(&cur->capture_link, ATOM_CLEAN_LIST(atom));
15401 +                       ON_DEBUG(count_jnode(atom, cur, FQ_LIST,
15402 +                                            CLEAN_LIST, 1));
15403 +               }
15404 +
15405 +               spin_unlock_jnode(cur);
15406 +       }
15407 +
15408 +       if (--atom->nr_running_queues == 0)
15409 +               atom_send_event(atom);
15410 +
15411 +       spin_unlock_atom(atom);
15412 +}
15413 +
15414 +/* Submit write requests for nodes on the already filled flush queue @fq.
15415 +
15416 +   @fq: flush queue object which contains jnodes we can (and will) write.
15417 +   @return: number of submitted blocks (>=0) if success, otherwise -- an error
15418 +            code (<0). */
15419 +int write_fq(flush_queue_t * fq, long *nr_submitted, int flags)
15420 +{
15421 +       int ret;
15422 +       txn_atom *atom;
15423 +
15424 +       while (1) {
15425 +               atom = atom_locked_by_fq(fq);
15426 +               assert("zam-924", atom);
15427 +               /* do not write fq in parallel. */
15428 +               if (atom->nr_running_queues == 0
15429 +                   || !(flags & WRITEOUT_SINGLE_STREAM))
15430 +                       break;
15431 +               atom_wait_event(atom);
15432 +       }
15433 +
15434 +       atom->nr_running_queues++;
15435 +       spin_unlock_atom(atom);
15436 +
15437 +       ret = write_jnode_list(ATOM_FQ_LIST(fq), fq, nr_submitted, flags);
15438 +       release_prepped_list(fq);
15439 +
15440 +       return ret;
15441 +}
15442 +
15443 +/* Getting flush queue object for exclusive use by one thread. May require
15444 +   several iterations which is indicated by -E_REPEAT return code.
15445 +
15446 +   This function does not contain code for obtaining an atom lock because an
15447 +   atom lock is obtained by different ways in different parts of reiser4,
15448 +   usually it is current atom, but we need a possibility for getting fq for the
15449 +   atom of given jnode. */
15450 +static int fq_by_atom_gfp(txn_atom *atom, flush_queue_t **new_fq, gfp_t gfp)
15451 +{
15452 +       flush_queue_t *fq;
15453 +
15454 +       assert_spin_locked(&(atom->alock));
15455 +
15456 +       fq = list_entry(atom->flush_queues.next, flush_queue_t, alink);
15457 +       while (&atom->flush_queues != &fq->alink) {
15458 +               spin_lock(&(fq->guard));
15459 +
15460 +               if (fq_ready(fq)) {
15461 +                       mark_fq_in_use(fq);
15462 +                       assert("vs-1246", fq->owner == NULL);
15463 +                       ON_DEBUG(fq->owner = current);
15464 +                       spin_unlock(&(fq->guard));
15465 +
15466 +                       if (*new_fq)
15467 +                               done_fq(*new_fq);
15468 +
15469 +                       *new_fq = fq;
15470 +
15471 +                       return 0;
15472 +               }
15473 +
15474 +               spin_unlock(&(fq->guard));
15475 +
15476 +               fq = list_entry(fq->alink.next, flush_queue_t, alink);
15477 +       }
15478 +
15479 +       /* Use previously allocated fq object */
15480 +       if (*new_fq) {
15481 +               mark_fq_in_use(*new_fq);
15482 +               assert("vs-1248", (*new_fq)->owner == 0);
15483 +               ON_DEBUG((*new_fq)->owner = current);
15484 +               attach_fq(atom, *new_fq);
15485 +
15486 +               return 0;
15487 +       }
15488 +
15489 +       spin_unlock_atom(atom);
15490 +
15491 +       *new_fq = create_fq(gfp);
15492 +
15493 +       if (*new_fq == NULL)
15494 +               return RETERR(-ENOMEM);
15495 +
15496 +       return RETERR(-E_REPEAT);
15497 +}
15498 +
15499 +int fq_by_atom(txn_atom * atom, flush_queue_t ** new_fq)
15500 +{
15501 +       return fq_by_atom_gfp(atom, new_fq, get_gfp_mask());
15502 +}
15503 +
15504 +/* A wrapper around fq_by_atom for getting a flush queue object for current
15505 + * atom, if success fq->atom remains locked. */
15506 +flush_queue_t *get_fq_for_current_atom(void)
15507 +{
15508 +       flush_queue_t *fq = NULL;
15509 +       txn_atom *atom;
15510 +       int ret;
15511 +
15512 +       do {
15513 +               atom = get_current_atom_locked();
15514 +               ret = fq_by_atom(atom, &fq);
15515 +       } while (ret == -E_REPEAT);
15516 +
15517 +       if (ret)
15518 +               return ERR_PTR(ret);
15519 +       return fq;
15520 +}
15521 +
15522 +/* Releasing flush queue object after exclusive use */
15523 +void fq_put_nolock(flush_queue_t *fq)
15524 +{
15525 +       assert("zam-747", fq->atom != NULL);
15526 +       assert("zam-902", list_empty_careful(ATOM_FQ_LIST(fq)));
15527 +       mark_fq_ready(fq);
15528 +       assert("vs-1245", fq->owner == current);
15529 +       ON_DEBUG(fq->owner = NULL);
15530 +}
15531 +
15532 +void fq_put(flush_queue_t * fq)
15533 +{
15534 +       txn_atom *atom;
15535 +
15536 +       spin_lock(&(fq->guard));
15537 +       atom = atom_locked_by_fq_nolock(fq);
15538 +
15539 +       assert("zam-746", atom != NULL);
15540 +
15541 +       fq_put_nolock(fq);
15542 +       atom_send_event(atom);
15543 +
15544 +       spin_unlock(&(fq->guard));
15545 +       spin_unlock_atom(atom);
15546 +}
15547 +
15548 +/* A part of atom object initialization related to the embedded flush queue
15549 +   list head */
15550 +
15551 +void init_atom_fq_parts(txn_atom *atom)
15552 +{
15553 +       INIT_LIST_HEAD(&atom->flush_queues);
15554 +}
15555 +
15556 +#if REISER4_DEBUG
15557 +
15558 +void check_fq(const txn_atom *atom)
15559 +{
15560 +       /* check number of nodes on all atom's flush queues */
15561 +       flush_queue_t *fq;
15562 +       int count;
15563 +       struct list_head *pos;
15564 +
15565 +       count = 0;
15566 +       list_for_each_entry(fq, &atom->flush_queues, alink) {
15567 +               spin_lock(&(fq->guard));
15568 +               /* calculate number of jnodes on fq' list of prepped jnodes */
15569 +               list_for_each(pos, ATOM_FQ_LIST(fq))
15570 +                       count++;
15571 +               spin_unlock(&(fq->guard));
15572 +       }
15573 +       if (count != atom->fq)
15574 +               warning("", "fq counter %d, real %d\n", atom->fq, count);
15575 +
15576 +}
15577 +
15578 +#endif
15579 +
15580 +/*
15581 + * Local variables:
15582 + * c-indentation-style: "K&R"
15583 + * mode-name: "LC"
15584 + * c-basic-offset: 8
15585 + * tab-width: 8
15586 + * fill-column: 79
15587 + * scroll-step: 1
15588 + * End:
15589 + */
15590 diff --git a/fs/reiser4/forward.h b/fs/reiser4/forward.h
15591 new file mode 100644
15592 index 0000000..5f25c03
15593 --- /dev/null
15594 +++ b/fs/reiser4/forward.h
15595 @@ -0,0 +1,258 @@
15596 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
15597 +
15598 +/* Forward declarations. Thank you Kernighan. */
15599 +
15600 +#if !defined( __REISER4_FORWARD_H__ )
15601 +#define __REISER4_FORWARD_H__
15602 +
15603 +#include <asm/errno.h>
15604 +#include <linux/types.h>
15605 +
15606 +typedef struct zlock zlock;
15607 +typedef struct lock_stack lock_stack;
15608 +typedef struct lock_handle lock_handle;
15609 +typedef struct znode znode;
15610 +typedef struct flow flow_t;
15611 +typedef struct coord coord_t;
15612 +typedef struct tree_access_pointer tap_t;
15613 +typedef struct item_coord item_coord;
15614 +typedef struct shift_params shift_params;
15615 +typedef struct reiser4_object_create_data reiser4_object_create_data;
15616 +typedef union reiser4_plugin reiser4_plugin;
15617 +typedef __u16 reiser4_plugin_id;
15618 +typedef struct item_plugin item_plugin;
15619 +typedef struct jnode_plugin jnode_plugin;
15620 +typedef struct reiser4_item_data reiser4_item_data;
15621 +typedef union reiser4_key reiser4_key;
15622 +typedef struct reiser4_tree reiser4_tree;
15623 +typedef struct carry_cut_data carry_cut_data;
15624 +typedef struct carry_kill_data carry_kill_data;
15625 +typedef struct carry_tree_op carry_tree_op;
15626 +typedef struct carry_tree_node carry_tree_node;
15627 +typedef struct carry_plugin_info carry_plugin_info;
15628 +typedef struct reiser4_journal reiser4_journal;
15629 +typedef struct txn_atom txn_atom;
15630 +typedef struct txn_handle txn_handle;
15631 +typedef struct txn_mgr txn_mgr;
15632 +typedef struct reiser4_dir_entry_desc reiser4_dir_entry_desc;
15633 +typedef struct reiser4_context reiser4_context;
15634 +typedef struct carry_level carry_level;
15635 +typedef struct blocknr_set blocknr_set;
15636 +typedef struct blocknr_set_entry blocknr_set_entry;
15637 +/* super_block->s_fs_info points to this */
15638 +typedef struct reiser4_super_info_data reiser4_super_info_data;
15639 +/* next two objects are fields of reiser4_super_info_data */
15640 +typedef struct reiser4_oid_allocator reiser4_oid_allocator;
15641 +typedef struct reiser4_space_allocator reiser4_space_allocator;
15642 +
15643 +typedef struct flush_scan flush_scan;
15644 +typedef struct flush_position flush_pos_t;
15645 +
15646 +typedef unsigned short pos_in_node_t;
15647 +#define MAX_POS_IN_NODE 65535
15648 +
15649 +typedef struct jnode jnode;
15650 +typedef struct reiser4_blocknr_hint reiser4_blocknr_hint;
15651 +
15652 +typedef struct uf_coord uf_coord_t;
15653 +typedef struct hint hint_t;
15654 +
15655 +typedef struct ktxnmgrd_context ktxnmgrd_context;
15656 +
15657 +typedef struct reiser4_xattr_plugin reiser4_xattr_plugin;
15658 +
15659 +struct inode;
15660 +struct page;
15661 +struct file;
15662 +struct dentry;
15663 +struct super_block;
15664 +
15665 +/* return values of coord_by_key(). cbk == coord_by_key */
15666 +typedef enum {
15667 +       CBK_COORD_FOUND = 0,
15668 +       CBK_COORD_NOTFOUND = -ENOENT,
15669 +} lookup_result;
15670 +
15671 +/* results of lookup with directory file */
15672 +typedef enum {
15673 +       FILE_NAME_FOUND = 0,
15674 +       FILE_NAME_NOTFOUND = -ENOENT,
15675 +       FILE_IO_ERROR = -EIO,   /* FIXME: it seems silly to have special OOM, IO_ERROR return codes for each search. */
15676 +       FILE_OOM = -ENOMEM      /* FIXME: it seems silly to have special OOM, IO_ERROR return codes for each search. */
15677 +} file_lookup_result;
15678 +
15679 +/* behaviors of lookup. If coord we are looking for is actually in a tree,
15680 +    both coincide. */
15681 +typedef enum {
15682 +       /* search exactly for the coord with key given */
15683 +       FIND_EXACT,
15684 +       /* search for coord with the maximal key not greater than one
15685 +          given */
15686 +       FIND_MAX_NOT_MORE_THAN  /*LEFT_SLANT_BIAS */
15687 +} lookup_bias;
15688 +
15689 +typedef enum {
15690 +       /* number of leaf level of the tree
15691 +          The fake root has (tree_level=0). */
15692 +       LEAF_LEVEL = 1,
15693 +
15694 +       /* number of level one above leaf level of the tree.
15695 +
15696 +          It is supposed that internal tree used by reiser4 to store file
15697 +          system data and meta data will have height 2 initially (when
15698 +          created by mkfs).
15699 +        */
15700 +       TWIG_LEVEL = 2,
15701 +} tree_level;
15702 +
15703 +/* The "real" maximum ztree height is the 0-origin size of any per-level
15704 +   array, since the zero'th level is not used. */
15705 +#define REAL_MAX_ZTREE_HEIGHT     (REISER4_MAX_ZTREE_HEIGHT-LEAF_LEVEL)
15706 +
15707 +/* enumeration of possible mutual position of item and coord.  This enum is
15708 +    return type of ->is_in_item() item plugin method which see. */
15709 +typedef enum {
15710 +       /* coord is on the left of an item */
15711 +       IP_ON_THE_LEFT,
15712 +       /* coord is inside item */
15713 +       IP_INSIDE,
15714 +       /* coord is inside item, but to the right of the rightmost unit of
15715 +          this item */
15716 +       IP_RIGHT_EDGE,
15717 +       /* coord is on the right of an item */
15718 +       IP_ON_THE_RIGHT
15719 +} interposition;
15720 +
15721 +/* type of lock to acquire on znode before returning it to caller */
15722 +typedef enum {
15723 +       ZNODE_NO_LOCK = 0,
15724 +       ZNODE_READ_LOCK = 1,
15725 +       ZNODE_WRITE_LOCK = 2,
15726 +} znode_lock_mode;
15727 +
15728 +/* type of lock request */
15729 +typedef enum {
15730 +       ZNODE_LOCK_LOPRI = 0,
15731 +       ZNODE_LOCK_HIPRI = (1 << 0),
15732 +
15733 +       /* By setting the ZNODE_LOCK_NONBLOCK flag in a lock request the call to longterm_lock_znode will not sleep
15734 +          waiting for the lock to become available.  If the lock is unavailable, reiser4_znode_lock will immediately
15735 +          return the value -E_REPEAT. */
15736 +       ZNODE_LOCK_NONBLOCK = (1 << 1),
15737 +       /* An option for longterm_lock_znode which prevents atom fusion */
15738 +       ZNODE_LOCK_DONT_FUSE = (1 << 2)
15739 +} znode_lock_request;
15740 +
15741 +typedef enum { READ_OP = 0, WRITE_OP = 1 } rw_op;
15742 +
15743 +/* used to specify direction of shift. These must be -1 and 1 */
15744 +typedef enum {
15745 +       SHIFT_LEFT = 1,
15746 +       SHIFT_RIGHT = -1
15747 +} shift_direction;
15748 +
15749 +typedef enum {
15750 +       LEFT_SIDE,
15751 +       RIGHT_SIDE
15752 +} sideof;
15753 +
15754 +#define round_up( value, order )                                               \
15755 +       ( ( typeof( value ) )( ( ( long ) ( value ) + ( order ) - 1U ) &        \
15756 +                            ~( ( order ) - 1 ) ) )
15757 +
15758 +/* values returned by squalloc_right_neighbor and its auxiliary functions */
15759 +typedef enum {
15760 +       /* unit of internal item is moved */
15761 +       SUBTREE_MOVED = 0,
15762 +       /* nothing else can be squeezed into left neighbor */
15763 +       SQUEEZE_TARGET_FULL = 1,
15764 +       /* all content of node is squeezed into its left neighbor */
15765 +       SQUEEZE_SOURCE_EMPTY = 2,
15766 +       /* one more item is copied (this is only returned by
15767 +          allocate_and_copy_extent to squalloc_twig)) */
15768 +       SQUEEZE_CONTINUE = 3
15769 +} squeeze_result;
15770 +
15771 +/* Do not change items ids. If you do - there will be format change */
15772 +typedef enum {
15773 +       STATIC_STAT_DATA_ID = 0x0,
15774 +       SIMPLE_DIR_ENTRY_ID = 0x1,
15775 +       COMPOUND_DIR_ID = 0x2,
15776 +       NODE_POINTER_ID = 0x3,
15777 +       EXTENT_POINTER_ID = 0x5,
15778 +       FORMATTING_ID = 0x6,
15779 +       CTAIL_ID = 0x7,
15780 +       BLACK_BOX_ID = 0x8,
15781 +       LAST_ITEM_ID = 0x9
15782 +} item_id;
15783 +
15784 +/* Flags passed to jnode_flush() to allow it to distinguish default settings based on
15785 +   whether commit() was called or VM memory pressure was applied. */
15786 +typedef enum {
15787 +       /* submit flush queue to disk at jnode_flush completion */
15788 +       JNODE_FLUSH_WRITE_BLOCKS = 1,
15789 +
15790 +       /* flush is called for commit */
15791 +       JNODE_FLUSH_COMMIT = 2,
15792 +       /* not implemented */
15793 +       JNODE_FLUSH_MEMORY_FORMATTED = 4,
15794 +
15795 +       /* not implemented */
15796 +       JNODE_FLUSH_MEMORY_UNFORMATTED = 8,
15797 +} jnode_flush_flags;
15798 +
15799 +/* Flags to insert/paste carry operations. Currently they only used in
15800 +   flushing code, but in future, they can be used to optimize for repetitive
15801 +   accesses.  */
15802 +typedef enum {
15803 +       /* carry is not allowed to shift data to the left when trying to find
15804 +          free space  */
15805 +       COPI_DONT_SHIFT_LEFT = (1 << 0),
15806 +       /* carry is not allowed to shift data to the right when trying to find
15807 +          free space  */
15808 +       COPI_DONT_SHIFT_RIGHT = (1 << 1),
15809 +       /* carry is not allowed to allocate new node(s) when trying to find
15810 +          free space */
15811 +       COPI_DONT_ALLOCATE = (1 << 2),
15812 +       /* try to load left neighbor if its not in a cache */
15813 +       COPI_LOAD_LEFT = (1 << 3),
15814 +       /* try to load right neighbor if its not in a cache */
15815 +       COPI_LOAD_RIGHT = (1 << 4),
15816 +       /* shift insertion point to the left neighbor */
15817 +       COPI_GO_LEFT = (1 << 5),
15818 +       /* shift insertion point to the right neighbor */
15819 +       COPI_GO_RIGHT = (1 << 6),
15820 +       /* try to step back into original node if insertion into new node
15821 +          fails after shifting data there. */
15822 +       COPI_STEP_BACK = (1 << 7)
15823 +} cop_insert_flag;
15824 +
15825 +typedef enum {
15826 +       SAFE_UNLINK,            /* safe-link for unlink */
15827 +       SAFE_TRUNCATE           /* safe-link for truncate */
15828 +} reiser4_safe_link_t;
15829 +
15830 +/* this is to show on which list of atom jnode is */
15831 +typedef enum {
15832 +       NOT_CAPTURED,
15833 +       DIRTY_LIST,
15834 +       CLEAN_LIST,
15835 +       FQ_LIST,
15836 +       WB_LIST,
15837 +       OVRWR_LIST
15838 +} atom_list;
15839 +
15840 +
15841 +
15842 +/* __REISER4_FORWARD_H__ */
15843 +#endif
15844 +
15845 +/* Make Linus happy.
15846 +   Local variables:
15847 +   c-indentation-style: "K&R"
15848 +   mode-name: "LC"
15849 +   c-basic-offset: 8
15850 +   tab-width: 8
15851 +   fill-column: 120
15852 +   End:
15853 +*/
15854 diff --git a/fs/reiser4/fsdata.c b/fs/reiser4/fsdata.c
15855 new file mode 100644
15856 index 0000000..3907c04
15857 --- /dev/null
15858 +++ b/fs/reiser4/fsdata.c
15859 @@ -0,0 +1,803 @@
15860 +/* Copyright 2001, 2002, 2003, 2004, 2005 by Hans Reiser, licensing governed by
15861 + * reiser4/README */
15862 +
15863 +#include "fsdata.h"
15864 +#include "inode.h"
15865 +
15866 +
15867 +/* cache or dir_cursors */
15868 +static kmem_cache_t *d_cursor_cache;
15869 +static struct shrinker *d_cursor_shrinker;
15870 +
15871 +/* list of unused cursors */
15872 +static LIST_HEAD(cursor_cache);
15873 +
15874 +/* number of cursors in list of ununsed cursors */
15875 +static unsigned long d_cursor_unused = 0;
15876 +
15877 +/* spinlock protecting manipulations with dir_cursor's hash table and lists */
15878 +DEFINE_SPINLOCK(d_lock);
15879 +
15880 +static reiser4_file_fsdata *create_fsdata(struct file *file);
15881 +static int file_is_stateless(struct file *file);
15882 +static void free_fsdata(reiser4_file_fsdata *fsdata);
15883 +static void kill_cursor(dir_cursor *);
15884 +
15885 +/**
15886 + * d_cursor_shrink - shrink callback for cache of dir_cursor-s
15887 + * @nr: number of objects to free
15888 + * @mask: GFP mask
15889 + *
15890 + * Shrinks d_cursor_cache. Scan LRU list of unused cursors, freeing requested
15891 + * number. Return number of still freeable cursors.
15892 + */
15893 +static int d_cursor_shrink(int nr, gfp_t mask)
15894 +{
15895 +       if (nr != 0) {
15896 +               dir_cursor *scan;
15897 +               int killed;
15898 +
15899 +               killed = 0;
15900 +               spin_lock(&d_lock);
15901 +               while (!list_empty(&cursor_cache)) {
15902 +                       scan = list_entry(cursor_cache.next, dir_cursor, alist);
15903 +                       assert("nikita-3567", scan->ref == 0);
15904 +                       kill_cursor(scan);
15905 +                       ++killed;
15906 +                       --nr;
15907 +                       if (nr == 0)
15908 +                               break;
15909 +               }
15910 +               spin_unlock(&d_lock);
15911 +       }
15912 +       return d_cursor_unused;
15913 +}
15914 +
15915 +/**
15916 + * init_d_cursor - create d_cursor cache
15917 + *
15918 + * Initializes slab cache of d_cursors. It is part of reiser4 module
15919 + * initialization.
15920 + */
15921 +int init_d_cursor(void)
15922 +{
15923 +       d_cursor_cache = kmem_cache_create("d_cursor", sizeof(dir_cursor), 0,
15924 +                                          SLAB_HWCACHE_ALIGN, NULL, NULL);
15925 +       if (d_cursor_cache == NULL)
15926 +               return RETERR(-ENOMEM);
15927 +
15928 +       /*
15929 +        * actually, d_cursors are "priceless", because there is no way to
15930 +        * recover information stored in them. On the other hand, we don't
15931 +        * want to consume all kernel memory by them. As a compromise, just
15932 +        * assign higher "seeks" value to d_cursor cache, so that it will be
15933 +        * shrunk only if system is really tight on memory.
15934 +        */
15935 +       d_cursor_shrinker = set_shrinker(DEFAULT_SEEKS << 3,
15936 +                                        d_cursor_shrink);
15937 +       if (d_cursor_shrinker == NULL) {
15938 +               destroy_reiser4_cache(&d_cursor_cache);
15939 +               d_cursor_cache = NULL;
15940 +               return RETERR(-ENOMEM);
15941 +       }
15942 +       return 0;
15943 +}
15944 +
15945 +/**
15946 + * done_d_cursor - delete d_cursor cache and d_cursor shrinker
15947 + *
15948 + * This is called on reiser4 module unloading or system shutdown.
15949 + */
15950 +void done_d_cursor(void)
15951 +{
15952 +       BUG_ON(d_cursor_shrinker == NULL);
15953 +       remove_shrinker(d_cursor_shrinker);
15954 +       d_cursor_shrinker = NULL;
15955 +
15956 +       destroy_reiser4_cache(&d_cursor_cache);
15957 +}
15958 +
15959 +#define D_CURSOR_TABLE_SIZE (256)
15960 +
15961 +static inline unsigned long
15962 +d_cursor_hash(d_cursor_hash_table *table, const d_cursor_key *key)
15963 +{
15964 +       assert("nikita-3555", IS_POW(D_CURSOR_TABLE_SIZE));
15965 +       return (key->oid + key->cid) & (D_CURSOR_TABLE_SIZE - 1);
15966 +}
15967 +
15968 +static inline int d_cursor_eq(const d_cursor_key *k1, const d_cursor_key *k2)
15969 +{
15970 +       return k1->cid == k2->cid && k1->oid == k2->oid;
15971 +}
15972 +
15973 +/*
15974 + * define functions to manipulate reiser4 super block's hash table of
15975 + * dir_cursors
15976 + */
15977 +#define KMALLOC(size) kmalloc((size), get_gfp_mask())
15978 +#define KFREE(ptr, size) kfree(ptr)
15979 +TYPE_SAFE_HASH_DEFINE(d_cursor,
15980 +                     dir_cursor,
15981 +                     d_cursor_key, key, hash, d_cursor_hash, d_cursor_eq);
15982 +#undef KFREE
15983 +#undef KMALLOC
15984 +
15985 +/**
15986 + * init_super_d_info - initialize per-super-block d_cursor resources
15987 + * @super: super block to initialize
15988 + *
15989 + * Initializes per-super-block d_cursor's hash table and radix tree. It is part
15990 + * of mount.
15991 + */
15992 +int init_super_d_info(struct super_block *super)
15993 +{
15994 +       d_cursor_info *p;
15995 +
15996 +       p = &get_super_private(super)->d_info;
15997 +
15998 +       INIT_RADIX_TREE(&p->tree, get_gfp_mask());
15999 +       return d_cursor_hash_init(&p->table, D_CURSOR_TABLE_SIZE);
16000 +}
16001 +
16002 +/**
16003 + * done_super_d_info - release per-super-block d_cursor resources
16004 + * @super: super block being umounted
16005 + *
16006 + * It is called on umount. Kills all directory cursors attached to suoer block.
16007 + */
16008 +void done_super_d_info(struct super_block *super)
16009 +{
16010 +       d_cursor_info *d_info;
16011 +       dir_cursor *cursor, *next;
16012 +
16013 +       d_info = &get_super_private(super)->d_info;
16014 +       for_all_in_htable(&d_info->table, d_cursor, cursor, next)
16015 +               kill_cursor(cursor);
16016 +
16017 +       BUG_ON(d_info->tree.rnode != NULL);
16018 +       d_cursor_hash_done(&d_info->table);
16019 +}
16020 +
16021 +/**
16022 + * kill_cursor - free dir_cursor and reiser4_file_fsdata attached to it
16023 + * @cursor: cursor to free
16024 + *
16025 + * Removes reiser4_file_fsdata attached to @cursor from readdir list of
16026 + * reiser4_inode, frees that reiser4_file_fsdata. Removes @cursor from from
16027 + * indices, hash table, list of unused cursors and frees it.
16028 + */
16029 +static void kill_cursor(dir_cursor *cursor)
16030 +{
16031 +       unsigned long index;
16032 +
16033 +       assert("nikita-3566", cursor->ref == 0);
16034 +       assert("nikita-3572", cursor->fsdata != NULL);
16035 +
16036 +       index = (unsigned long)cursor->key.oid;
16037 +       list_del_init(&cursor->fsdata->dir.linkage);
16038 +       free_fsdata(cursor->fsdata);
16039 +       cursor->fsdata = NULL;
16040 +
16041 +       if (list_empty_careful(&cursor->list))
16042 +               /* this is last cursor for a file. Kill radix-tree entry */
16043 +               radix_tree_delete(&cursor->info->tree, index);
16044 +       else {
16045 +               void **slot;
16046 +
16047 +               /*
16048 +                * there are other cursors for the same oid.
16049 +                */
16050 +
16051 +               /*
16052 +                * if radix tree point to the cursor being removed, re-target
16053 +                * radix tree slot to the next cursor in the (non-empty as was
16054 +                * checked above) element of the circular list of all cursors
16055 +                * for this oid.
16056 +                */
16057 +               slot = radix_tree_lookup_slot(&cursor->info->tree, index);
16058 +               assert("nikita-3571", *slot != NULL);
16059 +               if (*slot == cursor)
16060 +                       *slot = list_entry(cursor->list.next, dir_cursor, list);
16061 +               /* remove cursor from circular list */
16062 +               list_del_init(&cursor->list);
16063 +       }
16064 +       /* remove cursor from the list of unused cursors */
16065 +       list_del_init(&cursor->alist);
16066 +       /* remove cursor from the hash table */
16067 +       d_cursor_hash_remove(&cursor->info->table, cursor);
16068 +       /* and free it */
16069 +       kmem_cache_free(d_cursor_cache, cursor);
16070 +       --d_cursor_unused;
16071 +}
16072 +
16073 +/* possible actions that can be performed on all cursors for the given file */
16074 +enum cursor_action {
16075 +       /*
16076 +        * load all detached state: this is called when stat-data is loaded
16077 +        * from the disk to recover information about all pending readdirs
16078 +        */
16079 +       CURSOR_LOAD,
16080 +       /*
16081 +        * detach all state from inode, leaving it in the cache. This is called
16082 +        * when inode is removed form the memory by memory pressure
16083 +        */
16084 +       CURSOR_DISPOSE,
16085 +       /*
16086 +        * detach cursors from the inode, and free them. This is called when
16087 +        * inode is destroyed
16088 +        */
16089 +       CURSOR_KILL
16090 +};
16091 +
16092 +/*
16093 + * return d_cursor data for the file system @inode is in.
16094 + */
16095 +static inline d_cursor_info *d_info(struct inode *inode)
16096 +{
16097 +       return &get_super_private(inode->i_sb)->d_info;
16098 +}
16099 +
16100 +/*
16101 + * lookup d_cursor in the per-super-block radix tree.
16102 + */
16103 +static inline dir_cursor *lookup(d_cursor_info * info, unsigned long index)
16104 +{
16105 +       return (dir_cursor *) radix_tree_lookup(&info->tree, index);
16106 +}
16107 +
16108 +/*
16109 + * attach @cursor to the radix tree. There may be multiple cursors for the
16110 + * same oid, they are chained into circular list.
16111 + */
16112 +static void bind_cursor(dir_cursor * cursor, unsigned long index)
16113 +{
16114 +       dir_cursor *head;
16115 +
16116 +       head = lookup(cursor->info, index);
16117 +       if (head == NULL) {
16118 +               /* this is the first cursor for this index */
16119 +               INIT_LIST_HEAD(&cursor->list);
16120 +               radix_tree_insert(&cursor->info->tree, index, cursor);
16121 +       } else {
16122 +               /* some cursor already exists. Chain ours */
16123 +               list_add(&cursor->list, &head->list);
16124 +       }
16125 +}
16126 +
16127 +/*
16128 + * detach fsdata (if detachable) from file descriptor, and put cursor on the
16129 + * "unused" list. Called when file descriptor is not longer in active use.
16130 + */
16131 +static void clean_fsdata(struct file *file)
16132 +{
16133 +       dir_cursor *cursor;
16134 +       reiser4_file_fsdata *fsdata;
16135 +
16136 +       assert("nikita-3570", file_is_stateless(file));
16137 +
16138 +       fsdata = (reiser4_file_fsdata *) file->private_data;
16139 +       if (fsdata != NULL) {
16140 +               cursor = fsdata->cursor;
16141 +               if (cursor != NULL) {
16142 +                       spin_lock(&d_lock);
16143 +                       --cursor->ref;
16144 +                       if (cursor->ref == 0) {
16145 +                               list_add_tail(&cursor->alist, &cursor_cache);
16146 +                               ++d_cursor_unused;
16147 +                       }
16148 +                       spin_unlock(&d_lock);
16149 +                       file->private_data = NULL;
16150 +               }
16151 +       }
16152 +}
16153 +
16154 +/*
16155 + * global counter used to generate "client ids". These ids are encoded into
16156 + * high bits of fpos.
16157 + */
16158 +static __u32 cid_counter = 0;
16159 +#define CID_SHIFT (20)
16160 +#define CID_MASK  (0xfffffull)
16161 +
16162 +static void free_file_fsdata_nolock(struct file *);
16163 +
16164 +/**
16165 + * insert_cursor - allocate file_fsdata, insert cursor to tree and hash table
16166 + * @cursor:
16167 + * @file:
16168 + * @inode:
16169 + *
16170 + * Allocates reiser4_file_fsdata, attaches it to @cursor, inserts cursor to
16171 + * reiser4 super block's hash table and radix tree.
16172 + add detachable readdir
16173 + * state to the @f
16174 + */
16175 +static int insert_cursor(dir_cursor *cursor, struct file *file,
16176 +                        struct inode *inode)
16177 +{
16178 +       int result;
16179 +       reiser4_file_fsdata *fsdata;
16180 +
16181 +       memset(cursor, 0, sizeof *cursor);
16182 +
16183 +       /* this is either first call to readdir, or rewind. Anyway, create new
16184 +        * cursor. */
16185 +       fsdata = create_fsdata(NULL);
16186 +       if (fsdata != NULL) {
16187 +               result = radix_tree_preload(get_gfp_mask());
16188 +               if (result == 0) {
16189 +                       d_cursor_info *info;
16190 +                       oid_t oid;
16191 +
16192 +                       info = d_info(inode);
16193 +                       oid = get_inode_oid(inode);
16194 +                       /* cid occupies higher 12 bits of f->f_pos. Don't
16195 +                        * allow it to become negative: this confuses
16196 +                        * nfsd_readdir() */
16197 +                       cursor->key.cid = (++cid_counter) & 0x7ff;
16198 +                       cursor->key.oid = oid;
16199 +                       cursor->fsdata = fsdata;
16200 +                       cursor->info = info;
16201 +                       cursor->ref = 1;
16202 +
16203 +                       spin_lock_inode(inode);
16204 +                       /* install cursor as @f's private_data, discarding old
16205 +                        * one if necessary */
16206 +#if REISER4_DEBUG
16207 +                       if (file->private_data)
16208 +                               warning("", "file has fsdata already");
16209 +#endif
16210 +                       clean_fsdata(file);
16211 +                       free_file_fsdata_nolock(file);
16212 +                       file->private_data = fsdata;
16213 +                       fsdata->cursor = cursor;
16214 +                       spin_unlock_inode(inode);
16215 +                       spin_lock(&d_lock);
16216 +                       /* insert cursor into hash table */
16217 +                       d_cursor_hash_insert(&info->table, cursor);
16218 +                       /* and chain it into radix-tree */
16219 +                       bind_cursor(cursor, (unsigned long)oid);
16220 +                       spin_unlock(&d_lock);
16221 +                       radix_tree_preload_end();
16222 +                       file->f_pos = ((__u64) cursor->key.cid) << CID_SHIFT;
16223 +               }
16224 +       } else
16225 +               result = RETERR(-ENOMEM);
16226 +       return result;
16227 +}
16228 +
16229 +/**
16230 + * process_cursors - do action on each cursor attached to inode
16231 + * @inode:
16232 + * @act: action to do
16233 + *
16234 + * Finds all cursors of @inode in reiser4's super block radix tree of cursors
16235 + * and performs action specified by @act on each of cursors.
16236 + */
16237 +static void process_cursors(struct inode *inode, enum cursor_action act)
16238 +{
16239 +       oid_t oid;
16240 +       dir_cursor *start;
16241 +       struct list_head *head;
16242 +       reiser4_context *ctx;
16243 +       d_cursor_info *info;
16244 +
16245 +       /* this can be called by
16246 +        *
16247 +        * kswapd->...->prune_icache->..reiser4_destroy_inode
16248 +        *
16249 +        * without reiser4_context
16250 +        */
16251 +       ctx = init_context(inode->i_sb);
16252 +       if (IS_ERR(ctx)) {
16253 +               warning("vs-23", "failed to init context");
16254 +               return;
16255 +       }
16256 +
16257 +       assert("nikita-3558", inode != NULL);
16258 +
16259 +       info = d_info(inode);
16260 +       oid = get_inode_oid(inode);
16261 +       spin_lock_inode(inode);
16262 +       head = get_readdir_list(inode);
16263 +       spin_lock(&d_lock);
16264 +       /* find any cursor for this oid: reference to it is hanging of radix
16265 +        * tree */
16266 +       start = lookup(info, (unsigned long)oid);
16267 +       if (start != NULL) {
16268 +               dir_cursor *scan;
16269 +               reiser4_file_fsdata *fsdata;
16270 +
16271 +               /* process circular list of cursors for this oid */
16272 +               scan = start;
16273 +               do {
16274 +                       dir_cursor *next;
16275 +
16276 +                       next = list_entry(scan->list.next, dir_cursor, list);
16277 +                       fsdata = scan->fsdata;
16278 +                       assert("nikita-3557", fsdata != NULL);
16279 +                       if (scan->key.oid == oid) {
16280 +                               switch (act) {
16281 +                               case CURSOR_DISPOSE:
16282 +                                       list_del_init(&fsdata->dir.linkage);
16283 +                                       break;
16284 +                               case CURSOR_LOAD:
16285 +                                       list_add(&fsdata->dir.linkage, head);
16286 +                                       break;
16287 +                               case CURSOR_KILL:
16288 +                                       kill_cursor(scan);
16289 +                                       break;
16290 +                               }
16291 +                       }
16292 +                       if (scan == next)
16293 +                               /* last cursor was just killed */
16294 +                               break;
16295 +                       scan = next;
16296 +               } while (scan != start);
16297 +       }
16298 +       spin_unlock(&d_lock);
16299 +       /* check that we killed 'em all */
16300 +       assert("nikita-3568",
16301 +              ergo(act == CURSOR_KILL,
16302 +                   list_empty_careful(get_readdir_list(inode))));
16303 +       assert("nikita-3569",
16304 +              ergo(act == CURSOR_KILL, lookup(info, oid) == NULL));
16305 +       spin_unlock_inode(inode);
16306 +       reiser4_exit_context(ctx);
16307 +}
16308 +
16309 +/**
16310 + * dispose_cursors - removes cursors from inode's list
16311 + * @inode: inode to dispose cursors of
16312 + *
16313 + * For each of cursors corresponding to @inode - removes reiser4_file_fsdata
16314 + * attached to cursor from inode's readdir list. This is called when inode is
16315 + * removed from the memory by memory pressure.
16316 + */
16317 +void dispose_cursors(struct inode *inode)
16318 +{
16319 +       process_cursors(inode, CURSOR_DISPOSE);
16320 +}
16321 +
16322 +/**
16323 + * load_cursors - attach cursors to inode
16324 + * @inode: inode to load cursors to
16325 + *
16326 + * For each of cursors corresponding to @inode - attaches reiser4_file_fsdata
16327 + * attached to cursor to inode's readdir list. This is done when inode is
16328 + * loaded into memory.
16329 + */
16330 +void load_cursors(struct inode *inode)
16331 +{
16332 +       process_cursors(inode, CURSOR_LOAD);
16333 +}
16334 +
16335 +/**
16336 + * kill_cursors - kill all inode cursors
16337 + * @inode: inode to kill cursors of
16338 + *
16339 + * Frees all cursors for this inode. This is called when inode is destroyed.
16340 + */
16341 +void kill_cursors(struct inode *inode)
16342 +{
16343 +       process_cursors(inode, CURSOR_KILL);
16344 +}
16345 +
16346 +/**
16347 + * file_is_stateless -
16348 + * @file:
16349 + *
16350 + * true, if file descriptor @f is created by NFS server by "demand" to serve
16351 + * one file system operation. This means that there may be "detached state"
16352 + * for underlying inode.
16353 + */
16354 +static int file_is_stateless(struct file *file)
16355 +{
16356 +       return reiser4_get_dentry_fsdata(file->f_dentry)->stateless;
16357 +}
16358 +
16359 +/**
16360 + * get_dir_fpos -
16361 + * @dir:
16362 + *
16363 + * Calculates ->fpos from user-supplied cookie. Normally it is dir->f_pos, but
16364 + * in the case of stateless directory operation (readdir-over-nfs), client id
16365 + * was encoded in the high bits of cookie and should me masked off.
16366 + */
16367 +loff_t get_dir_fpos(struct file *dir)
16368 +{
16369 +       if (file_is_stateless(dir))
16370 +               return dir->f_pos & CID_MASK;
16371 +       else
16372 +               return dir->f_pos;
16373 +}
16374 +
16375 +/**
16376 + * try_to_attach_fsdata - ???
16377 + * @file:
16378 + * @inode:
16379 + *
16380 + * Finds or creates cursor for readdir-over-nfs.
16381 + */
16382 +int try_to_attach_fsdata(struct file *file, struct inode *inode)
16383 +{
16384 +       loff_t pos;
16385 +       int result;
16386 +       dir_cursor *cursor;
16387 +
16388 +       /*
16389 +        * we are serialized by inode->i_mutex
16390 +        */
16391 +       if (!file_is_stateless(file))
16392 +               return 0;
16393 +
16394 +       pos = file->f_pos;
16395 +       result = 0;
16396 +       if (pos == 0) {
16397 +               /*
16398 +                * first call to readdir (or rewind to the beginning of
16399 +                * directory)
16400 +                */
16401 +               cursor = kmem_cache_alloc(d_cursor_cache, get_gfp_mask());
16402 +               if (cursor != NULL)
16403 +                       result = insert_cursor(cursor, file, inode);
16404 +               else
16405 +                       result = RETERR(-ENOMEM);
16406 +       } else {
16407 +               /* try to find existing cursor */
16408 +               d_cursor_key key;
16409 +
16410 +               key.cid = pos >> CID_SHIFT;
16411 +               key.oid = get_inode_oid(inode);
16412 +               spin_lock(&d_lock);
16413 +               cursor = d_cursor_hash_find(&d_info(inode)->table, &key);
16414 +               if (cursor != NULL) {
16415 +                       /* cursor was found */
16416 +                       if (cursor->ref == 0) {
16417 +                               /* move it from unused list */
16418 +                               list_del_init(&cursor->alist);
16419 +                               --d_cursor_unused;
16420 +                       }
16421 +                       ++cursor->ref;
16422 +               }
16423 +               spin_unlock(&d_lock);
16424 +               if (cursor != NULL) {
16425 +                       spin_lock_inode(inode);
16426 +                       assert("nikita-3556", cursor->fsdata->back == NULL);
16427 +                       clean_fsdata(file);
16428 +                       free_file_fsdata_nolock(file);
16429 +                       file->private_data = cursor->fsdata;
16430 +                       spin_unlock_inode(inode);
16431 +               }
16432 +       }
16433 +       return result;
16434 +}
16435 +
16436 +/**
16437 + * detach_fsdata - ???
16438 + * @file:
16439 + *
16440 + * detach fsdata, if necessary
16441 + */
16442 +void detach_fsdata(struct file *file)
16443 +{
16444 +       struct inode *inode;
16445 +
16446 +       if (!file_is_stateless(file))
16447 +               return;
16448 +
16449 +       inode = file->f_dentry->d_inode;
16450 +       spin_lock_inode(inode);
16451 +       clean_fsdata(file);
16452 +       spin_unlock_inode(inode);
16453 +}
16454 +
16455 +/* slab for reiser4_dentry_fsdata */
16456 +static kmem_cache_t *dentry_fsdata_cache;
16457 +
16458 +/**
16459 + * init_dentry_fsdata - create cache of dentry_fsdata
16460 + *
16461 + * Initializes slab cache of structures attached to denty->d_fsdata. It is
16462 + * part of reiser4 module initialization.
16463 + */
16464 +int init_dentry_fsdata(void)
16465 +{
16466 +       dentry_fsdata_cache = kmem_cache_create("dentry_fsdata",
16467 +                                               sizeof(reiser4_dentry_fsdata),
16468 +                                               0,
16469 +                                               SLAB_HWCACHE_ALIGN |
16470 +                                               SLAB_RECLAIM_ACCOUNT, NULL,
16471 +                                               NULL);
16472 +       if (dentry_fsdata_cache == NULL)
16473 +               return RETERR(-ENOMEM);
16474 +       return 0;
16475 +}
16476 +
16477 +/**
16478 + * done_dentry_fsdata - delete cache of dentry_fsdata
16479 + *
16480 + * This is called on reiser4 module unloading or system shutdown.
16481 + */
16482 +void done_dentry_fsdata(void)
16483 +{
16484 +       destroy_reiser4_cache(&dentry_fsdata_cache);
16485 +}
16486 +
16487 +/**
16488 + * reiser4_get_dentry_fsdata - get fs-specific dentry data
16489 + * @dentry: queried dentry
16490 + *
16491 + * Allocates if necessary and returns per-dentry data that we attach to each
16492 + * dentry.
16493 + */
16494 +reiser4_dentry_fsdata *reiser4_get_dentry_fsdata(struct dentry *dentry)
16495 +{
16496 +       assert("nikita-1365", dentry != NULL);
16497 +
16498 +       if (dentry->d_fsdata == NULL) {
16499 +               dentry->d_fsdata = kmem_cache_alloc(dentry_fsdata_cache,
16500 +                                                   get_gfp_mask());
16501 +               if (dentry->d_fsdata == NULL)
16502 +                       return ERR_PTR(RETERR(-ENOMEM));
16503 +               memset(dentry->d_fsdata, 0, sizeof(reiser4_dentry_fsdata));
16504 +       }
16505 +       return dentry->d_fsdata;
16506 +}
16507 +
16508 +/**
16509 + * reiser4_free_dentry_fsdata - detach and free dentry_fsdata
16510 + * @dentry: dentry to free fsdata of
16511 + *
16512 + * Detaches and frees fs-specific dentry data
16513 + */
16514 +void reiser4_free_dentry_fsdata(struct dentry *dentry)
16515 +{
16516 +       if (dentry->d_fsdata != NULL) {
16517 +               kmem_cache_free(dentry_fsdata_cache, dentry->d_fsdata);
16518 +               dentry->d_fsdata = NULL;
16519 +       }
16520 +}
16521 +
16522 +
16523 +/* slab for reiser4_file_fsdata */
16524 +static kmem_cache_t *file_fsdata_cache;
16525 +
16526 +/**
16527 + * init_file_fsdata - create cache of reiser4_file_fsdata
16528 + *
16529 + * Initializes slab cache of structures attached to file->private_data. It is
16530 + * part of reiser4 module initialization.
16531 + */
16532 +int init_file_fsdata(void)
16533 +{
16534 +       file_fsdata_cache = kmem_cache_create("file_fsdata",
16535 +                                             sizeof(reiser4_file_fsdata),
16536 +                                             0,
16537 +                                             SLAB_HWCACHE_ALIGN |
16538 +                                             SLAB_RECLAIM_ACCOUNT, NULL, NULL);
16539 +       if (file_fsdata_cache == NULL)
16540 +               return RETERR(-ENOMEM);
16541 +       return 0;
16542 +}
16543 +
16544 +/**
16545 + * done_file_fsdata - delete cache of reiser4_file_fsdata
16546 + *
16547 + * This is called on reiser4 module unloading or system shutdown.
16548 + */
16549 +void done_file_fsdata(void)
16550 +{
16551 +       destroy_reiser4_cache(&file_fsdata_cache);
16552 +}
16553 +
16554 +/**
16555 + * create_fsdata - allocate and initialize reiser4_file_fsdata
16556 + * @file: what to create file_fsdata for, may be NULL
16557 + *
16558 + * Allocates and initializes reiser4_file_fsdata structure.
16559 + */
16560 +static reiser4_file_fsdata *create_fsdata(struct file *file)
16561 +{
16562 +       reiser4_file_fsdata *fsdata;
16563 +
16564 +       fsdata = kmem_cache_alloc(file_fsdata_cache, get_gfp_mask());
16565 +       if (fsdata != NULL) {
16566 +               memset(fsdata, 0, sizeof *fsdata);
16567 +               fsdata->ra1.max_window_size = VM_MAX_READAHEAD * 1024;
16568 +               fsdata->back = file;
16569 +               INIT_LIST_HEAD(&fsdata->dir.linkage);
16570 +       }
16571 +       return fsdata;
16572 +}
16573 +
16574 +/**
16575 + * free_fsdata - free reiser4_file_fsdata
16576 + * @fsdata: object to free
16577 + *
16578 + * Dual to create_fsdata(). Free reiser4_file_fsdata.
16579 + */
16580 +static void free_fsdata(reiser4_file_fsdata *fsdata)
16581 +{
16582 +       BUG_ON(fsdata == NULL);
16583 +       kmem_cache_free(file_fsdata_cache, fsdata);
16584 +}
16585 +
16586 +/**
16587 + * reiser4_get_file_fsdata - get fs-specific file data
16588 + * @file: queried file
16589 + *
16590 + * Returns fs-specific data of @file. If it is NULL, allocates it and attaches
16591 + * to @file.
16592 + */
16593 +reiser4_file_fsdata *reiser4_get_file_fsdata(struct file *file)
16594 +{
16595 +       assert("nikita-1603", file != NULL);
16596 +
16597 +       if (file->private_data == NULL) {
16598 +               reiser4_file_fsdata *fsdata;
16599 +               struct inode *inode;
16600 +
16601 +               fsdata = create_fsdata(file);
16602 +               if (fsdata == NULL)
16603 +                       return ERR_PTR(RETERR(-ENOMEM));
16604 +
16605 +               inode = file->f_dentry->d_inode;
16606 +               spin_lock_inode(inode);
16607 +               if (file->private_data == NULL) {
16608 +                       file->private_data = fsdata;
16609 +                       fsdata = NULL;
16610 +               }
16611 +               spin_unlock_inode(inode);
16612 +               if (fsdata != NULL)
16613 +                       /* other thread initialized ->fsdata */
16614 +                       kmem_cache_free(file_fsdata_cache, fsdata);
16615 +       }
16616 +       assert("nikita-2665", file->private_data != NULL);
16617 +       return file->private_data;
16618 +}
16619 +
16620 +/**
16621 + * free_file_fsdata_nolock - detach and free reiser4_file_fsdata
16622 + * @file:
16623 + *
16624 + * Detaches reiser4_file_fsdata from @file, removes reiser4_file_fsdata from
16625 + * readdir list, frees if it is not linked to d_cursor object.
16626 + */
16627 +static void free_file_fsdata_nolock(struct file *file)
16628 +{
16629 +       reiser4_file_fsdata *fsdata;
16630 +
16631 +       assert("", spin_inode_is_locked(file->f_dentry->d_inode));
16632 +       fsdata = file->private_data;
16633 +       if (fsdata != NULL) {
16634 +               list_del_init(&fsdata->dir.linkage);
16635 +               if (fsdata->cursor == NULL)
16636 +                       free_fsdata(fsdata);
16637 +       }
16638 +       file->private_data = NULL;
16639 +}
16640 +
16641 +/**
16642 + * reiser4_free_file_fsdata - detach from struct file and free reiser4_file_fsdata
16643 + * @file:
16644 + *
16645 + * Spinlocks inode and calls free_file_fsdata_nolock to do the work.
16646 + */
16647 +void reiser4_free_file_fsdata(struct file *file)
16648 +{
16649 +       spin_lock_inode(file->f_dentry->d_inode);
16650 +       free_file_fsdata_nolock(file);
16651 +       spin_unlock_inode(file->f_dentry->d_inode);
16652 +}
16653 +
16654 +/*
16655 + * Local variables:
16656 + * c-indentation-style: "K&R"
16657 + * mode-name: "LC"
16658 + * c-basic-offset: 8
16659 + * tab-width: 8
16660 + * fill-column: 79
16661 + * End:
16662 + */
16663 diff --git a/fs/reiser4/fsdata.h b/fs/reiser4/fsdata.h
16664 new file mode 100644
16665 index 0000000..8d89d72
16666 --- /dev/null
16667 +++ b/fs/reiser4/fsdata.h
16668 @@ -0,0 +1,218 @@
16669 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
16670 + * reiser4/README */
16671 +
16672 +#if !defined( __REISER4_FSDATA_H__ )
16673 +#define __REISER4_FSDATA_H__
16674 +
16675 +#include "debug.h"
16676 +#include "kassign.h"
16677 +#include "seal.h"
16678 +#include "type_safe_hash.h"
16679 +#include "plugin/file/file.h"
16680 +#include "readahead.h"
16681 +
16682 +/*
16683 + * comment about reiser4_dentry_fsdata
16684 + *
16685 + *
16686 + */
16687 +
16688 +/*
16689 + * locking: fields of per file descriptor readdir_pos and ->f_pos are
16690 + * protected by ->i_mutex on inode. Under this lock following invariant
16691 + * holds:
16692 + *
16693 + *     file descriptor is "looking" at the entry_no-th directory entry from
16694 + *     the beginning of directory. This entry has key dir_entry_key and is
16695 + *     pos-th entry with duplicate-key sequence.
16696 + *
16697 + */
16698 +
16699 +/* logical position within directory */
16700 +typedef struct {
16701 +       /* key of directory entry (actually, part of a key sufficient to
16702 +          identify directory entry)  */
16703 +       de_id dir_entry_key;
16704 +       /* ordinal number of directory entry among all entries with the same
16705 +          key. (Starting from 0.) */
16706 +       unsigned pos;
16707 +} dir_pos;
16708 +
16709 +typedef struct {
16710 +       /* f_pos corresponding to this readdir position */
16711 +       __u64 fpos;
16712 +       /* logical position within directory */
16713 +       dir_pos position;
16714 +       /* logical number of directory entry within
16715 +          directory  */
16716 +       __u64 entry_no;
16717 +} readdir_pos;
16718 +
16719 +/*
16720 + * this is used to speed up lookups for directory entry: on initial call to
16721 + * ->lookup() seal and coord of directory entry (if found, that is) are stored
16722 + * in struct dentry and reused later to avoid tree traversals.
16723 + */
16724 +typedef struct de_location {
16725 +       /* seal covering directory entry */
16726 +       seal_t entry_seal;
16727 +       /* coord of directory entry */
16728 +       coord_t entry_coord;
16729 +       /* ordinal number of directory entry among all entries with the same
16730 +          key. (Starting from 0.) */
16731 +       int pos;
16732 +} de_location;
16733 +
16734 +/**
16735 + * reiser4_dentry_fsdata - reiser4-specific data attached to dentries
16736 + *
16737 + * This is allocated dynamically and released in d_op->d_release()
16738 + *
16739 + * Currently it only contains cached location (hint) of directory entry, but
16740 + * it is expected that other information will be accumulated here.
16741 + */
16742 +typedef struct reiser4_dentry_fsdata {
16743 +       /*
16744 +        * here will go fields filled by ->lookup() to speedup next
16745 +        * create/unlink, like blocknr of znode with stat-data, or key of
16746 +        * stat-data.
16747 +        */
16748 +       de_location dec;
16749 +       int stateless;          /* created through reiser4_decode_fh, needs special
16750 +                                * treatment in readdir. */
16751 +} reiser4_dentry_fsdata;
16752 +
16753 +extern int init_dentry_fsdata(void);
16754 +extern void done_dentry_fsdata(void);
16755 +extern reiser4_dentry_fsdata *reiser4_get_dentry_fsdata(struct dentry *);
16756 +extern void reiser4_free_dentry_fsdata(struct dentry *dentry);
16757 +
16758 +
16759 +/**
16760 + * reiser4_file_fsdata - reiser4-specific data attached to file->private_data
16761 + *
16762 + * This is allocated dynamically and released in inode->i_fop->release
16763 + */
16764 +typedef struct reiser4_file_fsdata {
16765 +       /*
16766 +        * pointer back to the struct file which this reiser4_file_fsdata is
16767 +        * part of
16768 +        */
16769 +       struct file *back;
16770 +       /* detached cursor for stateless readdir. */
16771 +       struct dir_cursor *cursor;
16772 +       /*
16773 +        * We need both directory and regular file parts here, because there
16774 +        * are file system objects that are files and directories.
16775 +        */
16776 +       struct {
16777 +               /*
16778 +                * position in directory. It is updated each time directory is
16779 +                * modified
16780 +                */
16781 +               readdir_pos readdir;
16782 +               /* head of this list is reiser4_inode->lists.readdir_list */
16783 +               struct list_head linkage;
16784 +       } dir;
16785 +       /* hints to speed up operations with regular files: read and write. */
16786 +       struct {
16787 +               hint_t hint;
16788 +       } reg;
16789 +       /* */
16790 +       struct {
16791 +               /* this is called by reiser4_readpages if set */
16792 +               void (*readpages) (struct address_space *,
16793 +                                  struct list_head * pages, void *data);
16794 +               /* reiser4_readpaextended coord. It is set by read_extent before
16795 +                  calling page_cache_readahead */
16796 +               void *data;
16797 +       } ra2;
16798 +       struct reiser4_file_ra_state ra1;
16799 +
16800 +} reiser4_file_fsdata;
16801 +
16802 +extern int init_file_fsdata(void);
16803 +extern void done_file_fsdata(void);
16804 +extern reiser4_file_fsdata *reiser4_get_file_fsdata(struct file *);
16805 +extern void reiser4_free_file_fsdata(struct file *);
16806 +
16807 +
16808 +/*
16809 + * d_cursor is reiser4_file_fsdata not attached to struct file. d_cursors are
16810 + * used to address problem reiser4 has with readdir accesses via NFS. See
16811 + * plugin/file_ops_readdir.c for more details.
16812 + */
16813 +typedef struct {
16814 +       __u16 cid;
16815 +       __u64 oid;
16816 +} d_cursor_key;
16817 +
16818 +/*
16819 + * define structures d_cursor_hash_table d_cursor_hash_link which are used to
16820 + * maintain hash table of dir_cursor-s in reiser4's super block
16821 + */
16822 +typedef struct dir_cursor dir_cursor;
16823 +TYPE_SAFE_HASH_DECLARE(d_cursor, dir_cursor);
16824 +
16825 +typedef struct d_cursor_info d_cursor_info;
16826 +
16827 +struct dir_cursor {
16828 +       int ref;
16829 +       reiser4_file_fsdata *fsdata;
16830 +
16831 +       /* link to reiser4 super block hash table of cursors */
16832 +       d_cursor_hash_link hash;
16833 +
16834 +       /*
16835 +        * this is to link cursors to reiser4 super block's radix tree of
16836 +        * cursors if there are more than one cursor of the same objectid
16837 +        */
16838 +       struct list_head list;
16839 +       d_cursor_key key;
16840 +       d_cursor_info *info;
16841 +       /* list of unused cursors */
16842 +       struct list_head alist;
16843 +};
16844 +
16845 +extern int init_d_cursor(void);
16846 +extern void done_d_cursor(void);
16847 +
16848 +extern int init_super_d_info(struct super_block *);
16849 +extern void done_super_d_info(struct super_block *);
16850 +
16851 +extern loff_t get_dir_fpos(struct file *);
16852 +extern int try_to_attach_fsdata(struct file *, struct inode *);
16853 +extern void detach_fsdata(struct file *);
16854 +
16855 +
16856 +/* these are needed for "stateless" readdir. See plugin/file_ops_readdir.c for
16857 +   more details */
16858 +void dispose_cursors(struct inode *inode);
16859 +void load_cursors(struct inode *inode);
16860 +void kill_cursors(struct inode *inode);
16861 +void adjust_dir_file(struct inode *dir, const struct dentry *de, int offset, int adj);
16862 +
16863 +/*
16864 + * this structure is embedded to reise4_super_info_data. It maintains d_cursors
16865 + * (detached readdir state). See plugin/file_ops_readdir.c for more details.
16866 + */
16867 +struct d_cursor_info {
16868 +       d_cursor_hash_table table;
16869 +       struct radix_tree_root tree;
16870 +};
16871 +
16872 +/* spinlock protecting readdir cursors */
16873 +extern spinlock_t d_lock;
16874 +
16875 +/* __REISER4_FSDATA_H__ */
16876 +#endif
16877 +
16878 +/*
16879 + * Local variables:
16880 + * c-indentation-style: "K&R"
16881 + * mode-name: "LC"
16882 + * c-basic-offset: 8
16883 + * tab-width: 8
16884 + * fill-column: 120
16885 + * End:
16886 + */
16887 diff --git a/fs/reiser4/init_super.c b/fs/reiser4/init_super.c
16888 new file mode 100644
16889 index 0000000..6d528a5
16890 --- /dev/null
16891 +++ b/fs/reiser4/init_super.c
16892 @@ -0,0 +1,739 @@
16893 +/* Copyright by Hans Reiser, 2003 */
16894 +
16895 +#include "super.h"
16896 +#include "inode.h"
16897 +#include "plugin/plugin_set.h"
16898 +
16899 +#include <linux/swap.h>
16900 +
16901 +
16902 +/**
16903 + * init_fs_info - allocate reiser4 specific super block
16904 + * @super: super block of filesystem
16905 + *
16906 + * Allocates and initialize reiser4_super_info_data, attaches it to
16907 + * super->s_fs_info, initializes structures maintaining d_cursor-s.
16908 + */
16909 +int init_fs_info(struct super_block *super)
16910 +{
16911 +       reiser4_super_info_data *sbinfo;
16912 +
16913 +       sbinfo = kmalloc(sizeof(reiser4_super_info_data), get_gfp_mask());
16914 +       if (!sbinfo)
16915 +               return RETERR(-ENOMEM);
16916 +
16917 +       super->s_fs_info = sbinfo;
16918 +       super->s_op = NULL;
16919 +       memset(sbinfo, 0, sizeof(*sbinfo));
16920 +
16921 +       ON_DEBUG(INIT_LIST_HEAD(&sbinfo->all_jnodes));
16922 +       ON_DEBUG(spin_lock_init(&sbinfo->all_guard));
16923 +
16924 +       sema_init(&sbinfo->delete_sema, 1);
16925 +       sema_init(&sbinfo->flush_sema, 1);
16926 +       spin_lock_init(&(sbinfo->guard));
16927 +
16928 +       /*  initialize per-super-block d_cursor resources */
16929 +       init_super_d_info(super);
16930 +
16931 +       return 0;
16932 +}
16933 +
16934 +/**
16935 + * done_fs_info - free reiser4 specific super block
16936 + * @super: super block of filesystem
16937 + *
16938 + * Performs some sanity checks, releases structures maintaining d_cursor-s,
16939 + * frees reiser4_super_info_data.
16940 + */
16941 +void done_fs_info(struct super_block *super)
16942 +{
16943 +       assert("zam-990", super->s_fs_info != NULL);
16944 +
16945 +       /* release per-super-block d_cursor resources */
16946 +       done_super_d_info(super);
16947 +
16948 +       /* make sure that there are not jnodes already */
16949 +       assert("", list_empty(&get_super_private(super)->all_jnodes));
16950 +       assert("", get_current_context()->trans->atom == NULL);
16951 +       check_block_counters(super);
16952 +       kfree(super->s_fs_info);
16953 +       super->s_fs_info = NULL;
16954 +}
16955 +
16956 +/* type of option parseable by parse_option() */
16957 +typedef enum {
16958 +       /* value of option is arbitrary string */
16959 +       OPT_STRING,
16960 +
16961 +       /*
16962 +        * option specifies bit in a bitmask. When option is set - bit in
16963 +        * sbinfo->fs_flags is set. Examples are bsdgroups, 32bittimes, mtflush,
16964 +        * dont_load_bitmap, atomic_write.
16965 +        */
16966 +       OPT_BIT,
16967 +
16968 +       /*
16969 +        * value of option should conform to sprintf() format. Examples are
16970 +        * tmgr.atom_max_size=N, tmgr.atom_max_age=N
16971 +        */
16972 +       OPT_FORMAT,
16973 +
16974 +       /*
16975 +        * option can take one of predefined values. Example is onerror=panic or
16976 +        * onerror=remount-ro
16977 +        */
16978 +       OPT_ONEOF,
16979 +} opt_type_t;
16980 +
16981 +typedef struct opt_bitmask_bit {
16982 +       const char *bit_name;
16983 +       int bit_nr;
16984 +} opt_bitmask_bit;
16985 +
16986 +/* description of option parseable by parse_option() */
16987 +typedef struct opt_desc {
16988 +       /* option name.
16989 +
16990 +          parsed portion of string has a form "name=value".
16991 +        */
16992 +       const char *name;
16993 +       /* type of option */
16994 +       opt_type_t type;
16995 +       union {
16996 +               /* where to store value of string option (type == OPT_STRING) */
16997 +               char **string;
16998 +               /* description of bits for bit option (type == OPT_BIT) */
16999 +               struct {
17000 +                       int nr;
17001 +                       void *addr;
17002 +               } bit;
17003 +               /* description of format and targets for format option (type
17004 +                  == OPT_FORMAT) */
17005 +               struct {
17006 +                       const char *format;
17007 +                       int nr_args;
17008 +                       void *arg1;
17009 +                       void *arg2;
17010 +                       void *arg3;
17011 +                       void *arg4;
17012 +               } f;
17013 +               struct {
17014 +                       int *result;
17015 +                       const char *list[10];
17016 +               } oneof;
17017 +               struct {
17018 +                       void *addr;
17019 +                       int nr_bits;
17020 +                       opt_bitmask_bit *bits;
17021 +               } bitmask;
17022 +       } u;
17023 +} opt_desc_t;
17024 +
17025 +/**
17026 + * parse_option - parse one option
17027 + * @opt_strin: starting point of parsing
17028 + * @opt: option description
17029 + *
17030 + * foo=bar,
17031 + * ^   ^  ^
17032 + * |   |  +-- replaced to '\0'
17033 + * |   +-- val_start
17034 + * +-- opt_string
17035 + * Figures out option type and handles option correspondingly.
17036 + */
17037 +static int parse_option(char *opt_string, opt_desc_t *opt)
17038 +{
17039 +       char *val_start;
17040 +       int result;
17041 +       const char *err_msg;
17042 +
17043 +       /* NOTE-NIKITA think about using lib/cmdline.c functions here. */
17044 +
17045 +       val_start = strchr(opt_string, '=');
17046 +       if (val_start != NULL) {
17047 +               *val_start = '\0';
17048 +               ++val_start;
17049 +       }
17050 +
17051 +       err_msg = NULL;
17052 +       result = 0;
17053 +       switch (opt->type) {
17054 +       case OPT_STRING:
17055 +               if (val_start == NULL) {
17056 +                       err_msg = "String arg missing";
17057 +                       result = RETERR(-EINVAL);
17058 +               } else
17059 +                       *opt->u.string = val_start;
17060 +               break;
17061 +       case OPT_BIT:
17062 +               if (val_start != NULL)
17063 +                       err_msg = "Value ignored";
17064 +               else
17065 +                       set_bit(opt->u.bit.nr, opt->u.bit.addr);
17066 +               break;
17067 +       case OPT_FORMAT:
17068 +               if (val_start == NULL) {
17069 +                       err_msg = "Formatted arg missing";
17070 +                       result = RETERR(-EINVAL);
17071 +                       break;
17072 +               }
17073 +               if (sscanf(val_start, opt->u.f.format,
17074 +                          opt->u.f.arg1, opt->u.f.arg2, opt->u.f.arg3,
17075 +                          opt->u.f.arg4) != opt->u.f.nr_args) {
17076 +                       err_msg = "Wrong conversion";
17077 +                       result = RETERR(-EINVAL);
17078 +               }
17079 +               break;
17080 +       case OPT_ONEOF:
17081 +               {
17082 +                       int i = 0;
17083 +
17084 +                       if (val_start == NULL) {
17085 +                               err_msg = "Value is missing";
17086 +                               result = RETERR(-EINVAL);
17087 +                               break;
17088 +                       }
17089 +                       err_msg = "Wrong option value";
17090 +                       result = RETERR(-EINVAL);
17091 +                       while (opt->u.oneof.list[i]) {
17092 +                               if (!strcmp(opt->u.oneof.list[i], val_start)) {
17093 +                                       result = 0;
17094 +                                       err_msg = NULL;
17095 +                                       *opt->u.oneof.result = i;
17096 +                                       break;
17097 +                               }
17098 +                               i++;
17099 +                       }
17100 +                       break;
17101 +               }
17102 +       default:
17103 +               wrong_return_value("nikita-2100", "opt -> type");
17104 +               break;
17105 +       }
17106 +       if (err_msg != NULL) {
17107 +               warning("nikita-2496", "%s when parsing option \"%s%s%s\"",
17108 +                       err_msg, opt->name, val_start ? "=" : "",
17109 +                       val_start ? : "");
17110 +       }
17111 +       return result;
17112 +}
17113 +
17114 +/**
17115 + * parse_options - parse reiser4 mount options
17116 + * @opt_string: starting point
17117 + * @opts: array of option description
17118 + * @nr_opts: number of elements in @opts
17119 + *
17120 + * Parses comma separated list of reiser4 mount options.
17121 + */
17122 +static int parse_options(char *opt_string, opt_desc_t *opts, int nr_opts)
17123 +{
17124 +       int result;
17125 +
17126 +       result = 0;
17127 +       while ((result == 0) && opt_string && *opt_string) {
17128 +               int j;
17129 +               char *next;
17130 +
17131 +               next = strchr(opt_string, ',');
17132 +               if (next != NULL) {
17133 +                       *next = '\0';
17134 +                       ++next;
17135 +               }
17136 +               for (j = 0; j < nr_opts; ++j) {
17137 +                       if (!strncmp(opt_string, opts[j].name,
17138 +                                    strlen(opts[j].name))) {
17139 +                               result = parse_option(opt_string, &opts[j]);
17140 +                               break;
17141 +                       }
17142 +               }
17143 +               if (j == nr_opts) {
17144 +                       warning("nikita-2307", "Unrecognized option: \"%s\"",
17145 +                               opt_string);
17146 +                       /* traditionally, -EINVAL is returned on wrong mount
17147 +                          option */
17148 +                       result = RETERR(-EINVAL);
17149 +               }
17150 +               opt_string = next;
17151 +       }
17152 +       return result;
17153 +}
17154 +
17155 +#define NUM_OPT( label, fmt, addr )                            \
17156 +               {                                               \
17157 +                       .name = ( label ),                      \
17158 +                       .type = OPT_FORMAT,                     \
17159 +                       .u = {                                  \
17160 +                               .f = {                          \
17161 +                                       .format  = ( fmt ),     \
17162 +                                       .nr_args = 1,           \
17163 +                                       .arg1 = ( addr ),       \
17164 +                                       .arg2 = NULL,           \
17165 +                                       .arg3 = NULL,           \
17166 +                                       .arg4 = NULL            \
17167 +                               }                               \
17168 +                       }                                       \
17169 +               }
17170 +
17171 +#define SB_FIELD_OPT( field, fmt ) NUM_OPT( #field, fmt, &sbinfo -> field )
17172 +
17173 +#define BIT_OPT(label, bitnr)                                  \
17174 +       {                                                       \
17175 +               .name = label,                                  \
17176 +               .type = OPT_BIT,                                \
17177 +               .u = {                                          \
17178 +                       .bit = {                                \
17179 +                               .nr = bitnr,                    \
17180 +                               .addr = &sbinfo->fs_flags       \
17181 +                       }                                       \
17182 +               }                                               \
17183 +       }
17184 +
17185 +#define MAX_NR_OPTIONS (30)
17186 +
17187 +/**
17188 + * init_super_data - initialize reiser4 private super block
17189 + * @super: super block to initialize
17190 + * @opt_string: list of reiser4 mount options
17191 + *
17192 + * Sets various reiser4 parameters to default values. Parses mount options and
17193 + * overwrites default settings.
17194 + */
17195 +int init_super_data(struct super_block *super, char *opt_string)
17196 +{
17197 +       int result;
17198 +       opt_desc_t *opts, *p;
17199 +       reiser4_super_info_data *sbinfo = get_super_private(super);
17200 +
17201 +       /* initialize super, export, dentry operations */
17202 +       sbinfo->ops.super = reiser4_super_operations;
17203 +       sbinfo->ops.export = reiser4_export_operations;
17204 +       sbinfo->ops.dentry = reiser4_dentry_operations;
17205 +       super->s_op = &sbinfo->ops.super;
17206 +       super->s_export_op = &sbinfo->ops.export;
17207 +
17208 +       /* initialize transaction manager parameters to default values */
17209 +       sbinfo->tmgr.atom_max_size = totalram_pages / 4;
17210 +       sbinfo->tmgr.atom_max_age = REISER4_ATOM_MAX_AGE / HZ;
17211 +       sbinfo->tmgr.atom_min_size = 256;
17212 +       sbinfo->tmgr.atom_max_flushers = ATOM_MAX_FLUSHERS;
17213 +
17214 +       /* initialize cbk cache parameter */
17215 +       sbinfo->tree.cbk_cache.nr_slots = CBK_CACHE_SLOTS;
17216 +
17217 +       /* initialize flush parameters */
17218 +       sbinfo->flush.relocate_threshold = FLUSH_RELOCATE_THRESHOLD;
17219 +       sbinfo->flush.relocate_distance = FLUSH_RELOCATE_DISTANCE;
17220 +       sbinfo->flush.written_threshold = FLUSH_WRITTEN_THRESHOLD;
17221 +       sbinfo->flush.scan_maxnodes = FLUSH_SCAN_MAXNODES;
17222 +
17223 +       sbinfo->optimal_io_size = REISER4_OPTIMAL_IO_SIZE;
17224 +
17225 +       /* preliminary tree initializations */
17226 +       sbinfo->tree.super = super;
17227 +       sbinfo->tree.carry.new_node_flags = REISER4_NEW_NODE_FLAGS;
17228 +       sbinfo->tree.carry.new_extent_flags = REISER4_NEW_EXTENT_FLAGS;
17229 +       sbinfo->tree.carry.paste_flags = REISER4_PASTE_FLAGS;
17230 +       sbinfo->tree.carry.insert_flags = REISER4_INSERT_FLAGS;
17231 +       rwlock_init(&(sbinfo->tree.tree_lock));
17232 +       spin_lock_init(&(sbinfo->tree.epoch_lock));
17233 +
17234 +       /* initialize default readahead params */
17235 +       sbinfo->ra_params.max = num_physpages / 4;
17236 +       sbinfo->ra_params.flags = 0;
17237 +
17238 +       /* allocate memory for structure describing reiser4 mount options */
17239 +       opts = kmalloc(sizeof(opt_desc_t) * MAX_NR_OPTIONS, get_gfp_mask());
17240 +       if (opts == NULL)
17241 +               return RETERR(-ENOMEM);
17242 +
17243 +       /* initialize structure describing reiser4 mount options */
17244 +       p = opts;
17245 +
17246 +#if REISER4_DEBUG
17247 +#  define OPT_ARRAY_CHECK if ((p) > (opts) + MAX_NR_OPTIONS) {         \
17248 +               warning ("zam-1046", "opt array is overloaded"); break; \
17249 +       }
17250 +#else
17251 +#   define OPT_ARRAY_CHECK noop
17252 +#endif
17253 +
17254 +#define PUSH_OPT(...)                          \
17255 +do {                                           \
17256 +        opt_desc_t o = __VA_ARGS__;            \
17257 +        OPT_ARRAY_CHECK;                       \
17258 +        *p ++ = o;                             \
17259 +} while (0)
17260 +
17261 +#define PUSH_SB_FIELD_OPT(field, format) PUSH_OPT(SB_FIELD_OPT(field, format))
17262 +#define PUSH_BIT_OPT(name, bit) PUSH_OPT(BIT_OPT(name, bit))
17263 +
17264 +       /*
17265 +        * tmgr.atom_max_size=N
17266 +        * Atoms containing more than N blocks will be forced to commit. N is
17267 +        * decimal.
17268 +        */
17269 +       PUSH_SB_FIELD_OPT(tmgr.atom_max_size, "%u");
17270 +       /*
17271 +        * tmgr.atom_max_age=N
17272 +        * Atoms older than N seconds will be forced to commit. N is decimal.
17273 +        */
17274 +       PUSH_SB_FIELD_OPT(tmgr.atom_max_age, "%u");
17275 +       /*
17276 +        * tmgr.atom_min_size=N
17277 +        * In committing an atom to free dirty pages, force the atom less than
17278 +        * N in size to fuse with another one.
17279 +        */
17280 +       PUSH_SB_FIELD_OPT(tmgr.atom_min_size, "%u");
17281 +       /*
17282 +        * tmgr.atom_max_flushers=N
17283 +        * limit of concurrent flushers for one atom. 0 means no limit.
17284 +        */
17285 +       PUSH_SB_FIELD_OPT(tmgr.atom_max_flushers, "%u");
17286 +       /*
17287 +        * tree.cbk_cache_slots=N
17288 +        * Number of slots in the cbk cache.
17289 +        */
17290 +       PUSH_SB_FIELD_OPT(tree.cbk_cache.nr_slots, "%u");
17291 +       /*
17292 +        * If flush finds more than FLUSH_RELOCATE_THRESHOLD adjacent dirty
17293 +        * leaf-level blocks it will force them to be relocated.
17294 +        */
17295 +       PUSH_SB_FIELD_OPT(flush.relocate_threshold, "%u");
17296 +       /*
17297 +        * If flush finds can find a block allocation closer than at most
17298 +        * FLUSH_RELOCATE_DISTANCE from the preceder it will relocate to that
17299 +        * position.
17300 +        */
17301 +       PUSH_SB_FIELD_OPT(flush.relocate_distance, "%u");
17302 +       /*
17303 +        * If we have written this much or more blocks before encountering busy
17304 +        * jnode in flush list - abort flushing hoping that next time we get
17305 +        * called this jnode will be clean already, and we will save some
17306 +        * seeks.
17307 +        */
17308 +       PUSH_SB_FIELD_OPT(flush.written_threshold, "%u");
17309 +       /* The maximum number of nodes to scan left on a level during flush. */
17310 +       PUSH_SB_FIELD_OPT(flush.scan_maxnodes, "%u");
17311 +       /* preferred IO size */
17312 +       PUSH_SB_FIELD_OPT(optimal_io_size, "%u");
17313 +       /* carry flags used for insertion of new nodes */
17314 +       PUSH_SB_FIELD_OPT(tree.carry.new_node_flags, "%u");
17315 +       /* carry flags used for insertion of new extents */
17316 +       PUSH_SB_FIELD_OPT(tree.carry.new_extent_flags, "%u");
17317 +       /* carry flags used for paste operations */
17318 +       PUSH_SB_FIELD_OPT(tree.carry.paste_flags, "%u");
17319 +       /* carry flags used for insert operations */
17320 +       PUSH_SB_FIELD_OPT(tree.carry.insert_flags, "%u");
17321 +
17322 +#ifdef CONFIG_REISER4_BADBLOCKS
17323 +       /*
17324 +        * Alternative master superblock location in case if it's original
17325 +        * location is not writeable/accessable. This is offset in BYTES.
17326 +        */
17327 +       PUSH_SB_FIELD_OPT(altsuper, "%lu");
17328 +#endif
17329 +
17330 +       /* turn on BSD-style gid assignment */
17331 +       PUSH_BIT_OPT("bsdgroups", REISER4_BSD_GID);
17332 +       /* turn on 32 bit times */
17333 +       PUSH_BIT_OPT("32bittimes", REISER4_32_BIT_TIMES);
17334 +       /* turn off concurrent flushing */
17335 +       PUSH_BIT_OPT("mtflush", REISER4_MTFLUSH);
17336 +       /*
17337 +        * Don't load all bitmap blocks at mount time, it is useful for
17338 +        * machines with tiny RAM and large disks.
17339 +        */
17340 +       PUSH_BIT_OPT("dont_load_bitmap", REISER4_DONT_LOAD_BITMAP);
17341 +       /* disable transaction commits during write() */
17342 +       PUSH_BIT_OPT("atomic_write", REISER4_ATOMIC_WRITE);
17343 +       /* disable use of write barriers in the reiser4 log writer. */
17344 +       PUSH_BIT_OPT("no_write_barrier", REISER4_NO_WRITE_BARRIER);
17345 +
17346 +       PUSH_OPT(
17347 +       {
17348 +               /*
17349 +                * tree traversal readahead parameters:
17350 +                * -o readahead:MAXNUM:FLAGS
17351 +                * MAXNUM - max number fo nodes to request readahead for: -1UL
17352 +                * will set it to max_sane_readahead()
17353 +                * FLAGS - combination of bits: RA_ADJCENT_ONLY, RA_ALL_LEVELS,
17354 +                * CONTINUE_ON_PRESENT
17355 +                */
17356 +               .name = "readahead",
17357 +               .type = OPT_FORMAT,
17358 +               .u = {
17359 +                       .f = {
17360 +                               .format = "%u:%u",
17361 +                               .nr_args = 2,
17362 +                               .arg1 = &sbinfo->ra_params.max,
17363 +                               .arg2 = &sbinfo->ra_params.flags,
17364 +                               .arg3 = NULL,
17365 +                               .arg4 = NULL
17366 +                       }
17367 +               }
17368 +       }
17369 +       );
17370 +
17371 +       /* What to do in case of fs error */
17372 +       PUSH_OPT(
17373 +       {
17374 +               .name = "onerror",
17375 +               .type = OPT_ONEOF,
17376 +               .u = {
17377 +                       .oneof = {
17378 +                               .result = &sbinfo->onerror,
17379 +                               .list = {
17380 +                                       "panic", "remount-ro", NULL
17381 +                               },
17382 +                       }
17383 +               }
17384 +       }
17385 +       );
17386 +
17387 +       /* modify default settings to values set by mount options */
17388 +       result = parse_options(opt_string, opts, p - opts);
17389 +       kfree(opts);
17390 +       if (result != 0)
17391 +               return result;
17392 +
17393 +       /* correct settings to sanity values */
17394 +       sbinfo->tmgr.atom_max_age *= HZ;
17395 +       if (sbinfo->tmgr.atom_max_age <= 0)
17396 +               /* overflow */
17397 +               sbinfo->tmgr.atom_max_age = REISER4_ATOM_MAX_AGE;
17398 +
17399 +       /* round optimal io size up to 512 bytes */
17400 +       sbinfo->optimal_io_size >>= VFS_BLKSIZE_BITS;
17401 +       sbinfo->optimal_io_size <<= VFS_BLKSIZE_BITS;
17402 +       if (sbinfo->optimal_io_size == 0) {
17403 +               warning("nikita-2497", "optimal_io_size is too small");
17404 +               return RETERR(-EINVAL);
17405 +       }
17406 +
17407 +       /* disable single-threaded flush as it leads to deadlock */
17408 +       sbinfo->fs_flags |= (1 << REISER4_MTFLUSH);
17409 +       return result;
17410 +}
17411 +
17412 +/**
17413 + * init_read_super - read reiser4 master super block
17414 + * @super: super block to fill
17415 + * @silent: if 0 - print warnings
17416 + *
17417 + * Reads reiser4 master super block either from predefined location or from
17418 + * location specified by altsuper mount option, initializes disk format plugin.
17419 + */
17420 +int init_read_super(struct super_block *super, int silent)
17421 +{
17422 +       struct buffer_head *super_bh;
17423 +       struct reiser4_master_sb *master_sb;
17424 +       reiser4_super_info_data *sbinfo = get_super_private(super);
17425 +       unsigned long blocksize;
17426 +
17427 + read_super_block:
17428 +#ifdef CONFIG_REISER4_BADBLOCKS
17429 +       if (sbinfo->altsuper)
17430 +               /*
17431 +                * read reiser4 master super block at position specified by
17432 +                * mount option
17433 +                */
17434 +               super_bh = sb_bread(super,
17435 +                                   (sector_t)(sbinfo->altsuper / super->s_blocksize));
17436 +       else
17437 +#endif
17438 +               /* read reiser4 master super block at 16-th 4096 block */
17439 +               super_bh = sb_bread(super,
17440 +                                   (sector_t)(REISER4_MAGIC_OFFSET / super->s_blocksize));
17441 +       if (!super_bh)
17442 +               return RETERR(-EIO);
17443 +
17444 +       master_sb = (struct reiser4_master_sb *)super_bh->b_data;
17445 +       /* check reiser4 magic string */
17446 +       if (!strncmp(master_sb->magic, REISER4_SUPER_MAGIC_STRING,
17447 +                    sizeof(REISER4_SUPER_MAGIC_STRING))) {
17448 +               /* reiser4 master super block contains filesystem blocksize */
17449 +               blocksize = le16_to_cpu(get_unaligned(&master_sb->blocksize));
17450 +
17451 +               if (blocksize != PAGE_CACHE_SIZE) {
17452 +                       /*
17453 +                        * currenly reiser4's blocksize must be equal to
17454 +                        * pagesize
17455 +                        */
17456 +                       if (!silent)
17457 +                               warning("nikita-2609",
17458 +                                       "%s: wrong block size %ld\n", super->s_id,
17459 +                                       blocksize);
17460 +                       brelse(super_bh);
17461 +                       return RETERR(-EINVAL);
17462 +               }
17463 +               if (blocksize != super->s_blocksize) {
17464 +                       /*
17465 +                        * filesystem uses different blocksize. Reread master
17466 +                        * super block with correct blocksize
17467 +                        */
17468 +                       brelse(super_bh);
17469 +                       if (!sb_set_blocksize(super, (int)blocksize))
17470 +                               return RETERR(-EINVAL);
17471 +                       goto read_super_block;
17472 +               }
17473 +
17474 +               sbinfo->df_plug =
17475 +                       disk_format_plugin_by_id(
17476 +                               le16_to_cpu(get_unaligned(&master_sb->disk_plugin_id)));
17477 +               if (sbinfo->df_plug == NULL) {
17478 +                       if (!silent)
17479 +                               warning("nikita-26091",
17480 +                                       "%s: unknown disk format plugin %d\n",
17481 +                                       super->s_id,
17482 +                                       le16_to_cpu(get_unaligned(&master_sb->disk_plugin_id)));
17483 +                       brelse(super_bh);
17484 +                       return RETERR(-EINVAL);
17485 +               }
17486 +               sbinfo->diskmap_block = le64_to_cpu(get_unaligned(&master_sb->diskmap));
17487 +               brelse(super_bh);
17488 +               return 0;
17489 +       }
17490 +
17491 +       /* there is no reiser4 on the device */
17492 +       if (!silent)
17493 +               warning("nikita-2608",
17494 +                       "%s: wrong master super block magic", super->s_id);
17495 +       brelse(super_bh);
17496 +       return RETERR(-EINVAL);
17497 +}
17498 +
17499 +static struct {
17500 +       reiser4_plugin_type type;
17501 +       reiser4_plugin_id id;
17502 +} default_plugins[PSET_LAST] = {
17503 +       [PSET_FILE] = {
17504 +               .type = REISER4_FILE_PLUGIN_TYPE,
17505 +               .id = UNIX_FILE_PLUGIN_ID
17506 +       },
17507 +       [PSET_DIR] = {
17508 +               .type = REISER4_DIR_PLUGIN_TYPE,
17509 +               .id = HASHED_DIR_PLUGIN_ID
17510 +       },
17511 +       [PSET_HASH] = {
17512 +               .type = REISER4_HASH_PLUGIN_TYPE,
17513 +               .id = R5_HASH_ID
17514 +       },
17515 +       [PSET_FIBRATION] = {
17516 +               .type = REISER4_FIBRATION_PLUGIN_TYPE,
17517 +               .id = FIBRATION_DOT_O
17518 +       },
17519 +       [PSET_PERM] = {
17520 +               .type = REISER4_PERM_PLUGIN_TYPE,
17521 +               .id = NULL_PERM_ID
17522 +       },
17523 +       [PSET_FORMATTING] = {
17524 +               .type = REISER4_FORMATTING_PLUGIN_TYPE,
17525 +               .id = SMALL_FILE_FORMATTING_ID
17526 +       },
17527 +       [PSET_SD] = {
17528 +               .type = REISER4_ITEM_PLUGIN_TYPE,
17529 +               .id = STATIC_STAT_DATA_ID
17530 +       },
17531 +       [PSET_DIR_ITEM] = {
17532 +               .type = REISER4_ITEM_PLUGIN_TYPE,
17533 +               .id = COMPOUND_DIR_ID
17534 +       },
17535 +       [PSET_CIPHER] = {
17536 +               .type = REISER4_CIPHER_PLUGIN_TYPE,
17537 +               .id = NONE_CIPHER_ID
17538 +       },
17539 +       [PSET_DIGEST] = {
17540 +               .type = REISER4_DIGEST_PLUGIN_TYPE,
17541 +               .id = SHA256_32_DIGEST_ID
17542 +       },
17543 +       [PSET_COMPRESSION] = {
17544 +               .type = REISER4_COMPRESSION_PLUGIN_TYPE,
17545 +               .id = LZO1_COMPRESSION_ID
17546 +       },
17547 +       [PSET_COMPRESSION_MODE] = {
17548 +               .type = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
17549 +               .id = COL_16_COMPRESSION_MODE_ID
17550 +       },
17551 +       [PSET_CLUSTER] = {
17552 +               .type = REISER4_CLUSTER_PLUGIN_TYPE,
17553 +               .id = CLUSTER_64K_ID
17554 +       },
17555 +       [PSET_REGULAR_ENTRY] = {
17556 +               .type = REISER4_REGULAR_PLUGIN_TYPE,
17557 +               .id = UF_REGULAR_ID
17558 +       }
17559 +};
17560 +
17561 +/* access to default plugin table */
17562 +static reiser4_plugin *get_default_plugin(pset_member memb)
17563 +{
17564 +       return plugin_by_id(default_plugins[memb].type,
17565 +                           default_plugins[memb].id);
17566 +}
17567 +
17568 +/**
17569 + * init_root_inode - obtain inode of root directory
17570 + * @super: super block of filesystem
17571 + *
17572 + * Obtains inode of root directory (reading it from disk), initializes plugin
17573 + * set it was not initialized.
17574 + */
17575 +int init_root_inode(struct super_block *super)
17576 +{
17577 +       reiser4_super_info_data *sbinfo = get_super_private(super);
17578 +       struct inode *inode;
17579 +       int result = 0;
17580 +
17581 +       inode = reiser4_iget(super, sbinfo->df_plug->root_dir_key(super), 0);
17582 +       if (IS_ERR(inode))
17583 +               return RETERR(PTR_ERR(inode));
17584 +
17585 +       super->s_root = d_alloc_root(inode);
17586 +       if (!super->s_root) {
17587 +               iput(inode);
17588 +               return RETERR(-ENOMEM);
17589 +       }
17590 +
17591 +       super->s_root->d_op = &sbinfo->ops.dentry;
17592 +
17593 +       if (!is_inode_loaded(inode)) {
17594 +               pset_member memb;
17595 +
17596 +               for (memb = 0; memb < PSET_LAST; ++memb) {
17597 +                       reiser4_plugin *plug;
17598 +
17599 +                       plug = get_default_plugin(memb);
17600 +                       result = grab_plugin_from(inode, memb, plug);
17601 +                       if (result != 0)
17602 +                               break;
17603 +               }
17604 +
17605 +               if (result == 0) {
17606 +                       if (REISER4_DEBUG) {
17607 +                               plugin_set *pset;
17608 +
17609 +                               pset = reiser4_inode_data(inode)->pset;
17610 +                               for (memb = 0; memb < PSET_LAST; ++memb)
17611 +                                       assert("nikita-3500",
17612 +                                              pset_get(pset, memb) != NULL);
17613 +                       }
17614 +               } else
17615 +                       warning("nikita-3448", "Cannot set plugins of root: %i",
17616 +                               result);
17617 +               reiser4_iget_complete(inode);
17618 +       }
17619 +       super->s_maxbytes = MAX_LFS_FILESIZE;
17620 +       return result;
17621 +}
17622 +
17623 +/*
17624 + * Local variables:
17625 + * c-indentation-style: "K&R"
17626 + * mode-name: "LC"
17627 + * c-basic-offset: 8
17628 + * tab-width: 8
17629 + * fill-column: 79
17630 + * End:
17631 + */
17632 diff --git a/fs/reiser4/inode.c b/fs/reiser4/inode.c
17633 new file mode 100644
17634 index 0000000..2e97861
17635 --- /dev/null
17636 +++ b/fs/reiser4/inode.c
17637 @@ -0,0 +1,727 @@
17638 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
17639 +
17640 +/* Inode specific operations. */
17641 +
17642 +#include "forward.h"
17643 +#include "debug.h"
17644 +#include "key.h"
17645 +#include "kassign.h"
17646 +#include "coord.h"
17647 +#include "seal.h"
17648 +#include "dscale.h"
17649 +#include "plugin/item/item.h"
17650 +#include "plugin/security/perm.h"
17651 +#include "plugin/plugin.h"
17652 +#include "plugin/object.h"
17653 +#include "znode.h"
17654 +#include "vfs_ops.h"
17655 +#include "inode.h"
17656 +#include "super.h"
17657 +#include "reiser4.h"
17658 +
17659 +#include <linux/fs.h>          /* for struct super_block,  address_space */
17660 +
17661 +/* return reiser4 internal tree which inode belongs to */
17662 +/* Audited by: green(2002.06.17) */
17663 +reiser4_tree *tree_by_inode(const struct inode *inode /* inode queried */ )
17664 +{
17665 +       assert("nikita-256", inode != NULL);
17666 +       assert("nikita-257", inode->i_sb != NULL);
17667 +       return get_tree(inode->i_sb);
17668 +}
17669 +
17670 +/* return reiser4-specific inode flags */
17671 +static inline unsigned long *inode_flags(const struct inode *const inode)
17672 +{
17673 +       assert("nikita-2842", inode != NULL);
17674 +       return &reiser4_inode_data(inode)->flags;
17675 +}
17676 +
17677 +/* set reiser4-specific flag @f in @inode */
17678 +void inode_set_flag(struct inode *inode, reiser4_file_plugin_flags f)
17679 +{
17680 +       assert("nikita-2248", inode != NULL);
17681 +       set_bit((int)f, inode_flags(inode));
17682 +}
17683 +
17684 +/* clear reiser4-specific flag @f in @inode */
17685 +void inode_clr_flag(struct inode *inode, reiser4_file_plugin_flags f)
17686 +{
17687 +       assert("nikita-2250", inode != NULL);
17688 +       clear_bit((int)f, inode_flags(inode));
17689 +}
17690 +
17691 +/* true if reiser4-specific flag @f is set in @inode */
17692 +int inode_get_flag(const struct inode *inode, reiser4_file_plugin_flags f)
17693 +{
17694 +       assert("nikita-2251", inode != NULL);
17695 +       return test_bit((int)f, inode_flags(inode));
17696 +}
17697 +
17698 +/* convert oid to inode number */
17699 +ino_t oid_to_ino(oid_t oid)
17700 +{
17701 +       return (ino_t) oid;
17702 +}
17703 +
17704 +/* convert oid to user visible inode number */
17705 +ino_t oid_to_uino(oid_t oid)
17706 +{
17707 +       /* reiser4 object is uniquely identified by oid which is 64 bit
17708 +          quantity. Kernel in-memory inode is indexed (in the hash table) by
17709 +          32 bit i_ino field, but this is not a problem, because there is a
17710 +          way to further distinguish inodes with identical inode numbers
17711 +          (find_actor supplied to iget()).
17712 +
17713 +          But user space expects unique 32 bit inode number. Obviously this
17714 +          is impossible. Work-around is to somehow hash oid into user visible
17715 +          inode number.
17716 +        */
17717 +       oid_t max_ino = (ino_t) ~ 0;
17718 +
17719 +       if (REISER4_INO_IS_OID || (oid <= max_ino))
17720 +               return oid;
17721 +       else
17722 +               /* this is remotely similar to algorithm used to find next pid
17723 +                  to use for process: after wrap-around start from some
17724 +                  offset rather than from 0. Idea is that there are some long
17725 +                  living objects with which we don't want to collide.
17726 +                */
17727 +               return REISER4_UINO_SHIFT + ((oid - max_ino) & (max_ino >> 1));
17728 +}
17729 +
17730 +/* check that "inode" is on reiser4 file-system */
17731 +int is_reiser4_inode(const struct inode *inode /* inode queried */ )
17732 +{
17733 +       return inode != NULL && is_reiser4_super(inode->i_sb);
17734 +}
17735 +
17736 +/* Maximal length of a name that can be stored in directory @inode.
17737 +
17738 +   This is used in check during file creation and lookup. */
17739 +int reiser4_max_filename_len(const struct inode *inode /* inode queried */ )
17740 +{
17741 +       assert("nikita-287", is_reiser4_inode(inode));
17742 +       assert("nikita-1710", inode_dir_item_plugin(inode));
17743 +       if (inode_dir_item_plugin(inode)->s.dir.max_name_len)
17744 +               return inode_dir_item_plugin(inode)->s.dir.max_name_len(inode);
17745 +       else
17746 +               return 255;
17747 +}
17748 +
17749 +#if REISER4_USE_COLLISION_LIMIT
17750 +/* Maximal number of hash collisions for this directory. */
17751 +int max_hash_collisions(const struct inode *dir /* inode queried */ )
17752 +{
17753 +       assert("nikita-1711", dir != NULL);
17754 +       return reiser4_inode_data(dir)->plugin.max_collisions;
17755 +}
17756 +#endif  /*  REISER4_USE_COLLISION_LIMIT  */
17757 +
17758 +/* Install file, inode, and address_space operation on @inode, depending on
17759 +   its mode. */
17760 +int setup_inode_ops(struct inode *inode /* inode to intialize */ ,
17761 +                   reiser4_object_create_data * data   /* parameters to create
17762 +                                                        * object */ )
17763 +{
17764 +       reiser4_super_info_data *sinfo;
17765 +       file_plugin *fplug;
17766 +       dir_plugin *dplug;
17767 +
17768 +       fplug = inode_file_plugin(inode);
17769 +       dplug = inode_dir_plugin(inode);
17770 +
17771 +       sinfo = get_super_private(inode->i_sb);
17772 +
17773 +       switch (inode->i_mode & S_IFMT) {
17774 +       case S_IFSOCK:
17775 +       case S_IFBLK:
17776 +       case S_IFCHR:
17777 +       case S_IFIFO:
17778 +               {
17779 +                       dev_t rdev;     /* to keep gcc happy */
17780 +
17781 +                       assert("vs-46", fplug != NULL);
17782 +                       /* ugly hack with rdev */
17783 +                       if (data == NULL) {
17784 +                               rdev = inode->i_rdev;
17785 +                               inode->i_rdev = 0;
17786 +                       } else
17787 +                               rdev = data->rdev;
17788 +                       inode->i_blocks = 0;
17789 +                       assert("vs-42", fplug->h.id == SPECIAL_FILE_PLUGIN_ID);
17790 +                       inode->i_op = &file_plugins[fplug->h.id].inode_ops;
17791 +                       /* initialize inode->i_fop and inode->i_rdev for block and char
17792 +                          devices */
17793 +                       init_special_inode(inode, inode->i_mode, rdev);
17794 +                       /* all address space operations are null */
17795 +                       inode->i_mapping->a_ops =
17796 +                           &file_plugins[fplug->h.id].as_ops;
17797 +                       break;
17798 +               }
17799 +       case S_IFLNK:
17800 +               assert("vs-46", fplug != NULL);
17801 +               assert("vs-42", fplug->h.id == SYMLINK_FILE_PLUGIN_ID);
17802 +               inode->i_op = &file_plugins[fplug->h.id].inode_ops;
17803 +               inode->i_fop = NULL;
17804 +               /* all address space operations are null */
17805 +               inode->i_mapping->a_ops = &file_plugins[fplug->h.id].as_ops;
17806 +               break;
17807 +       case S_IFDIR:
17808 +               assert("vs-46", dplug != NULL);
17809 +               assert("vs-43", (dplug->h.id == HASHED_DIR_PLUGIN_ID ||
17810 +                                dplug->h.id == SEEKABLE_HASHED_DIR_PLUGIN_ID));
17811 +               inode->i_op = &dir_plugins[dplug->h.id].inode_ops;
17812 +               inode->i_fop = &dir_plugins[dplug->h.id].file_ops;
17813 +               inode->i_mapping->a_ops = &dir_plugins[dplug->h.id].as_ops;
17814 +               break;
17815 +       case S_IFREG:
17816 +               assert("vs-46", fplug != NULL);
17817 +               assert("vs-43", (fplug->h.id == UNIX_FILE_PLUGIN_ID ||
17818 +                                fplug->h.id == CRC_FILE_PLUGIN_ID));
17819 +               inode->i_op = &file_plugins[fplug->h.id].inode_ops;
17820 +               inode->i_fop = &file_plugins[fplug->h.id].file_ops;
17821 +               inode->i_mapping->a_ops = &file_plugins[fplug->h.id].as_ops;
17822 +               break;
17823 +       default:
17824 +               warning("nikita-291", "wrong file mode: %o for %llu",
17825 +                       inode->i_mode,
17826 +                       (unsigned long long)get_inode_oid(inode));
17827 +               reiser4_make_bad_inode(inode);
17828 +               return RETERR(-EINVAL);
17829 +       }
17830 +       return 0;
17831 +}
17832 +
17833 +/* initialize inode from disk data. Called with inode locked.
17834 +    Return inode locked. */
17835 +static int init_inode(struct inode *inode /* inode to intialise */ ,
17836 +                     coord_t * coord /* coord of stat data */ )
17837 +{
17838 +       int result;
17839 +       item_plugin *iplug;
17840 +       void *body;
17841 +       int length;
17842 +       reiser4_inode *state;
17843 +
17844 +       assert("nikita-292", coord != NULL);
17845 +       assert("nikita-293", inode != NULL);
17846 +
17847 +       coord_clear_iplug(coord);
17848 +       result = zload(coord->node);
17849 +       if (result)
17850 +               return result;
17851 +       iplug = item_plugin_by_coord(coord);
17852 +       body = item_body_by_coord(coord);
17853 +       length = item_length_by_coord(coord);
17854 +
17855 +       assert("nikita-295", iplug != NULL);
17856 +       assert("nikita-296", body != NULL);
17857 +       assert("nikita-297", length > 0);
17858 +
17859 +       /* inode is under I_LOCK now */
17860 +
17861 +       state = reiser4_inode_data(inode);
17862 +       /* call stat-data plugin method to load sd content into inode */
17863 +       result = iplug->s.sd.init_inode(inode, body, length);
17864 +       plugin_set_sd(&state->pset, iplug);
17865 +       if (result == 0) {
17866 +               result = setup_inode_ops(inode, NULL);
17867 +               if (result == 0 &&
17868 +                   inode->i_sb->s_root && inode->i_sb->s_root->d_inode) {
17869 +                       struct inode *root;
17870 +                       pset_member ind;
17871 +
17872 +                       /* take missing plugins from file-system defaults */
17873 +                       root = inode->i_sb->s_root->d_inode;
17874 +                       /* file and directory plugins are already initialized. */
17875 +                       for (ind = PSET_DIR + 1; ind < PSET_LAST; ++ind) {
17876 +                               result = grab_plugin(inode, root, ind);
17877 +                               if (result != 0)
17878 +                                       break;
17879 +                       }
17880 +                       if (result != 0) {
17881 +                               warning("nikita-3447",
17882 +                                       "Cannot set up plugins for %lli",
17883 +                                       (unsigned long long)
17884 +                                       get_inode_oid(inode));
17885 +                       }
17886 +               }
17887 +       }
17888 +       zrelse(coord->node);
17889 +       return result;
17890 +}
17891 +
17892 +/* read `inode' from the disk. This is what was previously in
17893 +   reiserfs_read_inode2().
17894 +
17895 +   Must be called with inode locked. Return inode still locked.
17896 +*/
17897 +static int read_inode(struct inode *inode /* inode to read from disk */ ,
17898 +                     const reiser4_key * key /* key of stat data */ ,
17899 +                     int silent)
17900 +{
17901 +       int result;
17902 +       lock_handle lh;
17903 +       reiser4_inode *info;
17904 +       coord_t coord;
17905 +
17906 +       assert("nikita-298", inode != NULL);
17907 +       assert("nikita-1945", !is_inode_loaded(inode));
17908 +
17909 +       info = reiser4_inode_data(inode);
17910 +       assert("nikita-300", info->locality_id != 0);
17911 +
17912 +       coord_init_zero(&coord);
17913 +       init_lh(&lh);
17914 +       /* locate stat-data in a tree and return znode locked */
17915 +       result = lookup_sd(inode, ZNODE_READ_LOCK, &coord, &lh, key, silent);
17916 +       assert("nikita-301", !is_inode_loaded(inode));
17917 +       if (result == 0) {
17918 +               /* use stat-data plugin to load sd into inode. */
17919 +               result = init_inode(inode, &coord);
17920 +               if (result == 0) {
17921 +                       /* initialize stat-data seal */
17922 +                       spin_lock_inode(inode);
17923 +                       seal_init(&info->sd_seal, &coord, key);
17924 +                       info->sd_coord = coord;
17925 +                       spin_unlock_inode(inode);
17926 +
17927 +                       /* call file plugin's method to initialize plugin
17928 +                        * specific part of inode */
17929 +                       if (inode_file_plugin(inode)->init_inode_data)
17930 +                               inode_file_plugin(inode)->init_inode_data(inode,
17931 +                                                                         NULL,
17932 +                                                                         0);
17933 +                       /* load detached directory cursors for stateless
17934 +                        * directory readers (NFS). */
17935 +                       load_cursors(inode);
17936 +
17937 +                       /* Check the opened inode for consistency. */
17938 +                       result =
17939 +                           get_super_private(inode->i_sb)->df_plug->
17940 +                           check_open(inode);
17941 +               }
17942 +       }
17943 +       /* lookup_sd() doesn't release coord because we want znode
17944 +          stay read-locked while stat-data fields are accessed in
17945 +          init_inode() */
17946 +       done_lh(&lh);
17947 +
17948 +       if (result != 0)
17949 +               reiser4_make_bad_inode(inode);
17950 +       return result;
17951 +}
17952 +
17953 +/* initialise new reiser4 inode being inserted into hash table. */
17954 +static int init_locked_inode(struct inode *inode /* new inode */ ,
17955 +                            void *opaque       /* key of stat data passed to the
17956 +                                                * iget5_locked as cookie */ )
17957 +{
17958 +       reiser4_key *key;
17959 +
17960 +       assert("nikita-1995", inode != NULL);
17961 +       assert("nikita-1996", opaque != NULL);
17962 +       key = opaque;
17963 +       set_inode_oid(inode, get_key_objectid(key));
17964 +       reiser4_inode_data(inode)->locality_id = get_key_locality(key);
17965 +       return 0;
17966 +}
17967 +
17968 +/* reiser4_inode_find_actor() - "find actor" supplied by reiser4 to iget5_locked().
17969 +
17970 +   This function is called by iget5_locked() to distinguish reiser4 inodes
17971 +   having the same inode numbers. Such inodes can only exist due to some error
17972 +   condition. One of them should be bad. Inodes with identical inode numbers
17973 +   (objectids) are distinguished by their packing locality.
17974 +
17975 +*/
17976 +static int reiser4_inode_find_actor(struct inode *inode        /* inode from hash table to
17977 +                                                        * check */ ,
17978 +                                   void *opaque        /* "cookie" passed to
17979 +                                                        * iget5_locked(). This is stat data
17980 +                                                        * key */ )
17981 +{
17982 +       reiser4_key *key;
17983 +
17984 +       key = opaque;
17985 +       return
17986 +           /* oid is unique, so first term is enough, actually. */
17987 +           get_inode_oid(inode) == get_key_objectid(key) &&
17988 +           /*
17989 +            * also, locality should be checked, but locality is stored in
17990 +            * the reiser4-specific part of the inode, and actor can be
17991 +            * called against arbitrary inode that happened to be in this
17992 +            * hash chain. Hence we first have to check that this is
17993 +            * reiser4 inode at least. is_reiser4_inode() is probably too
17994 +            * early to call, as inode may have ->i_op not yet
17995 +            * initialised.
17996 +            */
17997 +           is_reiser4_super(inode->i_sb) &&
17998 +           /*
17999 +            * usually objectid is unique, but pseudo files use counter to
18000 +            * generate objectid. All pseudo files are placed into special
18001 +            * (otherwise unused) locality.
18002 +            */
18003 +           reiser4_inode_data(inode)->locality_id == get_key_locality(key);
18004 +}
18005 +
18006 +/* hook for kmem_cache_create */
18007 +void loading_init_once(reiser4_inode * info)
18008 +{
18009 +       sema_init(&info->loading, 1);
18010 +}
18011 +
18012 +/* for reiser4_alloc_inode */
18013 +void loading_alloc(reiser4_inode * info)
18014 +{
18015 +#if REISER4_DEBUG
18016 +       assert("vs-1717", down_trylock(&info->loading) == 0);
18017 +       up(&info->loading);
18018 +#endif
18019 +}
18020 +
18021 +/* for reiser4_destroy */
18022 +void loading_destroy(reiser4_inode * info)
18023 +{
18024 +#if REISER4_DEBUG
18025 +       assert("vs-1717", down_trylock(&info->loading) == 0);
18026 +       up(&info->loading);
18027 +#endif
18028 +}
18029 +
18030 +static void loading_down(reiser4_inode * info)
18031 +{
18032 +       down(&info->loading);
18033 +}
18034 +
18035 +static void loading_up(reiser4_inode * info)
18036 +{
18037 +       up(&info->loading);
18038 +}
18039 +
18040 +/**
18041 + * reiser4_iget - obtain inode via iget5_locked, read from disk if necessary
18042 + * @super: super block of filesystem
18043 + * @key: key of inode's stat-data
18044 + * @silent:
18045 + *
18046 + * This is our helper function a la iget(). This is be called by
18047 + * reiser4_lookup() and reiser4_read_super(). Return inode locked or error
18048 + * encountered.
18049 + */
18050 +struct inode *reiser4_iget(struct super_block *super, const reiser4_key *key,
18051 +                          int silent)
18052 +{
18053 +       struct inode *inode;
18054 +       int result;
18055 +       reiser4_inode *info;
18056 +
18057 +       assert("nikita-302", super != NULL);
18058 +       assert("nikita-303", key != NULL);
18059 +
18060 +       result = 0;
18061 +
18062 +       /* call iget(). Our ->read_inode() is dummy, so this will either
18063 +          find inode in cache or return uninitialised inode */
18064 +       inode = iget5_locked(super,
18065 +                            (unsigned long)get_key_objectid(key),
18066 +                            reiser4_inode_find_actor,
18067 +                            init_locked_inode, (reiser4_key *) key);
18068 +       if (inode == NULL)
18069 +               return ERR_PTR(RETERR(-ENOMEM));
18070 +       if (is_bad_inode(inode)) {
18071 +               warning("nikita-304", "Bad inode found");
18072 +               print_key("key", key);
18073 +               iput(inode);
18074 +               return ERR_PTR(RETERR(-EIO));
18075 +       }
18076 +
18077 +       info = reiser4_inode_data(inode);
18078 +
18079 +       /* Reiser4 inode state bit REISER4_LOADED is used to distinguish fully
18080 +          loaded and initialized inode from just allocated inode. If
18081 +          REISER4_LOADED bit is not set, reiser4_iget() completes loading under
18082 +          info->loading.  The place in reiser4 which uses not initialized inode
18083 +          is the reiser4 repacker, see repacker-related functions in
18084 +          plugin/item/extent.c */
18085 +       if (!is_inode_loaded(inode)) {
18086 +               loading_down(info);
18087 +               if (!is_inode_loaded(inode)) {
18088 +                       /* locking: iget5_locked returns locked inode */
18089 +                       assert("nikita-1941", !is_inode_loaded(inode));
18090 +                       assert("nikita-1949",
18091 +                              reiser4_inode_find_actor(inode,
18092 +                                                       (reiser4_key *) key));
18093 +                       /* now, inode has objectid as ->i_ino and locality in
18094 +                          reiser4-specific part. This is enough for
18095 +                          read_inode() to read stat data from the disk */
18096 +                       result = read_inode(inode, key, silent);
18097 +               } else
18098 +                       loading_up(info);
18099 +       }
18100 +
18101 +       if (inode->i_state & I_NEW)
18102 +               unlock_new_inode(inode);
18103 +
18104 +       if (is_bad_inode(inode)) {
18105 +               assert("vs-1717", result != 0);
18106 +               loading_up(info);
18107 +               iput(inode);
18108 +               inode = ERR_PTR(result);
18109 +       } else if (REISER4_DEBUG) {
18110 +               reiser4_key found_key;
18111 +
18112 +               assert("vs-1717", result == 0);
18113 +               build_sd_key(inode, &found_key);
18114 +               if (!keyeq(&found_key, key)) {
18115 +                       warning("nikita-305", "Wrong key in sd");
18116 +                       print_key("sought for", key);
18117 +                       print_key("found", &found_key);
18118 +               }
18119 +               if (inode->i_nlink == 0) {
18120 +                       warning("nikita-3559", "Unlinked inode found: %llu\n",
18121 +                               (unsigned long long)get_inode_oid(inode));
18122 +               }
18123 +       }
18124 +       return inode;
18125 +}
18126 +
18127 +/* reiser4_iget() may return not fully initialized inode, this function should
18128 + * be called after one completes reiser4 inode initializing. */
18129 +void reiser4_iget_complete(struct inode *inode)
18130 +{
18131 +       assert("zam-988", is_reiser4_inode(inode));
18132 +
18133 +       if (!is_inode_loaded(inode)) {
18134 +               inode_set_flag(inode, REISER4_LOADED);
18135 +               loading_up(reiser4_inode_data(inode));
18136 +       }
18137 +}
18138 +
18139 +void reiser4_make_bad_inode(struct inode *inode)
18140 +{
18141 +       assert("nikita-1934", inode != NULL);
18142 +
18143 +       /* clear LOADED bit */
18144 +       inode_clr_flag(inode, REISER4_LOADED);
18145 +       make_bad_inode(inode);
18146 +       return;
18147 +}
18148 +
18149 +file_plugin *inode_file_plugin(const struct inode * inode)
18150 +{
18151 +       assert("nikita-1997", inode != NULL);
18152 +       return reiser4_inode_data(inode)->pset->file;
18153 +}
18154 +
18155 +dir_plugin *inode_dir_plugin(const struct inode * inode)
18156 +{
18157 +       assert("nikita-1998", inode != NULL);
18158 +       return reiser4_inode_data(inode)->pset->dir;
18159 +}
18160 +
18161 +#if 0
18162 +perm_plugin *inode_perm_plugin(const struct inode * inode)
18163 +{
18164 +       assert("nikita-1999", inode != NULL);
18165 +       return reiser4_inode_data(inode)->pset->perm;
18166 +}
18167 +#endif  /*  0  */
18168 +
18169 +formatting_plugin *inode_formatting_plugin(const struct inode * inode)
18170 +{
18171 +       assert("nikita-2000", inode != NULL);
18172 +       return reiser4_inode_data(inode)->pset->formatting;
18173 +}
18174 +
18175 +hash_plugin *inode_hash_plugin(const struct inode * inode)
18176 +{
18177 +       assert("nikita-2001", inode != NULL);
18178 +       return reiser4_inode_data(inode)->pset->hash;
18179 +}
18180 +
18181 +fibration_plugin *inode_fibration_plugin(const struct inode * inode)
18182 +{
18183 +       assert("nikita-2001", inode != NULL);
18184 +       return reiser4_inode_data(inode)->pset->fibration;
18185 +}
18186 +
18187 +cipher_plugin *inode_cipher_plugin(const struct inode * inode)
18188 +{
18189 +       assert("edward-36", inode != NULL);
18190 +       return reiser4_inode_data(inode)->pset->cipher;
18191 +}
18192 +
18193 +compression_plugin *inode_compression_plugin(const struct inode * inode)
18194 +{
18195 +       assert("edward-37", inode != NULL);
18196 +       return reiser4_inode_data(inode)->pset->compression;
18197 +}
18198 +
18199 +compression_mode_plugin *inode_compression_mode_plugin(const struct inode *
18200 +                                                      inode)
18201 +{
18202 +       assert("edward-1330", inode != NULL);
18203 +       return reiser4_inode_data(inode)->pset->compression_mode;
18204 +}
18205 +
18206 +cluster_plugin *inode_cluster_plugin(const struct inode * inode)
18207 +{
18208 +       assert("edward-1328", inode != NULL);
18209 +       return reiser4_inode_data(inode)->pset->cluster;
18210 +}
18211 +
18212 +regular_plugin *inode_regular_plugin(const struct inode * inode)
18213 +{
18214 +       assert("edward-1329", inode != NULL);
18215 +       return reiser4_inode_data(inode)->pset->regular_entry;
18216 +}
18217 +
18218 +digest_plugin *inode_digest_plugin(const struct inode * inode)
18219 +{
18220 +       assert("edward-86", inode != NULL);
18221 +       return reiser4_inode_data(inode)->pset->digest;
18222 +}
18223 +
18224 +item_plugin *inode_sd_plugin(const struct inode * inode)
18225 +{
18226 +       assert("vs-534", inode != NULL);
18227 +       return reiser4_inode_data(inode)->pset->sd;
18228 +}
18229 +
18230 +item_plugin *inode_dir_item_plugin(const struct inode * inode)
18231 +{
18232 +       assert("vs-534", inode != NULL);
18233 +       return reiser4_inode_data(inode)->pset->dir_item;
18234 +}
18235 +
18236 +void inode_set_extension(struct inode *inode, sd_ext_bits ext)
18237 +{
18238 +       reiser4_inode *state;
18239 +
18240 +       assert("nikita-2716", inode != NULL);
18241 +       assert("nikita-2717", ext < LAST_SD_EXTENSION);
18242 +       assert("nikita-3491", spin_inode_is_locked(inode));
18243 +
18244 +       state = reiser4_inode_data(inode);
18245 +       state->extmask |= 1 << ext;
18246 +       /* force re-calculation of stat-data length on next call to
18247 +          update_sd(). */
18248 +       inode_clr_flag(inode, REISER4_SDLEN_KNOWN);
18249 +}
18250 +
18251 +void
18252 +inode_set_plugin(struct inode *inode, reiser4_plugin * plug, pset_member memb)
18253 +{
18254 +       assert("nikita-2718", inode != NULL);
18255 +       assert("nikita-2719", plug != NULL);
18256 +
18257 +       reiser4_inode_data(inode)->plugin_mask |= (1 << memb);
18258 +}
18259 +
18260 +void inode_check_scale_nolock(struct inode *inode, __u64 old, __u64 new)
18261 +{
18262 +       assert("edward-1287", inode != NULL);
18263 +       if (!dscale_fit(old, new))
18264 +               inode_clr_flag(inode, REISER4_SDLEN_KNOWN);
18265 +       return;
18266 +}
18267 +
18268 +void inode_check_scale(struct inode *inode, __u64 old, __u64 new)
18269 +{
18270 +       assert("nikita-2875", inode != NULL);
18271 +       spin_lock_inode(inode);
18272 +       inode_check_scale_nolock(inode, old, new);
18273 +       spin_unlock_inode(inode);
18274 +}
18275 +
18276 +/*
18277 + * initialize ->ordering field of inode. This field defines how file stat-data
18278 + * and body is ordered within a tree with respect to other objects within the
18279 + * same parent directory.
18280 + */
18281 +void
18282 +init_inode_ordering(struct inode *inode,
18283 +                   reiser4_object_create_data * crd, int create)
18284 +{
18285 +       reiser4_key key;
18286 +
18287 +       if (create) {
18288 +               struct inode *parent;
18289 +
18290 +               parent = crd->parent;
18291 +               assert("nikita-3224", inode_dir_plugin(parent) != NULL);
18292 +               inode_dir_plugin(parent)->build_entry_key(parent,
18293 +                                                         &crd->dentry->d_name,
18294 +                                                         &key);
18295 +       } else {
18296 +               coord_t *coord;
18297 +
18298 +               coord = &reiser4_inode_data(inode)->sd_coord;
18299 +               coord_clear_iplug(coord);
18300 +               /* safe to use ->sd_coord, because node is under long term
18301 +                * lock */
18302 +               WITH_DATA(coord->node, item_key_by_coord(coord, &key));
18303 +       }
18304 +
18305 +       set_inode_ordering(inode, get_key_ordering(&key));
18306 +}
18307 +
18308 +znode *inode_get_vroot(struct inode *inode)
18309 +{
18310 +       reiser4_block_nr blk;
18311 +       znode *result;
18312 +
18313 +       spin_lock_inode(inode);
18314 +       blk = reiser4_inode_data(inode)->vroot;
18315 +       spin_unlock_inode(inode);
18316 +       if (!disk_addr_eq(&UBER_TREE_ADDR, &blk))
18317 +               result = zlook(tree_by_inode(inode), &blk);
18318 +       else
18319 +               result = NULL;
18320 +       return result;
18321 +}
18322 +
18323 +void inode_set_vroot(struct inode *inode, znode *vroot)
18324 +{
18325 +       spin_lock_inode(inode);
18326 +       reiser4_inode_data(inode)->vroot = *znode_get_block(vroot);
18327 +       spin_unlock_inode(inode);
18328 +}
18329 +
18330 +#if REISER4_DEBUG
18331 +
18332 +void inode_invariant(const struct inode *inode)
18333 +{
18334 +       assert("nikita-3077", spin_inode_is_locked(inode));
18335 +}
18336 +
18337 +int inode_has_no_jnodes(reiser4_inode * r4_inode)
18338 +{
18339 +       return jnode_tree_by_reiser4_inode(r4_inode)->rnode == NULL &&
18340 +               r4_inode->nr_jnodes == 0;
18341 +}
18342 +
18343 +#endif
18344 +
18345 +/* true if directory is empty (only contains dot and dotdot) */
18346 +/* FIXME: shouldn't it be dir plugin method? */
18347 +int is_dir_empty(const struct inode *dir)
18348 +{
18349 +       assert("nikita-1976", dir != NULL);
18350 +
18351 +       /* rely on our method to maintain directory i_size being equal to the
18352 +          number of entries. */
18353 +       return dir->i_size <= 2 ? 0 : RETERR(-ENOTEMPTY);
18354 +}
18355 +
18356 +/* Make Linus happy.
18357 +   Local variables:
18358 +   c-indentation-style: "K&R"
18359 +   mode-name: "LC"
18360 +   c-basic-offset: 8
18361 +   tab-width: 8
18362 +   fill-column: 120
18363 +   End:
18364 +*/
18365 diff --git a/fs/reiser4/inode.h b/fs/reiser4/inode.h
18366 new file mode 100644
18367 index 0000000..88b155d
18368 --- /dev/null
18369 +++ b/fs/reiser4/inode.h
18370 @@ -0,0 +1,430 @@
18371 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
18372 +
18373 +/* Inode functions. */
18374 +
18375 +#if !defined( __REISER4_INODE_H__ )
18376 +#define __REISER4_INODE_H__
18377 +
18378 +#include "forward.h"
18379 +#include "debug.h"
18380 +#include "key.h"
18381 +#include "seal.h"
18382 +#include "plugin/plugin.h"
18383 +#include "plugin/file/cryptcompress.h"
18384 +#include "plugin/file/file.h"
18385 +#include "plugin/dir/dir.h"
18386 +#include "plugin/plugin_set.h"
18387 +#include "plugin/security/perm.h"
18388 +#include "vfs_ops.h"
18389 +#include "jnode.h"
18390 +#include "fsdata.h"
18391 +
18392 +#include <linux/types.h>       /* for __u?? , ino_t */
18393 +#include <linux/fs.h>          /* for struct super_block, struct
18394 +                                * rw_semaphore, etc  */
18395 +#include <linux/spinlock.h>
18396 +#include <asm/types.h>
18397 +
18398 +/* reiser4-specific inode flags. They are "transient" and are not
18399 +   supposed to be stored on disk. Used to trace "state" of
18400 +   inode
18401 +*/
18402 +typedef enum {
18403 +       /* this is light-weight inode, inheriting some state from its
18404 +          parent  */
18405 +       REISER4_LIGHT_WEIGHT = 0,
18406 +       /* stat data wasn't yet created */
18407 +       REISER4_NO_SD = 1,
18408 +       /* internal immutable flag. Currently is only used
18409 +          to avoid race condition during file creation.
18410 +          See comment in create_object(). */
18411 +       REISER4_IMMUTABLE = 2,
18412 +       /* inode was read from storage */
18413 +       REISER4_LOADED = 3,
18414 +       /* this bit is set for symlinks. inode->i_private points to target
18415 +          name of symlink. */
18416 +       REISER4_GENERIC_PTR_USED = 4,
18417 +       /* set if size of stat-data item for this inode is known. If this is
18418 +        * set we can avoid recalculating size of stat-data on each update. */
18419 +       REISER4_SDLEN_KNOWN = 5,
18420 +       /* reiser4_inode->crypt points to the crypto stat */
18421 +       REISER4_CRYPTO_STAT_LOADED = 6,
18422 +       /* cryptcompress_inode_data points to the secret key */
18423 +       REISER4_SECRET_KEY_INSTALLED = 7,
18424 +       /* File (possibly) has pages corresponding to the tail items, that
18425 +        * were created by ->readpage. It is set by mmap_unix_file() and
18426 +        * sendfile_unix_file(). This bit is inspected by write_unix_file and
18427 +        * kill-hook of tail items. It is never cleared once set. This bit is
18428 +        * modified and inspected under i_mutex. */
18429 +       REISER4_HAS_MMAP = 8,
18430 +
18431 +       REISER4_PART_MIXED = 9,
18432 +       REISER4_PART_IN_CONV = 10
18433 +} reiser4_file_plugin_flags;
18434 +
18435 +/* state associated with each inode.
18436 +   reiser4 inode.
18437 +
18438 +   NOTE-NIKITA In 2.5 kernels it is not necessary that all file-system inodes
18439 +   be of the same size. File-system allocates inodes by itself through
18440 +   s_op->allocate_inode() method. So, it is possible to adjust size of inode
18441 +   at the time of its creation.
18442 +
18443 +   Invariants involving parts of this data-type:
18444 +
18445 +      [inode->eflushed]
18446 +
18447 +*/
18448 +
18449 +typedef struct reiser4_inode reiser4_inode;
18450 +/* return pointer to reiser4-specific part of inode */
18451 +static inline reiser4_inode *reiser4_inode_data(const struct inode *inode
18452 +                                               /* inode queried */ );
18453 +
18454 +#if BITS_PER_LONG == 64
18455 +
18456 +#define REISER4_INO_IS_OID (1)
18457 +typedef struct {;
18458 +} oid_hi_t;
18459 +
18460 +/* BITS_PER_LONG == 64 */
18461 +#else
18462 +
18463 +#define REISER4_INO_IS_OID (0)
18464 +typedef __u32 oid_hi_t;
18465 +
18466 +/* BITS_PER_LONG == 64 */
18467 +#endif
18468 +
18469 +struct reiser4_inode {
18470 +       /* spin lock protecting fields of this structure. */
18471 +       spinlock_t guard;
18472 +       /* object plugins */
18473 +       plugin_set *pset;
18474 +       /* plugins set for inheritance */
18475 +       plugin_set *hset;
18476 +       /* high 32 bits of object id */
18477 +       oid_hi_t oid_hi;
18478 +       /* seal for stat-data */
18479 +       seal_t sd_seal;
18480 +       /* locality id for this file */
18481 +       oid_t locality_id;
18482 +#if REISER4_LARGE_KEY
18483 +       __u64 ordering;
18484 +#endif
18485 +       /* coord of stat-data in sealed node */
18486 +       coord_t sd_coord;
18487 +       /* bit-mask of stat-data extentions used by this file */
18488 +       __u64 extmask;
18489 +       /* bitmask of non-default plugins for this inode */
18490 +       __u16 plugin_mask;
18491 +       union {
18492 +               struct list_head readdir_list;
18493 +               struct list_head not_used;
18494 +       } lists;
18495 +       /* per-inode flags. Filled by values of reiser4_file_plugin_flags */
18496 +       unsigned long flags;
18497 +       union {
18498 +               /* fields specific to unix_file plugin */
18499 +               unix_file_info_t unix_file_info;
18500 +               /* fields specific to cryptcompress plugin */
18501 +               cryptcompress_info_t cryptcompress_info;
18502 +       } file_plugin_data;
18503 +
18504 +       /* tree of jnodes. Phantom jnodes (ones not attched to any atom) are
18505 +          tagged in that tree by EFLUSH_TAG_ANONYMOUS */
18506 +       struct radix_tree_root jnodes_tree;
18507 +#if REISER4_DEBUG
18508 +       /* number of unformatted node jnodes of this file in jnode hash table */
18509 +       unsigned long nr_jnodes;
18510 +#endif
18511 +
18512 +       /* block number of virtual root for this object. See comment above
18513 +        * fs/reiser4/search.c:handle_vroot() */
18514 +       reiser4_block_nr vroot;
18515 +       struct semaphore loading;
18516 +};
18517 +
18518 +void loading_init_once(reiser4_inode *);
18519 +void loading_alloc(reiser4_inode *);
18520 +void loading_destroy(reiser4_inode *);
18521 +
18522 +typedef struct reiser4_inode_object {
18523 +       /* private part */
18524 +       reiser4_inode p;
18525 +       /* generic fields not specific to reiser4, but used by VFS */
18526 +       struct inode vfs_inode;
18527 +} reiser4_inode_object;
18528 +
18529 +/* return pointer to the reiser4 specific portion of @inode */
18530 +static inline reiser4_inode *reiser4_inode_data(const struct inode *inode
18531 +                                               /* inode queried */ )
18532 +{
18533 +       assert("nikita-254", inode != NULL);
18534 +       return &container_of(inode, reiser4_inode_object, vfs_inode)->p;
18535 +}
18536 +
18537 +static inline struct inode *inode_by_reiser4_inode(const reiser4_inode *
18538 +                                                  r4_inode /* inode queried */
18539 +                                                  )
18540 +{
18541 +       return &container_of(r4_inode, reiser4_inode_object, p)->vfs_inode;
18542 +}
18543 +
18544 +/*
18545 + * reiser4 inodes are identified by 64bit object-id (oid_t), but in struct
18546 + * inode ->i_ino field is of type ino_t (long) that can be either 32 or 64
18547 + * bits.
18548 + *
18549 + * If ->i_ino is 32 bits we store remaining 32 bits in reiser4 specific part
18550 + * of inode, otherwise whole oid is stored in i_ino.
18551 + *
18552 + * Wrappers below ([sg]et_inode_oid()) are used to hide this difference.
18553 + */
18554 +
18555 +#define OID_HI_SHIFT (sizeof(ino_t) * 8)
18556 +
18557 +#if REISER4_INO_IS_OID
18558 +
18559 +static inline oid_t get_inode_oid(const struct inode *inode)
18560 +{
18561 +       return inode->i_ino;
18562 +}
18563 +
18564 +static inline void set_inode_oid(struct inode *inode, oid_t oid)
18565 +{
18566 +       inode->i_ino = oid;
18567 +}
18568 +
18569 +/* REISER4_INO_IS_OID */
18570 +#else
18571 +
18572 +static inline oid_t get_inode_oid(const struct inode *inode)
18573 +{
18574 +       return
18575 +           ((__u64) reiser4_inode_data(inode)->oid_hi << OID_HI_SHIFT) |
18576 +           inode->i_ino;
18577 +}
18578 +
18579 +static inline void set_inode_oid(struct inode *inode, oid_t oid)
18580 +{
18581 +       assert("nikita-2519", inode != NULL);
18582 +       inode->i_ino = (ino_t) (oid);
18583 +       reiser4_inode_data(inode)->oid_hi = (oid) >> OID_HI_SHIFT;
18584 +       assert("nikita-2521", get_inode_oid(inode) == (oid));
18585 +}
18586 +
18587 +/* REISER4_INO_IS_OID */
18588 +#endif
18589 +
18590 +static inline oid_t get_inode_locality(const struct inode *inode)
18591 +{
18592 +       return reiser4_inode_data(inode)->locality_id;
18593 +}
18594 +
18595 +#if REISER4_LARGE_KEY
18596 +static inline __u64 get_inode_ordering(const struct inode *inode)
18597 +{
18598 +       return reiser4_inode_data(inode)->ordering;
18599 +}
18600 +
18601 +static inline void set_inode_ordering(const struct inode *inode, __u64 ordering)
18602 +{
18603 +       reiser4_inode_data(inode)->ordering = ordering;
18604 +}
18605 +
18606 +#else
18607 +
18608 +#define get_inode_ordering(inode) (0)
18609 +#define set_inode_ordering(inode, val) noop
18610 +
18611 +#endif
18612 +
18613 +/* return inode in which @uf_info is embedded */
18614 +static inline struct inode *unix_file_info_to_inode(const unix_file_info_t *
18615 +                                                   uf_info)
18616 +{
18617 +       return &container_of(uf_info, reiser4_inode_object,
18618 +                            p.file_plugin_data.unix_file_info)->vfs_inode;
18619 +}
18620 +
18621 +
18622 +extern ino_t oid_to_ino(oid_t oid) __attribute__ ((const));
18623 +extern ino_t oid_to_uino(oid_t oid) __attribute__ ((const));
18624 +
18625 +extern reiser4_tree *tree_by_inode(const struct inode *inode);
18626 +
18627 +#if REISER4_DEBUG
18628 +extern void inode_invariant(const struct inode *inode);
18629 +extern int inode_has_no_jnodes(reiser4_inode *);
18630 +#else
18631 +#define inode_invariant(inode) noop
18632 +#endif
18633 +
18634 +static inline int spin_inode_is_locked(const struct inode *inode)
18635 +{
18636 +       assert_spin_locked(&reiser4_inode_data(inode)->guard);
18637 +       return 1;
18638 +}
18639 +
18640 +/**
18641 + * spin_lock_inode - lock reiser4_inode' embedded spinlock
18642 + * @inode: inode to lock
18643 + *
18644 + * In debug mode it checks that lower priority locks are not held and
18645 + * increments reiser4_context's lock counters on which lock ordering checking
18646 + * is based.
18647 + */
18648 +static inline void spin_lock_inode(struct inode *inode)
18649 +{
18650 +       assert("", LOCK_CNT_NIL(spin_locked));
18651 +       /* check lock ordering */
18652 +       assert_spin_not_locked(&d_lock);
18653 +
18654 +       spin_lock(&reiser4_inode_data(inode)->guard);
18655 +
18656 +       LOCK_CNT_INC(spin_locked_inode);
18657 +       LOCK_CNT_INC(spin_locked);
18658 +
18659 +       inode_invariant(inode);
18660 +}
18661 +
18662 +/**
18663 + * spin_unlock_inode - unlock reiser4_inode' embedded spinlock
18664 + * @inode: inode to unlock
18665 + *
18666 + * In debug mode it checks that spinlock is held and decrements
18667 + * reiser4_context's lock counters on which lock ordering checking is based.
18668 + */
18669 +static inline void spin_unlock_inode(struct inode *inode)
18670 +{
18671 +       assert_spin_locked(&reiser4_inode_data(inode)->guard);
18672 +       assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_inode));
18673 +       assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
18674 +
18675 +       inode_invariant(inode);
18676 +
18677 +       LOCK_CNT_DEC(spin_locked_inode);
18678 +       LOCK_CNT_DEC(spin_locked);
18679 +
18680 +       spin_unlock(&reiser4_inode_data(inode)->guard);
18681 +}
18682 +
18683 +
18684 +extern znode *inode_get_vroot(struct inode *inode);
18685 +extern void inode_set_vroot(struct inode *inode, znode * vroot);
18686 +
18687 +extern int reiser4_max_filename_len(const struct inode *inode);
18688 +extern int max_hash_collisions(const struct inode *dir);
18689 +extern void reiser4_unlock_inode(struct inode *inode);
18690 +extern int is_reiser4_inode(const struct inode *inode);
18691 +extern int setup_inode_ops(struct inode *inode, reiser4_object_create_data *);
18692 +extern struct inode *reiser4_iget(struct super_block *super,
18693 +                                 const reiser4_key * key, int silent);
18694 +extern void reiser4_iget_complete(struct inode *inode);
18695 +extern void inode_set_flag(struct inode *inode, reiser4_file_plugin_flags f);
18696 +extern void inode_clr_flag(struct inode *inode, reiser4_file_plugin_flags f);
18697 +extern int inode_get_flag(const struct inode *inode,
18698 +                         reiser4_file_plugin_flags f);
18699 +
18700 +/*  has inode been initialized? */
18701 +static inline int
18702 +is_inode_loaded(const struct inode *inode /* inode queried */ )
18703 +{
18704 +       assert("nikita-1120", inode != NULL);
18705 +       return inode_get_flag(inode, REISER4_LOADED);
18706 +}
18707 +
18708 +extern file_plugin *inode_file_plugin(const struct inode *inode);
18709 +extern dir_plugin *inode_dir_plugin(const struct inode *inode);
18710 +extern formatting_plugin *inode_formatting_plugin(const struct inode *inode);
18711 +extern hash_plugin *inode_hash_plugin(const struct inode *inode);
18712 +extern fibration_plugin *inode_fibration_plugin(const struct inode *inode);
18713 +extern cipher_plugin *inode_cipher_plugin(const struct inode *inode);
18714 +extern digest_plugin *inode_digest_plugin(const struct inode *inode);
18715 +extern compression_plugin *inode_compression_plugin(const struct inode *inode);
18716 +extern compression_mode_plugin *inode_compression_mode_plugin(const struct inode
18717 +                                                             *inode);
18718 +extern cluster_plugin *inode_cluster_plugin(const struct inode *inode);
18719 +extern regular_plugin *inode_regular_plugin(const struct inode *inode);
18720 +extern item_plugin *inode_sd_plugin(const struct inode *inode);
18721 +extern item_plugin *inode_dir_item_plugin(const struct inode *inode);
18722 +
18723 +extern void inode_set_plugin(struct inode *inode,
18724 +                            reiser4_plugin * plug, pset_member memb);
18725 +extern void reiser4_make_bad_inode(struct inode *inode);
18726 +
18727 +extern void inode_set_extension(struct inode *inode, sd_ext_bits ext);
18728 +extern void inode_check_scale(struct inode *inode, __u64 old, __u64 new);
18729 +extern void inode_check_scale_nolock(struct inode * inode, __u64 old, __u64 new);
18730 +
18731 +/*
18732 + * update field @field in inode @i to contain value @value.
18733 + */
18734 +#define INODE_SET_FIELD(i, field, value)               \
18735 +({                                                     \
18736 +       struct inode *__i;                              \
18737 +       typeof(value) __v;                              \
18738 +                                                       \
18739 +       __i = (i);                                      \
18740 +       __v = (value);                                  \
18741 +       inode_check_scale(__i, __i->field, __v);        \
18742 +       __i->field = __v;                               \
18743 +})
18744 +
18745 +#define INODE_INC_FIELD(i, field)                              \
18746 +({                                                             \
18747 +       struct inode *__i;                                      \
18748 +                                                               \
18749 +       __i = (i);                                              \
18750 +       inode_check_scale(__i, __i->field, __i->field + 1);     \
18751 +       ++ __i->field;                                          \
18752 +})
18753 +
18754 +#define INODE_DEC_FIELD(i, field)                              \
18755 +({                                                             \
18756 +       struct inode *__i;                                      \
18757 +                                                               \
18758 +       __i = (i);                                              \
18759 +       inode_check_scale(__i, __i->field, __i->field - 1);     \
18760 +       -- __i->field;                                          \
18761 +})
18762 +
18763 +/* See comment before readdir_common() for description. */
18764 +static inline struct list_head *get_readdir_list(const struct inode *inode)
18765 +{
18766 +       return &reiser4_inode_data(inode)->lists.readdir_list;
18767 +}
18768 +
18769 +extern void init_inode_ordering(struct inode *inode,
18770 +                               reiser4_object_create_data * crd, int create);
18771 +
18772 +static inline struct radix_tree_root *jnode_tree_by_inode(struct inode *inode)
18773 +{
18774 +       return &reiser4_inode_data(inode)->jnodes_tree;
18775 +}
18776 +
18777 +static inline struct radix_tree_root *jnode_tree_by_reiser4_inode(reiser4_inode
18778 +                                                                 * r4_inode)
18779 +{
18780 +       return &r4_inode->jnodes_tree;
18781 +}
18782 +
18783 +#if REISER4_DEBUG
18784 +extern void print_inode(const char *prefix, const struct inode *i);
18785 +#endif
18786 +
18787 +int is_dir_empty(const struct inode *);
18788 +
18789 +/* __REISER4_INODE_H__ */
18790 +#endif
18791 +
18792 +/* Make Linus happy.
18793 +   Local variables:
18794 +   c-indentation-style: "K&R"
18795 +   mode-name: "LC"
18796 +   c-basic-offset: 8
18797 +   tab-width: 8
18798 +   fill-column: 120
18799 +   End:
18800 +*/
18801 diff --git a/fs/reiser4/ioctl.h b/fs/reiser4/ioctl.h
18802 new file mode 100644
18803 index 0000000..4d57737
18804 --- /dev/null
18805 +++ b/fs/reiser4/ioctl.h
18806 @@ -0,0 +1,41 @@
18807 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
18808 + * reiser4/README */
18809 +
18810 +#if !defined( __REISER4_IOCTL_H__ )
18811 +#define __REISER4_IOCTL_H__
18812 +
18813 +#include <linux/fs.h>
18814 +
18815 +/*
18816 + * ioctl(2) command used to "unpack" reiser4 file, that is, convert it into
18817 + * extents and fix in this state. This is used by applications that rely on
18818 + *
18819 + *     . files being block aligned, and
18820 + *
18821 + *     . files never migrating on disk
18822 + *
18823 + * for example, boot loaders (LILO) need this.
18824 + *
18825 + * This ioctl should be used as
18826 + *
18827 + *     result = ioctl(fd, REISER4_IOC_UNPACK);
18828 + *
18829 + * File behind fd descriptor will be converted to the extents (if necessary),
18830 + * and its stat-data will be updated so that it will never be converted back
18831 + * into tails again.
18832 + */
18833 +#define REISER4_IOC_UNPACK _IOW(0xCD,1,long)
18834 +
18835 +/* __REISER4_IOCTL_H__ */
18836 +#endif
18837 +
18838 +/* Make Linus happy.
18839 +   Local variables:
18840 +   c-indentation-style: "K&R"
18841 +   mode-name: "LC"
18842 +   c-basic-offset: 8
18843 +   tab-width: 8
18844 +   fill-column: 120
18845 +   scroll-step: 1
18846 +   End:
18847 +*/
18848 diff --git a/fs/reiser4/jnode.c b/fs/reiser4/jnode.c
18849 new file mode 100644
18850 index 0000000..8e4c026
18851 --- /dev/null
18852 +++ b/fs/reiser4/jnode.c
18853 @@ -0,0 +1,1922 @@
18854 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
18855 + * reiser4/README */
18856 +/* Jnode manipulation functions. */
18857 +/* Jnode is entity used to track blocks with data and meta-data in reiser4.
18858 +
18859 +   In particular, jnodes are used to track transactional information
18860 +   associated with each block. Each znode contains jnode as ->zjnode field.
18861 +
18862 +   Jnode stands for either Josh or Journal node.
18863 +*/
18864 +
18865 +/*
18866 + * Taxonomy.
18867 + *
18868 + *     Jnode represents block containing data or meta-data. There are jnodes
18869 + *     for:
18870 + *
18871 + *         unformatted blocks (jnodes proper). There are plans, however to
18872 + *         have a handle per extent unit rather than per each unformatted
18873 + *         block, because there are so many of them.
18874 + *
18875 + *         For bitmaps. Each bitmap is actually represented by two jnodes--one
18876 + *         for working and another for "commit" data, together forming bnode.
18877 + *
18878 + *         For io-heads. These are used by log writer.
18879 + *
18880 + *         For formatted nodes (znode). See comment at the top of znode.c for
18881 + *         details specific to the formatted nodes (znodes).
18882 + *
18883 + * Node data.
18884 + *
18885 + *     Jnode provides access to the data of node it represents. Data are
18886 + *     stored in a page. Page is kept in a page cache. This means, that jnodes
18887 + *     are highly interconnected with page cache and VM internals.
18888 + *
18889 + *     jnode has a pointer to page (->pg) containing its data. Pointer to data
18890 + *     themselves is cached in ->data field to avoid frequent calls to
18891 + *     page_address().
18892 + *
18893 + *     jnode and page are attached to each other by jnode_attach_page(). This
18894 + *     function places pointer to jnode in set_page_private(), sets PG_private
18895 + *     flag and increments page counter.
18896 + *
18897 + *     Opposite operation is performed by page_clear_jnode().
18898 + *
18899 + *     jnode->pg is protected by jnode spin lock, and page->private is
18900 + *     protected by page lock. See comment at the top of page_cache.c for
18901 + *     more.
18902 + *
18903 + *     page can be detached from jnode for two reasons:
18904 + *
18905 + *         . jnode is removed from a tree (file is truncated, of formatted
18906 + *         node is removed by balancing).
18907 + *
18908 + *         . during memory pressure, VM calls ->releasepage() method
18909 + *         (reiser4_releasepage()) to evict page from memory.
18910 + *
18911 + *    (there, of course, is also umount, but this is special case we are not
18912 + *    concerned with here).
18913 + *
18914 + *    To protect jnode page from eviction, one calls jload() function that
18915 + *    "pins" page in memory (loading it if necessary), increments
18916 + *    jnode->d_count, and kmap()s page. Page is unpinned through call to
18917 + *    jrelse().
18918 + *
18919 + * Jnode life cycle.
18920 + *
18921 + *    jnode is created, placed in hash table, and, optionally, in per-inode
18922 + *    radix tree. Page can be attached to jnode, pinned, released, etc.
18923 + *
18924 + *    When jnode is captured into atom its reference counter is
18925 + *    increased. While being part of an atom, jnode can be "early
18926 + *    flushed". This means that as part of flush procedure, jnode is placed
18927 + *    into "relocate set", and its page is submitted to the disk. After io
18928 + *    completes, page can be detached, then loaded again, re-dirtied, etc.
18929 + *
18930 + *    Thread acquired reference to jnode by calling jref() and releases it by
18931 + *    jput(). When last reference is removed, jnode is still retained in
18932 + *    memory (cached) if it has page attached, _unless_ it is scheduled for
18933 + *    destruction (has JNODE_HEARD_BANSHEE bit set).
18934 + *
18935 + *    Tree read-write lock was used as "existential" lock for jnodes. That is,
18936 + *    jnode->x_count could be changed from 0 to 1 only under tree write lock,
18937 + *    that is, tree lock protected unreferenced jnodes stored in the hash
18938 + *    table, from recycling.
18939 + *
18940 + *    This resulted in high contention on tree lock, because jref()/jput() is
18941 + *    frequent operation. To ameliorate this problem, RCU is used: when jput()
18942 + *    is just about to release last reference on jnode it sets JNODE_RIP bit
18943 + *    on it, and then proceed with jnode destruction (removing jnode from hash
18944 + *    table, cbk_cache, detaching page, etc.). All places that change jnode
18945 + *    reference counter from 0 to 1 (jlookup(), zlook(), zget(), and
18946 + *    cbk_cache_scan_slots()) check for JNODE_RIP bit (this is done by
18947 + *    jnode_rip_check() function), and pretend that nothing was found in hash
18948 + *    table if bit is set.
18949 + *
18950 + *    jput defers actual return of jnode into slab cache to some later time
18951 + *    (by call_rcu()), this guarantees that other threads can safely continue
18952 + *    working with JNODE_RIP-ped jnode.
18953 + *
18954 + */
18955 +
18956 +#include "reiser4.h"
18957 +#include "debug.h"
18958 +#include "dformat.h"
18959 +#include "jnode.h"
18960 +#include "plugin/plugin_header.h"
18961 +#include "plugin/plugin.h"
18962 +#include "txnmgr.h"
18963 +/*#include "jnode.h"*/
18964 +#include "znode.h"
18965 +#include "tree.h"
18966 +#include "tree_walk.h"
18967 +#include "super.h"
18968 +#include "inode.h"
18969 +#include "page_cache.h"
18970 +
18971 +#include <asm/uaccess.h>       /* UML needs this for PAGE_OFFSET */
18972 +#include <linux/types.h>
18973 +#include <linux/slab.h>
18974 +#include <linux/pagemap.h>
18975 +#include <linux/vmalloc.h>     /* for vmalloc(), vfree() */
18976 +#include <linux/swap.h>
18977 +#include <linux/fs.h>          /* for struct address_space  */
18978 +#include <linux/writeback.h>   /* for inode_lock */
18979 +
18980 +static kmem_cache_t *_jnode_slab = NULL;
18981 +
18982 +static void jnode_set_type(jnode * node, jnode_type type);
18983 +static int jdelete(jnode * node);
18984 +static int jnode_try_drop(jnode * node);
18985 +
18986 +#if REISER4_DEBUG
18987 +static int jnode_invariant(const jnode * node, int tlocked, int jlocked);
18988 +#endif
18989 +
18990 +/* true if valid page is attached to jnode */
18991 +static inline int jnode_is_parsed(jnode * node)
18992 +{
18993 +       return JF_ISSET(node, JNODE_PARSED);
18994 +}
18995 +
18996 +/* hash table support */
18997 +
18998 +/* compare two jnode keys for equality. Used by hash-table macros */
18999 +static inline int jnode_key_eq(const jnode_key_t * k1, const jnode_key_t * k2)
19000 +{
19001 +       assert("nikita-2350", k1 != NULL);
19002 +       assert("nikita-2351", k2 != NULL);
19003 +
19004 +       return (k1->index == k2->index && k1->objectid == k2->objectid);
19005 +}
19006 +
19007 +/* Hash jnode by its key (inode plus offset). Used by hash-table macros */
19008 +static inline __u32
19009 +jnode_key_hashfn(j_hash_table * table, const jnode_key_t * key)
19010 +{
19011 +       assert("nikita-2352", key != NULL);
19012 +       assert("nikita-3346", IS_POW(table->_buckets));
19013 +
19014 +       /* yes, this is remarkable simply (where not stupid) hash function. */
19015 +       return (key->objectid + key->index) & (table->_buckets - 1);
19016 +}
19017 +
19018 +/* The hash table definition */
19019 +#define KMALLOC(size) vmalloc(size)
19020 +#define KFREE(ptr, size) vfree(ptr)
19021 +TYPE_SAFE_HASH_DEFINE(j, jnode, jnode_key_t, key.j, link.j, jnode_key_hashfn,
19022 +                     jnode_key_eq);
19023 +#undef KFREE
19024 +#undef KMALLOC
19025 +
19026 +/* call this to initialise jnode hash table */
19027 +int jnodes_tree_init(reiser4_tree * tree /* tree to initialise jnodes for */ )
19028 +{
19029 +       assert("nikita-2359", tree != NULL);
19030 +       return j_hash_init(&tree->jhash_table, 16384);
19031 +}
19032 +
19033 +/* call this to destroy jnode hash table. This is called during umount. */
19034 +int jnodes_tree_done(reiser4_tree * tree /* tree to destroy jnodes for */ )
19035 +{
19036 +       j_hash_table *jtable;
19037 +       jnode *node;
19038 +       jnode *next;
19039 +
19040 +       assert("nikita-2360", tree != NULL);
19041 +
19042 +       /*
19043 +        * Scan hash table and free all jnodes.
19044 +        */
19045 +       jtable = &tree->jhash_table;
19046 +       if (jtable->_table) {
19047 +               for_all_in_htable(jtable, j, node, next) {
19048 +                       assert("nikita-2361", !atomic_read(&node->x_count));
19049 +                       jdrop(node);
19050 +               }
19051 +
19052 +               j_hash_done(&tree->jhash_table);
19053 +       }
19054 +       return 0;
19055 +}
19056 +
19057 +/**
19058 + * init_jnodes - create jnode cache
19059 + *
19060 + * Initializes slab cache jnodes. It is part of reiser4 module initialization.
19061 + */
19062 +int init_jnodes(void)
19063 +{
19064 +       assert("umka-168", _jnode_slab == NULL);
19065 +
19066 +       _jnode_slab = kmem_cache_create("jnode", sizeof(jnode), 0,
19067 +                                       SLAB_HWCACHE_ALIGN |
19068 +                                       SLAB_RECLAIM_ACCOUNT, NULL, NULL);
19069 +       if (_jnode_slab == NULL)
19070 +               return RETERR(-ENOMEM);
19071 +
19072 +       return 0;
19073 +}
19074 +
19075 +/**
19076 + * done_znodes - delete znode cache
19077 + *
19078 + * This is called on reiser4 module unloading or system shutdown.
19079 + */
19080 +void done_jnodes(void)
19081 +{
19082 +       destroy_reiser4_cache(&_jnode_slab);
19083 +}
19084 +
19085 +/* Initialize a jnode. */
19086 +void jnode_init(jnode * node, reiser4_tree * tree, jnode_type type)
19087 +{
19088 +       assert("umka-175", node != NULL);
19089 +
19090 +       memset(node, 0, sizeof(jnode));
19091 +       ON_DEBUG(node->magic = JMAGIC);
19092 +       jnode_set_type(node, type);
19093 +       atomic_set(&node->d_count, 0);
19094 +       atomic_set(&node->x_count, 0);
19095 +       spin_lock_init(&node->guard);
19096 +       spin_lock_init(&node->load);
19097 +       node->atom = NULL;
19098 +       node->tree = tree;
19099 +       INIT_LIST_HEAD(&node->capture_link);
19100 +
19101 +       ASSIGN_NODE_LIST(node, NOT_CAPTURED);
19102 +
19103 +       INIT_RCU_HEAD(&node->rcu);
19104 +
19105 +#if REISER4_DEBUG
19106 +       {
19107 +               reiser4_super_info_data *sbinfo;
19108 +
19109 +               sbinfo = get_super_private(tree->super);
19110 +               spin_lock_irq(&sbinfo->all_guard);
19111 +               list_add(&node->jnodes, &sbinfo->all_jnodes);
19112 +               spin_unlock_irq(&sbinfo->all_guard);
19113 +       }
19114 +#endif
19115 +}
19116 +
19117 +#if REISER4_DEBUG
19118 +/*
19119 + * Remove jnode from ->all_jnodes list.
19120 + */
19121 +static void jnode_done(jnode * node, reiser4_tree * tree)
19122 +{
19123 +       reiser4_super_info_data *sbinfo;
19124 +
19125 +       sbinfo = get_super_private(tree->super);
19126 +
19127 +       spin_lock_irq(&sbinfo->all_guard);
19128 +       assert("nikita-2422", !list_empty(&node->jnodes));
19129 +       list_del_init(&node->jnodes);
19130 +       spin_unlock_irq(&sbinfo->all_guard);
19131 +}
19132 +#endif
19133 +
19134 +/* return already existing jnode of page */
19135 +jnode *jnode_by_page(struct page *pg)
19136 +{
19137 +       assert("nikita-2066", pg != NULL);
19138 +       assert("nikita-2400", PageLocked(pg));
19139 +       assert("nikita-2068", PagePrivate(pg));
19140 +       assert("nikita-2067", jprivate(pg) != NULL);
19141 +       return jprivate(pg);
19142 +}
19143 +
19144 +/* exported functions to allocate/free jnode objects outside this file */
19145 +jnode *jalloc(void)
19146 +{
19147 +       jnode *jal = kmem_cache_alloc(_jnode_slab, get_gfp_mask());
19148 +       return jal;
19149 +}
19150 +
19151 +/* return jnode back to the slab allocator */
19152 +inline void jfree(jnode * node)
19153 +{
19154 +       assert("zam-449", node != NULL);
19155 +
19156 +       assert("nikita-2663", (list_empty_careful(&node->capture_link) &&
19157 +                              NODE_LIST(node) == NOT_CAPTURED));
19158 +       assert("nikita-3222", list_empty(&node->jnodes));
19159 +       assert("nikita-3221", jnode_page(node) == NULL);
19160 +
19161 +       /* not yet phash_jnode_destroy(node); */
19162 +
19163 +       kmem_cache_free(_jnode_slab, node);
19164 +}
19165 +
19166 +/*
19167 + * This function is supplied as RCU callback. It actually frees jnode when
19168 + * last reference to it is gone.
19169 + */
19170 +static void jnode_free_actor(struct rcu_head *head)
19171 +{
19172 +       jnode *node;
19173 +       jnode_type jtype;
19174 +
19175 +       node = container_of(head, jnode, rcu);
19176 +       jtype = jnode_get_type(node);
19177 +
19178 +       ON_DEBUG(jnode_done(node, jnode_get_tree(node)));
19179 +
19180 +       switch (jtype) {
19181 +       case JNODE_IO_HEAD:
19182 +       case JNODE_BITMAP:
19183 +       case JNODE_UNFORMATTED_BLOCK:
19184 +               jfree(node);
19185 +               break;
19186 +       case JNODE_FORMATTED_BLOCK:
19187 +               zfree(JZNODE(node));
19188 +               break;
19189 +       case JNODE_INODE:
19190 +       default:
19191 +               wrong_return_value("nikita-3197", "Wrong jnode type");
19192 +       }
19193 +}
19194 +
19195 +/*
19196 + * Free a jnode. Post a callback to be executed later through RCU when all
19197 + * references to @node are released.
19198 + */
19199 +static inline void jnode_free(jnode * node, jnode_type jtype)
19200 +{
19201 +       if (jtype != JNODE_INODE) {
19202 +               /*assert("nikita-3219", list_empty(&node->rcu.list)); */
19203 +               call_rcu(&node->rcu, jnode_free_actor);
19204 +       } else
19205 +               jnode_list_remove(node);
19206 +}
19207 +
19208 +/* allocate new unformatted jnode */
19209 +static jnode *jnew_unformatted(void)
19210 +{
19211 +       jnode *jal;
19212 +
19213 +       jal = jalloc();
19214 +       if (jal == NULL)
19215 +               return NULL;
19216 +
19217 +       jnode_init(jal, current_tree, JNODE_UNFORMATTED_BLOCK);
19218 +       jal->key.j.mapping = NULL;
19219 +       jal->key.j.index = (unsigned long)-1;
19220 +       jal->key.j.objectid = 0;
19221 +       return jal;
19222 +}
19223 +
19224 +/* look for jnode with given mapping and offset within hash table */
19225 +jnode *jlookup(reiser4_tree * tree, oid_t objectid, unsigned long index)
19226 +{
19227 +       jnode_key_t jkey;
19228 +       jnode *node;
19229 +
19230 +       assert("nikita-2353", tree != NULL);
19231 +
19232 +       jkey.objectid = objectid;
19233 +       jkey.index = index;
19234 +
19235 +       /*
19236 +        * hash table is _not_ protected by any lock during lookups. All we
19237 +        * have to do is to disable preemption to keep RCU happy.
19238 +        */
19239 +
19240 +       rcu_read_lock();
19241 +       node = j_hash_find(&tree->jhash_table, &jkey);
19242 +       if (node != NULL) {
19243 +               /* protect @node from recycling */
19244 +               jref(node);
19245 +               assert("nikita-2955", jnode_invariant(node, 0, 0));
19246 +               node = jnode_rip_check(tree, node);
19247 +       }
19248 +       rcu_read_unlock();
19249 +       return node;
19250 +}
19251 +
19252 +/* per inode radix tree of jnodes is protected by tree's read write spin lock */
19253 +static jnode *jfind_nolock(struct address_space *mapping, unsigned long index)
19254 +{
19255 +       assert("vs-1694", mapping->host != NULL);
19256 +
19257 +       return radix_tree_lookup(jnode_tree_by_inode(mapping->host), index);
19258 +}
19259 +
19260 +jnode *jfind(struct address_space * mapping, unsigned long index)
19261 +{
19262 +       reiser4_tree *tree;
19263 +       jnode *node;
19264 +
19265 +       assert("vs-1694", mapping->host != NULL);
19266 +       tree = tree_by_inode(mapping->host);
19267 +
19268 +       read_lock_tree(tree);
19269 +       node = jfind_nolock(mapping, index);
19270 +       if (node != NULL)
19271 +               jref(node);
19272 +       read_unlock_tree(tree);
19273 +       return node;
19274 +}
19275 +
19276 +static void inode_attach_jnode(jnode * node)
19277 +{
19278 +       struct inode *inode;
19279 +       reiser4_inode *info;
19280 +       struct radix_tree_root *rtree;
19281 +
19282 +       assert_rw_write_locked(&(jnode_get_tree(node)->tree_lock));
19283 +       assert("zam-1043", node->key.j.mapping != NULL);
19284 +       inode = node->key.j.mapping->host;
19285 +       info = reiser4_inode_data(inode);
19286 +       rtree = jnode_tree_by_reiser4_inode(info);
19287 +       if (rtree->rnode == NULL) {
19288 +               /* prevent inode from being pruned when it has jnodes attached
19289 +                  to it */
19290 +               write_lock_irq(&inode->i_data.tree_lock);
19291 +               inode->i_data.nrpages++;
19292 +               write_unlock_irq(&inode->i_data.tree_lock);
19293 +       }
19294 +       assert("zam-1049", equi(rtree->rnode != NULL, info->nr_jnodes != 0));
19295 +       check_me("zam-1045",
19296 +                !radix_tree_insert(rtree, node->key.j.index, node));
19297 +       ON_DEBUG(info->nr_jnodes++);
19298 +}
19299 +
19300 +static void inode_detach_jnode(jnode * node)
19301 +{
19302 +       struct inode *inode;
19303 +       reiser4_inode *info;
19304 +       struct radix_tree_root *rtree;
19305 +
19306 +       assert_rw_write_locked(&(jnode_get_tree(node)->tree_lock));
19307 +       assert("zam-1044", node->key.j.mapping != NULL);
19308 +       inode = node->key.j.mapping->host;
19309 +       info = reiser4_inode_data(inode);
19310 +       rtree = jnode_tree_by_reiser4_inode(info);
19311 +
19312 +       assert("zam-1051", info->nr_jnodes != 0);
19313 +       assert("zam-1052", rtree->rnode != NULL);
19314 +       ON_DEBUG(info->nr_jnodes--);
19315 +
19316 +       /* delete jnode from inode's radix tree of jnodes */
19317 +       check_me("zam-1046", radix_tree_delete(rtree, node->key.j.index));
19318 +       if (rtree->rnode == NULL) {
19319 +               /* inode can be pruned now */
19320 +               write_lock_irq(&inode->i_data.tree_lock);
19321 +               inode->i_data.nrpages--;
19322 +               write_unlock_irq(&inode->i_data.tree_lock);
19323 +       }
19324 +}
19325 +
19326 +/* put jnode into hash table (where they can be found by flush who does not know
19327 +   mapping) and to inode's tree of jnodes (where they can be found (hopefully
19328 +   faster) in places where mapping is known). Currently it is used by
19329 +   fs/reiser4/plugin/item/extent_file_ops.c:index_extent_jnode when new jnode is
19330 +   created */
19331 +static void
19332 +hash_unformatted_jnode(jnode * node, struct address_space *mapping,
19333 +                      unsigned long index)
19334 +{
19335 +       j_hash_table *jtable;
19336 +
19337 +       assert("vs-1446", jnode_is_unformatted(node));
19338 +       assert("vs-1442", node->key.j.mapping == 0);
19339 +       assert("vs-1443", node->key.j.objectid == 0);
19340 +       assert("vs-1444", node->key.j.index == (unsigned long)-1);
19341 +       assert_rw_write_locked(&(jnode_get_tree(node)->tree_lock));
19342 +
19343 +       node->key.j.mapping = mapping;
19344 +       node->key.j.objectid = get_inode_oid(mapping->host);
19345 +       node->key.j.index = index;
19346 +
19347 +       jtable = &jnode_get_tree(node)->jhash_table;
19348 +
19349 +       /* race with some other thread inserting jnode into the hash table is
19350 +        * impossible, because we keep the page lock. */
19351 +       /*
19352 +        * following assertion no longer holds because of RCU: it is possible
19353 +        * jnode is in the hash table, but with JNODE_RIP bit set.
19354 +        */
19355 +       /* assert("nikita-3211", j_hash_find(jtable, &node->key.j) == NULL); */
19356 +       j_hash_insert_rcu(jtable, node);
19357 +       inode_attach_jnode(node);
19358 +}
19359 +
19360 +static void unhash_unformatted_node_nolock(jnode * node)
19361 +{
19362 +       assert("vs-1683", node->key.j.mapping != NULL);
19363 +       assert("vs-1684",
19364 +              node->key.j.objectid ==
19365 +              get_inode_oid(node->key.j.mapping->host));
19366 +
19367 +       /* remove jnode from hash-table */
19368 +       j_hash_remove_rcu(&node->tree->jhash_table, node);
19369 +       inode_detach_jnode(node);
19370 +       node->key.j.mapping = NULL;
19371 +       node->key.j.index = (unsigned long)-1;
19372 +       node->key.j.objectid = 0;
19373 +
19374 +}
19375 +
19376 +/* remove jnode from hash table and from inode's tree of jnodes. This is used in
19377 +   reiser4_invalidatepage and in kill_hook_extent -> truncate_inode_jnodes ->
19378 +   uncapture_jnode */
19379 +void unhash_unformatted_jnode(jnode * node)
19380 +{
19381 +       assert("vs-1445", jnode_is_unformatted(node));
19382 +
19383 +       write_lock_tree(node->tree);
19384 +       unhash_unformatted_node_nolock(node);
19385 +       write_unlock_tree(node->tree);
19386 +}
19387 +
19388 +/*
19389 + * search hash table for a jnode with given oid and index. If not found,
19390 + * allocate new jnode, insert it, and also insert into radix tree for the
19391 + * given inode/mapping.
19392 + */
19393 +static jnode *find_get_jnode(reiser4_tree * tree,
19394 +                            struct address_space *mapping,
19395 +                            oid_t oid, unsigned long index)
19396 +{
19397 +       jnode *result;
19398 +       jnode *shadow;
19399 +       int preload;
19400 +
19401 +       result = jnew_unformatted();
19402 +
19403 +       if (unlikely(result == NULL))
19404 +               return ERR_PTR(RETERR(-ENOMEM));
19405 +
19406 +       preload = radix_tree_preload(get_gfp_mask());
19407 +       if (preload != 0)
19408 +               return ERR_PTR(preload);
19409 +
19410 +       write_lock_tree(tree);
19411 +       shadow = jfind_nolock(mapping, index);
19412 +       if (likely(shadow == NULL)) {
19413 +               /* add new jnode to hash table and inode's radix tree of jnodes */
19414 +               jref(result);
19415 +               hash_unformatted_jnode(result, mapping, index);
19416 +       } else {
19417 +               /* jnode is found in inode's radix tree of jnodes */
19418 +               jref(shadow);
19419 +               jnode_free(result, JNODE_UNFORMATTED_BLOCK);
19420 +               assert("vs-1498", shadow->key.j.mapping == mapping);
19421 +               result = shadow;
19422 +       }
19423 +       write_unlock_tree(tree);
19424 +
19425 +       assert("nikita-2955",
19426 +              ergo(result != NULL, jnode_invariant(result, 0, 0)));
19427 +       radix_tree_preload_end();
19428 +       return result;
19429 +}
19430 +
19431 +/* jget() (a la zget() but for unformatted nodes). Returns (and possibly
19432 +   creates) jnode corresponding to page @pg. jnode is attached to page and
19433 +   inserted into jnode hash-table. */
19434 +static jnode *do_jget(reiser4_tree * tree, struct page *pg)
19435 +{
19436 +       /*
19437 +        * There are two ways to create jnode: starting with pre-existing page
19438 +        * and without page.
19439 +        *
19440 +        * When page already exists, jnode is created
19441 +        * (jnode_of_page()->do_jget()) under page lock. This is done in
19442 +        * ->writepage(), or when capturing anonymous page dirtied through
19443 +        * mmap.
19444 +        *
19445 +        * Jnode without page is created by index_extent_jnode().
19446 +        *
19447 +        */
19448 +
19449 +       jnode *result;
19450 +       oid_t oid = get_inode_oid(pg->mapping->host);
19451 +
19452 +       assert("umka-176", pg != NULL);
19453 +       assert("nikita-2394", PageLocked(pg));
19454 +
19455 +       result = jprivate(pg);
19456 +       if (likely(result != NULL))
19457 +               return jref(result);
19458 +
19459 +       tree = tree_by_page(pg);
19460 +
19461 +       /* check hash-table first */
19462 +       result = jfind(pg->mapping, pg->index);
19463 +       if (unlikely(result != NULL)) {
19464 +               spin_lock_jnode(result);
19465 +               jnode_attach_page(result, pg);
19466 +               spin_unlock_jnode(result);
19467 +               result->key.j.mapping = pg->mapping;
19468 +               return result;
19469 +       }
19470 +
19471 +       result = find_get_jnode(tree, pg->mapping, oid, pg->index);
19472 +       if (unlikely(IS_ERR(result)))
19473 +               return result;
19474 +       /* attach jnode to page */
19475 +       spin_lock_jnode(result);
19476 +       jnode_attach_page(result, pg);
19477 +       spin_unlock_jnode(result);
19478 +       return result;
19479 +}
19480 +
19481 +/*
19482 + * return jnode for @pg, creating it if necessary.
19483 + */
19484 +jnode *jnode_of_page(struct page * pg)
19485 +{
19486 +       jnode *result;
19487 +
19488 +       assert("umka-176", pg != NULL);
19489 +       assert("nikita-2394", PageLocked(pg));
19490 +
19491 +       result = do_jget(tree_by_page(pg), pg);
19492 +
19493 +       if (REISER4_DEBUG && !IS_ERR(result)) {
19494 +               assert("nikita-3210", result == jprivate(pg));
19495 +               assert("nikita-2046", jnode_page(jprivate(pg)) == pg);
19496 +               if (jnode_is_unformatted(jprivate(pg))) {
19497 +                       assert("nikita-2364",
19498 +                              jprivate(pg)->key.j.index == pg->index);
19499 +                       assert("nikita-2367",
19500 +                              jprivate(pg)->key.j.mapping == pg->mapping);
19501 +                       assert("nikita-2365",
19502 +                              jprivate(pg)->key.j.objectid ==
19503 +                              get_inode_oid(pg->mapping->host));
19504 +                       assert("vs-1200",
19505 +                              jprivate(pg)->key.j.objectid ==
19506 +                              pg->mapping->host->i_ino);
19507 +                       assert("nikita-2356",
19508 +                              jnode_is_unformatted(jnode_by_page(pg)));
19509 +               }
19510 +               assert("nikita-2956", jnode_invariant(jprivate(pg), 0, 0));
19511 +       }
19512 +       return result;
19513 +}
19514 +
19515 +/* attach page to jnode: set ->pg pointer in jnode, and ->private one in the
19516 + * page.*/
19517 +void jnode_attach_page(jnode * node, struct page *pg)
19518 +{
19519 +       assert("nikita-2060", node != NULL);
19520 +       assert("nikita-2061", pg != NULL);
19521 +
19522 +       assert("nikita-2050", jprivate(pg) == 0ul);
19523 +       assert("nikita-2393", !PagePrivate(pg));
19524 +       assert("vs-1741", node->pg == NULL);
19525 +
19526 +       assert("nikita-2396", PageLocked(pg));
19527 +       assert_spin_locked(&(node->guard));
19528 +
19529 +       page_cache_get(pg);
19530 +       set_page_private(pg, (unsigned long)node);
19531 +       node->pg = pg;
19532 +       SetPagePrivate(pg);
19533 +}
19534 +
19535 +/* Dual to jnode_attach_page: break a binding between page and jnode */
19536 +void page_clear_jnode(struct page *page, jnode * node)
19537 +{
19538 +       assert("nikita-2424", page != NULL);
19539 +       assert("nikita-2425", PageLocked(page));
19540 +       assert("nikita-2426", node != NULL);
19541 +       assert_spin_locked(&(node->guard));
19542 +       assert("nikita-2428", PagePrivate(page));
19543 +
19544 +       assert("nikita-3551", !PageWriteback(page));
19545 +
19546 +       JF_CLR(node, JNODE_PARSED);
19547 +       set_page_private(page, 0ul);
19548 +       ClearPagePrivate(page);
19549 +       node->pg = NULL;
19550 +       page_cache_release(page);
19551 +}
19552 +
19553 +/* it is only used in one place to handle error */
19554 +void
19555 +page_detach_jnode(struct page *page, struct address_space *mapping,
19556 +                 unsigned long index)
19557 +{
19558 +       assert("nikita-2395", page != NULL);
19559 +
19560 +       lock_page(page);
19561 +       if ((page->mapping == mapping) && (page->index == index)
19562 +           && PagePrivate(page)) {
19563 +               jnode *node;
19564 +
19565 +               node = jprivate(page);
19566 +               spin_lock_jnode(node);
19567 +               page_clear_jnode(page, node);
19568 +               spin_unlock_jnode(node);
19569 +       }
19570 +       unlock_page(page);
19571 +}
19572 +
19573 +/* return @node page locked.
19574 +
19575 +   Locking ordering requires that one first takes page lock and afterwards
19576 +   spin lock on node attached to this page. Sometimes it is necessary to go in
19577 +   the opposite direction. This is done through standard trylock-and-release
19578 +   loop.
19579 +*/
19580 +static struct page *jnode_lock_page(jnode * node)
19581 +{
19582 +       struct page *page;
19583 +
19584 +       assert("nikita-2052", node != NULL);
19585 +       assert("nikita-2401", LOCK_CNT_NIL(spin_locked_jnode));
19586 +
19587 +       while (1) {
19588 +
19589 +               spin_lock_jnode(node);
19590 +               page = jnode_page(node);
19591 +               if (page == NULL) {
19592 +                       break;
19593 +               }
19594 +
19595 +               /* no need to page_cache_get( page ) here, because page cannot
19596 +                  be evicted from memory without detaching it from jnode and
19597 +                  this requires spin lock on jnode that we already hold.
19598 +                */
19599 +               if (!TestSetPageLocked(page)) {
19600 +                       /* We won a lock on jnode page, proceed. */
19601 +                       break;
19602 +               }
19603 +
19604 +               /* Page is locked by someone else. */
19605 +               page_cache_get(page);
19606 +               spin_unlock_jnode(node);
19607 +               wait_on_page_locked(page);
19608 +               /* it is possible that page was detached from jnode and
19609 +                  returned to the free pool, or re-assigned while we were
19610 +                  waiting on locked bit. This will be rechecked on the next
19611 +                  loop iteration.
19612 +                */
19613 +               page_cache_release(page);
19614 +
19615 +               /* try again */
19616 +       }
19617 +       return page;
19618 +}
19619 +
19620 +/*
19621 + * is JNODE_PARSED bit is not set, call ->parse() method of jnode, to verify
19622 + * validness of jnode content.
19623 + */
19624 +static inline int jparse(jnode * node)
19625 +{
19626 +       int result;
19627 +
19628 +       assert("nikita-2466", node != NULL);
19629 +
19630 +       spin_lock_jnode(node);
19631 +       if (likely(!jnode_is_parsed(node))) {
19632 +               result = jnode_ops(node)->parse(node);
19633 +               if (likely(result == 0))
19634 +                       JF_SET(node, JNODE_PARSED);
19635 +       } else
19636 +               result = 0;
19637 +       spin_unlock_jnode(node);
19638 +       return result;
19639 +}
19640 +
19641 +/* Lock a page attached to jnode, create and attach page to jnode if it had no
19642 + * one. */
19643 +static struct page *jnode_get_page_locked(jnode * node, gfp_t gfp_flags)
19644 +{
19645 +       struct page *page;
19646 +
19647 +       spin_lock_jnode(node);
19648 +       page = jnode_page(node);
19649 +
19650 +       if (page == NULL) {
19651 +               spin_unlock_jnode(node);
19652 +               page = find_or_create_page(jnode_get_mapping(node),
19653 +                                          jnode_get_index(node), gfp_flags);
19654 +               if (page == NULL)
19655 +                       return ERR_PTR(RETERR(-ENOMEM));
19656 +       } else {
19657 +               if (!TestSetPageLocked(page)) {
19658 +                       spin_unlock_jnode(node);
19659 +                       return page;
19660 +               }
19661 +               page_cache_get(page);
19662 +               spin_unlock_jnode(node);
19663 +               lock_page(page);
19664 +               assert("nikita-3134", page->mapping == jnode_get_mapping(node));
19665 +       }
19666 +
19667 +       spin_lock_jnode(node);
19668 +       if (!jnode_page(node))
19669 +               jnode_attach_page(node, page);
19670 +       spin_unlock_jnode(node);
19671 +
19672 +       page_cache_release(page);
19673 +       assert("zam-894", jnode_page(node) == page);
19674 +       return page;
19675 +}
19676 +
19677 +/* Start read operation for jnode's page if page is not up-to-date. */
19678 +static int jnode_start_read(jnode * node, struct page *page)
19679 +{
19680 +       assert("zam-893", PageLocked(page));
19681 +
19682 +       if (PageUptodate(page)) {
19683 +               unlock_page(page);
19684 +               return 0;
19685 +       }
19686 +       return page_io(page, node, READ, get_gfp_mask());
19687 +}
19688 +
19689 +#if REISER4_DEBUG
19690 +static void check_jload(jnode * node, struct page *page)
19691 +{
19692 +       if (jnode_is_znode(node)) {
19693 +               node40_header *nh;
19694 +               znode *z;
19695 +
19696 +               z = JZNODE(node);
19697 +               if (znode_is_any_locked(z)) {
19698 +                       nh = (node40_header *) kmap(page);
19699 +                       /* this only works for node40-only file systems. For
19700 +                        * debugging. */
19701 +                       assert("nikita-3253",
19702 +                              z->nr_items == le16_to_cpu(get_unaligned(&nh->nr_items)));
19703 +                       kunmap(page);
19704 +               }
19705 +               assert("nikita-3565", znode_invariant(z));
19706 +       }
19707 +}
19708 +#else
19709 +#define check_jload(node, page) noop
19710 +#endif
19711 +
19712 +/* prefetch jnode to speed up next call to jload. Call this when you are going
19713 + * to call jload() shortly. This will bring appropriate portion of jnode into
19714 + * CPU cache. */
19715 +void jload_prefetch(jnode * node)
19716 +{
19717 +       prefetchw(&node->x_count);
19718 +}
19719 +
19720 +/* load jnode's data into memory */
19721 +int jload_gfp(jnode * node /* node to load */ ,
19722 +             gfp_t gfp_flags /* allocation flags */ ,
19723 +             int do_kmap /* true if page should be kmapped */ )
19724 +{
19725 +       struct page *page;
19726 +       int result = 0;
19727 +       int parsed;
19728 +
19729 +       assert("nikita-3010", schedulable());
19730 +
19731 +       prefetchw(&node->pg);
19732 +
19733 +       /* taking d-reference implies taking x-reference. */
19734 +       jref(node);
19735 +
19736 +       /*
19737 +        * acquiring d-reference to @jnode and check for JNODE_PARSED bit
19738 +        * should be atomic, otherwise there is a race against
19739 +        * reiser4_releasepage().
19740 +        */
19741 +       spin_lock(&(node->load));
19742 +       add_d_ref(node);
19743 +       parsed = jnode_is_parsed(node);
19744 +       spin_unlock(&(node->load));
19745 +
19746 +       if (unlikely(!parsed)) {
19747 +               page = jnode_get_page_locked(node, gfp_flags);
19748 +               if (unlikely(IS_ERR(page))) {
19749 +                       result = PTR_ERR(page);
19750 +                       goto failed;
19751 +               }
19752 +
19753 +               result = jnode_start_read(node, page);
19754 +               if (unlikely(result != 0))
19755 +                       goto failed;
19756 +
19757 +               wait_on_page_locked(page);
19758 +               if (unlikely(!PageUptodate(page))) {
19759 +                       result = RETERR(-EIO);
19760 +                       goto failed;
19761 +               }
19762 +
19763 +               if (do_kmap)
19764 +                       node->data = kmap(page);
19765 +
19766 +               result = jparse(node);
19767 +               if (unlikely(result != 0)) {
19768 +                       if (do_kmap)
19769 +                               kunmap(page);
19770 +                       goto failed;
19771 +               }
19772 +               check_jload(node, page);
19773 +       } else {
19774 +               page = jnode_page(node);
19775 +               check_jload(node, page);
19776 +               if (do_kmap)
19777 +                       node->data = kmap(page);
19778 +       }
19779 +
19780 +       if (!is_writeout_mode())
19781 +               /* We do not mark pages active if jload is called as a part of
19782 +                * jnode_flush() or reiser4_write_logs().  Both jnode_flush()
19783 +                * and write_logs() add no value to cached data, there is no
19784 +                * sense to mark pages as active when they go to disk, it just
19785 +                * confuses vm scanning routines because clean page could be
19786 +                * moved out from inactive list as a result of this
19787 +                * mark_page_accessed() call. */
19788 +               mark_page_accessed(page);
19789 +
19790 +       return 0;
19791 +
19792 +      failed:
19793 +       jrelse_tail(node);
19794 +       return result;
19795 +
19796 +}
19797 +
19798 +/* start asynchronous reading for given jnode's page. */
19799 +int jstartio(jnode * node)
19800 +{
19801 +       struct page *page;
19802 +
19803 +       page = jnode_get_page_locked(node, get_gfp_mask());
19804 +       if (IS_ERR(page))
19805 +               return PTR_ERR(page);
19806 +
19807 +       return jnode_start_read(node, page);
19808 +}
19809 +
19810 +/* Initialize a node by calling appropriate plugin instead of reading
19811 + * node from disk as in jload(). */
19812 +int jinit_new(jnode * node, gfp_t gfp_flags)
19813 +{
19814 +       struct page *page;
19815 +       int result;
19816 +
19817 +       jref(node);
19818 +       add_d_ref(node);
19819 +
19820 +       page = jnode_get_page_locked(node, gfp_flags);
19821 +       if (IS_ERR(page)) {
19822 +               result = PTR_ERR(page);
19823 +               goto failed;
19824 +       }
19825 +
19826 +       SetPageUptodate(page);
19827 +       unlock_page(page);
19828 +
19829 +       node->data = kmap(page);
19830 +
19831 +       if (!jnode_is_parsed(node)) {
19832 +               jnode_plugin *jplug = jnode_ops(node);
19833 +               spin_lock_jnode(node);
19834 +               result = jplug->init(node);
19835 +               spin_unlock_jnode(node);
19836 +               if (result) {
19837 +                       kunmap(page);
19838 +                       goto failed;
19839 +               }
19840 +               JF_SET(node, JNODE_PARSED);
19841 +       }
19842 +
19843 +       return 0;
19844 +
19845 +      failed:
19846 +       jrelse(node);
19847 +       return result;
19848 +}
19849 +
19850 +/* release a reference to jnode acquired by jload(), decrement ->d_count */
19851 +void jrelse_tail(jnode * node /* jnode to release references to */ )
19852 +{
19853 +       assert("nikita-489", atomic_read(&node->d_count) > 0);
19854 +       atomic_dec(&node->d_count);
19855 +       if (jnode_is_unformatted(node) || jnode_is_znode(node))
19856 +               LOCK_CNT_DEC(d_refs);
19857 +       /* release reference acquired in jload_gfp() or jinit_new() */
19858 +       jput(node);
19859 +}
19860 +
19861 +/* drop reference to node data. When last reference is dropped, data are
19862 +   unloaded. */
19863 +void jrelse(jnode * node /* jnode to release references to */ )
19864 +{
19865 +       struct page *page;
19866 +
19867 +       assert("nikita-487", node != NULL);
19868 +       assert_spin_not_locked(&(node->guard));
19869 +
19870 +       page = jnode_page(node);
19871 +       if (likely(page != NULL)) {
19872 +               /*
19873 +                * it is safe not to lock jnode here, because at this point
19874 +                * @node->d_count is greater than zero (if jrelse() is used
19875 +                * correctly, that is). JNODE_PARSED may be not set yet, if,
19876 +                * for example, we got here as a result of error handling path
19877 +                * in jload(). Anyway, page cannot be detached by
19878 +                * reiser4_releasepage(). truncate will invalidate page
19879 +                * regardless, but this should not be a problem.
19880 +                */
19881 +               kunmap(page);
19882 +       }
19883 +       jrelse_tail(node);
19884 +}
19885 +
19886 +/* called from jput() to wait for io completion */
19887 +static void jnode_finish_io(jnode * node)
19888 +{
19889 +       struct page *page;
19890 +
19891 +       assert("nikita-2922", node != NULL);
19892 +
19893 +       spin_lock_jnode(node);
19894 +       page = jnode_page(node);
19895 +       if (page != NULL) {
19896 +               page_cache_get(page);
19897 +               spin_unlock_jnode(node);
19898 +               wait_on_page_writeback(page);
19899 +               page_cache_release(page);
19900 +       } else
19901 +               spin_unlock_jnode(node);
19902 +}
19903 +
19904 +/*
19905 + * This is called by jput() when last reference to jnode is released. This is
19906 + * separate function, because we want fast path of jput() to be inline and,
19907 + * therefore, small.
19908 + */
19909 +void jput_final(jnode * node)
19910 +{
19911 +       int r_i_p;
19912 +
19913 +       /* A fast check for keeping node in cache. We always keep node in cache
19914 +        * if its page is present and node was not marked for deletion */
19915 +       if (jnode_page(node) != NULL && !JF_ISSET(node, JNODE_HEARD_BANSHEE)) {
19916 +               rcu_read_unlock();
19917 +               return;
19918 +       }
19919 +       assert("edward-1432", node->page_count == 0);
19920 +
19921 +       r_i_p = !JF_TEST_AND_SET(node, JNODE_RIP);
19922 +       /*
19923 +        * if r_i_p is true, we were first to set JNODE_RIP on this node. In
19924 +        * this case it is safe to access node after unlock.
19925 +        */
19926 +       rcu_read_unlock();
19927 +       if (r_i_p) {
19928 +               jnode_finish_io(node);
19929 +               if (JF_ISSET(node, JNODE_HEARD_BANSHEE))
19930 +                       /* node is removed from the tree. */
19931 +                       jdelete(node);
19932 +               else
19933 +                       jnode_try_drop(node);
19934 +       }
19935 +       /* if !r_i_p some other thread is already killing it */
19936 +}
19937 +
19938 +int jwait_io(jnode * node, int rw)
19939 +{
19940 +       struct page *page;
19941 +       int result;
19942 +
19943 +       assert("zam-447", node != NULL);
19944 +       assert("zam-448", jnode_page(node) != NULL);
19945 +
19946 +       page = jnode_page(node);
19947 +
19948 +       result = 0;
19949 +       if (rw == READ) {
19950 +               wait_on_page_locked(page);
19951 +       } else {
19952 +               assert("nikita-2227", rw == WRITE);
19953 +               wait_on_page_writeback(page);
19954 +       }
19955 +       if (PageError(page))
19956 +               result = RETERR(-EIO);
19957 +
19958 +       return result;
19959 +}
19960 +
19961 +/*
19962 + * jnode types and plugins.
19963 + *
19964 + * jnode by itself is a "base type". There are several different jnode
19965 + * flavors, called "jnode types" (see jnode_type for a list). Sometimes code
19966 + * has to do different things based on jnode type. In the standard reiser4 way
19967 + * this is done by having jnode plugin (see fs/reiser4/plugin.h:jnode_plugin).
19968 + *
19969 + * Functions below deal with jnode types and define methods of jnode plugin.
19970 + *
19971 + */
19972 +
19973 +/* set jnode type. This is done during jnode initialization. */
19974 +static void jnode_set_type(jnode * node, jnode_type type)
19975 +{
19976 +       static unsigned long type_to_mask[] = {
19977 +               [JNODE_UNFORMATTED_BLOCK] = 1,
19978 +               [JNODE_FORMATTED_BLOCK] = 0,
19979 +               [JNODE_BITMAP] = 2,
19980 +               [JNODE_IO_HEAD] = 6,
19981 +               [JNODE_INODE] = 4
19982 +       };
19983 +
19984 +       assert("zam-647", type < LAST_JNODE_TYPE);
19985 +       assert("nikita-2815", !jnode_is_loaded(node));
19986 +       assert("nikita-3386", node->state == 0);
19987 +
19988 +       node->state |= (type_to_mask[type] << JNODE_TYPE_1);
19989 +}
19990 +
19991 +/* ->init() method of jnode plugin for jnodes that don't require plugin
19992 + * specific initialization. */
19993 +static int init_noinit(jnode * node UNUSED_ARG)
19994 +{
19995 +       return 0;
19996 +}
19997 +
19998 +/* ->parse() method of jnode plugin for jnodes that don't require plugin
19999 + * specific pasring. */
20000 +static int parse_noparse(jnode * node UNUSED_ARG)
20001 +{
20002 +       return 0;
20003 +}
20004 +
20005 +/* ->mapping() method for unformatted jnode */
20006 +struct address_space *mapping_jnode(const jnode * node)
20007 +{
20008 +       struct address_space *map;
20009 +
20010 +       assert("nikita-2713", node != NULL);
20011 +
20012 +       /* mapping is stored in jnode */
20013 +
20014 +       map = node->key.j.mapping;
20015 +       assert("nikita-2714", map != NULL);
20016 +       assert("nikita-2897", is_reiser4_inode(map->host));
20017 +       assert("nikita-2715", get_inode_oid(map->host) == node->key.j.objectid);
20018 +       return map;
20019 +}
20020 +
20021 +/* ->index() method for unformatted jnodes */
20022 +unsigned long index_jnode(const jnode * node)
20023 +{
20024 +       /* index is stored in jnode */
20025 +       return node->key.j.index;
20026 +}
20027 +
20028 +/* ->remove() method for unformatted jnodes */
20029 +static inline void remove_jnode(jnode * node, reiser4_tree * tree)
20030 +{
20031 +       /* remove jnode from hash table and radix tree */
20032 +       if (node->key.j.mapping)
20033 +               unhash_unformatted_node_nolock(node);
20034 +}
20035 +
20036 +/* ->mapping() method for znodes */
20037 +static struct address_space *mapping_znode(const jnode * node)
20038 +{
20039 +       /* all znodes belong to fake inode */
20040 +       return get_super_fake(jnode_get_tree(node)->super)->i_mapping;
20041 +}
20042 +
20043 +/* ->index() method for znodes */
20044 +static unsigned long index_znode(const jnode * node)
20045 +{
20046 +       unsigned long addr;
20047 +       assert("nikita-3317", (1 << znode_shift_order) < sizeof(znode));
20048 +
20049 +       /* index of znode is just its address (shifted) */
20050 +       addr = (unsigned long)node;
20051 +       return (addr - PAGE_OFFSET) >> znode_shift_order;
20052 +}
20053 +
20054 +/* ->mapping() method for bitmap jnode */
20055 +static struct address_space *mapping_bitmap(const jnode * node)
20056 +{
20057 +       /* all bitmap blocks belong to special bitmap inode */
20058 +       return get_super_private(jnode_get_tree(node)->super)->bitmap->
20059 +           i_mapping;
20060 +}
20061 +
20062 +/* ->index() method for jnodes that are indexed by address */
20063 +static unsigned long index_is_address(const jnode * node)
20064 +{
20065 +       unsigned long ind;
20066 +
20067 +       ind = (unsigned long)node;
20068 +       return ind - PAGE_OFFSET;
20069 +}
20070 +
20071 +/* resolve race with jput */
20072 +jnode *jnode_rip_sync(reiser4_tree *tree, jnode *node)
20073 +{
20074 +       /*
20075 +        * This is used as part of RCU-based jnode handling.
20076 +        *
20077 +        * jlookup(), zlook(), zget(), and cbk_cache_scan_slots() have to work
20078 +        * with unreferenced jnodes (ones with ->x_count == 0). Hash table is
20079 +        * not protected during this, so concurrent thread may execute
20080 +        * zget-set-HEARD_BANSHEE-zput, or somehow else cause jnode to be
20081 +        * freed in jput_final(). To avoid such races, jput_final() sets
20082 +        * JNODE_RIP on jnode (under tree lock). All places that work with
20083 +        * unreferenced jnodes call this function. It checks for JNODE_RIP bit
20084 +        * (first without taking tree lock), and if this bit is set, released
20085 +        * reference acquired by the current thread and returns NULL.
20086 +        *
20087 +        * As a result, if jnode is being concurrently freed, NULL is returned
20088 +        * and caller should pretend that jnode wasn't found in the first
20089 +        * place.
20090 +        *
20091 +        * Otherwise it's safe to release "rcu-read-lock" and continue with
20092 +        * jnode.
20093 +        */
20094 +       if (unlikely(JF_ISSET(node, JNODE_RIP))) {
20095 +               read_lock_tree(tree);
20096 +               if (JF_ISSET(node, JNODE_RIP)) {
20097 +                       dec_x_ref(node);
20098 +                       node = NULL;
20099 +               }
20100 +               read_unlock_tree(tree);
20101 +       }
20102 +       return node;
20103 +}
20104 +
20105 +reiser4_key *jnode_build_key(const jnode * node, reiser4_key * key)
20106 +{
20107 +       struct inode *inode;
20108 +       item_plugin *iplug;
20109 +       loff_t off;
20110 +
20111 +       assert("nikita-3092", node != NULL);
20112 +       assert("nikita-3093", key != NULL);
20113 +       assert("nikita-3094", jnode_is_unformatted(node));
20114 +
20115 +       off = ((loff_t) index_jnode(node)) << PAGE_CACHE_SHIFT;
20116 +       inode = mapping_jnode(node)->host;
20117 +
20118 +       if (node->parent_item_id != 0)
20119 +               iplug = item_plugin_by_id(node->parent_item_id);
20120 +       else
20121 +               iplug = NULL;
20122 +
20123 +       if (iplug != NULL && iplug->f.key_by_offset)
20124 +               iplug->f.key_by_offset(inode, off, key);
20125 +       else {
20126 +               file_plugin *fplug;
20127 +
20128 +               fplug = inode_file_plugin(inode);
20129 +               assert("zam-1007", fplug != NULL);
20130 +               assert("zam-1008", fplug->key_by_inode != NULL);
20131 +
20132 +               fplug->key_by_inode(inode, off, key);
20133 +       }
20134 +
20135 +       return key;
20136 +}
20137 +
20138 +/* ->parse() method for formatted nodes */
20139 +static int parse_znode(jnode * node)
20140 +{
20141 +       return zparse(JZNODE(node));
20142 +}
20143 +
20144 +/* ->delete() method for formatted nodes */
20145 +static void delete_znode(jnode * node, reiser4_tree * tree)
20146 +{
20147 +       znode *z;
20148 +
20149 +       assert_rw_write_locked(&(tree->tree_lock));
20150 +       assert("vs-898", JF_ISSET(node, JNODE_HEARD_BANSHEE));
20151 +
20152 +       z = JZNODE(node);
20153 +       assert("vs-899", z->c_count == 0);
20154 +
20155 +       /* delete znode from sibling list. */
20156 +       sibling_list_remove(z);
20157 +
20158 +       znode_remove(z, tree);
20159 +}
20160 +
20161 +/* ->remove() method for formatted nodes */
20162 +static int remove_znode(jnode * node, reiser4_tree * tree)
20163 +{
20164 +       znode *z;
20165 +
20166 +       assert_rw_write_locked(&(tree->tree_lock));
20167 +       z = JZNODE(node);
20168 +
20169 +       if (z->c_count == 0) {
20170 +               /* detach znode from sibling list. */
20171 +               sibling_list_drop(z);
20172 +               /* this is called with tree spin-lock held, so call
20173 +                  znode_remove() directly (rather than znode_lock_remove()). */
20174 +               znode_remove(z, tree);
20175 +               return 0;
20176 +       }
20177 +       return RETERR(-EBUSY);
20178 +}
20179 +
20180 +/* ->init() method for formatted nodes */
20181 +static int init_znode(jnode * node)
20182 +{
20183 +       znode *z;
20184 +
20185 +       z = JZNODE(node);
20186 +       /* call node plugin to do actual initialization */
20187 +       return z->nplug->init(z);
20188 +}
20189 +
20190 +/* ->clone() method for formatted nodes */
20191 +static jnode *clone_formatted(jnode * node)
20192 +{
20193 +       znode *clone;
20194 +
20195 +       assert("vs-1430", jnode_is_znode(node));
20196 +       clone = zalloc(get_gfp_mask());
20197 +       if (clone == NULL)
20198 +               return ERR_PTR(RETERR(-ENOMEM));
20199 +       zinit(clone, NULL, current_tree);
20200 +       jnode_set_block(ZJNODE(clone), jnode_get_block(node));
20201 +       /* ZJNODE(clone)->key.z is not initialized */
20202 +       clone->level = JZNODE(node)->level;
20203 +
20204 +       return ZJNODE(clone);
20205 +}
20206 +
20207 +/* jplug->clone for unformatted nodes */
20208 +static jnode *clone_unformatted(jnode * node)
20209 +{
20210 +       jnode *clone;
20211 +
20212 +       assert("vs-1431", jnode_is_unformatted(node));
20213 +       clone = jalloc();
20214 +       if (clone == NULL)
20215 +               return ERR_PTR(RETERR(-ENOMEM));
20216 +
20217 +       jnode_init(clone, current_tree, JNODE_UNFORMATTED_BLOCK);
20218 +       jnode_set_block(clone, jnode_get_block(node));
20219 +
20220 +       return clone;
20221 +
20222 +}
20223 +
20224 +/*
20225 + * Setup jnode plugin methods for various jnode types.
20226 + */
20227 +jnode_plugin jnode_plugins[LAST_JNODE_TYPE] = {
20228 +       [JNODE_UNFORMATTED_BLOCK] = {
20229 +               .h = {
20230 +                       .type_id = REISER4_JNODE_PLUGIN_TYPE,
20231 +                       .id = JNODE_UNFORMATTED_BLOCK,
20232 +                       .pops = NULL,
20233 +                       .label = "unformatted",
20234 +                       .desc = "unformatted node",
20235 +                       .linkage = {NULL, NULL}
20236 +               },
20237 +               .init = init_noinit,
20238 +               .parse = parse_noparse,
20239 +               .mapping = mapping_jnode,
20240 +               .index = index_jnode,
20241 +               .clone = clone_unformatted
20242 +       },
20243 +       [JNODE_FORMATTED_BLOCK] = {
20244 +               .h = {
20245 +                       .type_id = REISER4_JNODE_PLUGIN_TYPE,
20246 +                       .id = JNODE_FORMATTED_BLOCK,
20247 +                       .pops = NULL,
20248 +                       .label = "formatted",
20249 +                       .desc = "formatted tree node",
20250 +                       .linkage = {NULL, NULL}
20251 +               },
20252 +               .init = init_znode,
20253 +               .parse = parse_znode,
20254 +               .mapping = mapping_znode,
20255 +               .index = index_znode,
20256 +               .clone = clone_formatted
20257 +       },
20258 +       [JNODE_BITMAP] = {
20259 +               .h = {
20260 +                       .type_id = REISER4_JNODE_PLUGIN_TYPE,
20261 +                       .id = JNODE_BITMAP,
20262 +                       .pops = NULL,
20263 +                       .label = "bitmap",
20264 +                       .desc = "bitmap node",
20265 +                       .linkage = {NULL, NULL}
20266 +               },
20267 +               .init = init_noinit,
20268 +               .parse = parse_noparse,
20269 +               .mapping = mapping_bitmap,
20270 +               .index = index_is_address,
20271 +               .clone = NULL
20272 +       },
20273 +       [JNODE_IO_HEAD] = {
20274 +               .h = {
20275 +                       .type_id = REISER4_JNODE_PLUGIN_TYPE,
20276 +                       .id = JNODE_IO_HEAD,
20277 +                       .pops = NULL,
20278 +                       .label = "io head",
20279 +                       .desc = "io head",
20280 +                       .linkage = {NULL, NULL}
20281 +               },
20282 +               .init = init_noinit,
20283 +               .parse = parse_noparse,
20284 +               .mapping = mapping_bitmap,
20285 +               .index = index_is_address,
20286 +               .clone = NULL
20287 +       },
20288 +       [JNODE_INODE] = {
20289 +               .h = {
20290 +                       .type_id = REISER4_JNODE_PLUGIN_TYPE,
20291 +                       .id = JNODE_INODE,
20292 +                       .pops = NULL,
20293 +                       .label = "inode",
20294 +                       .desc = "inode's builtin jnode",
20295 +                       .linkage = {NULL, NULL}
20296 +               },
20297 +               .init = NULL,
20298 +               .parse = NULL,
20299 +               .mapping = NULL,
20300 +               .index = NULL,
20301 +               .clone = NULL
20302 +       }
20303 +};
20304 +
20305 +/*
20306 + * jnode destruction.
20307 + *
20308 + * Thread may use a jnode after it acquired a reference to it. References are
20309 + * counted in ->x_count field. Reference protects jnode from being
20310 + * recycled. This is different from protecting jnode data (that are stored in
20311 + * jnode page) from being evicted from memory. Data are protected by jload()
20312 + * and released by jrelse().
20313 + *
20314 + * If thread already possesses a reference to the jnode it can acquire another
20315 + * one through jref(). Initial reference is obtained (usually) by locating
20316 + * jnode in some indexing structure that depends on jnode type: formatted
20317 + * nodes are kept in global hash table, where they are indexed by block
20318 + * number, and also in the cbk cache. Unformatted jnodes are also kept in hash
20319 + * table, which is indexed by oid and offset within file, and in per-inode
20320 + * radix tree.
20321 + *
20322 + * Reference to jnode is released by jput(). If last reference is released,
20323 + * jput_final() is called. This function determines whether jnode has to be
20324 + * deleted (this happens when corresponding node is removed from the file
20325 + * system, jnode is marked with JNODE_HEARD_BANSHEE bit in this case), or it
20326 + * should be just "removed" (deleted from memory).
20327 + *
20328 + * Jnode destruction is signally delicate dance because of locking and RCU.
20329 + */
20330 +
20331 +/*
20332 + * Returns true if jnode cannot be removed right now. This check is called
20333 + * under tree lock. If it returns true, jnode is irrevocably committed to be
20334 + * deleted/removed.
20335 + */
20336 +static inline int jnode_is_busy(const jnode * node, jnode_type jtype)
20337 +{
20338 +       /* if other thread managed to acquire a reference to this jnode, don't
20339 +        * free it. */
20340 +       if (atomic_read(&node->x_count) > 0)
20341 +               return 1;
20342 +       /* also, don't free znode that has children in memory */
20343 +       if (jtype == JNODE_FORMATTED_BLOCK && JZNODE(node)->c_count > 0)
20344 +               return 1;
20345 +       return 0;
20346 +}
20347 +
20348 +/*
20349 + * this is called as part of removing jnode. Based on jnode type, call
20350 + * corresponding function that removes jnode from indices and returns it back
20351 + * to the appropriate slab (through RCU).
20352 + */
20353 +static inline void
20354 +jnode_remove(jnode * node, jnode_type jtype, reiser4_tree * tree)
20355 +{
20356 +       switch (jtype) {
20357 +       case JNODE_UNFORMATTED_BLOCK:
20358 +               remove_jnode(node, tree);
20359 +               break;
20360 +       case JNODE_IO_HEAD:
20361 +       case JNODE_BITMAP:
20362 +               break;
20363 +       case JNODE_INODE:
20364 +               break;
20365 +       case JNODE_FORMATTED_BLOCK:
20366 +               remove_znode(node, tree);
20367 +               break;
20368 +       default:
20369 +               wrong_return_value("nikita-3196", "Wrong jnode type");
20370 +       }
20371 +}
20372 +
20373 +/*
20374 + * this is called as part of deleting jnode. Based on jnode type, call
20375 + * corresponding function that removes jnode from indices and returns it back
20376 + * to the appropriate slab (through RCU).
20377 + *
20378 + * This differs from jnode_remove() only for formatted nodes---for them
20379 + * sibling list handling is different for removal and deletion.
20380 + */
20381 +static inline void
20382 +jnode_delete(jnode * node, jnode_type jtype, reiser4_tree * tree UNUSED_ARG)
20383 +{
20384 +       switch (jtype) {
20385 +       case JNODE_UNFORMATTED_BLOCK:
20386 +               remove_jnode(node, tree);
20387 +               break;
20388 +       case JNODE_IO_HEAD:
20389 +       case JNODE_BITMAP:
20390 +               break;
20391 +       case JNODE_FORMATTED_BLOCK:
20392 +               delete_znode(node, tree);
20393 +               break;
20394 +       case JNODE_INODE:
20395 +       default:
20396 +               wrong_return_value("nikita-3195", "Wrong jnode type");
20397 +       }
20398 +}
20399 +
20400 +#if REISER4_DEBUG
20401 +/*
20402 + * remove jnode from the debugging list of all jnodes hanging off super-block.
20403 + */
20404 +void jnode_list_remove(jnode * node)
20405 +{
20406 +       reiser4_super_info_data *sbinfo;
20407 +
20408 +       sbinfo = get_super_private(jnode_get_tree(node)->super);
20409 +
20410 +       spin_lock_irq(&sbinfo->all_guard);
20411 +       assert("nikita-2422", !list_empty(&node->jnodes));
20412 +       list_del_init(&node->jnodes);
20413 +       spin_unlock_irq(&sbinfo->all_guard);
20414 +}
20415 +#endif
20416 +
20417 +/*
20418 + * this is called by jput_final() to remove jnode when last reference to it is
20419 + * released.
20420 + */
20421 +static int jnode_try_drop(jnode * node)
20422 +{
20423 +       int result;
20424 +       reiser4_tree *tree;
20425 +       jnode_type jtype;
20426 +
20427 +       assert("nikita-2491", node != NULL);
20428 +       assert("nikita-2583", JF_ISSET(node, JNODE_RIP));
20429 +
20430 +       tree = jnode_get_tree(node);
20431 +       jtype = jnode_get_type(node);
20432 +
20433 +       spin_lock_jnode(node);
20434 +       write_lock_tree(tree);
20435 +       /*
20436 +        * if jnode has a page---leave it alone. Memory pressure will
20437 +        * eventually kill page and jnode.
20438 +        */
20439 +       if (jnode_page(node) != NULL) {
20440 +               write_unlock_tree(tree);
20441 +               spin_unlock_jnode(node);
20442 +               JF_CLR(node, JNODE_RIP);
20443 +               return RETERR(-EBUSY);
20444 +       }
20445 +
20446 +       /* re-check ->x_count under tree lock. */
20447 +       result = jnode_is_busy(node, jtype);
20448 +       if (result == 0) {
20449 +               assert("nikita-2582", !JF_ISSET(node, JNODE_HEARD_BANSHEE));
20450 +               assert("jmacd-511/b", atomic_read(&node->d_count) == 0);
20451 +
20452 +               spin_unlock_jnode(node);
20453 +               /* no page and no references---despatch him. */
20454 +               jnode_remove(node, jtype, tree);
20455 +               write_unlock_tree(tree);
20456 +               jnode_free(node, jtype);
20457 +       } else {
20458 +               /* busy check failed: reference was acquired by concurrent
20459 +                * thread. */
20460 +               write_unlock_tree(tree);
20461 +               spin_unlock_jnode(node);
20462 +               JF_CLR(node, JNODE_RIP);
20463 +       }
20464 +       return result;
20465 +}
20466 +
20467 +/* jdelete() -- Delete jnode from the tree and file system */
20468 +static int jdelete(jnode * node /* jnode to finish with */ )
20469 +{
20470 +       struct page *page;
20471 +       int result;
20472 +       reiser4_tree *tree;
20473 +       jnode_type jtype;
20474 +
20475 +       assert("nikita-467", node != NULL);
20476 +       assert("nikita-2531", JF_ISSET(node, JNODE_RIP));
20477 +
20478 +       jtype = jnode_get_type(node);
20479 +
20480 +       page = jnode_lock_page(node);
20481 +       assert_spin_locked(&(node->guard));
20482 +
20483 +       tree = jnode_get_tree(node);
20484 +
20485 +       write_lock_tree(tree);
20486 +       /* re-check ->x_count under tree lock. */
20487 +       result = jnode_is_busy(node, jtype);
20488 +       if (likely(!result)) {
20489 +               assert("nikita-2123", JF_ISSET(node, JNODE_HEARD_BANSHEE));
20490 +               assert("jmacd-511", atomic_read(&node->d_count) == 0);
20491 +
20492 +               /* detach page */
20493 +               if (page != NULL) {
20494 +                       /*
20495 +                        * FIXME this is racy against jnode_extent_write().
20496 +                        */
20497 +                       page_clear_jnode(page, node);
20498 +               }
20499 +               spin_unlock_jnode(node);
20500 +               /* goodbye */
20501 +               jnode_delete(node, jtype, tree);
20502 +               write_unlock_tree(tree);
20503 +               jnode_free(node, jtype);
20504 +               /* @node is no longer valid pointer */
20505 +               if (page != NULL)
20506 +                       drop_page(page);
20507 +       } else {
20508 +               /* busy check failed: reference was acquired by concurrent
20509 +                * thread. */
20510 +               JF_CLR(node, JNODE_RIP);
20511 +               write_unlock_tree(tree);
20512 +               spin_unlock_jnode(node);
20513 +               if (page != NULL)
20514 +                       unlock_page(page);
20515 +       }
20516 +       return result;
20517 +}
20518 +
20519 +/* drop jnode on the floor.
20520 +
20521 +   Return value:
20522 +
20523 +    -EBUSY:  failed to drop jnode, because there are still references to it
20524 +
20525 +    0:       successfully dropped jnode
20526 +
20527 +*/
20528 +static int jdrop_in_tree(jnode * node, reiser4_tree * tree)
20529 +{
20530 +       struct page *page;
20531 +       jnode_type jtype;
20532 +       int result;
20533 +
20534 +       assert("zam-602", node != NULL);
20535 +       assert_rw_not_read_locked(&(tree->tree_lock));
20536 +       assert_rw_not_write_locked(&(tree->tree_lock));
20537 +       assert("nikita-2403", !JF_ISSET(node, JNODE_HEARD_BANSHEE));
20538 +
20539 +       jtype = jnode_get_type(node);
20540 +
20541 +       page = jnode_lock_page(node);
20542 +       assert_spin_locked(&(node->guard));
20543 +
20544 +       write_lock_tree(tree);
20545 +
20546 +       /* re-check ->x_count under tree lock. */
20547 +       result = jnode_is_busy(node, jtype);
20548 +       if (!result) {
20549 +               assert("nikita-2488", page == jnode_page(node));
20550 +               assert("nikita-2533", atomic_read(&node->d_count) == 0);
20551 +               if (page != NULL) {
20552 +                       assert("nikita-2126", !PageDirty(page));
20553 +                       assert("nikita-2127", PageUptodate(page));
20554 +                       assert("nikita-2181", PageLocked(page));
20555 +                       page_clear_jnode(page, node);
20556 +               }
20557 +               spin_unlock_jnode(node);
20558 +               jnode_remove(node, jtype, tree);
20559 +               write_unlock_tree(tree);
20560 +               jnode_free(node, jtype);
20561 +               if (page != NULL) {
20562 +                       drop_page(page);
20563 +               }
20564 +       } else {
20565 +               /* busy check failed: reference was acquired by concurrent
20566 +                * thread. */
20567 +               JF_CLR(node, JNODE_RIP);
20568 +               write_unlock_tree(tree);
20569 +               spin_unlock_jnode(node);
20570 +               if (page != NULL)
20571 +                       unlock_page(page);
20572 +       }
20573 +       return result;
20574 +}
20575 +
20576 +/* This function frees jnode "if possible". In particular, [dcx]_count has to
20577 +   be 0 (where applicable).  */
20578 +void jdrop(jnode * node)
20579 +{
20580 +       jdrop_in_tree(node, jnode_get_tree(node));
20581 +}
20582 +
20583 +/* IO head jnode implementation; The io heads are simple j-nodes with limited
20584 +   functionality (these j-nodes are not in any hash table) just for reading
20585 +   from and writing to disk. */
20586 +
20587 +jnode *alloc_io_head(const reiser4_block_nr * block)
20588 +{
20589 +       jnode *jal = jalloc();
20590 +
20591 +       if (jal != NULL) {
20592 +               jnode_init(jal, current_tree, JNODE_IO_HEAD);
20593 +               jnode_set_block(jal, block);
20594 +       }
20595 +
20596 +       jref(jal);
20597 +
20598 +       return jal;
20599 +}
20600 +
20601 +void drop_io_head(jnode * node)
20602 +{
20603 +       assert("zam-648", jnode_get_type(node) == JNODE_IO_HEAD);
20604 +
20605 +       jput(node);
20606 +       jdrop(node);
20607 +}
20608 +
20609 +/* protect keep jnode data from reiser4_releasepage()  */
20610 +void pin_jnode_data(jnode * node)
20611 +{
20612 +       assert("zam-671", jnode_page(node) != NULL);
20613 +       page_cache_get(jnode_page(node));
20614 +}
20615 +
20616 +/* make jnode data free-able again */
20617 +void unpin_jnode_data(jnode * node)
20618 +{
20619 +       assert("zam-672", jnode_page(node) != NULL);
20620 +       page_cache_release(jnode_page(node));
20621 +}
20622 +
20623 +struct address_space *jnode_get_mapping(const jnode * node)
20624 +{
20625 +       assert("nikita-3162", node != NULL);
20626 +       return jnode_ops(node)->mapping(node);
20627 +}
20628 +
20629 +#if REISER4_DEBUG
20630 +/* debugging aid: jnode invariant */
20631 +int jnode_invariant_f(const jnode * node, char const **msg)
20632 +{
20633 +#define _ergo(ant, con)                                                \
20634 +       ((*msg) = "{" #ant "} ergo {" #con "}", ergo((ant), (con)))
20635 +#define _check(exp) ((*msg) = #exp, (exp))
20636 +
20637 +       return _check(node != NULL) &&
20638 +           /* [jnode-queued] */
20639 +           /* only relocated node can be queued, except that when znode
20640 +            * is being deleted, its JNODE_RELOC bit is cleared */
20641 +           _ergo(JF_ISSET(node, JNODE_FLUSH_QUEUED),
20642 +                 JF_ISSET(node, JNODE_RELOC) ||
20643 +                 JF_ISSET(node, JNODE_HEARD_BANSHEE)) &&
20644 +           _check(node->jnodes.prev != NULL) &&
20645 +           _check(node->jnodes.next != NULL) &&
20646 +           /* [jnode-dirty] invariant */
20647 +           /* dirty inode is part of atom */
20648 +           _ergo(JF_ISSET(node, JNODE_DIRTY), node->atom != NULL) &&
20649 +           /* [jnode-oid] invariant */
20650 +           /* for unformatted node ->objectid and ->mapping fields are
20651 +            * consistent */
20652 +           _ergo(jnode_is_unformatted(node) && node->key.j.mapping != NULL,
20653 +                 node->key.j.objectid ==
20654 +                 get_inode_oid(node->key.j.mapping->host)) &&
20655 +           /* [jnode-atom-valid] invariant */
20656 +           /* node atom has valid state */
20657 +           _ergo(node->atom != NULL, node->atom->stage != ASTAGE_INVALID) &&
20658 +           /* [jnode-page-binding] invariant */
20659 +           /* if node points to page, it points back to node */
20660 +           _ergo(node->pg != NULL, jprivate(node->pg) == node) &&
20661 +           /* [jnode-refs] invariant */
20662 +           /* only referenced jnode can be loaded */
20663 +           _check(atomic_read(&node->x_count) >= atomic_read(&node->d_count));
20664 +
20665 +}
20666 +
20667 +static const char *jnode_type_name(jnode_type type)
20668 +{
20669 +       switch (type) {
20670 +       case JNODE_UNFORMATTED_BLOCK:
20671 +               return "unformatted";
20672 +       case JNODE_FORMATTED_BLOCK:
20673 +               return "formatted";
20674 +       case JNODE_BITMAP:
20675 +               return "bitmap";
20676 +       case JNODE_IO_HEAD:
20677 +               return "io head";
20678 +       case JNODE_INODE:
20679 +               return "inode";
20680 +       case LAST_JNODE_TYPE:
20681 +               return "last";
20682 +       default:{
20683 +                       static char unknown[30];
20684 +
20685 +                       sprintf(unknown, "unknown %i", type);
20686 +                       return unknown;
20687 +               }
20688 +       }
20689 +}
20690 +
20691 +#define jnode_state_name( node, flag )                 \
20692 +       ( JF_ISSET( ( node ), ( flag ) ) ? ((#flag "|")+6) : "" )
20693 +
20694 +/* debugging aid: output human readable information about @node */
20695 +static void info_jnode(const char *prefix /* prefix to print */ ,
20696 +                      const jnode * node /* node to print */ )
20697 +{
20698 +       assert("umka-068", prefix != NULL);
20699 +
20700 +       if (node == NULL) {
20701 +               printk("%s: null\n", prefix);
20702 +               return;
20703 +       }
20704 +
20705 +       printk
20706 +           ("%s: %p: state: %lx: [%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s], level: %i,"
20707 +            " block: %s, d_count: %d, x_count: %d, "
20708 +            "pg: %p, atom: %p, lock: %i:%i, type: %s, ", prefix, node,
20709 +            node->state,
20710 +            jnode_state_name(node, JNODE_PARSED),
20711 +            jnode_state_name(node, JNODE_HEARD_BANSHEE),
20712 +            jnode_state_name(node, JNODE_LEFT_CONNECTED),
20713 +            jnode_state_name(node, JNODE_RIGHT_CONNECTED),
20714 +            jnode_state_name(node, JNODE_ORPHAN),
20715 +            jnode_state_name(node, JNODE_CREATED),
20716 +            jnode_state_name(node, JNODE_RELOC),
20717 +            jnode_state_name(node, JNODE_OVRWR),
20718 +            jnode_state_name(node, JNODE_DIRTY),
20719 +            jnode_state_name(node, JNODE_IS_DYING),
20720 +            jnode_state_name(node, JNODE_RIP),
20721 +            jnode_state_name(node, JNODE_MISSED_IN_CAPTURE),
20722 +            jnode_state_name(node, JNODE_WRITEBACK),
20723 +            jnode_state_name(node, JNODE_NEW),
20724 +            jnode_state_name(node, JNODE_DKSET),
20725 +            jnode_state_name(node, JNODE_REPACK),
20726 +            jnode_state_name(node, JNODE_CLUSTER_PAGE),
20727 +            jnode_get_level(node), sprint_address(jnode_get_block(node)),
20728 +            atomic_read(&node->d_count), atomic_read(&node->x_count),
20729 +            jnode_page(node), node->atom, 0, 0,
20730 +            jnode_type_name(jnode_get_type(node)));
20731 +       if (jnode_is_unformatted(node)) {
20732 +               printk("inode: %llu, index: %lu, ",
20733 +                      node->key.j.objectid, node->key.j.index);
20734 +       }
20735 +}
20736 +
20737 +/* debugging aid: check znode invariant and panic if it doesn't hold */
20738 +static int jnode_invariant(const jnode * node, int tlocked, int jlocked)
20739 +{
20740 +       char const *failed_msg;
20741 +       int result;
20742 +       reiser4_tree *tree;
20743 +
20744 +       tree = jnode_get_tree(node);
20745 +
20746 +       assert("umka-063312", node != NULL);
20747 +       assert("umka-064321", tree != NULL);
20748 +
20749 +       if (!jlocked && !tlocked)
20750 +               spin_lock_jnode((jnode *) node);
20751 +       if (!tlocked)
20752 +               read_lock_tree(jnode_get_tree(node));
20753 +       result = jnode_invariant_f(node, &failed_msg);
20754 +       if (!result) {
20755 +               info_jnode("corrupted node", node);
20756 +               warning("jmacd-555", "Condition %s failed", failed_msg);
20757 +       }
20758 +       if (!tlocked)
20759 +               read_unlock_tree(jnode_get_tree(node));
20760 +       if (!jlocked && !tlocked)
20761 +               spin_unlock_jnode((jnode *) node);
20762 +       return result;
20763 +}
20764 +
20765 +#endif                         /* REISER4_DEBUG */
20766 +
20767 +/* Make Linus happy.
20768 +   Local variables:
20769 +   c-indentation-style: "K&R"
20770 +   mode-name: "LC"
20771 +   c-basic-offset: 8
20772 +   tab-width: 8
20773 +   fill-column: 80
20774 +   End:
20775 +*/
20776 diff --git a/fs/reiser4/jnode.h b/fs/reiser4/jnode.h
20777 new file mode 100644
20778 index 0000000..6f29a66
20779 --- /dev/null
20780 +++ b/fs/reiser4/jnode.h
20781 @@ -0,0 +1,707 @@
20782 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
20783 + * reiser4/README */
20784 +
20785 +/* Declaration of jnode. See jnode.c for details. */
20786 +
20787 +#ifndef __JNODE_H__
20788 +#define __JNODE_H__
20789 +
20790 +#include "forward.h"
20791 +#include "type_safe_hash.h"
20792 +#include "txnmgr.h"
20793 +#include "key.h"
20794 +#include "debug.h"
20795 +#include "dformat.h"
20796 +#include "context.h"
20797 +
20798 +#include "plugin/plugin.h"
20799 +
20800 +#include <linux/fs.h>
20801 +#include <linux/mm.h>
20802 +#include <linux/spinlock.h>
20803 +#include <asm/atomic.h>
20804 +#include <asm/bitops.h>
20805 +#include <linux/list.h>
20806 +#include <linux/rcupdate.h>
20807 +
20808 +/* declare hash table of jnodes (jnodes proper, that is, unformatted
20809 +   nodes)  */
20810 +TYPE_SAFE_HASH_DECLARE(j, jnode);
20811 +
20812 +/* declare hash table of znodes */
20813 +TYPE_SAFE_HASH_DECLARE(z, znode);
20814 +
20815 +typedef struct {
20816 +       __u64 objectid;
20817 +       unsigned long index;
20818 +       struct address_space *mapping;
20819 +} jnode_key_t;
20820 +
20821 +/*
20822 +   Jnode is the "base class" of other nodes in reiser4. It is also happens to
20823 +   be exactly the node we use for unformatted tree nodes.
20824 +
20825 +   Jnode provides following basic functionality:
20826 +
20827 +   . reference counting and indexing.
20828 +
20829 +   . integration with page cache. Jnode has ->pg reference to which page can
20830 +   be attached.
20831 +
20832 +   . interface to transaction manager. It is jnode that is kept in transaction
20833 +   manager lists, attached to atoms, etc. (NOTE-NIKITA one may argue that this
20834 +   means, there should be special type of jnode for inode.)
20835 +
20836 +   Locking:
20837 +
20838 +   Spin lock: the following fields are protected by the per-jnode spin lock:
20839 +
20840 +    ->state
20841 +    ->atom
20842 +    ->capture_link
20843 +
20844 +   Following fields are protected by the global tree lock:
20845 +
20846 +    ->link
20847 +    ->key.z (content of ->key.z is only changed in znode_rehash())
20848 +    ->key.j
20849 +
20850 +   Atomic counters
20851 +
20852 +    ->x_count
20853 +    ->d_count
20854 +
20855 +    ->pg, and ->data are protected by spin lock for unused jnode and are
20856 +    immutable for used jnode (one for which fs/reiser4/vfs_ops.c:releasable()
20857 +    is false).
20858 +
20859 +    ->tree is immutable after creation
20860 +
20861 +   Unclear
20862 +
20863 +    ->blocknr: should be under jnode spin-lock, but current interface is based
20864 +    on passing of block address.
20865 +
20866 +   If you ever need to spin lock two nodes at once, do this in "natural"
20867 +   memory order: lock znode with lower address first. (See lock_two_nodes().)
20868 +
20869 +   Invariants involving this data-type:
20870 +
20871 +      [jnode-dirty]
20872 +      [jnode-refs]
20873 +      [jnode-oid]
20874 +      [jnode-queued]
20875 +      [jnode-atom-valid]
20876 +      [jnode-page-binding]
20877 +*/
20878 +
20879 +struct jnode {
20880 +#if REISER4_DEBUG
20881 +#define JMAGIC 0x52654973      /* "ReIs" */
20882 +       int magic;
20883 +#endif
20884 +       /* FIRST CACHE LINE (16 bytes): data used by jload */
20885 +
20886 +       /* jnode's state: bitwise flags from the reiser4_jnode_state enum. */
20887 +       /*   0 */ unsigned long state;
20888 +
20889 +       /* lock, protecting jnode's fields. */
20890 +       /*   4 */ spinlock_t load;
20891 +
20892 +       /* counter of references to jnode itself. Increased on jref().
20893 +          Decreased on jput().
20894 +        */
20895 +       /*   8 */ atomic_t x_count;
20896 +
20897 +       /* counter of references to jnode's data. Pin data page(s) in
20898 +          memory while this is greater than 0. Increased on jload().
20899 +          Decreased on jrelse().
20900 +        */
20901 +       /*   12 */ atomic_t d_count;
20902 +
20903 +       /* SECOND CACHE LINE: data used by hash table lookups */
20904 +
20905 +       /*   16 */ union {
20906 +               /* znodes are hashed by block number */
20907 +               reiser4_block_nr z;
20908 +               /* unformatted nodes are hashed by mapping plus offset */
20909 +               jnode_key_t j;
20910 +       } key;
20911 +
20912 +       /* THIRD CACHE LINE */
20913 +
20914 +       /*   32 */ union {
20915 +               /* pointers to maintain hash-table */
20916 +               z_hash_link z;
20917 +               j_hash_link j;
20918 +       } link;
20919 +
20920 +       /* pointer to jnode page.  */
20921 +       /*   36 */ struct page *pg;
20922 +       /* pointer to node itself. This is page_address(node->pg) when page is
20923 +          attached to the jnode
20924 +        */
20925 +       /*   40 */ void *data;
20926 +
20927 +       /*   44 */ reiser4_tree *tree;
20928 +
20929 +       /* FOURTH CACHE LINE: atom related fields */
20930 +
20931 +       /*   48 */ spinlock_t guard;
20932 +
20933 +       /* atom the block is in, if any */
20934 +       /*   52 */ txn_atom *atom;
20935 +
20936 +       /* capture list */
20937 +       /*   56 */ struct list_head capture_link;
20938 +
20939 +       /* FIFTH CACHE LINE */
20940 +
20941 +       /*   64 */ struct rcu_head rcu;
20942 +       /* crosses cache line */
20943 +
20944 +       /* SIXTH CACHE LINE */
20945 +
20946 +       /* the real blocknr (where io is going to/from) */
20947 +       /*   80 */ reiser4_block_nr blocknr;
20948 +       /* Parent item type, unformatted and CRC need it for offset => key conversion.  */
20949 +       /* NOTE: this parent_item_id looks like jnode type. */
20950 +       /*   88 */ reiser4_plugin_id parent_item_id;
20951 +       /*   92 */
20952 +#if REISER4_DEBUG
20953 +       /* number of pages referenced by the jnode (meaningful while capturing of
20954 +          page clusters) */
20955 +       int page_count;
20956 +       /* list of all jnodes for debugging purposes. */
20957 +       struct list_head jnodes;
20958 +       /* how many times this jnode was written in one transaction */
20959 +       int written;
20960 +       /* this indicates which atom's list the jnode is on */
20961 +       atom_list list;
20962 +#endif
20963 +} __attribute__ ((aligned(16)));
20964 +
20965 +/*
20966 + * jnode types. Enumeration of existing jnode types.
20967 + */
20968 +typedef enum {
20969 +       JNODE_UNFORMATTED_BLOCK,        /* unformatted block */
20970 +       JNODE_FORMATTED_BLOCK,  /* formatted block, znode */
20971 +       JNODE_BITMAP,           /* bitmap */
20972 +       JNODE_IO_HEAD,          /* jnode representing a block in the
20973 +                                * wandering log */
20974 +       JNODE_INODE,            /* jnode embedded into inode */
20975 +       LAST_JNODE_TYPE
20976 +} jnode_type;
20977 +
20978 +/* jnode states */
20979 +typedef enum {
20980 +       /* jnode's page is loaded and data checked */
20981 +       JNODE_PARSED = 0,
20982 +       /* node was deleted, not all locks on it were released. This
20983 +          node is empty and is going to be removed from the tree
20984 +          shortly. */
20985 +       JNODE_HEARD_BANSHEE = 1,
20986 +       /* left sibling pointer is valid */
20987 +       JNODE_LEFT_CONNECTED = 2,
20988 +       /* right sibling pointer is valid */
20989 +       JNODE_RIGHT_CONNECTED = 3,
20990 +
20991 +       /* znode was just created and doesn't yet have a pointer from
20992 +          its parent */
20993 +       JNODE_ORPHAN = 4,
20994 +
20995 +       /* this node was created by its transaction and has not been assigned
20996 +          a block address. */
20997 +       JNODE_CREATED = 5,
20998 +
20999 +       /* this node is currently relocated */
21000 +       JNODE_RELOC = 6,
21001 +       /* this node is currently wandered */
21002 +       JNODE_OVRWR = 7,
21003 +
21004 +       /* this znode has been modified */
21005 +       JNODE_DIRTY = 8,
21006 +
21007 +       /* znode lock is being invalidated */
21008 +       JNODE_IS_DYING = 9,
21009 +
21010 +       /* THIS PLACE IS INTENTIONALLY LEFT BLANK */
21011 +
21012 +       /* jnode is queued for flushing. */
21013 +       JNODE_FLUSH_QUEUED = 12,
21014 +
21015 +       /* In the following bits jnode type is encoded. */
21016 +       JNODE_TYPE_1 = 13,
21017 +       JNODE_TYPE_2 = 14,
21018 +       JNODE_TYPE_3 = 15,
21019 +
21020 +       /* jnode is being destroyed */
21021 +       JNODE_RIP = 16,
21022 +
21023 +       /* znode was not captured during locking (it might so be because
21024 +          ->level != LEAF_LEVEL and lock_mode == READ_LOCK) */
21025 +       JNODE_MISSED_IN_CAPTURE = 17,
21026 +
21027 +       /* write is in progress */
21028 +       JNODE_WRITEBACK = 18,
21029 +
21030 +       /* FIXME: now it is used by crypto-compress plugin only */
21031 +       JNODE_NEW = 19,
21032 +
21033 +       /* delimiting keys are already set for this znode. */
21034 +       JNODE_DKSET = 20,
21035 +
21036 +       /* when this bit is set page and jnode can not be disconnected */
21037 +       JNODE_WRITE_PREPARED = 21,
21038 +
21039 +       JNODE_CLUSTER_PAGE = 22,
21040 +       /* Jnode is marked for repacking, that means the reiser4 flush and the
21041 +        * block allocator should process this node special way  */
21042 +       JNODE_REPACK = 23,
21043 +       /* node should be converted by flush in squalloc phase */
21044 +       JNODE_CONVERTIBLE = 24,
21045 +       /*
21046 +        * When jnode is dirtied for the first time in given transaction,
21047 +        * do_jnode_make_dirty() checks whether this jnode can possible became
21048 +        * member of overwrite set. If so, this bit is set, and one block is
21049 +        * reserved in the ->flush_reserved space of atom.
21050 +        *
21051 +        * This block is "used" (and JNODE_FLUSH_RESERVED bit is cleared) when
21052 +        *
21053 +        *     (1) flush decides that we want this block to go into relocate
21054 +        *     set after all.
21055 +        *
21056 +        *     (2) wandering log is allocated (by log writer)
21057 +        *
21058 +        *     (3) extent is allocated
21059 +        *
21060 +        */
21061 +       JNODE_FLUSH_RESERVED = 29
21062 +} reiser4_jnode_state;
21063 +
21064 +/* Macros for accessing the jnode state. */
21065 +
21066 +static inline void JF_CLR(jnode * j, int f)
21067 +{
21068 +       assert("unknown-1", j->magic == JMAGIC);
21069 +       clear_bit(f, &j->state);
21070 +}
21071 +static inline int JF_ISSET(const jnode * j, int f)
21072 +{
21073 +       assert("unknown-2", j->magic == JMAGIC);
21074 +       return test_bit(f, &((jnode *) j)->state);
21075 +}
21076 +static inline void JF_SET(jnode * j, int f)
21077 +{
21078 +       assert("unknown-3", j->magic == JMAGIC);
21079 +       set_bit(f, &j->state);
21080 +}
21081 +
21082 +static inline int JF_TEST_AND_SET(jnode * j, int f)
21083 +{
21084 +       assert("unknown-4", j->magic == JMAGIC);
21085 +       return test_and_set_bit(f, &j->state);
21086 +}
21087 +
21088 +static inline void spin_lock_jnode(jnode *node)
21089 +{
21090 +       /* check that spinlocks of lower priorities are not held */
21091 +       assert("", (LOCK_CNT_NIL(rw_locked_tree) &&
21092 +                   LOCK_CNT_NIL(spin_locked_txnh) &&
21093 +                   LOCK_CNT_NIL(spin_locked_zlock) &&
21094 +                   LOCK_CNT_NIL(rw_locked_dk) &&
21095 +                   LOCK_CNT_LT(spin_locked_jnode, 2)));
21096 +
21097 +       spin_lock(&(node->guard));
21098 +
21099 +       LOCK_CNT_INC(spin_locked_jnode);
21100 +       LOCK_CNT_INC(spin_locked);
21101 +}
21102 +
21103 +static inline void spin_unlock_jnode(jnode *node)
21104 +{
21105 +       assert_spin_locked(&(node->guard));
21106 +       assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_jnode));
21107 +       assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
21108 +
21109 +       LOCK_CNT_DEC(spin_locked_jnode);
21110 +       LOCK_CNT_DEC(spin_locked);
21111 +
21112 +       spin_unlock(&(node->guard));
21113 +}
21114 +
21115 +static inline int jnode_is_in_deleteset(const jnode * node)
21116 +{
21117 +       return JF_ISSET(node, JNODE_RELOC);
21118 +}
21119 +
21120 +extern int init_jnodes(void);
21121 +extern void done_jnodes(void);
21122 +
21123 +/* Jnode routines */
21124 +extern jnode *jalloc(void);
21125 +extern void jfree(jnode * node) NONNULL;
21126 +extern jnode *jclone(jnode *);
21127 +extern jnode *jlookup(reiser4_tree * tree,
21128 +                     oid_t objectid, unsigned long ind) NONNULL;
21129 +extern jnode *jfind(struct address_space *, unsigned long index) NONNULL;
21130 +extern jnode *jnode_by_page(struct page *pg) NONNULL;
21131 +extern jnode *jnode_of_page(struct page *pg) NONNULL;
21132 +void jnode_attach_page(jnode * node, struct page *pg);
21133 +
21134 +void unhash_unformatted_jnode(jnode *);
21135 +extern jnode *page_next_jnode(jnode * node) NONNULL;
21136 +extern void jnode_init(jnode * node, reiser4_tree * tree, jnode_type) NONNULL;
21137 +extern void jnode_make_dirty(jnode * node) NONNULL;
21138 +extern void jnode_make_clean(jnode * node) NONNULL;
21139 +extern void jnode_make_wander_nolock(jnode * node) NONNULL;
21140 +extern void jnode_make_wander(jnode *) NONNULL;
21141 +extern void znode_make_reloc(znode *, flush_queue_t *) NONNULL;
21142 +extern void unformatted_make_reloc(jnode *, flush_queue_t *) NONNULL;
21143 +extern struct address_space *jnode_get_mapping(const jnode * node) NONNULL;
21144 +
21145 +/**
21146 + * jnode_get_block
21147 + * @node: jnode to query
21148 + *
21149 + */
21150 +static inline const reiser4_block_nr *jnode_get_block(const jnode *node)
21151 +{
21152 +       assert("nikita-528", node != NULL);
21153 +
21154 +       return &node->blocknr;
21155 +}
21156 +
21157 +/**
21158 + * jnode_set_block
21159 + * @node: jnode to update
21160 + * @blocknr: new block nr
21161 + */
21162 +static inline void jnode_set_block(jnode *node, const reiser4_block_nr *blocknr)
21163 +{
21164 +       assert("nikita-2020", node != NULL);
21165 +       assert("umka-055", blocknr != NULL);
21166 +       node->blocknr = *blocknr;
21167 +}
21168 +
21169 +
21170 +/* block number for IO. Usually this is the same as jnode_get_block(), unless
21171 + * jnode was emergency flushed---then block number chosen by eflush is
21172 + * used. */
21173 +static inline const reiser4_block_nr *jnode_get_io_block(jnode * node)
21174 +{
21175 +       assert("nikita-2768", node != NULL);
21176 +       assert_spin_locked(&(node->guard));
21177 +
21178 +       return jnode_get_block(node);
21179 +}
21180 +
21181 +/* Jnode flush interface. */
21182 +extern reiser4_blocknr_hint *pos_hint(flush_pos_t * pos);
21183 +extern flush_queue_t *pos_fq(flush_pos_t * pos);
21184 +
21185 +/* FIXME-VS: these are used in plugin/item/extent.c */
21186 +
21187 +/* does extent_get_block have to be called */
21188 +#define jnode_mapped(node)     JF_ISSET (node, JNODE_MAPPED)
21189 +#define jnode_set_mapped(node) JF_SET (node, JNODE_MAPPED)
21190 +
21191 +/* the node should be converted during flush squalloc phase */
21192 +#define jnode_convertible(node)        JF_ISSET (node, JNODE_CONVERTIBLE)
21193 +#define jnode_set_convertible(node)    JF_SET (node, JNODE_CONVERTIBLE)
21194 +
21195 +/* Macros to convert from jnode to znode, znode to jnode.  These are macros
21196 +   because C doesn't allow overloading of const prototypes. */
21197 +#define ZJNODE(x) (& (x) -> zjnode)
21198 +#define JZNODE(x)                                              \
21199 +({                                                             \
21200 +       typeof (x) __tmp_x;                                     \
21201 +                                                               \
21202 +       __tmp_x = (x);                                          \
21203 +       assert ("jmacd-1300", jnode_is_znode (__tmp_x));        \
21204 +       (znode*) __tmp_x;                                       \
21205 +})
21206 +
21207 +extern int jnodes_tree_init(reiser4_tree * tree);
21208 +extern int jnodes_tree_done(reiser4_tree * tree);
21209 +
21210 +#if REISER4_DEBUG
21211 +
21212 +extern int znode_is_any_locked(const znode * node);
21213 +extern void jnode_list_remove(jnode * node);
21214 +
21215 +#else
21216 +
21217 +#define jnode_list_remove(node) noop
21218 +
21219 +#endif
21220 +
21221 +int znode_is_root(const znode * node) NONNULL;
21222 +
21223 +/* bump reference counter on @node */
21224 +static inline void add_x_ref(jnode * node /* node to increase x_count of */ )
21225 +{
21226 +       assert("nikita-1911", node != NULL);
21227 +
21228 +       atomic_inc(&node->x_count);
21229 +       LOCK_CNT_INC(x_refs);
21230 +}
21231 +
21232 +static inline void dec_x_ref(jnode * node)
21233 +{
21234 +       assert("nikita-3215", node != NULL);
21235 +       assert("nikita-3216", atomic_read(&node->x_count) > 0);
21236 +
21237 +       atomic_dec(&node->x_count);
21238 +       assert("nikita-3217", LOCK_CNT_GTZ(x_refs));
21239 +       LOCK_CNT_DEC(x_refs);
21240 +}
21241 +
21242 +/* jref() - increase counter of references to jnode/znode (x_count) */
21243 +static inline jnode *jref(jnode * node)
21244 +{
21245 +       assert("jmacd-508", (node != NULL) && !IS_ERR(node));
21246 +       add_x_ref(node);
21247 +       return node;
21248 +}
21249 +
21250 +/* get the page of jnode */
21251 +static inline struct page *jnode_page(const jnode * node)
21252 +{
21253 +       return node->pg;
21254 +}
21255 +
21256 +/* return pointer to jnode data */
21257 +static inline char *jdata(const jnode * node)
21258 +{
21259 +       assert("nikita-1415", node != NULL);
21260 +       assert("nikita-3198", jnode_page(node) != NULL);
21261 +       return node->data;
21262 +}
21263 +
21264 +static inline int jnode_is_loaded(const jnode * node)
21265 +{
21266 +       assert("zam-506", node != NULL);
21267 +       return atomic_read(&node->d_count) > 0;
21268 +}
21269 +
21270 +extern void page_detach_jnode(struct page *page,
21271 +                             struct address_space *mapping,
21272 +                             unsigned long index) NONNULL;
21273 +extern void page_clear_jnode(struct page *page, jnode * node) NONNULL;
21274 +
21275 +static inline void jnode_set_reloc(jnode * node)
21276 +{
21277 +       assert("nikita-2431", node != NULL);
21278 +       assert("nikita-2432", !JF_ISSET(node, JNODE_OVRWR));
21279 +       JF_SET(node, JNODE_RELOC);
21280 +}
21281 +
21282 +/* jload/jwrite/junload give a bread/bwrite/brelse functionality for jnodes */
21283 +
21284 +extern int jload_gfp(jnode *, gfp_t, int do_kmap) NONNULL;
21285 +
21286 +static inline int jload(jnode *node)
21287 +{
21288 +       return jload_gfp(node, get_gfp_mask(), 1);
21289 +}
21290 +
21291 +extern int jinit_new(jnode *, gfp_t) NONNULL;
21292 +extern int jstartio(jnode *) NONNULL;
21293 +
21294 +extern void jdrop(jnode *) NONNULL;
21295 +extern int jwait_io(jnode *, int rw) NONNULL;
21296 +
21297 +void jload_prefetch(jnode *);
21298 +
21299 +extern jnode *alloc_io_head(const reiser4_block_nr * block) NONNULL;
21300 +extern void drop_io_head(jnode * node) NONNULL;
21301 +
21302 +static inline reiser4_tree *jnode_get_tree(const jnode * node)
21303 +{
21304 +       assert("nikita-2691", node != NULL);
21305 +       return node->tree;
21306 +}
21307 +
21308 +extern void pin_jnode_data(jnode *);
21309 +extern void unpin_jnode_data(jnode *);
21310 +
21311 +static inline jnode_type jnode_get_type(const jnode * node)
21312 +{
21313 +       static const unsigned long state_mask =
21314 +           (1 << JNODE_TYPE_1) | (1 << JNODE_TYPE_2) | (1 << JNODE_TYPE_3);
21315 +
21316 +       static jnode_type mask_to_type[] = {
21317 +               /*  JNODE_TYPE_3 : JNODE_TYPE_2 : JNODE_TYPE_1 */
21318 +
21319 +               /* 000 */
21320 +               [0] = JNODE_FORMATTED_BLOCK,
21321 +               /* 001 */
21322 +               [1] = JNODE_UNFORMATTED_BLOCK,
21323 +               /* 010 */
21324 +               [2] = JNODE_BITMAP,
21325 +               /* 011 */
21326 +               [3] = LAST_JNODE_TYPE,  /*invalid */
21327 +               /* 100 */
21328 +               [4] = JNODE_INODE,
21329 +               /* 101 */
21330 +               [5] = LAST_JNODE_TYPE,
21331 +               /* 110 */
21332 +               [6] = JNODE_IO_HEAD,
21333 +               /* 111 */
21334 +               [7] = LAST_JNODE_TYPE,  /* invalid */
21335 +       };
21336 +
21337 +       return mask_to_type[(node->state & state_mask) >> JNODE_TYPE_1];
21338 +}
21339 +
21340 +/* returns true if node is a znode */
21341 +static inline int jnode_is_znode(const jnode * node)
21342 +{
21343 +       return jnode_get_type(node) == JNODE_FORMATTED_BLOCK;
21344 +}
21345 +
21346 +static inline int jnode_is_flushprepped(jnode * node)
21347 +{
21348 +       assert("jmacd-78212", node != NULL);
21349 +       assert_spin_locked(&(node->guard));
21350 +       return !JF_ISSET(node, JNODE_DIRTY) || JF_ISSET(node, JNODE_RELOC) ||
21351 +               JF_ISSET(node, JNODE_OVRWR);
21352 +}
21353 +
21354 +/* Return true if @node has already been processed by the squeeze and allocate
21355 +   process.  This implies the block address has been finalized for the
21356 +   duration of this atom (or it is clean and will remain in place).  If this
21357 +   returns true you may use the block number as a hint. */
21358 +static inline int jnode_check_flushprepped(jnode * node)
21359 +{
21360 +       int result;
21361 +
21362 +       /* It must be clean or relocated or wandered.  New allocations are set to relocate. */
21363 +       spin_lock_jnode(node);
21364 +       result = jnode_is_flushprepped(node);
21365 +       spin_unlock_jnode(node);
21366 +       return result;
21367 +}
21368 +
21369 +/* returns true if node is unformatted */
21370 +static inline int jnode_is_unformatted(const jnode * node)
21371 +{
21372 +       assert("jmacd-0123", node != NULL);
21373 +       return jnode_get_type(node) == JNODE_UNFORMATTED_BLOCK;
21374 +}
21375 +
21376 +/* returns true if node represents a cluster cache page */
21377 +static inline int jnode_is_cluster_page(const jnode * node)
21378 +{
21379 +       assert("edward-50", node != NULL);
21380 +       return (JF_ISSET(node, JNODE_CLUSTER_PAGE));
21381 +}
21382 +
21383 +/* returns true is node is builtin inode's jnode */
21384 +static inline int jnode_is_inode(const jnode * node)
21385 +{
21386 +       assert("vs-1240", node != NULL);
21387 +       return jnode_get_type(node) == JNODE_INODE;
21388 +}
21389 +
21390 +static inline jnode_plugin *jnode_ops_of(const jnode_type type)
21391 +{
21392 +       assert("nikita-2367", type < LAST_JNODE_TYPE);
21393 +       return jnode_plugin_by_id((reiser4_plugin_id) type);
21394 +}
21395 +
21396 +static inline jnode_plugin *jnode_ops(const jnode * node)
21397 +{
21398 +       assert("nikita-2366", node != NULL);
21399 +
21400 +       return jnode_ops_of(jnode_get_type(node));
21401 +}
21402 +
21403 +/* Get the index of a block. */
21404 +static inline unsigned long jnode_get_index(jnode * node)
21405 +{
21406 +       return jnode_ops(node)->index(node);
21407 +}
21408 +
21409 +/* return true if "node" is the root */
21410 +static inline int jnode_is_root(const jnode * node)
21411 +{
21412 +       return jnode_is_znode(node) && znode_is_root(JZNODE(node));
21413 +}
21414 +
21415 +extern struct address_space *mapping_jnode(const jnode * node);
21416 +extern unsigned long index_jnode(const jnode * node);
21417 +
21418 +static inline void jput(jnode * node);
21419 +extern void jput_final(jnode * node);
21420 +
21421 +/* bump data counter on @node */
21422 +static inline void add_d_ref(jnode * node /* node to increase d_count of */ )
21423 +{
21424 +       assert("nikita-1962", node != NULL);
21425 +
21426 +       atomic_inc(&node->d_count);
21427 +       if (jnode_is_unformatted(node) || jnode_is_znode(node))
21428 +               LOCK_CNT_INC(d_refs);
21429 +}
21430 +
21431 +/* jput() - decrement x_count reference counter on znode.
21432 +
21433 +   Count may drop to 0, jnode stays in cache until memory pressure causes the
21434 +   eviction of its page. The c_count variable also ensures that children are
21435 +   pressured out of memory before the parent. The jnode remains hashed as
21436 +   long as the VM allows its page to stay in memory.
21437 +*/
21438 +static inline void jput(jnode * node)
21439 +{
21440 +       assert("jmacd-509", node != NULL);
21441 +       assert("jmacd-510", atomic_read(&node->x_count) > 0);
21442 +       assert("zam-926", schedulable());
21443 +       LOCK_CNT_DEC(x_refs);
21444 +
21445 +       rcu_read_lock();
21446 +       /*
21447 +        * we don't need any kind of lock here--jput_final() uses RCU.
21448 +        */
21449 +       if (unlikely(atomic_dec_and_test(&node->x_count))) {
21450 +               jput_final(node);
21451 +       } else
21452 +               rcu_read_unlock();
21453 +       assert("nikita-3473", schedulable());
21454 +}
21455 +
21456 +extern void jrelse(jnode * node);
21457 +extern void jrelse_tail(jnode * node);
21458 +
21459 +extern jnode *jnode_rip_sync(reiser4_tree * t, jnode * node);
21460 +
21461 +/* resolve race with jput */
21462 +static inline jnode *jnode_rip_check(reiser4_tree * tree, jnode * node)
21463 +{
21464 +       if (unlikely(JF_ISSET(node, JNODE_RIP)))
21465 +               node = jnode_rip_sync(tree, node);
21466 +       return node;
21467 +}
21468 +
21469 +extern reiser4_key *jnode_build_key(const jnode *node, reiser4_key * key);
21470 +
21471 +#if REISER4_DEBUG
21472 +extern int jnode_invariant_f(const jnode *node, char const **msg);
21473 +#endif
21474 +
21475 +extern jnode_plugin jnode_plugins[LAST_JNODE_TYPE];
21476 +
21477 +/* __JNODE_H__ */
21478 +#endif
21479 +
21480 +/* Make Linus happy.
21481 +   Local variables:
21482 +   c-indentation-style: "K&R"
21483 +   mode-name: "LC"
21484 +   c-basic-offset: 8
21485 +   tab-width: 8
21486 +   fill-column: 120
21487 +   End:
21488 +*/
21489 diff --git a/fs/reiser4/kassign.c b/fs/reiser4/kassign.c
21490 new file mode 100644
21491 index 0000000..68314dd
21492 --- /dev/null
21493 +++ b/fs/reiser4/kassign.c
21494 @@ -0,0 +1,659 @@
21495 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
21496 + * reiser4/README */
21497 +
21498 +/* Key assignment policy implementation */
21499 +
21500 +/*
21501 + * In reiser4 every piece of file system data and meta-data has a key. Keys
21502 + * are used to store information in and retrieve it from reiser4 internal
21503 + * tree. In addition to this, keys define _ordering_ of all file system
21504 + * information: things having close keys are placed into the same or
21505 + * neighboring (in the tree order) nodes of the tree. As our block allocator
21506 + * tries to respect tree order (see flush.c), keys also define order in which
21507 + * things are laid out on the disk, and hence, affect performance directly.
21508 + *
21509 + * Obviously, assignment of keys to data and meta-data should be consistent
21510 + * across whole file system. Algorithm that calculates a key for a given piece
21511 + * of data or meta-data is referred to as "key assignment".
21512 + *
21513 + * Key assignment is too expensive to be implemented as a plugin (that is,
21514 + * with an ability to support different key assignment schemas in the same
21515 + * compiled kernel image). As a compromise, all key-assignment functions and
21516 + * data-structures are collected in this single file, so that modifications to
21517 + * key assignment algorithm can be localized. Additional changes may be
21518 + * required in key.[ch].
21519 + *
21520 + * Current default reiser4 key assignment algorithm is dubbed "Plan A". As one
21521 + * may guess, there is "Plan B" too.
21522 + *
21523 + */
21524 +
21525 +/*
21526 + * Additional complication with key assignment implementation is a requirement
21527 + * to support different key length.
21528 + */
21529 +
21530 +/*
21531 + *                   KEY ASSIGNMENT: PLAN A, LONG KEYS.
21532 + *
21533 + * DIRECTORY ITEMS
21534 + *
21535 + *  |       60     | 4 | 7 |1|   56        |        64        |        64       |
21536 + *  +--------------+---+---+-+-------------+------------------+-----------------+
21537 + *  |    dirid     | 0 | F |H|  prefix-1   |    prefix-2      |  prefix-3/hash  |
21538 + *  +--------------+---+---+-+-------------+------------------+-----------------+
21539 + *  |                  |                   |                  |                 |
21540 + *  |    8 bytes       |      8 bytes      |     8 bytes      |     8 bytes     |
21541 + *
21542 + * dirid         objectid of directory this item is for
21543 + *
21544 + * F             fibration, see fs/reiser4/plugin/fibration.[ch]
21545 + *
21546 + * H             1 if last 8 bytes of the key contain hash,
21547 + *               0 if last 8 bytes of the key contain prefix-3
21548 + *
21549 + * prefix-1      first 7 characters of file name.
21550 + *               Padded by zeroes if name is not long enough.
21551 + *
21552 + * prefix-2      next 8 characters of the file name.
21553 + *
21554 + * prefix-3      next 8 characters of the file name.
21555 + *
21556 + * hash          hash of the rest of file name (i.e., portion of file
21557 + *               name not included into prefix-1 and prefix-2).
21558 + *
21559 + * File names shorter than 23 (== 7 + 8 + 8) characters are completely encoded
21560 + * in the key. Such file names are called "short". They are distinguished by H
21561 + * bit set 0 in the key.
21562 + *
21563 + * Other file names are "long". For long name, H bit is 1, and first 15 (== 7
21564 + * + 8) characters are encoded in prefix-1 and prefix-2 portions of the
21565 + * key. Last 8 bytes of the key are occupied by hash of the remaining
21566 + * characters of the name.
21567 + *
21568 + * This key assignment reaches following important goals:
21569 + *
21570 + *     (1) directory entries are sorted in approximately lexicographical
21571 + *     order.
21572 + *
21573 + *     (2) collisions (when multiple directory items have the same key), while
21574 + *     principally unavoidable in a tree with fixed length keys, are rare.
21575 + *
21576 + * STAT DATA
21577 + *
21578 + *  |       60     | 4 |       64        | 4 |     60       |        64       |
21579 + *  +--------------+---+-----------------+---+--------------+-----------------+
21580 + *  |  locality id | 1 |    ordering     | 0 |  objectid    |        0        |
21581 + *  +--------------+---+-----------------+---+--------------+-----------------+
21582 + *  |                  |                 |                  |                 |
21583 + *  |    8 bytes       |    8 bytes      |     8 bytes      |     8 bytes     |
21584 + *
21585 + * locality id     object id of a directory where first name was created for
21586 + *                 the object
21587 + *
21588 + * ordering        copy of second 8-byte portion of the key of directory
21589 + *                 entry for the first name of this object. Ordering has a form
21590 + *                         {
21591 + *                                 fibration :7;
21592 + *                                 h         :1;
21593 + *                                 prefix1   :56;
21594 + *                         }
21595 + *                 see description of key for directory entry above.
21596 + *
21597 + * objectid        object id for this object
21598 + *
21599 + * This key assignment policy is designed to keep stat-data in the same order
21600 + * as corresponding directory items, thus speeding up readdir/stat types of
21601 + * workload.
21602 + *
21603 + * FILE BODY
21604 + *
21605 + *  |       60     | 4 |       64        | 4 |     60       |        64       |
21606 + *  +--------------+---+-----------------+---+--------------+-----------------+
21607 + *  |  locality id | 4 |    ordering     | 0 |  objectid    |      offset     |
21608 + *  +--------------+---+-----------------+---+--------------+-----------------+
21609 + *  |                  |                 |                  |                 |
21610 + *  |    8 bytes       |    8 bytes      |     8 bytes      |     8 bytes     |
21611 + *
21612 + * locality id     object id of a directory where first name was created for
21613 + *                 the object
21614 + *
21615 + * ordering        the same as in the key of stat-data for this object
21616 + *
21617 + * objectid        object id for this object
21618 + *
21619 + * offset          logical offset from the beginning of this file.
21620 + *                 Measured in bytes.
21621 + *
21622 + *
21623 + *                   KEY ASSIGNMENT: PLAN A, SHORT KEYS.
21624 + *
21625 + * DIRECTORY ITEMS
21626 + *
21627 + *  |       60     | 4 | 7 |1|   56        |        64       |
21628 + *  +--------------+---+---+-+-------------+-----------------+
21629 + *  |    dirid     | 0 | F |H|  prefix-1   |  prefix-2/hash  |
21630 + *  +--------------+---+---+-+-------------+-----------------+
21631 + *  |                  |                   |                 |
21632 + *  |    8 bytes       |      8 bytes      |     8 bytes     |
21633 + *
21634 + * dirid         objectid of directory this item is for
21635 + *
21636 + * F             fibration, see fs/reiser4/plugin/fibration.[ch]
21637 + *
21638 + * H             1 if last 8 bytes of the key contain hash,
21639 + *               0 if last 8 bytes of the key contain prefix-2
21640 + *
21641 + * prefix-1      first 7 characters of file name.
21642 + *               Padded by zeroes if name is not long enough.
21643 + *
21644 + * prefix-2      next 8 characters of the file name.
21645 + *
21646 + * hash          hash of the rest of file name (i.e., portion of file
21647 + *               name not included into prefix-1).
21648 + *
21649 + * File names shorter than 15 (== 7 + 8) characters are completely encoded in
21650 + * the key. Such file names are called "short". They are distinguished by H
21651 + * bit set in the key.
21652 + *
21653 + * Other file names are "long". For long name, H bit is 0, and first 7
21654 + * characters are encoded in prefix-1 portion of the key. Last 8 bytes of the
21655 + * key are occupied by hash of the remaining characters of the name.
21656 + *
21657 + * STAT DATA
21658 + *
21659 + *  |       60     | 4 | 4 |     60       |        64       |
21660 + *  +--------------+---+---+--------------+-----------------+
21661 + *  |  locality id | 1 | 0 |  objectid    |        0        |
21662 + *  +--------------+---+---+--------------+-----------------+
21663 + *  |                  |                  |                 |
21664 + *  |    8 bytes       |     8 bytes      |     8 bytes     |
21665 + *
21666 + * locality id     object id of a directory where first name was created for
21667 + *                 the object
21668 + *
21669 + * objectid        object id for this object
21670 + *
21671 + * FILE BODY
21672 + *
21673 + *  |       60     | 4 | 4 |     60       |        64       |
21674 + *  +--------------+---+---+--------------+-----------------+
21675 + *  |  locality id | 4 | 0 |  objectid    |      offset     |
21676 + *  +--------------+---+---+--------------+-----------------+
21677 + *  |                  |                  |                 |
21678 + *  |    8 bytes       |     8 bytes      |     8 bytes     |
21679 + *
21680 + * locality id     object id of a directory where first name was created for
21681 + *                 the object
21682 + *
21683 + * objectid        object id for this object
21684 + *
21685 + * offset          logical offset from the beginning of this file.
21686 + *                 Measured in bytes.
21687 + *
21688 + *
21689 + */
21690 +
21691 +#include "debug.h"
21692 +#include "key.h"
21693 +#include "kassign.h"
21694 +#include "vfs_ops.h"
21695 +#include "inode.h"
21696 +#include "super.h"
21697 +#include "dscale.h"
21698 +
21699 +#include <linux/types.h>       /* for __u??  */
21700 +#include <linux/fs.h>          /* for struct super_block, etc  */
21701 +
21702 +/* bitmask for H bit (see comment at the beginning of this file */
21703 +static const __u64 longname_mark = 0x0100000000000000ull;
21704 +/* bitmask for F and H portions of the key. */
21705 +static const __u64 fibration_mask = 0xff00000000000000ull;
21706 +
21707 +/* return true if name is not completely encoded in @key */
21708 +int is_longname_key(const reiser4_key * key)
21709 +{
21710 +       __u64 highpart;
21711 +
21712 +       assert("nikita-2863", key != NULL);
21713 +       if (get_key_type(key) != KEY_FILE_NAME_MINOR)
21714 +               print_key("oops", key);
21715 +       assert("nikita-2864", get_key_type(key) == KEY_FILE_NAME_MINOR);
21716 +
21717 +       if (REISER4_LARGE_KEY)
21718 +               highpart = get_key_ordering(key);
21719 +       else
21720 +               highpart = get_key_objectid(key);
21721 +
21722 +       return (highpart & longname_mark) ? 1 : 0;
21723 +}
21724 +
21725 +/* return true if @name is too long to be completely encoded in the key */
21726 +int is_longname(const char *name UNUSED_ARG, int len)
21727 +{
21728 +       if (REISER4_LARGE_KEY)
21729 +               return len > 23;
21730 +       else
21731 +               return len > 15;
21732 +}
21733 +
21734 +/* code ascii string into __u64.
21735 +
21736 +   Put characters of @name into result (@str) one after another starting
21737 +   from @start_idx-th highest (arithmetically) byte. This produces
21738 +   endian-safe encoding. memcpy(2) will not do.
21739 +
21740 +*/
21741 +static __u64 pack_string(const char *name /* string to encode */ ,
21742 +                        int start_idx  /* highest byte in result from
21743 +                                        * which to start encoding */ )
21744 +{
21745 +       unsigned i;
21746 +       __u64 str;
21747 +
21748 +       str = 0;
21749 +       for (i = 0; (i < sizeof str - start_idx) && name[i]; ++i) {
21750 +               str <<= 8;
21751 +               str |= (unsigned char)name[i];
21752 +       }
21753 +       str <<= (sizeof str - i - start_idx) << 3;
21754 +       return str;
21755 +}
21756 +
21757 +/* opposite to pack_string(). Takes value produced by pack_string(), restores
21758 + * string encoded in it and stores result in @buf */
21759 +char *unpack_string(__u64 value, char *buf)
21760 +{
21761 +       do {
21762 +               *buf = value >> (64 - 8);
21763 +               if (*buf)
21764 +                       ++buf;
21765 +               value <<= 8;
21766 +       } while (value != 0);
21767 +       *buf = 0;
21768 +       return buf;
21769 +}
21770 +
21771 +/* obtain name encoded in @key and store it in @buf */
21772 +char *extract_name_from_key(const reiser4_key * key, char *buf)
21773 +{
21774 +       char *c;
21775 +
21776 +       assert("nikita-2868", !is_longname_key(key));
21777 +
21778 +       c = buf;
21779 +       if (REISER4_LARGE_KEY) {
21780 +               c = unpack_string(get_key_ordering(key) & ~fibration_mask, c);
21781 +               c = unpack_string(get_key_fulloid(key), c);
21782 +       } else
21783 +               c = unpack_string(get_key_fulloid(key) & ~fibration_mask, c);
21784 +       unpack_string(get_key_offset(key), c);
21785 +       return buf;
21786 +}
21787 +
21788 +/**
21789 + * complete_entry_key - calculate entry key by name
21790 + * @dir: directory where entry is (or will be) in
21791 + * @name: name to calculate key of
21792 + * @len: lenth of name
21793 + * @result: place to store result in
21794 + *
21795 + * Sets fields of entry key @result which depend on file name.
21796 + * When REISER4_LARGE_KEY is defined three fields of @result are set: ordering,
21797 + * objectid and offset. Otherwise, objectid and offset are set.
21798 + */
21799 +void complete_entry_key(const struct inode *dir, const char *name,
21800 +                       int len, reiser4_key *result)
21801 +{
21802 +#if REISER4_LARGE_KEY
21803 +       __u64 ordering;
21804 +       __u64 objectid;
21805 +       __u64 offset;
21806 +
21807 +       assert("nikita-1139", dir != NULL);
21808 +       assert("nikita-1142", result != NULL);
21809 +       assert("nikita-2867", strlen(name) == len);
21810 +
21811 +       /*
21812 +        * key allocation algorithm for directory entries in case of large
21813 +        * keys:
21814 +        *
21815 +        * If name is not longer than 7 + 8 + 8 = 23 characters, put first 7
21816 +        * characters into ordering field of key, next 8 charactes (if any)
21817 +        * into objectid field of key and next 8 ones (of any) into offset
21818 +        * field of key
21819 +        *
21820 +        * If file name is longer than 23 characters, put first 7 characters
21821 +        * into key's ordering, next 8 to objectid and hash of remaining
21822 +        * characters into offset field.
21823 +        *
21824 +        * To distinguish above cases, in latter set up unused high bit in
21825 +        * ordering field.
21826 +        */
21827 +
21828 +       /* [0-6] characters to ordering */
21829 +       ordering = pack_string(name, 1);
21830 +       if (len > 7) {
21831 +               /* [7-14] characters to objectid */
21832 +               objectid = pack_string(name + 7, 0);
21833 +               if (len > 15) {
21834 +                       if (len <= 23) {
21835 +                               /* [15-23] characters to offset */
21836 +                               offset = pack_string(name + 15, 0);
21837 +                       } else {
21838 +                               /* note in a key the fact that offset contains hash. */
21839 +                               ordering |= longname_mark;
21840 +
21841 +                               /* offset is the hash of the file name's tail. */
21842 +                               offset = inode_hash_plugin(dir)->hash(name + 15,
21843 +                                                                     len - 15);
21844 +                       }
21845 +               } else {
21846 +                       offset = 0ull;
21847 +               }
21848 +       } else {
21849 +               objectid = 0ull;
21850 +               offset = 0ull;
21851 +       }
21852 +
21853 +       assert("nikita-3480", inode_fibration_plugin(dir) != NULL);
21854 +       ordering |= inode_fibration_plugin(dir)->fibre(dir, name, len);
21855 +
21856 +       set_key_ordering(result, ordering);
21857 +       set_key_fulloid(result, objectid);
21858 +       set_key_offset(result, offset);
21859 +       return;
21860 +
21861 +#else
21862 +       __u64 objectid;
21863 +       __u64 offset;
21864 +
21865 +       assert("nikita-1139", dir != NULL);
21866 +       assert("nikita-1142", result != NULL);
21867 +       assert("nikita-2867", strlen(name) == len);
21868 +
21869 +       /*
21870 +        * key allocation algorithm for directory entries in case of not large
21871 +        * keys:
21872 +        *
21873 +        * If name is not longer than 7 + 8 = 15 characters, put first 7
21874 +        * characters into objectid field of key, next 8 charactes (if any)
21875 +        * into offset field of key
21876 +        *
21877 +        * If file name is longer than 15 characters, put first 7 characters
21878 +        * into key's objectid, and hash of remaining characters into offset
21879 +        * field.
21880 +        *
21881 +        * To distinguish above cases, in latter set up unused high bit in
21882 +        * objectid field.
21883 +        */
21884 +
21885 +       /* [0-6] characters to objectid */
21886 +       objectid = pack_string(name, 1);
21887 +       if (len > 7) {
21888 +               if (len <= 15) {
21889 +                       /* [7-14] characters to offset */
21890 +                       offset = pack_string(name + 7, 0);
21891 +               } else {
21892 +                       /* note in a key the fact that offset contains hash. */
21893 +                       objectid |= longname_mark;
21894 +
21895 +                       /* offset is the hash of the file name. */
21896 +                       offset = inode_hash_plugin(dir)->hash(name + 7,
21897 +                                                             len - 7);
21898 +               }
21899 +       } else
21900 +               offset = 0ull;
21901 +
21902 +       assert("nikita-3480", inode_fibration_plugin(dir) != NULL);
21903 +       objectid |= inode_fibration_plugin(dir)->fibre(dir, name, len);
21904 +
21905 +       set_key_fulloid(result, objectid);
21906 +       set_key_offset(result, offset);
21907 +       return;
21908 +#endif                         /* ! REISER4_LARGE_KEY */
21909 +}
21910 +
21911 +/* true, if @key is the key of "." */
21912 +int is_dot_key(const reiser4_key * key /* key to check */ )
21913 +{
21914 +       assert("nikita-1717", key != NULL);
21915 +       assert("nikita-1718", get_key_type(key) == KEY_FILE_NAME_MINOR);
21916 +       return
21917 +           (get_key_ordering(key) == 0ull) &&
21918 +           (get_key_objectid(key) == 0ull) && (get_key_offset(key) == 0ull);
21919 +}
21920 +
21921 +/* build key for stat-data.
21922 +
21923 +   return key of stat-data of this object. This should became sd plugin
21924 +   method in the future. For now, let it be here.
21925 +
21926 +*/
21927 +reiser4_key *build_sd_key(const struct inode * target /* inode of an object */ ,
21928 +                         reiser4_key * result  /* resulting key of @target
21929 +                                                  stat-data */ )
21930 +{
21931 +       assert("nikita-261", result != NULL);
21932 +
21933 +       reiser4_key_init(result);
21934 +       set_key_locality(result, reiser4_inode_data(target)->locality_id);
21935 +       set_key_ordering(result, get_inode_ordering(target));
21936 +       set_key_objectid(result, get_inode_oid(target));
21937 +       set_key_type(result, KEY_SD_MINOR);
21938 +       set_key_offset(result, (__u64) 0);
21939 +       return result;
21940 +}
21941 +
21942 +/* encode part of key into &obj_key_id
21943 +
21944 +   This encodes into @id part of @key sufficient to restore @key later,
21945 +   given that latter is key of object (key of stat-data).
21946 +
21947 +   See &obj_key_id
21948 +*/
21949 +int build_obj_key_id(const reiser4_key * key /* key to encode */ ,
21950 +                    obj_key_id * id /* id where key is encoded in */ )
21951 +{
21952 +       assert("nikita-1151", key != NULL);
21953 +       assert("nikita-1152", id != NULL);
21954 +
21955 +       memcpy(id, key, sizeof *id);
21956 +       return 0;
21957 +}
21958 +
21959 +/* encode reference to @obj in @id.
21960 +
21961 +   This is like build_obj_key_id() above, but takes inode as parameter. */
21962 +int build_inode_key_id(const struct inode *obj /* object to build key of */ ,
21963 +                      obj_key_id * id /* result */ )
21964 +{
21965 +       reiser4_key sdkey;
21966 +
21967 +       assert("nikita-1166", obj != NULL);
21968 +       assert("nikita-1167", id != NULL);
21969 +
21970 +       build_sd_key(obj, &sdkey);
21971 +       build_obj_key_id(&sdkey, id);
21972 +       return 0;
21973 +}
21974 +
21975 +/* decode @id back into @key
21976 +
21977 +   Restore key of object stat-data from @id. This is dual to
21978 +   build_obj_key_id() above.
21979 +*/
21980 +int extract_key_from_id(const obj_key_id * id  /* object key id to extract key
21981 +                                                * from */ ,
21982 +                       reiser4_key * key /* result */ )
21983 +{
21984 +       assert("nikita-1153", id != NULL);
21985 +       assert("nikita-1154", key != NULL);
21986 +
21987 +       reiser4_key_init(key);
21988 +       memcpy(key, id, sizeof *id);
21989 +       return 0;
21990 +}
21991 +
21992 +/* extract objectid of directory from key of directory entry within said
21993 +   directory.
21994 +   */
21995 +oid_t extract_dir_id_from_key(const reiser4_key * de_key       /* key of
21996 +                                                                * directory
21997 +                                                                * entry */ )
21998 +{
21999 +       assert("nikita-1314", de_key != NULL);
22000 +       return get_key_locality(de_key);
22001 +}
22002 +
22003 +/* encode into @id key of directory entry.
22004 +
22005 +   Encode into @id information sufficient to later distinguish directory
22006 +   entries within the same directory. This is not whole key, because all
22007 +   directory entries within directory item share locality which is equal
22008 +   to objectid of their directory.
22009 +
22010 +*/
22011 +int build_de_id(const struct inode *dir /* inode of directory */ ,
22012 +               const struct qstr *name /* name to be given to @obj by
22013 +                                        * directory entry being
22014 +                                        * constructed */ ,
22015 +               de_id * id /* short key of directory entry */ )
22016 +{
22017 +       reiser4_key key;
22018 +
22019 +       assert("nikita-1290", dir != NULL);
22020 +       assert("nikita-1292", id != NULL);
22021 +
22022 +       /* NOTE-NIKITA this is suboptimal. */
22023 +       inode_dir_plugin(dir)->build_entry_key(dir, name, &key);
22024 +       return build_de_id_by_key(&key, id);
22025 +}
22026 +
22027 +/* encode into @id key of directory entry.
22028 +
22029 +   Encode into @id information sufficient to later distinguish directory
22030 +   entries within the same directory. This is not whole key, because all
22031 +   directory entries within directory item share locality which is equal
22032 +   to objectid of their directory.
22033 +
22034 +*/
22035 +int build_de_id_by_key(const reiser4_key * entry_key   /* full key of directory
22036 +                                                        * entry */ ,
22037 +                      de_id * id /* short key of directory entry */ )
22038 +{
22039 +       memcpy(id, ((__u64 *) entry_key) + 1, sizeof *id);
22040 +       return 0;
22041 +}
22042 +
22043 +/* restore from @id key of directory entry.
22044 +
22045 +   Function dual to build_de_id(): given @id and locality, build full
22046 +   key of directory entry within directory item.
22047 +
22048 +*/
22049 +int extract_key_from_de_id(const oid_t locality        /* locality of directory
22050 +                                                * entry */ ,
22051 +                          const de_id * id /* directory entry id */ ,
22052 +                          reiser4_key * key /* result */ )
22053 +{
22054 +       /* no need to initialise key here: all fields are overwritten */
22055 +       memcpy(((__u64 *) key) + 1, id, sizeof *id);
22056 +       set_key_locality(key, locality);
22057 +       set_key_type(key, KEY_FILE_NAME_MINOR);
22058 +       return 0;
22059 +}
22060 +
22061 +/* compare two &de_id's */
22062 +cmp_t de_id_cmp(const de_id * id1 /* first &de_id to compare */ ,
22063 +               const de_id * id2 /* second &de_id to compare */ )
22064 +{
22065 +       /* NOTE-NIKITA ugly implementation */
22066 +       reiser4_key k1;
22067 +       reiser4_key k2;
22068 +
22069 +       extract_key_from_de_id((oid_t) 0, id1, &k1);
22070 +       extract_key_from_de_id((oid_t) 0, id2, &k2);
22071 +       return keycmp(&k1, &k2);
22072 +}
22073 +
22074 +/* compare &de_id with key */
22075 +cmp_t de_id_key_cmp(const de_id * id /* directory entry id to compare */ ,
22076 +                   const reiser4_key * key /* key to compare */ )
22077 +{
22078 +       cmp_t result;
22079 +       reiser4_key *k1;
22080 +
22081 +       k1 = (reiser4_key *) (((unsigned long)id) - sizeof key->el[0]);
22082 +       result = KEY_DIFF_EL(k1, key, 1);
22083 +       if (result == EQUAL_TO) {
22084 +               result = KEY_DIFF_EL(k1, key, 2);
22085 +               if (REISER4_LARGE_KEY && result == EQUAL_TO) {
22086 +                       result = KEY_DIFF_EL(k1, key, 3);
22087 +               }
22088 +       }
22089 +       return result;
22090 +}
22091 +
22092 +/*
22093 + * return number of bytes necessary to encode @inode identity.
22094 + */
22095 +int inode_onwire_size(const struct inode *inode)
22096 +{
22097 +       int result;
22098 +
22099 +       result = dscale_bytes(get_inode_oid(inode));
22100 +       result += dscale_bytes(get_inode_locality(inode));
22101 +
22102 +       /*
22103 +        * ordering is large (it usually has highest bits set), so it makes
22104 +        * little sense to dscale it.
22105 +        */
22106 +       if (REISER4_LARGE_KEY)
22107 +               result += sizeof(get_inode_ordering(inode));
22108 +       return result;
22109 +}
22110 +
22111 +/*
22112 + * encode @inode identity at @start
22113 + */
22114 +char *build_inode_onwire(const struct inode *inode, char *start)
22115 +{
22116 +       start += dscale_write(start, get_inode_locality(inode));
22117 +       start += dscale_write(start, get_inode_oid(inode));
22118 +
22119 +       if (REISER4_LARGE_KEY) {
22120 +               put_unaligned(cpu_to_le64(get_inode_ordering(inode)), (__le64 *)start);
22121 +               start += sizeof(get_inode_ordering(inode));
22122 +       }
22123 +       return start;
22124 +}
22125 +
22126 +/*
22127 + * extract key that was previously encoded by build_inode_onwire() at @addr
22128 + */
22129 +char *extract_obj_key_id_from_onwire(char *addr, obj_key_id * key_id)
22130 +{
22131 +       __u64 val;
22132 +
22133 +       addr += dscale_read(addr, &val);
22134 +       val = (val << KEY_LOCALITY_SHIFT) | KEY_SD_MINOR;
22135 +       put_unaligned(cpu_to_le64(val), (__le64 *)key_id->locality);
22136 +       addr += dscale_read(addr, &val);
22137 +       put_unaligned(cpu_to_le64(val), (__le64 *)key_id->objectid);
22138 +#if REISER4_LARGE_KEY
22139 +       memcpy(&key_id->ordering, addr, sizeof key_id->ordering);
22140 +       addr += sizeof key_id->ordering;
22141 +#endif
22142 +       return addr;
22143 +}
22144 +
22145 +/* Make Linus happy.
22146 +   Local variables:
22147 +   c-indentation-style: "K&R"
22148 +   mode-name: "LC"
22149 +   c-basic-offset: 8
22150 +   tab-width: 8
22151 +   fill-column: 120
22152 +   End:
22153 +*/
22154 diff --git a/fs/reiser4/kassign.h b/fs/reiser4/kassign.h
22155 new file mode 100644
22156 index 0000000..b63b388
22157 --- /dev/null
22158 +++ b/fs/reiser4/kassign.h
22159 @@ -0,0 +1,110 @@
22160 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
22161 + * reiser4/README */
22162 +
22163 +/* Key assignment policy interface. See kassign.c for details. */
22164 +
22165 +#if !defined( __KASSIGN_H__ )
22166 +#define __KASSIGN_H__
22167 +
22168 +#include "forward.h"
22169 +#include "key.h"
22170 +#include "dformat.h"
22171 +
22172 +#include <linux/types.h>       /* for __u??  */
22173 +#include <linux/fs.h>          /* for struct super_block, etc  */
22174 +#include <linux/dcache.h>      /* for struct qstr */
22175 +
22176 +/* key assignment functions */
22177 +
22178 +/* Information from which key of file stat-data can be uniquely
22179 +   restored. This depends on key assignment policy for
22180 +   stat-data. Currently it's enough to store object id and locality id
22181 +   (60+60==120) bits, because minor packing locality and offset of
22182 +   stat-data key are always known constants: KEY_SD_MINOR and 0
22183 +   respectively. For simplicity 4 bits are wasted in each id, and just
22184 +   two 64 bit integers are stored.
22185 +
22186 +   This field has to be byte-aligned, because we don't want to waste
22187 +   space in directory entries. There is another side of a coin of
22188 +   course: we waste CPU and bus bandwidth in stead, by copying data back
22189 +   and forth.
22190 +
22191 +   Next optimization: &obj_key_id is mainly used to address stat data from
22192 +   directory entries. Under the assumption that majority of files only have
22193 +   only name (one hard link) from *the* parent directory it seems reasonable
22194 +   to only store objectid of stat data and take its locality from key of
22195 +   directory item.
22196 +
22197 +   This requires some flag to be added to the &obj_key_id to distinguish
22198 +   between these two cases. Remaining bits in flag byte are then asking to be
22199 +   used to store file type.
22200 +
22201 +   This optimization requires changes in directory item handling code.
22202 +
22203 +*/
22204 +typedef struct obj_key_id {
22205 +       d8 locality[sizeof(__u64)];
22206 +        ON_LARGE_KEY(d8 ordering[sizeof(__u64)];
22207 +           )
22208 +       d8 objectid[sizeof(__u64)];
22209 +}
22210 +obj_key_id;
22211 +
22212 +/* Information sufficient to uniquely identify directory entry within
22213 +   compressed directory item.
22214 +
22215 +   For alignment issues see &obj_key_id above.
22216 +*/
22217 +typedef struct de_id {
22218 +       ON_LARGE_KEY(d8 ordering[sizeof(__u64)];)
22219 +       d8 objectid[sizeof(__u64)];
22220 +       d8 offset[sizeof(__u64)];
22221 +}
22222 +de_id;
22223 +
22224 +extern int inode_onwire_size(const struct inode *obj);
22225 +extern char *build_inode_onwire(const struct inode *obj, char *area);
22226 +extern char *extract_obj_key_id_from_onwire(char *area, obj_key_id * key_id);
22227 +
22228 +extern int build_inode_key_id(const struct inode *obj, obj_key_id * id);
22229 +extern int extract_key_from_id(const obj_key_id * id, reiser4_key * key);
22230 +extern int build_obj_key_id(const reiser4_key * key, obj_key_id * id);
22231 +extern oid_t extract_dir_id_from_key(const reiser4_key * de_key);
22232 +extern int build_de_id(const struct inode *dir, const struct qstr *name,
22233 +                      de_id * id);
22234 +extern int build_de_id_by_key(const reiser4_key * entry_key, de_id * id);
22235 +extern int extract_key_from_de_id(const oid_t locality, const de_id * id,
22236 +                                 reiser4_key * key);
22237 +extern cmp_t de_id_cmp(const de_id * id1, const de_id * id2);
22238 +extern cmp_t de_id_key_cmp(const de_id * id, const reiser4_key * key);
22239 +
22240 +extern int build_readdir_key_common(struct file *dir, reiser4_key * result);
22241 +extern void build_entry_key_common(const struct inode *dir,
22242 +                                  const struct qstr *name,
22243 +                                  reiser4_key * result);
22244 +extern void build_entry_key_stable_entry(const struct inode *dir,
22245 +                                        const struct qstr *name,
22246 +                                        reiser4_key * result);
22247 +extern int is_dot_key(const reiser4_key * key);
22248 +extern reiser4_key *build_sd_key(const struct inode *target,
22249 +                                reiser4_key * result);
22250 +
22251 +extern int is_longname_key(const reiser4_key * key);
22252 +extern int is_longname(const char *name, int len);
22253 +extern char *extract_name_from_key(const reiser4_key * key, char *buf);
22254 +extern char *unpack_string(__u64 value, char *buf);
22255 +extern void complete_entry_key(const struct inode *dir, const char *name,
22256 +                              int len, reiser4_key *result);
22257 +
22258 +/* __KASSIGN_H__ */
22259 +#endif
22260 +
22261 +/* Make Linus happy.
22262 +   Local variables:
22263 +   c-indentation-style: "K&R"
22264 +   mode-name: "LC"
22265 +   c-basic-offset: 8
22266 +   tab-width: 8
22267 +   fill-column: 120
22268 +   End:
22269 +*/
22270 diff --git a/fs/reiser4/key.c b/fs/reiser4/key.c
22271 new file mode 100644
22272 index 0000000..962c8bb
22273 --- /dev/null
22274 +++ b/fs/reiser4/key.c
22275 @@ -0,0 +1,137 @@
22276 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
22277 +
22278 +/* Key manipulations. */
22279 +
22280 +#include "debug.h"
22281 +#include "key.h"
22282 +#include "super.h"
22283 +#include "reiser4.h"
22284 +
22285 +#include <linux/types.h>       /* for __u??  */
22286 +
22287 +/* Minimal possible key: all components are zero. It is presumed that this is
22288 +   independent of key scheme. */
22289 +static const reiser4_key MINIMAL_KEY = {
22290 +       .el = {
22291 +               0ull,
22292 +               ON_LARGE_KEY(0ull,)
22293 +               0ull,
22294 +               0ull
22295 +       }
22296 +};
22297 +
22298 +/* Maximal possible key: all components are ~0. It is presumed that this is
22299 +   independent of key scheme. */
22300 +static const reiser4_key MAXIMAL_KEY = {
22301 +       .el = {
22302 +               __constant_cpu_to_le64(~0ull),
22303 +               ON_LARGE_KEY(__constant_cpu_to_le64(~0ull),)
22304 +               __constant_cpu_to_le64(~0ull),
22305 +               __constant_cpu_to_le64(~0ull)
22306 +       }
22307 +};
22308 +
22309 +/* Initialize key. */
22310 +void reiser4_key_init(reiser4_key * key /* key to init */ )
22311 +{
22312 +       assert("nikita-1169", key != NULL);
22313 +       memset(key, 0, sizeof *key);
22314 +}
22315 +
22316 +/* minimal possible key in the tree. Return pointer to the static storage. */
22317 +const reiser4_key *min_key(void)
22318 +{
22319 +       return &MINIMAL_KEY;
22320 +}
22321 +
22322 +/* maximum possible key in the tree. Return pointer to the static storage. */
22323 +const reiser4_key *max_key(void)
22324 +{
22325 +       return &MAXIMAL_KEY;
22326 +}
22327 +
22328 +#if REISER4_DEBUG
22329 +/* debugging aid: print symbolic name of key type */
22330 +static const char *type_name(unsigned int key_type /* key type */ )
22331 +{
22332 +       switch (key_type) {
22333 +       case KEY_FILE_NAME_MINOR:
22334 +               return "file name";
22335 +       case KEY_SD_MINOR:
22336 +               return "stat data";
22337 +       case KEY_ATTR_NAME_MINOR:
22338 +               return "attr name";
22339 +       case KEY_ATTR_BODY_MINOR:
22340 +               return "attr body";
22341 +       case KEY_BODY_MINOR:
22342 +               return "file body";
22343 +       default:
22344 +               return "unknown";
22345 +       }
22346 +}
22347 +
22348 +/* debugging aid: print human readable information about key */
22349 +void print_key(const char *prefix /* prefix to print */ ,
22350 +              const reiser4_key * key /* key to print */ )
22351 +{
22352 +       /* turn bold on */
22353 +       /* printf ("\033[1m"); */
22354 +       if (key == NULL)
22355 +               printk("%s: null key\n", prefix);
22356 +       else {
22357 +               if (REISER4_LARGE_KEY)
22358 +                       printk("%s: (%Lx:%x:%Lx:%Lx:%Lx:%Lx)", prefix,
22359 +                              get_key_locality(key),
22360 +                              get_key_type(key),
22361 +                              get_key_ordering(key),
22362 +                              get_key_band(key),
22363 +                              get_key_objectid(key), get_key_offset(key));
22364 +               else
22365 +                       printk("%s: (%Lx:%x:%Lx:%Lx:%Lx)", prefix,
22366 +                              get_key_locality(key),
22367 +                              get_key_type(key),
22368 +                              get_key_band(key),
22369 +                              get_key_objectid(key), get_key_offset(key));
22370 +               /*
22371 +                * if this is a key of directory entry, try to decode part of
22372 +                * a name stored in the key, and output it.
22373 +                */
22374 +               if (get_key_type(key) == KEY_FILE_NAME_MINOR) {
22375 +                       char buf[DE_NAME_BUF_LEN];
22376 +                       char *c;
22377 +
22378 +                       c = buf;
22379 +                       c = unpack_string(get_key_ordering(key), c);
22380 +                       unpack_string(get_key_fulloid(key), c);
22381 +                       printk("[%s", buf);
22382 +                       if (is_longname_key(key))
22383 +                               /*
22384 +                                * only part of the name is stored in the key.
22385 +                                */
22386 +                               printk("...]\n");
22387 +                       else {
22388 +                               /*
22389 +                                * whole name is stored in the key.
22390 +                                */
22391 +                               unpack_string(get_key_offset(key), buf);
22392 +                               printk("%s]\n", buf);
22393 +                       }
22394 +               } else {
22395 +                       printk("[%s]\n", type_name(get_key_type(key)));
22396 +               }
22397 +       }
22398 +       /* turn bold off */
22399 +       /* printf ("\033[m\017"); */
22400 +}
22401 +
22402 +#endif
22403 +
22404 +/* Make Linus happy.
22405 +   Local variables:
22406 +   c-indentation-style: "K&R"
22407 +   mode-name: "LC"
22408 +   c-basic-offset: 8
22409 +   tab-width: 8
22410 +   fill-column: 120
22411 +   End:
22412 +*/
22413 diff --git a/fs/reiser4/key.h b/fs/reiser4/key.h
22414 new file mode 100644
22415 index 0000000..e29df83
22416 --- /dev/null
22417 +++ b/fs/reiser4/key.h
22418 @@ -0,0 +1,384 @@
22419 +/* Copyright 2000, 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
22420 +
22421 +/* Declarations of key-related data-structures and operations on keys. */
22422 +
22423 +#if !defined( __REISER4_KEY_H__ )
22424 +#define __REISER4_KEY_H__
22425 +
22426 +#include "dformat.h"
22427 +#include "forward.h"
22428 +#include "debug.h"
22429 +
22430 +#include <linux/types.h>       /* for __u??  */
22431 +
22432 +/* Operations on keys in reiser4 tree */
22433 +
22434 +/* No access to any of these fields shall be done except via a
22435 +   wrapping macro/function, and that wrapping macro/function shall
22436 +   convert to little endian order.  Compare keys will consider cpu byte order. */
22437 +
22438 +/* A storage layer implementation difference between a regular unix file body and its attributes is in the typedef below
22439 +   which causes all of the attributes of a file to be near in key to all of the other attributes for all of the files
22440 +   within that directory, and not near to the file itself.  It is interesting to consider whether this is the wrong
22441 +   approach, and whether there should be no difference at all. For current usage patterns this choice is probably the
22442 +   right one.  */
22443 +
22444 +/* possible values for minor packing locality (4 bits required) */
22445 +typedef enum {
22446 +       /* file name */
22447 +       KEY_FILE_NAME_MINOR = 0,
22448 +       /* stat-data */
22449 +       KEY_SD_MINOR = 1,
22450 +       /* file attribute name */
22451 +       KEY_ATTR_NAME_MINOR = 2,
22452 +       /* file attribute value */
22453 +       KEY_ATTR_BODY_MINOR = 3,
22454 +       /* file body (tail or extent) */
22455 +       KEY_BODY_MINOR = 4,
22456 +} key_minor_locality;
22457 +
22458 +/* everything stored in the tree has a unique key, which means that the tree is (logically) fully ordered by key.
22459 +   Physical order is determined by dynamic heuristics that attempt to reflect key order when allocating available space,
22460 +   and by the repacker.  It is stylistically better to put aggregation information into the key.  Thus, if you want to
22461 +   segregate extents from tails, it is better to give them distinct minor packing localities rather than changing
22462 +   block_alloc.c to check the node type when deciding where to allocate the node.
22463 +
22464 +   The need to randomly displace new directories and large files disturbs this symmetry unfortunately.  However, it
22465 +   should be noted that this is a need that is not clearly established given the existence of a repacker.  Also, in our
22466 +   current implementation tails have a different minor packing locality from extents, and no files have both extents and
22467 +   tails, so maybe symmetry can be had without performance cost after all.  Symmetry is what we ship for now....
22468 +*/
22469 +
22470 +/* Arbitrary major packing localities can be assigned to objects using
22471 +   the reiser4(filenameA/..packing<=some_number) system call.
22472 +
22473 +   In reiser4, the creat() syscall creates a directory
22474 +
22475 +   whose default flow (that which is referred to if the directory is
22476 +   read as a file) is the traditional unix file body.
22477 +
22478 +   whose directory plugin is the 'filedir'
22479 +
22480 +   whose major packing locality is that of the parent of the object created.
22481 +
22482 +   The static_stat item is a particular commonly used directory
22483 +   compression (the one for normal unix files).
22484 +
22485 +   The filedir plugin checks to see if the static_stat item exists.
22486 +   There is a unique key for static_stat.  If yes, then it uses the
22487 +   static_stat item for all of the values that it contains.  The
22488 +   static_stat item contains a flag for each stat it contains which
22489 +   indicates whether one should look outside the static_stat item for its
22490 +   contents.
22491 +*/
22492 +
22493 +/* offset of fields in reiser4_key. Value of each element of this enum
22494 +    is index within key (thought as array of __u64's) where this field
22495 +    is. */
22496 +typedef enum {
22497 +       /* major "locale", aka dirid. Sits in 1st element */
22498 +       KEY_LOCALITY_INDEX = 0,
22499 +       /* minor "locale", aka item type. Sits in 1st element */
22500 +       KEY_TYPE_INDEX = 0,
22501 +       ON_LARGE_KEY(KEY_ORDERING_INDEX,)
22502 +           /* "object band". Sits in 2nd element */
22503 +           KEY_BAND_INDEX,
22504 +       /* objectid. Sits in 2nd element */
22505 +       KEY_OBJECTID_INDEX = KEY_BAND_INDEX,
22506 +       /* full objectid. Sits in 2nd element */
22507 +       KEY_FULLOID_INDEX = KEY_BAND_INDEX,
22508 +       /* Offset. Sits in 3rd element */
22509 +       KEY_OFFSET_INDEX,
22510 +       /* Name hash. Sits in 3rd element */
22511 +       KEY_HASH_INDEX = KEY_OFFSET_INDEX,
22512 +       KEY_CACHELINE_END = KEY_OFFSET_INDEX,
22513 +       KEY_LAST_INDEX
22514 +} reiser4_key_field_index;
22515 +
22516 +/* key in reiser4 internal "balanced" tree. It is just array of three
22517 +    64bit integers in disk byte order (little-endian by default). This
22518 +    array is actually indexed by reiser4_key_field.  Each __u64 within
22519 +    this array is called "element". Logical key component encoded within
22520 +    elements are called "fields".
22521 +
22522 +    We declare this as union with second component dummy to suppress
22523 +    inconvenient array<->pointer casts implied in C. */
22524 +union reiser4_key {
22525 +       __le64 el[KEY_LAST_INDEX];
22526 +       int pad;
22527 +};
22528 +
22529 +/* bitmasks showing where within reiser4_key particular key is stored. */
22530 +/* major locality occupies higher 60 bits of the first element */
22531 +#define KEY_LOCALITY_MASK 0xfffffffffffffff0ull
22532 +
22533 +/* minor locality occupies lower 4 bits of the first element */
22534 +#define KEY_TYPE_MASK 0xfull
22535 +
22536 +/* controversial band occupies higher 4 bits of the 2nd element */
22537 +#define KEY_BAND_MASK 0xf000000000000000ull
22538 +
22539 +/* objectid occupies lower 60 bits of the 2nd element */
22540 +#define KEY_OBJECTID_MASK 0x0fffffffffffffffull
22541 +
22542 +/* full 64bit objectid*/
22543 +#define KEY_FULLOID_MASK 0xffffffffffffffffull
22544 +
22545 +/* offset is just 3rd L.M.Nt itself */
22546 +#define KEY_OFFSET_MASK 0xffffffffffffffffull
22547 +
22548 +/* ordering is whole second element */
22549 +#define KEY_ORDERING_MASK 0xffffffffffffffffull
22550 +
22551 +/* how many bits key element should be shifted to left to get particular field */
22552 +typedef enum {
22553 +       KEY_LOCALITY_SHIFT = 4,
22554 +       KEY_TYPE_SHIFT = 0,
22555 +       KEY_BAND_SHIFT = 60,
22556 +       KEY_OBJECTID_SHIFT = 0,
22557 +       KEY_FULLOID_SHIFT = 0,
22558 +       KEY_OFFSET_SHIFT = 0,
22559 +       KEY_ORDERING_SHIFT = 0,
22560 +} reiser4_key_field_shift;
22561 +
22562 +static inline __u64
22563 +get_key_el(const reiser4_key * key, reiser4_key_field_index off)
22564 +{
22565 +       assert("nikita-753", key != NULL);
22566 +       assert("nikita-754", off < KEY_LAST_INDEX);
22567 +       return le64_to_cpu(get_unaligned(&key->el[off]));
22568 +}
22569 +
22570 +static inline void
22571 +set_key_el(reiser4_key * key, reiser4_key_field_index off, __u64 value)
22572 +{
22573 +       assert("nikita-755", key != NULL);
22574 +       assert("nikita-756", off < KEY_LAST_INDEX);
22575 +       put_unaligned(cpu_to_le64(value), &key->el[off]);
22576 +}
22577 +
22578 +/* macro to define getter and setter functions for field F with type T */
22579 +#define DEFINE_KEY_FIELD( L, U, T )                                    \
22580 +static inline T get_key_ ## L ( const reiser4_key *key )               \
22581 +{                                                                      \
22582 +       assert( "nikita-750", key != NULL );                            \
22583 +       return ( T ) ( get_key_el( key, KEY_ ## U ## _INDEX ) &         \
22584 +                KEY_ ## U ## _MASK ) >> KEY_ ## U ## _SHIFT;           \
22585 +}                                                                      \
22586 +                                                                       \
22587 +static inline void set_key_ ## L ( reiser4_key *key, T loc )           \
22588 +{                                                                      \
22589 +       __u64 el;                                                       \
22590 +                                                                       \
22591 +       assert( "nikita-752", key != NULL );                            \
22592 +                                                                       \
22593 +       el = get_key_el( key, KEY_ ## U ## _INDEX );                    \
22594 +       /* clear field bits in the key */                               \
22595 +       el &= ~KEY_ ## U ## _MASK;                                      \
22596 +       /* actually it should be                                        \
22597 +                                                                       \
22598 +          el |= ( loc << KEY_ ## U ## _SHIFT ) & KEY_ ## U ## _MASK;   \
22599 +                                                                       \
22600 +          but we trust user to never pass values that wouldn't fit     \
22601 +          into field. Clearing extra bits is one operation, but this   \
22602 +          function is time-critical.                                   \
22603 +          But check this in assertion. */                              \
22604 +       assert( "nikita-759", ( ( loc << KEY_ ## U ## _SHIFT ) &        \
22605 +               ~KEY_ ## U ## _MASK ) == 0 );                           \
22606 +       el |= ( loc << KEY_ ## U ## _SHIFT );                           \
22607 +       set_key_el( key, KEY_ ## U ## _INDEX, el );                     \
22608 +}
22609 +
22610 +typedef __u64 oid_t;
22611 +
22612 +/* define get_key_locality(), set_key_locality() */
22613 +DEFINE_KEY_FIELD(locality, LOCALITY, oid_t);
22614 +/* define get_key_type(), set_key_type() */
22615 +DEFINE_KEY_FIELD(type, TYPE, key_minor_locality);
22616 +/* define get_key_band(), set_key_band() */
22617 +DEFINE_KEY_FIELD(band, BAND, __u64);
22618 +/* define get_key_objectid(), set_key_objectid() */
22619 +DEFINE_KEY_FIELD(objectid, OBJECTID, oid_t);
22620 +/* define get_key_fulloid(), set_key_fulloid() */
22621 +DEFINE_KEY_FIELD(fulloid, FULLOID, oid_t);
22622 +/* define get_key_offset(), set_key_offset() */
22623 +DEFINE_KEY_FIELD(offset, OFFSET, __u64);
22624 +#if (REISER4_LARGE_KEY)
22625 +/* define get_key_ordering(), set_key_ordering() */
22626 +DEFINE_KEY_FIELD(ordering, ORDERING, __u64);
22627 +#else
22628 +static inline __u64 get_key_ordering(const reiser4_key * key)
22629 +{
22630 +       return 0;
22631 +}
22632 +
22633 +static inline void set_key_ordering(reiser4_key * key, __u64 val)
22634 +{
22635 +}
22636 +#endif
22637 +
22638 +/* key comparison result */
22639 +typedef enum { LESS_THAN = -1, /* if first key is less than second */
22640 +       EQUAL_TO = 0,           /* if keys are equal */
22641 +       GREATER_THAN = +1       /* if first key is greater than second */
22642 +} cmp_t;
22643 +
22644 +void reiser4_key_init(reiser4_key * key);
22645 +
22646 +/* minimal possible key in the tree. Return pointer to the static storage. */
22647 +extern const reiser4_key *min_key(void);
22648 +extern const reiser4_key *max_key(void);
22649 +
22650 +/* helper macro for keycmp() */
22651 +#define KEY_DIFF(k1, k2, field)                                                        \
22652 +({                                                                             \
22653 +       typeof (get_key_ ## field (k1)) f1;                                     \
22654 +       typeof (get_key_ ## field (k2)) f2;                                     \
22655 +                                                                               \
22656 +       f1 = get_key_ ## field (k1);                                            \
22657 +       f2 = get_key_ ## field (k2);                                            \
22658 +                                                                               \
22659 +       (f1 < f2) ? LESS_THAN : ((f1 == f2) ? EQUAL_TO : GREATER_THAN);         \
22660 +})
22661 +
22662 +/* helper macro for keycmp() */
22663 +#define KEY_DIFF_EL(k1, k2, off)                                               \
22664 +({                                                                             \
22665 +       __u64 e1;                                                               \
22666 +       __u64 e2;                                                               \
22667 +                                                                               \
22668 +       e1 = get_key_el(k1, off);                                               \
22669 +       e2 = get_key_el(k2, off);                                               \
22670 +                                                                               \
22671 +       (e1 < e2) ? LESS_THAN : ((e1 == e2) ? EQUAL_TO : GREATER_THAN);         \
22672 +})
22673 +
22674 +/* compare `k1' and `k2'.  This function is a heart of "key allocation
22675 +    policy". All you need to implement new policy is to add yet another
22676 +    clause here. */
22677 +static inline cmp_t keycmp(const reiser4_key * k1 /* first key to compare */ ,
22678 +                          const reiser4_key * k2 /* second key to compare */ )
22679 +{
22680 +       cmp_t result;
22681 +
22682 +       /*
22683 +        * This function is the heart of reiser4 tree-routines. Key comparison
22684 +        * is among most heavily used operations in the file system.
22685 +        */
22686 +
22687 +       assert("nikita-439", k1 != NULL);
22688 +       assert("nikita-440", k2 != NULL);
22689 +
22690 +       /* there is no actual branch here: condition is compile time constant
22691 +        * and constant folding and propagation ensures that only one branch
22692 +        * is actually compiled in. */
22693 +
22694 +       if (REISER4_PLANA_KEY_ALLOCATION) {
22695 +               /* if physical order of fields in a key is identical
22696 +                  with logical order, we can implement key comparison
22697 +                  as three 64bit comparisons. */
22698 +               /* logical order of fields in plan-a:
22699 +                  locality->type->objectid->offset. */
22700 +               /* compare locality and type at once */
22701 +               result = KEY_DIFF_EL(k1, k2, 0);
22702 +               if (result == EQUAL_TO) {
22703 +                       /* compare objectid (and band if it's there) */
22704 +                       result = KEY_DIFF_EL(k1, k2, 1);
22705 +                       /* compare offset */
22706 +                       if (result == EQUAL_TO) {
22707 +                               result = KEY_DIFF_EL(k1, k2, 2);
22708 +                               if (REISER4_LARGE_KEY && result == EQUAL_TO) {
22709 +                                       result = KEY_DIFF_EL(k1, k2, 3);
22710 +                               }
22711 +                       }
22712 +               }
22713 +       } else if (REISER4_3_5_KEY_ALLOCATION) {
22714 +               result = KEY_DIFF(k1, k2, locality);
22715 +               if (result == EQUAL_TO) {
22716 +                       result = KEY_DIFF(k1, k2, objectid);
22717 +                       if (result == EQUAL_TO) {
22718 +                               result = KEY_DIFF(k1, k2, type);
22719 +                               if (result == EQUAL_TO)
22720 +                                       result = KEY_DIFF(k1, k2, offset);
22721 +                       }
22722 +               }
22723 +       } else
22724 +               impossible("nikita-441", "Unknown key allocation scheme!");
22725 +       return result;
22726 +}
22727 +
22728 +/* true if @k1 equals @k2 */
22729 +static inline int keyeq(const reiser4_key * k1 /* first key to compare */ ,
22730 +                       const reiser4_key * k2 /* second key to compare */ )
22731 +{
22732 +       assert("nikita-1879", k1 != NULL);
22733 +       assert("nikita-1880", k2 != NULL);
22734 +       return !memcmp(k1, k2, sizeof *k1);
22735 +}
22736 +
22737 +/* true if @k1 is less than @k2 */
22738 +static inline int keylt(const reiser4_key * k1 /* first key to compare */ ,
22739 +                       const reiser4_key * k2 /* second key to compare */ )
22740 +{
22741 +       assert("nikita-1952", k1 != NULL);
22742 +       assert("nikita-1953", k2 != NULL);
22743 +       return keycmp(k1, k2) == LESS_THAN;
22744 +}
22745 +
22746 +/* true if @k1 is less than or equal to @k2 */
22747 +static inline int keyle(const reiser4_key * k1 /* first key to compare */ ,
22748 +                       const reiser4_key * k2 /* second key to compare */ )
22749 +{
22750 +       assert("nikita-1954", k1 != NULL);
22751 +       assert("nikita-1955", k2 != NULL);
22752 +       return keycmp(k1, k2) != GREATER_THAN;
22753 +}
22754 +
22755 +/* true if @k1 is greater than @k2 */
22756 +static inline int keygt(const reiser4_key * k1 /* first key to compare */ ,
22757 +                       const reiser4_key * k2 /* second key to compare */ )
22758 +{
22759 +       assert("nikita-1959", k1 != NULL);
22760 +       assert("nikita-1960", k2 != NULL);
22761 +       return keycmp(k1, k2) == GREATER_THAN;
22762 +}
22763 +
22764 +/* true if @k1 is greater than or equal to @k2 */
22765 +static inline int keyge(const reiser4_key * k1 /* first key to compare */ ,
22766 +                       const reiser4_key * k2 /* second key to compare */ )
22767 +{
22768 +       assert("nikita-1956", k1 != NULL);
22769 +       assert("nikita-1957", k2 != NULL);      /* October  4: sputnik launched
22770 +                                                * November 3: Laika */
22771 +       return keycmp(k1, k2) != LESS_THAN;
22772 +}
22773 +
22774 +static inline void prefetchkey(reiser4_key * key)
22775 +{
22776 +       prefetch(key);
22777 +       prefetch(&key->el[KEY_CACHELINE_END]);
22778 +}
22779 +
22780 +/* (%Lx:%x:%Lx:%Lx:%Lx:%Lx) =
22781 +           1 + 16 + 1 + 1 + 1 + 1 + 1 + 16 + 1 + 16 + 1 + 16 + 1 */
22782 +/* size of a buffer suitable to hold human readable key representation */
22783 +#define KEY_BUF_LEN (80)
22784 +
22785 +#if REISER4_DEBUG
22786 +extern void print_key(const char *prefix, const reiser4_key * key);
22787 +#else
22788 +#define print_key(p,k) noop
22789 +#endif
22790 +
22791 +/* __FS_REISERFS_KEY_H__ */
22792 +#endif
22793 +
22794 +/* Make Linus happy.
22795 +   Local variables:
22796 +   c-indentation-style: "K&R"
22797 +   mode-name: "LC"
22798 +   c-basic-offset: 8
22799 +   tab-width: 8
22800 +   fill-column: 120
22801 +   End:
22802 +*/
22803 diff --git a/fs/reiser4/ktxnmgrd.c b/fs/reiser4/ktxnmgrd.c
22804 new file mode 100644
22805 index 0000000..1955066
22806 --- /dev/null
22807 +++ b/fs/reiser4/ktxnmgrd.c
22808 @@ -0,0 +1,214 @@
22809 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
22810 +/* Transaction manager daemon. */
22811 +
22812 +/*
22813 + * ktxnmgrd is a kernel daemon responsible for committing transactions. It is
22814 + * needed/important for the following reasons:
22815 + *
22816 + *     1. in reiser4 atom is not committed immediately when last transaction
22817 + *     handle closes, unless atom is either too old or too large (see
22818 + *     atom_should_commit()). This is done to avoid committing too frequently.
22819 + *     because:
22820 + *
22821 + *     2. sometimes we don't want to commit atom when closing last transaction
22822 + *     handle even if it is old and fat enough. For example, because we are at
22823 + *     this point under directory semaphore, and committing would stall all
22824 + *     accesses to this directory.
22825 + *
22826 + * ktxnmgrd binds its time sleeping on condition variable. When is awakes
22827 + * either due to (tunable) timeout or because it was explicitly woken up by
22828 + * call to ktxnmgrd_kick(), it scans list of all atoms and commits ones
22829 + * eligible.
22830 + *
22831 + */
22832 +
22833 +#include "debug.h"
22834 +#include "txnmgr.h"
22835 +#include "tree.h"
22836 +#include "ktxnmgrd.h"
22837 +#include "super.h"
22838 +#include "reiser4.h"
22839 +
22840 +#include <linux/sched.h>       /* for struct task_struct */
22841 +#include <linux/wait.h>
22842 +#include <linux/suspend.h>
22843 +#include <linux/kernel.h>
22844 +#include <linux/writeback.h>
22845 +#include <linux/kthread.h>
22846 +
22847 +static int scan_mgr(struct super_block *);
22848 +
22849 +/*
22850 + * change current->comm so that ps, top, and friends will see changed
22851 + * state. This serves no useful purpose whatsoever, but also costs nothing. May
22852 + * be it will make lonely system administrator feeling less alone at 3 A.M.
22853 + */
22854 +#define set_comm( state )                                              \
22855 +       snprintf( current -> comm, sizeof( current -> comm ),   \
22856 +                 "%s:%s:%s", __FUNCTION__, (super)->s_id, ( state ) )
22857 +
22858 +/**
22859 + * ktxnmgrd - kernel txnmgr daemon
22860 + * @arg: pointer to super block
22861 + *
22862 + * The background transaction manager daemon, started as a kernel thread during
22863 + * reiser4 initialization.
22864 + */
22865 +static int ktxnmgrd(void *arg)
22866 +{
22867 +       struct super_block *super;
22868 +       ktxnmgrd_context *ctx;
22869 +       txn_mgr *mgr;
22870 +       int done = 0;
22871 +
22872 +       super = arg;
22873 +       mgr = &get_super_private(super)->tmgr;
22874 +
22875 +       /*
22876 +        * do_fork() just copies task_struct into the new thread. ->fs_context
22877 +        * shouldn't be copied of course. This shouldn't be a problem for the
22878 +        * rest of the code though.
22879 +        */
22880 +       current->journal_info = NULL;
22881 +       ctx = mgr->daemon;
22882 +       while (1) {
22883 +               try_to_freeze();
22884 +               set_comm("wait");
22885 +               {
22886 +                       DEFINE_WAIT(__wait);
22887 +
22888 +                       prepare_to_wait(&ctx->wait, &__wait, TASK_INTERRUPTIBLE);
22889 +                       if (kthread_should_stop()) {
22890 +                               done = 1;
22891 +                       } else
22892 +                               schedule_timeout(ctx->timeout);
22893 +                       finish_wait(&ctx->wait, &__wait);
22894 +               }
22895 +               if (done)
22896 +                       break;
22897 +               set_comm("run");
22898 +               spin_lock(&ctx->guard);
22899 +               /*
22900 +                * wait timed out or ktxnmgrd was woken up by explicit request
22901 +                * to commit something. Scan list of atoms in txnmgr and look
22902 +                * for too old atoms.
22903 +                */
22904 +               do {
22905 +                       ctx->rescan = 0;
22906 +                       scan_mgr(super);
22907 +                       spin_lock(&ctx->guard);
22908 +                       if (ctx->rescan) {
22909 +                               /*
22910 +                                * the list could be modified while ctx
22911 +                                * spinlock was released, we have to repeat
22912 +                                * scanning from the beginning
22913 +                                */
22914 +                               break;
22915 +                       }
22916 +               } while (ctx->rescan);
22917 +               spin_unlock(&ctx->guard);
22918 +       }
22919 +       return 0;
22920 +}
22921 +
22922 +#undef set_comm
22923 +
22924 +/**
22925 + * init_ktxnmgrd - initialize ktxnmgrd context and start kernel daemon
22926 + * @super: pointer to super block
22927 + *
22928 + * Allocates and initializes ktxnmgrd_context, attaches it to transaction
22929 + * manager. Starts kernel txnmgr daemon. This is called on mount.
22930 + */
22931 +int init_ktxnmgrd(struct super_block *super)
22932 +{
22933 +       txn_mgr *mgr;
22934 +       ktxnmgrd_context *ctx;
22935 +
22936 +       mgr = &get_super_private(super)->tmgr;
22937 +
22938 +       assert("zam-1014", mgr->daemon == NULL);
22939 +
22940 +       ctx = kmalloc(sizeof(ktxnmgrd_context), get_gfp_mask());
22941 +       if (ctx == NULL)
22942 +               return RETERR(-ENOMEM);
22943 +
22944 +       assert("nikita-2442", ctx != NULL);
22945 +
22946 +       memset(ctx, 0, sizeof *ctx);
22947 +       init_waitqueue_head(&ctx->wait);
22948 +
22949 +       /*kcond_init(&ctx->startup);*/
22950 +       spin_lock_init(&ctx->guard);
22951 +       ctx->timeout = REISER4_TXNMGR_TIMEOUT;
22952 +       ctx->rescan = 1;
22953 +       mgr->daemon = ctx;
22954 +
22955 +       ctx->tsk = kthread_run(ktxnmgrd, super, "ktxnmgrd");
22956 +       if (IS_ERR(ctx->tsk)) {
22957 +               int ret = PTR_ERR(ctx->tsk);
22958 +               mgr->daemon = NULL;
22959 +               kfree(ctx);
22960 +               return RETERR(ret);
22961 +       }
22962 +       return 0;
22963 +}
22964 +
22965 +void ktxnmgrd_kick(txn_mgr *mgr)
22966 +{
22967 +       assert("nikita-3234", mgr != NULL);
22968 +       assert("nikita-3235", mgr->daemon != NULL);
22969 +       wake_up(&mgr->daemon->wait);
22970 +}
22971 +
22972 +int is_current_ktxnmgrd(void)
22973 +{
22974 +       return (get_current_super_private()->tmgr.daemon->tsk == current);
22975 +}
22976 +
22977 +/**
22978 + * scan_mgr - commit atoms which are to be committed
22979 + * @super: super block to commit atoms of
22980 + *
22981 + * Commits old atoms.
22982 + */
22983 +static int scan_mgr(struct super_block *super)
22984 +{
22985 +       int ret;
22986 +       reiser4_context ctx;
22987 +
22988 +       init_stack_context(&ctx, super);
22989 +
22990 +       ret = commit_some_atoms(&get_super_private(super)->tmgr);
22991 +
22992 +       reiser4_exit_context(&ctx);
22993 +       return ret;
22994 +}
22995 +
22996 +/**
22997 + * done_ktxnmgrd - stop kernel thread and frees ktxnmgrd context
22998 + * @mgr:
22999 + *
23000 + * This is called on umount. Stops ktxnmgrd and free t
23001 + */
23002 +void done_ktxnmgrd(struct super_block *super)
23003 +{
23004 +       txn_mgr *mgr;
23005 +
23006 +       mgr = &get_super_private(super)->tmgr;
23007 +       assert("zam-1012", mgr->daemon != NULL);
23008 +
23009 +       kthread_stop(mgr->daemon->tsk);
23010 +       kfree(mgr->daemon);
23011 +       mgr->daemon = NULL;
23012 +}
23013 +
23014 +/*
23015 + * Local variables:
23016 + * c-indentation-style: "K&R"
23017 + * mode-name: "LC"
23018 + * c-basic-offset: 8
23019 + * tab-width: 8
23020 + * fill-column: 120
23021 + * End:
23022 + */
23023 diff --git a/fs/reiser4/ktxnmgrd.h b/fs/reiser4/ktxnmgrd.h
23024 new file mode 100644
23025 index 0000000..5d497c0
23026 --- /dev/null
23027 +++ b/fs/reiser4/ktxnmgrd.h
23028 @@ -0,0 +1,52 @@
23029 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
23030 + * reiser4/README */
23031 +
23032 +/* Transaction manager daemon. See ktxnmgrd.c for comments. */
23033 +
23034 +#ifndef __KTXNMGRD_H__
23035 +#define __KTXNMGRD_H__
23036 +
23037 +#include "txnmgr.h"
23038 +
23039 +#include <linux/fs.h>
23040 +#include <linux/wait.h>
23041 +#include <linux/completion.h>
23042 +#include <linux/spinlock.h>
23043 +#include <asm/atomic.h>
23044 +#include <linux/sched.h>       /* for struct task_struct */
23045 +
23046 +/* in this structure all data necessary to start up, shut down and communicate
23047 + * with ktxnmgrd are kept. */
23048 +struct ktxnmgrd_context {
23049 +       /* wait queue head on which ktxnmgrd sleeps */
23050 +       wait_queue_head_t wait;
23051 +       /* spin lock protecting all fields of this structure */
23052 +       spinlock_t guard;
23053 +       /* timeout of sleeping on ->wait */
23054 +       signed long timeout;
23055 +       /* kernel thread running ktxnmgrd */
23056 +       struct task_struct *tsk;
23057 +       /* list of all file systems served by this ktxnmgrd */
23058 +       struct list_head queue;
23059 +       /* should ktxnmgrd repeat scanning of atoms? */
23060 +       unsigned int rescan:1;
23061 +};
23062 +
23063 +extern int init_ktxnmgrd(struct super_block *);
23064 +extern void done_ktxnmgrd(struct super_block *);
23065 +
23066 +extern void ktxnmgrd_kick(txn_mgr * mgr);
23067 +extern int is_current_ktxnmgrd(void);
23068 +
23069 +/* __KTXNMGRD_H__ */
23070 +#endif
23071 +
23072 +/* Make Linus happy.
23073 +   Local variables:
23074 +   c-indentation-style: "K&R"
23075 +   mode-name: "LC"
23076 +   c-basic-offset: 8
23077 +   tab-width: 8
23078 +   fill-column: 120
23079 +   End:
23080 +*/
23081 diff --git a/fs/reiser4/lock.c b/fs/reiser4/lock.c
23082 new file mode 100644
23083 index 0000000..b781d30
23084 --- /dev/null
23085 +++ b/fs/reiser4/lock.c
23086 @@ -0,0 +1,1261 @@
23087 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
23088 + * reiser4/README */
23089 +
23090 +/* Traditional deadlock avoidance is achieved by acquiring all locks in a single
23091 +   order.  V4 balances the tree from the bottom up, and searches the tree from
23092 +   the top down, and that is really the way we want it, so tradition won't work
23093 +   for us.
23094 +
23095 +   Instead we have two lock orderings, a high priority lock ordering, and a low
23096 +   priority lock ordering.  Each node in the tree has a lock in its znode.
23097 +
23098 +   Suppose we have a set of processes which lock (R/W) tree nodes. Each process
23099 +   has a set (maybe empty) of already locked nodes ("process locked set"). Each
23100 +   process may have a pending lock request to a node locked by another process.
23101 +   Note: we lock and unlock, but do not transfer locks: it is possible
23102 +   transferring locks instead would save some bus locking....
23103 +
23104 +   Deadlock occurs when we have a loop constructed from process locked sets and
23105 +   lock request vectors.
23106 +
23107 +   NOTE: The reiser4 "tree" is a tree on disk, but its cached representation in
23108 +   memory is extended with "znodes" with which we connect nodes with their left
23109 +   and right neighbors using sibling pointers stored in the znodes.  When we
23110 +   perform balancing operations we often go from left to right and from right to
23111 +   left.
23112 +
23113 +   +-P1-+          +-P3-+
23114 +   |+--+|   V1     |+--+|
23115 +   ||N1|| -------> ||N3||
23116 +   |+--+|          |+--+|
23117 +   +----+          +----+
23118 +     ^               |
23119 +     |V2             |V3
23120 +     |               v
23121 +   +---------P2---------+
23122 +   |+--+            +--+|
23123 +   ||N2|  --------  |N4||
23124 +   |+--+            +--+|
23125 +   +--------------------+
23126 +
23127 +   We solve this by ensuring that only low priority processes lock in top to
23128 +   bottom order and from right to left, and high priority processes lock from
23129 +   bottom to top and left to right.
23130 +
23131 +   ZAM-FIXME-HANS: order not just node locks in this way, order atom locks, and
23132 +   kill those damn busy loops.
23133 +   ANSWER(ZAM): atom locks (which are introduced by ASTAGE_CAPTURE_WAIT atom
23134 +   stage) cannot be ordered that way. There are no rules what nodes can belong
23135 +   to the atom and what nodes cannot.  We cannot define what is right or left
23136 +   direction, what is top or bottom.  We can take immediate parent or side
23137 +   neighbor of one node, but nobody guarantees that, say, left neighbor node is
23138 +   not a far right neighbor for other nodes from the same atom.  It breaks
23139 +   deadlock avoidance rules and hi-low priority locking cannot be applied for
23140 +   atom locks.
23141 +
23142 +   How does it help to avoid deadlocks ?
23143 +
23144 +   Suppose we have a deadlock with n processes. Processes from one priority
23145 +   class never deadlock because they take locks in one consistent
23146 +   order.
23147 +
23148 +   So, any possible deadlock loop must have low priority as well as high
23149 +   priority processes.  There are no other lock priority levels except low and
23150 +   high. We know that any deadlock loop contains at least one node locked by a
23151 +   low priority process and requested by a high priority process. If this
23152 +   situation is caught and resolved it is sufficient to avoid deadlocks.
23153 +
23154 +   V4 DEADLOCK PREVENTION ALGORITHM IMPLEMENTATION.
23155 +
23156 +   The deadlock prevention algorithm is based on comparing
23157 +   priorities of node owners (processes which keep znode locked) and
23158 +   requesters (processes which want to acquire a lock on znode).  We
23159 +   implement a scheme where low-priority owners yield locks to
23160 +   high-priority requesters. We created a signal passing system that
23161 +   is used to ask low-priority processes to yield one or more locked
23162 +   znodes.
23163 +
23164 +   The condition when a znode needs to change its owners is described by the
23165 +   following formula:
23166 +
23167 +   #############################################
23168 +   #                                           #
23169 +   # (number of high-priority requesters) >  0 #
23170 +   #                AND                        #
23171 +   # (numbers of high-priority owners)    == 0 #
23172 +   #                                           #
23173 +   #############################################
23174 +
23175 +   Note that a low-priority process delays node releasing if another
23176 +   high-priority process owns this node.  So, slightly more strictly speaking,
23177 +   to have a deadlock capable cycle you must have a loop in which a high
23178 +   priority process is waiting on a low priority process to yield a node, which
23179 +   is slightly different from saying a high priority process is waiting on a
23180 +   node owned by a low priority process.
23181 +
23182 +   It is enough to avoid deadlocks if we prevent any low-priority process from
23183 +   falling asleep if its locked set contains a node which satisfies the
23184 +   deadlock condition.
23185 +
23186 +   That condition is implicitly or explicitly checked in all places where new
23187 +   high-priority requests may be added or removed from node request queue or
23188 +   high-priority process takes or releases a lock on node. The main
23189 +   goal of these checks is to never lose the moment when node becomes "has
23190 +   wrong owners" and send "must-yield-this-lock" signals to its low-pri owners
23191 +   at that time.
23192 +
23193 +   The information about received signals is stored in the per-process
23194 +   structure (lock stack) and analyzed before a low-priority process goes to
23195 +   sleep but after a "fast" attempt to lock a node fails. Any signal wakes
23196 +   sleeping process up and forces him to re-check lock status and received
23197 +   signal info. If "must-yield-this-lock" signals were received the locking
23198 +   primitive (longterm_lock_znode()) fails with -E_DEADLOCK error code.
23199 +
23200 +   V4 LOCKING DRAWBACKS
23201 +
23202 +   If we have already balanced on one level, and we are propagating our changes
23203 +   upward to a higher level, it could be very messy to surrender all locks on
23204 +   the lower level because we put so much computational work into it, and
23205 +   reverting them to their state before they were locked might be very complex.
23206 +   We also don't want to acquire all locks before performing balancing because
23207 +   that would either be almost as much work as the balancing, or it would be
23208 +   too conservative and lock too much.  We want balancing to be done only at
23209 +   high priority.  Yet, we might want to go to the left one node and use some
23210 +   of its empty space... So we make one attempt at getting the node to the left
23211 +   using try_lock, and if it fails we do without it, because we didn't really
23212 +   need it, it was only a nice to have.
23213 +
23214 +   LOCK STRUCTURES DESCRIPTION
23215 +
23216 +   The following data structures are used in the reiser4 locking
23217 +   implementation:
23218 +
23219 +   All fields related to long-term locking are stored in znode->lock.
23220 +
23221 +   The lock stack is a per thread object.  It owns all znodes locked by the
23222 +   thread. One znode may be locked by several threads in case of read lock or
23223 +   one znode may be write locked by one thread several times. The special link
23224 +   objects (lock handles) support n<->m relation between znodes and lock
23225 +   owners.
23226 +
23227 +   <Thread 1>                       <Thread 2>
23228 +
23229 +   +---------+                     +---------+
23230 +   |  LS1    |                    |  LS2    |
23231 +   +---------+                    +---------+
23232 +       ^                                ^
23233 +       |---------------+                +----------+
23234 +       v               v                v          v
23235 +   +---------+      +---------+    +---------+   +---------+
23236 +   |  LH1    |      |   LH2   |           |  LH3    |   |   LH4   |
23237 +   +---------+     +---------+    +---------+   +---------+
23238 +       ^                   ^            ^           ^
23239 +       |                   +------------+           |
23240 +       v                   v                        v
23241 +   +---------+      +---------+                  +---------+
23242 +   |  Z1     |     |   Z2    |                  |  Z3     |
23243 +   +---------+     +---------+                  +---------+
23244 +
23245 +   Thread 1 locked znodes Z1 and Z2, thread 2 locked znodes Z2 and Z3. The
23246 +   picture above shows that lock stack LS1 has a list of 2 lock handles LH1 and
23247 +   LH2, lock stack LS2 has a list with lock handles LH3 and LH4 on it.  Znode
23248 +   Z1 is locked by only one thread, znode has only one lock handle LH1 on its
23249 +   list, similar situation is for Z3 which is locked by the thread 2 only. Z2
23250 +   is locked (for read) twice by different threads and two lock handles are on
23251 +   its list. Each lock handle represents a single relation of a locking of a
23252 +   znode by a thread. Locking of a znode is an establishing of a locking
23253 +   relation between the lock stack and the znode by adding of a new lock handle
23254 +   to a list of lock handles, the lock stack.  The lock stack links all lock
23255 +   handles for all znodes locked by the lock stack.  The znode list groups all
23256 +   lock handles for all locks stacks which locked the znode.
23257 +
23258 +   Yet another relation may exist between znode and lock owners.  If lock
23259 +   procedure cannot immediately take lock on an object it adds the lock owner
23260 +   on special `requestors' list belongs to znode.  That list represents a
23261 +   queue of pending lock requests.  Because one lock owner may request only
23262 +   only one lock object at a time, it is a 1->n relation between lock objects
23263 +   and a lock owner implemented as it is described above. Full information
23264 +   (priority, pointers to lock and link objects) about each lock request is
23265 +   stored in lock owner structure in `request' field.
23266 +
23267 +   SHORT_TERM LOCKING
23268 +
23269 +   This is a list of primitive operations over lock stacks / lock handles /
23270 +   znodes and locking descriptions for them.
23271 +
23272 +   1. locking / unlocking which is done by two list insertion/deletion, one
23273 +      to/from znode's list of lock handles, another one is to/from lock stack's
23274 +      list of lock handles.  The first insertion is protected by
23275 +      znode->lock.guard spinlock.  The list owned by the lock stack can be
23276 +      modified only by thread who owns the lock stack and nobody else can
23277 +      modify/read it. There is nothing to be protected by a spinlock or
23278 +      something else.
23279 +
23280 +   2. adding/removing a lock request to/from znode requesters list. The rule is
23281 +      that znode->lock.guard spinlock should be taken for this.
23282 +
23283 +   3. we can traverse list of lock handles and use references to lock stacks who
23284 +      locked given znode if znode->lock.guard spinlock is taken.
23285 +
23286 +   4. If a lock stack is associated with a znode as a lock requestor or lock
23287 +      owner its existence is guaranteed by znode->lock.guard spinlock.  Some its
23288 +      (lock stack's) fields should be protected from being accessed in parallel
23289 +      by two or more threads. Please look at  lock_stack structure definition
23290 +      for the info how those fields are protected. */
23291 +
23292 +/* Znode lock and capturing intertwining. */
23293 +/* In current implementation we capture formatted nodes before locking
23294 +   them. Take a look on longterm lock znode, try_capture() request precedes
23295 +   locking requests.  The longterm_lock_znode function unconditionally captures
23296 +   znode before even checking of locking conditions.
23297 +
23298 +   Another variant is to capture znode after locking it.  It was not tested, but
23299 +   at least one deadlock condition is supposed to be there.  One thread has
23300 +   locked a znode (Node-1) and calls try_capture() for it.  Try_capture() sleeps
23301 +   because znode's atom has CAPTURE_WAIT state.  Second thread is a flushing
23302 +   thread, its current atom is the atom Node-1 belongs to. Second thread wants
23303 +   to lock Node-1 and sleeps because Node-1 is locked by the first thread.  The
23304 +   described situation is a deadlock. */
23305 +
23306 +#include "debug.h"
23307 +#include "txnmgr.h"
23308 +#include "znode.h"
23309 +#include "jnode.h"
23310 +#include "tree.h"
23311 +#include "plugin/node/node.h"
23312 +#include "super.h"
23313 +
23314 +#include <linux/spinlock.h>
23315 +
23316 +#if REISER4_DEBUG
23317 +static int request_is_deadlock_safe(znode *, znode_lock_mode,
23318 +                                   znode_lock_request);
23319 +#endif
23320 +
23321 +/* Returns a lock owner associated with current thread */
23322 +lock_stack *get_current_lock_stack(void)
23323 +{
23324 +       return &get_current_context()->stack;
23325 +}
23326 +
23327 +/* Wakes up all low priority owners informing them about possible deadlock */
23328 +static void wake_up_all_lopri_owners(znode * node)
23329 +{
23330 +       lock_handle *handle;
23331 +
23332 +       assert_spin_locked(&(node->lock.guard));
23333 +       list_for_each_entry(handle, &node->lock.owners, owners_link) {
23334 +               assert("nikita-1832", handle->node == node);
23335 +               /* count this signal in owner->nr_signaled */
23336 +               if (!handle->signaled) {
23337 +                       handle->signaled = 1;
23338 +                       atomic_inc(&handle->owner->nr_signaled);
23339 +                       /* Wake up a single process */
23340 +                       reiser4_wake_up(handle->owner);
23341 +               }
23342 +       }
23343 +}
23344 +
23345 +/* Adds a lock to a lock owner, which means creating a link to the lock and
23346 +   putting the link into the two lists all links are on (the doubly linked list
23347 +   that forms the lock_stack, and the doubly linked list of links attached
23348 +   to a lock.
23349 +*/
23350 +static inline void
23351 +link_object(lock_handle * handle, lock_stack * owner, znode * node)
23352 +{
23353 +       assert("jmacd-810", handle->owner == NULL);
23354 +       assert_spin_locked(&(node->lock.guard));
23355 +
23356 +       handle->owner = owner;
23357 +       handle->node = node;
23358 +
23359 +       assert("reiser4-4",
23360 +              ergo(list_empty_careful(&owner->locks), owner->nr_locks == 0));
23361 +
23362 +       /* add lock handle to the end of lock_stack's list of locks */
23363 +       list_add_tail(&handle->locks_link, &owner->locks);
23364 +       ON_DEBUG(owner->nr_locks++);
23365 +       set_gfp_mask();
23366 +
23367 +       /* add lock handle to the head of znode's list of owners */
23368 +       list_add(&handle->owners_link, &node->lock.owners);
23369 +       handle->signaled = 0;
23370 +}
23371 +
23372 +/* Breaks a relation between a lock and its owner */
23373 +static inline void unlink_object(lock_handle * handle)
23374 +{
23375 +       assert("zam-354", handle->owner != NULL);
23376 +       assert("nikita-1608", handle->node != NULL);
23377 +       assert_spin_locked(&(handle->node->lock.guard));
23378 +       assert("nikita-1829", handle->owner == get_current_lock_stack());
23379 +       assert("reiser4-5", handle->owner->nr_locks > 0);
23380 +
23381 +       /* remove lock handle from lock_stack's list of locks */
23382 +       list_del(&handle->locks_link);
23383 +       ON_DEBUG(handle->owner->nr_locks--);
23384 +       set_gfp_mask();
23385 +       assert("reiser4-6",
23386 +              ergo(list_empty_careful(&handle->owner->locks),
23387 +                   handle->owner->nr_locks == 0));
23388 +       /* remove lock handle from znode's list of owners */
23389 +       list_del(&handle->owners_link);
23390 +       /* indicates that lock handle is free now */
23391 +       handle->node = NULL;
23392 +#if REISER4_DEBUG
23393 +       INIT_LIST_HEAD(&handle->locks_link);
23394 +       INIT_LIST_HEAD(&handle->owners_link);
23395 +       handle->owner = NULL;
23396 +#endif
23397 +}
23398 +
23399 +/* Actually locks an object knowing that we are able to do this */
23400 +static void lock_object(lock_stack * owner)
23401 +{
23402 +       lock_request *request;
23403 +       znode *node;
23404 +
23405 +       request = &owner->request;
23406 +       node = request->node;
23407 +       assert_spin_locked(&(node->lock.guard));
23408 +       if (request->mode == ZNODE_READ_LOCK) {
23409 +               node->lock.nr_readers++;
23410 +       } else {
23411 +               /* check that we don't switched from read to write lock */
23412 +               assert("nikita-1840", node->lock.nr_readers <= 0);
23413 +               /* We allow recursive locking; a node can be locked several
23414 +                  times for write by same process */
23415 +               node->lock.nr_readers--;
23416 +       }
23417 +
23418 +       link_object(request->handle, owner, node);
23419 +
23420 +       if (owner->curpri) {
23421 +               node->lock.nr_hipri_owners++;
23422 +       }
23423 +}
23424 +
23425 +/* Check for recursive write locking */
23426 +static int recursive(lock_stack * owner)
23427 +{
23428 +       int ret;
23429 +       znode *node;
23430 +       lock_handle *lh;
23431 +
23432 +       node = owner->request.node;
23433 +
23434 +       /* Owners list is not empty for a locked node */
23435 +       assert("zam-314", !list_empty_careful(&node->lock.owners));
23436 +       assert("nikita-1841", owner == get_current_lock_stack());
23437 +       assert_spin_locked(&(node->lock.guard));
23438 +
23439 +
23440 +       lh = list_entry(node->lock.owners.next, lock_handle, owners_link);
23441 +       ret = (lh->owner == owner);
23442 +
23443 +       /* Recursive read locking should be done usual way */
23444 +       assert("zam-315", !ret || owner->request.mode == ZNODE_WRITE_LOCK);
23445 +       /* mixing of read/write locks is not allowed */
23446 +       assert("zam-341", !ret || znode_is_wlocked(node));
23447 +
23448 +       return ret;
23449 +}
23450 +
23451 +#if REISER4_DEBUG
23452 +/* Returns true if the lock is held by the calling thread. */
23453 +int znode_is_any_locked(const znode * node)
23454 +{
23455 +       lock_handle *handle;
23456 +       lock_stack *stack;
23457 +       int ret;
23458 +
23459 +       if (!znode_is_locked(node)) {
23460 +               return 0;
23461 +       }
23462 +
23463 +       stack = get_current_lock_stack();
23464 +
23465 +       spin_lock_stack(stack);
23466 +
23467 +       ret = 0;
23468 +
23469 +       list_for_each_entry(handle, &stack->locks, locks_link) {
23470 +               if (handle->node == node) {
23471 +                       ret = 1;
23472 +                       break;
23473 +               }
23474 +       }
23475 +
23476 +       spin_unlock_stack(stack);
23477 +
23478 +       return ret;
23479 +}
23480 +
23481 +#endif
23482 +
23483 +/* Returns true if a write lock is held by the calling thread. */
23484 +int znode_is_write_locked(const znode * node)
23485 +{
23486 +       lock_stack *stack;
23487 +       lock_handle *handle;
23488 +
23489 +       assert("jmacd-8765", node != NULL);
23490 +
23491 +       if (!znode_is_wlocked(node)) {
23492 +               return 0;
23493 +       }
23494 +
23495 +       stack = get_current_lock_stack();
23496 +
23497 +       /*
23498 +        * When znode is write locked, all owner handles point to the same lock
23499 +        * stack. Get pointer to lock stack from the first lock handle from
23500 +        * znode's owner list
23501 +        */
23502 +       handle = list_entry(node->lock.owners.next, lock_handle, owners_link);
23503 +
23504 +       return (handle->owner == stack);
23505 +}
23506 +
23507 +/* This "deadlock" condition is the essential part of reiser4 locking
23508 +   implementation. This condition is checked explicitly by calling
23509 +   check_deadlock_condition() or implicitly in all places where znode lock
23510 +   state (set of owners and request queue) is changed. Locking code is
23511 +   designed to use this condition to trigger procedure of passing object from
23512 +   low priority owner(s) to high priority one(s).
23513 +
23514 +   The procedure results in passing an event (setting lock_handle->signaled
23515 +   flag) and counting this event in nr_signaled field of owner's lock stack
23516 +   object and wakeup owner's process.
23517 +*/
23518 +static inline int check_deadlock_condition(znode * node)
23519 +{
23520 +       assert_spin_locked(&(node->lock.guard));
23521 +       return node->lock.nr_hipri_requests > 0
23522 +           && node->lock.nr_hipri_owners == 0;
23523 +}
23524 +
23525 +static int check_livelock_condition(znode * node, znode_lock_mode mode)
23526 +{
23527 +       zlock * lock = &node->lock;
23528 +
23529 +       return mode == ZNODE_READ_LOCK &&
23530 +               lock -> nr_readers >= 0 && lock->nr_hipri_write_requests > 0;
23531 +}
23532 +
23533 +/* checks lock/request compatibility */
23534 +static int can_lock_object(lock_stack * owner)
23535 +{
23536 +       znode *node = owner->request.node;
23537 +
23538 +       assert_spin_locked(&(node->lock.guard));
23539 +
23540 +       /* See if the node is disconnected. */
23541 +       if (unlikely(ZF_ISSET(node, JNODE_IS_DYING)))
23542 +               return RETERR(-EINVAL);
23543 +
23544 +       /* Do not ever try to take a lock if we are going in low priority
23545 +          direction and a node have a high priority request without high
23546 +          priority owners. */
23547 +       if (unlikely(!owner->curpri && check_deadlock_condition(node)))
23548 +               return RETERR(-E_REPEAT);
23549 +       if (unlikely(owner->curpri && check_livelock_condition(node, owner->request.mode)))
23550 +               return RETERR(-E_REPEAT);
23551 +       if (unlikely(!is_lock_compatible(node, owner->request.mode)))
23552 +               return RETERR(-E_REPEAT);
23553 +       return 0;
23554 +}
23555 +
23556 +/* Setting of a high priority to the process. It clears "signaled" flags
23557 +   because znode locked by high-priority process can't satisfy our "deadlock
23558 +   condition". */
23559 +static void set_high_priority(lock_stack * owner)
23560 +{
23561 +       assert("nikita-1846", owner == get_current_lock_stack());
23562 +       /* Do nothing if current priority is already high */
23563 +       if (!owner->curpri) {
23564 +               /* We don't need locking for owner->locks list, because, this
23565 +                * function is only called with the lock stack of the current
23566 +                * thread, and no other thread can play with owner->locks list
23567 +                * and/or change ->node pointers of lock handles in this list.
23568 +                *
23569 +                * (Interrupts also are not involved.)
23570 +                */
23571 +               lock_handle *item = list_entry(owner->locks.next, lock_handle, locks_link);
23572 +               while (&owner->locks != &item->locks_link) {
23573 +                       znode *node = item->node;
23574 +
23575 +                       spin_lock_zlock(&node->lock);
23576 +
23577 +                       node->lock.nr_hipri_owners++;
23578 +
23579 +                       /* we can safely set signaled to zero, because
23580 +                          previous statement (nr_hipri_owners ++) guarantees
23581 +                          that signaled will be never set again. */
23582 +                       item->signaled = 0;
23583 +                       spin_unlock_zlock(&node->lock);
23584 +
23585 +                       item = list_entry(item->locks_link.next, lock_handle, locks_link);
23586 +               }
23587 +               owner->curpri = 1;
23588 +               atomic_set(&owner->nr_signaled, 0);
23589 +       }
23590 +}
23591 +
23592 +/* Sets a low priority to the process. */
23593 +static void set_low_priority(lock_stack * owner)
23594 +{
23595 +       assert("nikita-3075", owner == get_current_lock_stack());
23596 +       /* Do nothing if current priority is already low */
23597 +       if (owner->curpri) {
23598 +               /* scan all locks (lock handles) held by @owner, which is
23599 +                  actually current thread, and check whether we are reaching
23600 +                  deadlock possibility anywhere.
23601 +                */
23602 +               lock_handle *handle = list_entry(owner->locks.next, lock_handle, locks_link);
23603 +               while (&owner->locks != &handle->locks_link) {
23604 +                       znode *node = handle->node;
23605 +                       spin_lock_zlock(&node->lock);
23606 +                       /* this thread just was hipri owner of @node, so
23607 +                          nr_hipri_owners has to be greater than zero. */
23608 +                       assert("nikita-1835", node->lock.nr_hipri_owners > 0);
23609 +                       node->lock.nr_hipri_owners--;
23610 +                       /* If we have deadlock condition, adjust a nr_signaled
23611 +                          field. It is enough to set "signaled" flag only for
23612 +                          current process, other low-pri owners will be
23613 +                          signaled and waken up after current process unlocks
23614 +                          this object and any high-priority requestor takes
23615 +                          control. */
23616 +                       if (check_deadlock_condition(node)
23617 +                           && !handle->signaled) {
23618 +                               handle->signaled = 1;
23619 +                               atomic_inc(&owner->nr_signaled);
23620 +                       }
23621 +                       spin_unlock_zlock(&node->lock);
23622 +                       handle = list_entry(handle->locks_link.next, lock_handle, locks_link);
23623 +               }
23624 +               owner->curpri = 0;
23625 +       }
23626 +}
23627 +
23628 +static void remove_lock_request(lock_stack * requestor)
23629 +{
23630 +       zlock * lock = &requestor->request.node->lock;
23631 +
23632 +       if (requestor->curpri) {
23633 +               assert("nikita-1838", lock->nr_hipri_requests > 0);
23634 +               lock->nr_hipri_requests--;
23635 +               if (requestor->request.mode == ZNODE_WRITE_LOCK)
23636 +                       lock->nr_hipri_write_requests --;
23637 +       }
23638 +       list_del(&requestor->requestors_link);
23639 +}
23640 +
23641 +
23642 +static void invalidate_all_lock_requests(znode * node)
23643 +{
23644 +       lock_stack *requestor, *tmp;
23645 +
23646 +       assert_spin_locked(&(node->lock.guard));
23647 +
23648 +       list_for_each_entry_safe(requestor, tmp, &node->lock.requestors, requestors_link) {
23649 +               remove_lock_request(requestor);
23650 +               requestor->request.ret_code = -EINVAL;
23651 +               reiser4_wake_up(requestor);
23652 +               requestor->request.mode = ZNODE_NO_LOCK;
23653 +       }
23654 +}
23655 +
23656 +static void dispatch_lock_requests(znode * node)
23657 +{
23658 +       lock_stack *requestor, *tmp;
23659 +
23660 +       assert_spin_locked(&(node->lock.guard));
23661 +
23662 +       list_for_each_entry_safe(requestor, tmp, &node->lock.requestors, requestors_link) {
23663 +               if (znode_is_write_locked(node))
23664 +                       break;
23665 +               if (!can_lock_object(requestor)) {
23666 +                       lock_object(requestor);
23667 +                       remove_lock_request(requestor);
23668 +                       requestor->request.ret_code = 0;
23669 +                       reiser4_wake_up(requestor);
23670 +                       requestor->request.mode = ZNODE_NO_LOCK;
23671 +               }
23672 +       }
23673 +}
23674 +
23675 +/* release long-term lock, acquired by longterm_lock_znode() */
23676 +void longterm_unlock_znode(lock_handle * handle)
23677 +{
23678 +       znode *node = handle->node;
23679 +       lock_stack *oldowner = handle->owner;
23680 +       int hipri;
23681 +       int readers;
23682 +       int rdelta;
23683 +       int youdie;
23684 +
23685 +       /*
23686 +        * this is time-critical and highly optimized code. Modify carefully.
23687 +        */
23688 +
23689 +       assert("jmacd-1021", handle != NULL);
23690 +       assert("jmacd-1022", handle->owner != NULL);
23691 +       assert("nikita-1392", LOCK_CNT_GTZ(long_term_locked_znode));
23692 +
23693 +       assert("zam-130", oldowner == get_current_lock_stack());
23694 +
23695 +       LOCK_CNT_DEC(long_term_locked_znode);
23696 +
23697 +       /*
23698 +        * to minimize amount of operations performed under lock, pre-compute
23699 +        * all variables used within critical section. This makes code
23700 +        * obscure.
23701 +        */
23702 +
23703 +       /* was this lock of hi or lo priority */
23704 +       hipri = oldowner->curpri ? -1 : 0;
23705 +       /* number of readers */
23706 +       readers = node->lock.nr_readers;
23707 +       /* +1 if write lock, -1 if read lock */
23708 +       rdelta = (readers > 0) ? -1 : +1;
23709 +       /* true if node is to die and write lock is released */
23710 +       youdie = ZF_ISSET(node, JNODE_HEARD_BANSHEE) && (readers < 0);
23711 +
23712 +       spin_lock_zlock(&node->lock);
23713 +
23714 +       assert("zam-101", znode_is_locked(node));
23715 +
23716 +       /* Adjust a number of high priority owners of this lock */
23717 +       node->lock.nr_hipri_owners += hipri;
23718 +       assert("nikita-1836", node->lock.nr_hipri_owners >= 0);
23719 +
23720 +       /* Handle znode deallocation on last write-lock release. */
23721 +       if (znode_is_wlocked_once(node)) {
23722 +               if (youdie) {
23723 +                       forget_znode(handle);
23724 +                       assert("nikita-2191", znode_invariant(node));
23725 +                       zput(node);
23726 +                       return;
23727 +               }
23728 +       }
23729 +
23730 +       if (handle->signaled)
23731 +               atomic_dec(&oldowner->nr_signaled);
23732 +
23733 +       /* Unlocking means owner<->object link deletion */
23734 +       unlink_object(handle);
23735 +
23736 +       /* This is enough to be sure whether an object is completely
23737 +          unlocked. */
23738 +       node->lock.nr_readers += rdelta;
23739 +
23740 +       /* If the node is locked it must have an owners list.  Likewise, if
23741 +          the node is unlocked it must have an empty owners list. */
23742 +       assert("zam-319", equi(znode_is_locked(node),
23743 +                              !list_empty_careful(&node->lock.owners)));
23744 +
23745 +#if REISER4_DEBUG
23746 +       if (!znode_is_locked(node))
23747 +               ++node->times_locked;
23748 +#endif
23749 +
23750 +       /* If there are pending lock requests we wake up a requestor */
23751 +       if (!znode_is_wlocked(node))
23752 +               dispatch_lock_requests(node);
23753 +       if (check_deadlock_condition(node))
23754 +               wake_up_all_lopri_owners(node);
23755 +       spin_unlock_zlock(&node->lock);
23756 +
23757 +       /* minus one reference from handle->node */
23758 +       assert("nikita-2190", znode_invariant(node));
23759 +       ON_DEBUG(check_lock_data());
23760 +       ON_DEBUG(check_lock_node_data(node));
23761 +       zput(node);
23762 +}
23763 +
23764 +/* final portion of longterm-lock */
23765 +static int
23766 +lock_tail(lock_stack * owner, int ok, znode_lock_mode mode)
23767 +{
23768 +       znode *node = owner->request.node;
23769 +
23770 +       assert_spin_locked(&(node->lock.guard));
23771 +
23772 +       /* If we broke with (ok == 0) it means we can_lock, now do it. */
23773 +       if (ok == 0) {
23774 +               lock_object(owner);
23775 +               owner->request.mode = 0;
23776 +               /* count a reference from lockhandle->node
23777 +
23778 +                  znode was already referenced at the entry to this function,
23779 +                  hence taking spin-lock here is not necessary (see comment
23780 +                  in the zref()).
23781 +                */
23782 +               zref(node);
23783 +
23784 +               LOCK_CNT_INC(long_term_locked_znode);
23785 +       }
23786 +       spin_unlock_zlock(&node->lock);
23787 +       ON_DEBUG(check_lock_data());
23788 +       ON_DEBUG(check_lock_node_data(node));
23789 +       return ok;
23790 +}
23791 +
23792 +/*
23793 + * version of longterm_znode_lock() optimized for the most common case: read
23794 + * lock without any special flags. This is the kind of lock that any tree
23795 + * traversal takes on the root node of the tree, which is very frequent.
23796 + */
23797 +static int longterm_lock_tryfast(lock_stack * owner)
23798 +{
23799 +       int result;
23800 +       znode *node;
23801 +       zlock *lock;
23802 +
23803 +       node = owner->request.node;
23804 +       lock = &node->lock;
23805 +
23806 +       assert("nikita-3340", schedulable());
23807 +       assert("nikita-3341", request_is_deadlock_safe(node,
23808 +                                                      ZNODE_READ_LOCK,
23809 +                                                      ZNODE_LOCK_LOPRI));
23810 +       spin_lock_zlock(lock);
23811 +       result = can_lock_object(owner);
23812 +       spin_unlock_zlock(lock);
23813 +
23814 +       if (likely(result != -EINVAL)) {
23815 +               spin_lock_znode(node);
23816 +               result = try_capture(ZJNODE(node), ZNODE_READ_LOCK, 0);
23817 +               spin_unlock_znode(node);
23818 +               spin_lock_zlock(lock);
23819 +               if (unlikely(result != 0)) {
23820 +                       owner->request.mode = 0;
23821 +               } else {
23822 +                       result = can_lock_object(owner);
23823 +                       if (unlikely(result == -E_REPEAT)) {
23824 +                               /* fall back to longterm_lock_znode() */
23825 +                               spin_unlock_zlock(lock);
23826 +                               return 1;
23827 +                       }
23828 +               }
23829 +               return lock_tail(owner, result, ZNODE_READ_LOCK);
23830 +       } else
23831 +               return 1;
23832 +}
23833 +
23834 +/* locks given lock object */
23835 +int longterm_lock_znode(
23836 +                              /* local link object (allocated by lock owner thread, usually on its own
23837 +                               * stack) */
23838 +                              lock_handle * handle,
23839 +                              /* znode we want to lock. */
23840 +                              znode * node,
23841 +                              /* {ZNODE_READ_LOCK, ZNODE_WRITE_LOCK}; */
23842 +                              znode_lock_mode mode,
23843 +                              /* {0, -EINVAL, -E_DEADLOCK}, see return codes description. */
23844 +                              znode_lock_request request) {
23845 +       int ret;
23846 +       int hipri = (request & ZNODE_LOCK_HIPRI) != 0;
23847 +       int non_blocking = 0;
23848 +       int has_atom;
23849 +       txn_capture cap_flags;
23850 +       zlock *lock;
23851 +       txn_handle *txnh;
23852 +       tree_level level;
23853 +
23854 +       /* Get current process context */
23855 +       lock_stack *owner = get_current_lock_stack();
23856 +
23857 +       /* Check that the lock handle is initialized and isn't already being
23858 +        * used. */
23859 +       assert("jmacd-808", handle->owner == NULL);
23860 +       assert("nikita-3026", schedulable());
23861 +       assert("nikita-3219", request_is_deadlock_safe(node, mode, request));
23862 +       assert("zam-1056", atomic_read(&ZJNODE(node)->x_count) > 0);
23863 +       /* long term locks are not allowed in the VM contexts (->writepage(),
23864 +        * prune_{d,i}cache()).
23865 +        *
23866 +        * FIXME this doesn't work due to unused-dentry-with-unlinked-inode
23867 +        * bug caused by d_splice_alias() only working for directories.
23868 +        */
23869 +       assert("nikita-3547", 1 || ((current->flags & PF_MEMALLOC) == 0));
23870 +       assert ("zam-1055", mode != ZNODE_NO_LOCK);
23871 +
23872 +       cap_flags = 0;
23873 +       if (request & ZNODE_LOCK_NONBLOCK) {
23874 +               cap_flags |= TXN_CAPTURE_NONBLOCKING;
23875 +               non_blocking = 1;
23876 +       }
23877 +
23878 +       if (request & ZNODE_LOCK_DONT_FUSE)
23879 +               cap_flags |= TXN_CAPTURE_DONT_FUSE;
23880 +
23881 +       /* If we are changing our process priority we must adjust a number
23882 +          of high priority owners for each znode that we already lock */
23883 +       if (hipri) {
23884 +               set_high_priority(owner);
23885 +       } else {
23886 +               set_low_priority(owner);
23887 +       }
23888 +
23889 +       level = znode_get_level(node);
23890 +
23891 +       /* Fill request structure with our values. */
23892 +       owner->request.mode = mode;
23893 +       owner->request.handle = handle;
23894 +       owner->request.node = node;
23895 +
23896 +       txnh = get_current_context()->trans;
23897 +       lock = &node->lock;
23898 +
23899 +       if (mode == ZNODE_READ_LOCK && request == 0) {
23900 +               ret = longterm_lock_tryfast(owner);
23901 +               if (ret <= 0)
23902 +                       return ret;
23903 +       }
23904 +
23905 +       has_atom = (txnh->atom != NULL);
23906 +
23907 +       /* Synchronize on node's zlock guard lock. */
23908 +       spin_lock_zlock(lock);
23909 +
23910 +       if (znode_is_locked(node) &&
23911 +           mode == ZNODE_WRITE_LOCK && recursive(owner))
23912 +               return lock_tail(owner, 0, mode);
23913 +
23914 +       for (;;) {
23915 +               /* Check the lock's availability: if it is unavaiable we get
23916 +                  E_REPEAT, 0 indicates "can_lock", otherwise the node is
23917 +                  invalid.  */
23918 +               ret = can_lock_object(owner);
23919 +
23920 +               if (unlikely(ret == -EINVAL)) {
23921 +                       /* @node is dying. Leave it alone. */
23922 +                       break;
23923 +               }
23924 +
23925 +               if (unlikely(ret == -E_REPEAT && non_blocking)) {
23926 +                       /* either locking of @node by the current thread will
23927 +                        * lead to the deadlock, or lock modes are
23928 +                        * incompatible. */
23929 +                       break;
23930 +               }
23931 +
23932 +               assert("nikita-1844", (ret == 0)
23933 +                      || ((ret == -E_REPEAT) && !non_blocking));
23934 +               /* If we can get the lock... Try to capture first before
23935 +                  taking the lock. */
23936 +
23937 +               /* first handle commonest case where node and txnh are already
23938 +                * in the same atom. */
23939 +               /* safe to do without taking locks, because:
23940 +                *
23941 +                * 1. read of aligned word is atomic with respect to writes to
23942 +                * this word
23943 +                *
23944 +                * 2. false negatives are handled in try_capture().
23945 +                *
23946 +                * 3. false positives are impossible.
23947 +                *
23948 +                * PROOF: left as an exercise to the curious reader.
23949 +                *
23950 +                * Just kidding. Here is one:
23951 +                *
23952 +                * At the time T0 txnh->atom is stored in txnh_atom.
23953 +                *
23954 +                * At the time T1 node->atom is stored in node_atom.
23955 +                *
23956 +                * At the time T2 we observe that
23957 +                *
23958 +                *     txnh_atom != NULL && node_atom == txnh_atom.
23959 +                *
23960 +                * Imagine that at this moment we acquire node and txnh spin
23961 +                * lock in this order. Suppose that under spin lock we have
23962 +                *
23963 +                *     node->atom != txnh->atom,                       (S1)
23964 +                *
23965 +                * at the time T3.
23966 +                *
23967 +                * txnh->atom != NULL still, because txnh is open by the
23968 +                * current thread.
23969 +                *
23970 +                * Suppose node->atom == NULL, that is, node was un-captured
23971 +                * between T1, and T3. But un-capturing of formatted node is
23972 +                * always preceded by the call to invalidate_lock(), which
23973 +                * marks znode as JNODE_IS_DYING under zlock spin
23974 +                * lock. Contradiction, because can_lock_object() above checks
23975 +                * for JNODE_IS_DYING. Hence, node->atom != NULL at T3.
23976 +                *
23977 +                * Suppose that node->atom != node_atom, that is, atom, node
23978 +                * belongs to was fused into another atom: node_atom was fused
23979 +                * into node->atom. Atom of txnh was equal to node_atom at T2,
23980 +                * which means that under spin lock, txnh->atom == node->atom,
23981 +                * because txnh->atom can only follow fusion
23982 +                * chain. Contradicts S1.
23983 +                *
23984 +                * The same for hypothesis txnh->atom != txnh_atom. Hence,
23985 +                * node->atom == node_atom == txnh_atom == txnh->atom. Again
23986 +                * contradicts S1. Hence S1 is false. QED.
23987 +                *
23988 +                */
23989 +
23990 +               if (likely(has_atom && ZJNODE(node)->atom == txnh->atom)) {
23991 +                       ;
23992 +               } else {
23993 +                       /*
23994 +                        * unlock zlock spin lock here. It is possible for
23995 +                        * longterm_unlock_znode() to sneak in here, but there
23996 +                        * is no harm: invalidate_lock() will mark znode as
23997 +                        * JNODE_IS_DYING and this will be noted by
23998 +                        * can_lock_object() below.
23999 +                        */
24000 +                       spin_unlock_zlock(lock);
24001 +                       spin_lock_znode(node);
24002 +                       ret = try_capture(ZJNODE(node), mode, cap_flags);
24003 +                       spin_unlock_znode(node);
24004 +                       spin_lock_zlock(lock);
24005 +                       if (unlikely(ret != 0)) {
24006 +                               /* In the failure case, the txnmgr releases
24007 +                                  the znode's lock (or in some cases, it was
24008 +                                  released a while ago).  There's no need to
24009 +                                  reacquire it so we should return here,
24010 +                                  avoid releasing the lock. */
24011 +                               owner->request.mode = 0;
24012 +                               break;
24013 +                       }
24014 +
24015 +                       /* Check the lock's availability again -- this is
24016 +                          because under some circumstances the capture code
24017 +                          has to release and reacquire the znode spinlock. */
24018 +                       ret = can_lock_object(owner);
24019 +               }
24020 +
24021 +               /* This time, a return of (ret == 0) means we can lock, so we
24022 +                  should break out of the loop. */
24023 +               if (likely(ret != -E_REPEAT || non_blocking)) {
24024 +                       break;
24025 +               }
24026 +
24027 +               /* Lock is unavailable, we have to wait. */
24028 +
24029 +               /* By having semaphore initialization here we cannot lose
24030 +                  wakeup signal even if it comes after `nr_signaled' field
24031 +                  check. */
24032 +               ret = prepare_to_sleep(owner);
24033 +               if (unlikely(ret != 0)) {
24034 +                       break;
24035 +               }
24036 +
24037 +               assert_spin_locked(&(node->lock.guard));
24038 +               if (hipri) {
24039 +                       /* If we are going in high priority direction then
24040 +                          increase high priority requests counter for the
24041 +                          node */
24042 +                       lock->nr_hipri_requests++;
24043 +                       if (mode == ZNODE_WRITE_LOCK)
24044 +                               lock->nr_hipri_write_requests ++;
24045 +                       /* If there are no high priority owners for a node,
24046 +                          then immediately wake up low priority owners, so
24047 +                          they can detect possible deadlock */
24048 +                       if (lock->nr_hipri_owners == 0)
24049 +                               wake_up_all_lopri_owners(node);
24050 +               }
24051 +               list_add_tail(&owner->requestors_link, &lock->requestors);
24052 +
24053 +               /* Ok, here we have prepared a lock request, so unlock
24054 +                  a znode ... */
24055 +               spin_unlock_zlock(lock);
24056 +               /* ... and sleep */
24057 +               go_to_sleep(owner);
24058 +               if (owner->request.mode == ZNODE_NO_LOCK)
24059 +                       goto request_is_done;
24060 +               spin_lock_zlock(lock);
24061 +               if (owner->request.mode == ZNODE_NO_LOCK) {
24062 +                       spin_unlock_zlock(lock);
24063 +               request_is_done:
24064 +                       if (owner->request.ret_code == 0) {
24065 +                               LOCK_CNT_INC(long_term_locked_znode);
24066 +                               zref(node);
24067 +                       }
24068 +                       return owner->request.ret_code;
24069 +               }
24070 +               remove_lock_request(owner);
24071 +       }
24072 +
24073 +       return lock_tail(owner, ret, mode);
24074 +}
24075 +
24076 +/* lock object invalidation means changing of lock object state to `INVALID'
24077 +   and waiting for all other processes to cancel theirs lock requests. */
24078 +void invalidate_lock(lock_handle * handle      /* path to lock
24079 +                                                * owner and lock
24080 +                                                * object is being
24081 +                                                * invalidated. */ )
24082 +{
24083 +       znode *node = handle->node;
24084 +       lock_stack *owner = handle->owner;
24085 +
24086 +       assert("zam-325", owner == get_current_lock_stack());
24087 +       assert("zam-103", znode_is_write_locked(node));
24088 +       assert("nikita-1393", !ZF_ISSET(node, JNODE_LEFT_CONNECTED));
24089 +       assert("nikita-1793", !ZF_ISSET(node, JNODE_RIGHT_CONNECTED));
24090 +       assert("nikita-1394", ZF_ISSET(node, JNODE_HEARD_BANSHEE));
24091 +       assert("nikita-3097", znode_is_wlocked_once(node));
24092 +       assert_spin_locked(&(node->lock.guard));
24093 +
24094 +       if (handle->signaled)
24095 +               atomic_dec(&owner->nr_signaled);
24096 +
24097 +       ZF_SET(node, JNODE_IS_DYING);
24098 +       unlink_object(handle);
24099 +       node->lock.nr_readers = 0;
24100 +
24101 +       invalidate_all_lock_requests(node);
24102 +       spin_unlock_zlock(&node->lock);
24103 +}
24104 +
24105 +/* Initializes lock_stack. */
24106 +void init_lock_stack(lock_stack * owner        /* pointer to
24107 +                                        * allocated
24108 +                                        * structure. */ )
24109 +{
24110 +       INIT_LIST_HEAD(&owner->locks);
24111 +       INIT_LIST_HEAD(&owner->requestors_link);
24112 +       spin_lock_init(&owner->sguard);
24113 +       owner->curpri = 1;
24114 +       sema_init(&owner->sema, 0);
24115 +}
24116 +
24117 +/* Initializes lock object. */
24118 +void reiser4_init_lock(zlock * lock    /* pointer on allocated
24119 +                                        * uninitialized lock object
24120 +                                        * structure. */ )
24121 +{
24122 +       memset(lock, 0, sizeof(zlock));
24123 +       spin_lock_init(&lock->guard);
24124 +       INIT_LIST_HEAD(&lock->requestors);
24125 +       INIT_LIST_HEAD(&lock->owners);
24126 +}
24127 +
24128 +/* Transfer a lock handle (presumably so that variables can be moved between stack and
24129 +   heap locations). */
24130 +static void
24131 +move_lh_internal(lock_handle * new, lock_handle * old, int unlink_old)
24132 +{
24133 +       znode *node = old->node;
24134 +       lock_stack *owner = old->owner;
24135 +       int signaled;
24136 +
24137 +       /* locks_list, modified by link_object() is not protected by
24138 +          anything. This is valid because only current thread ever modifies
24139 +          locks_list of its lock_stack.
24140 +        */
24141 +       assert("nikita-1827", owner == get_current_lock_stack());
24142 +       assert("nikita-1831", new->owner == NULL);
24143 +
24144 +       spin_lock_zlock(&node->lock);
24145 +
24146 +       signaled = old->signaled;
24147 +       if (unlink_old) {
24148 +               unlink_object(old);
24149 +       } else {
24150 +               if (node->lock.nr_readers > 0) {
24151 +                       node->lock.nr_readers += 1;
24152 +               } else {
24153 +                       node->lock.nr_readers -= 1;
24154 +               }
24155 +               if (signaled) {
24156 +                       atomic_inc(&owner->nr_signaled);
24157 +               }
24158 +               if (owner->curpri) {
24159 +                       node->lock.nr_hipri_owners += 1;
24160 +               }
24161 +               LOCK_CNT_INC(long_term_locked_znode);
24162 +
24163 +               zref(node);
24164 +       }
24165 +       link_object(new, owner, node);
24166 +       new->signaled = signaled;
24167 +
24168 +       spin_unlock_zlock(&node->lock);
24169 +}
24170 +
24171 +void move_lh(lock_handle * new, lock_handle * old)
24172 +{
24173 +       move_lh_internal(new, old, /*unlink_old */ 1);
24174 +}
24175 +
24176 +void copy_lh(lock_handle * new, lock_handle * old)
24177 +{
24178 +       move_lh_internal(new, old, /*unlink_old */ 0);
24179 +}
24180 +
24181 +/* after getting -E_DEADLOCK we unlock znodes until this function returns false */
24182 +int check_deadlock(void)
24183 +{
24184 +       lock_stack *owner = get_current_lock_stack();
24185 +       return atomic_read(&owner->nr_signaled) != 0;
24186 +}
24187 +
24188 +/* Before going to sleep we re-check "release lock" requests which might come from threads with hi-pri lock
24189 +   priorities. */
24190 +int prepare_to_sleep(lock_stack * owner)
24191 +{
24192 +       assert("nikita-1847", owner == get_current_lock_stack());
24193 +       /* NOTE(Zam): We cannot reset the lock semaphore here because it may
24194 +          clear wake-up signal. The initial design was to re-check all
24195 +          conditions under which we continue locking, release locks or sleep
24196 +          until conditions are changed. However, even lock.c does not follow
24197 +          that design.  So, wake-up signal which is stored in semaphore state
24198 +          could we loosen by semaphore reset.  The less complex scheme without
24199 +          resetting the semaphore is enough to not to loose wake-ups.
24200 +
24201 +          if (0) {
24202 +
24203 +          NOTE-NIKITA: I commented call to sema_init() out hoping
24204 +          that it is the reason or thread sleeping in
24205 +          down(&owner->sema) without any other thread running.
24206 +
24207 +          Anyway, it is just an optimization: is semaphore is not
24208 +          reinitialised at this point, in the worst case
24209 +          longterm_lock_znode() would have to iterate its loop once
24210 +          more.
24211 +          spin_lock_stack(owner);
24212 +          sema_init(&owner->sema, 0);
24213 +          spin_unlock_stack(owner);
24214 +          }
24215 +        */
24216 +
24217 +       /* We return -E_DEADLOCK if one or more "give me the lock" messages are
24218 +        * counted in nr_signaled */
24219 +       if (unlikely(atomic_read(&owner->nr_signaled) != 0)) {
24220 +               assert("zam-959", !owner->curpri);
24221 +               return RETERR(-E_DEADLOCK);
24222 +       }
24223 +       return 0;
24224 +}
24225 +
24226 +/* Wakes up a single thread */
24227 +void __reiser4_wake_up(lock_stack * owner)
24228 +{
24229 +       up(&owner->sema);
24230 +}
24231 +
24232 +/* Puts a thread to sleep */
24233 +void go_to_sleep(lock_stack * owner)
24234 +{
24235 +       /* Well, we might sleep here, so holding of any spinlocks is no-no */
24236 +       assert("nikita-3027", schedulable());
24237 +       /* return down_interruptible(&owner->sema); */
24238 +       down(&owner->sema);
24239 +}
24240 +
24241 +int lock_stack_isclean(lock_stack * owner)
24242 +{
24243 +       if (list_empty_careful(&owner->locks)) {
24244 +               assert("zam-353", atomic_read(&owner->nr_signaled) == 0);
24245 +               return 1;
24246 +       }
24247 +
24248 +       return 0;
24249 +}
24250 +
24251 +#if REISER4_DEBUG
24252 +
24253 +/*
24254 + * debugging functions
24255 + */
24256 +
24257 +static void list_check(struct list_head *head)
24258 +{
24259 +       struct list_head *pos;
24260 +
24261 +       list_for_each(pos, head)
24262 +               assert("", (pos->prev != NULL && pos->next != NULL &&
24263 +                           pos->prev->next == pos && pos->next->prev == pos));
24264 +}
24265 +
24266 +/* check consistency of locking data-structures hanging of the @stack */
24267 +static void check_lock_stack(lock_stack * stack)
24268 +{
24269 +       spin_lock_stack(stack);
24270 +       /* check that stack->locks is not corrupted */
24271 +       list_check(&stack->locks);
24272 +       spin_unlock_stack(stack);
24273 +}
24274 +
24275 +/* check consistency of locking data structures */
24276 +void check_lock_data(void)
24277 +{
24278 +       check_lock_stack(&get_current_context()->stack);
24279 +}
24280 +
24281 +/* check consistency of locking data structures for @node */
24282 +void check_lock_node_data(znode * node)
24283 +{
24284 +       spin_lock_zlock(&node->lock);
24285 +       list_check(&node->lock.owners);
24286 +       list_check(&node->lock.requestors);
24287 +       spin_unlock_zlock(&node->lock);
24288 +}
24289 +
24290 +/* check that given lock request is dead lock safe. This check is, of course,
24291 + * not exhaustive. */
24292 +static int
24293 +request_is_deadlock_safe(znode * node, znode_lock_mode mode,
24294 +                        znode_lock_request request)
24295 +{
24296 +       lock_stack *owner;
24297 +
24298 +       owner = get_current_lock_stack();
24299 +       /*
24300 +        * check that hipri lock request is not issued when there are locked
24301 +        * nodes at the higher levels.
24302 +        */
24303 +       if (request & ZNODE_LOCK_HIPRI && !(request & ZNODE_LOCK_NONBLOCK) &&
24304 +           znode_get_level(node) != 0) {
24305 +               lock_handle *item;
24306 +
24307 +               list_for_each_entry(item, &owner->locks, locks_link) {
24308 +                       znode *other;
24309 +
24310 +                       other = item->node;
24311 +
24312 +                       if (znode_get_level(other) == 0)
24313 +                               continue;
24314 +                       if (znode_get_level(other) > znode_get_level(node))
24315 +                               return 0;
24316 +               }
24317 +       }
24318 +       return 1;
24319 +}
24320 +
24321 +#endif
24322 +
24323 +/* return pointer to static storage with name of lock_mode. For
24324 +    debugging */
24325 +const char *lock_mode_name(znode_lock_mode lock /* lock mode to get name of */ )
24326 +{
24327 +       if (lock == ZNODE_READ_LOCK)
24328 +               return "read";
24329 +       else if (lock == ZNODE_WRITE_LOCK)
24330 +               return "write";
24331 +       else {
24332 +               static char buf[30];
24333 +
24334 +               sprintf(buf, "unknown: %i", lock);
24335 +               return buf;
24336 +       }
24337 +}
24338 +
24339 +/* Make Linus happy.
24340 +   Local variables:
24341 +   c-indentation-style: "K&R"
24342 +   mode-name: "LC"
24343 +   c-basic-offset: 8
24344 +   tab-width: 8
24345 +   fill-column: 79
24346 +   End:
24347 +*/
24348 diff --git a/fs/reiser4/lock.h b/fs/reiser4/lock.h
24349 new file mode 100644
24350 index 0000000..ebfccb0
24351 --- /dev/null
24352 +++ b/fs/reiser4/lock.h
24353 @@ -0,0 +1,272 @@
24354 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
24355 +
24356 +/* Long term locking data structures. See lock.c for details. */
24357 +
24358 +#ifndef __LOCK_H__
24359 +#define __LOCK_H__
24360 +
24361 +#include "forward.h"
24362 +#include "debug.h"
24363 +#include "dformat.h"
24364 +#include "key.h"
24365 +#include "coord.h"
24366 +#include "plugin/node/node.h"
24367 +#include "txnmgr.h"
24368 +#include "readahead.h"
24369 +
24370 +#include <linux/types.h>
24371 +#include <linux/spinlock.h>
24372 +#include <linux/pagemap.h>     /* for PAGE_CACHE_SIZE */
24373 +#include <asm/atomic.h>
24374 +#include <asm/semaphore.h>
24375 +
24376 +/* Per-znode lock object */
24377 +struct zlock {
24378 +       spinlock_t guard;
24379 +       /* The number of readers if positive; the number of recursively taken
24380 +          write locks if negative. Protected by zlock spin lock. */
24381 +       int nr_readers;
24382 +       /* A number of processes (lock_stacks) that have this object
24383 +          locked with high priority */
24384 +       unsigned nr_hipri_owners;
24385 +       /* A number of attempts to lock znode in high priority direction */
24386 +       unsigned nr_hipri_requests;
24387 +       /* A linked list of lock_handle objects that contains pointers
24388 +          for all lock_stacks which have this lock object locked */
24389 +       unsigned nr_hipri_write_requests;
24390 +       struct list_head owners;
24391 +       /* A linked list of lock_stacks that wait for this lock */
24392 +       struct list_head requestors;
24393 +};
24394 +
24395 +static inline void spin_lock_zlock(zlock *lock)
24396 +{
24397 +       /* check that zlock is not locked */
24398 +       assert("", LOCK_CNT_NIL(spin_locked_zlock));
24399 +       /* check that spinlocks of lower priorities are not held */
24400 +       assert("", LOCK_CNT_NIL(spin_locked_stack));
24401 +
24402 +       spin_lock(&lock->guard);
24403 +
24404 +       LOCK_CNT_INC(spin_locked_zlock);
24405 +       LOCK_CNT_INC(spin_locked);
24406 +}
24407 +
24408 +static inline void spin_unlock_zlock(zlock *lock)
24409 +{
24410 +       assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_zlock));
24411 +       assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
24412 +
24413 +       LOCK_CNT_DEC(spin_locked_zlock);
24414 +       LOCK_CNT_DEC(spin_locked);
24415 +
24416 +       spin_unlock(&lock->guard);
24417 +}
24418 +
24419 +#define lock_is_locked(lock)          ((lock)->nr_readers != 0)
24420 +#define lock_is_rlocked(lock)         ((lock)->nr_readers > 0)
24421 +#define lock_is_wlocked(lock)         ((lock)->nr_readers < 0)
24422 +#define lock_is_wlocked_once(lock)    ((lock)->nr_readers == -1)
24423 +#define lock_can_be_rlocked(lock)     ((lock)->nr_readers >=0)
24424 +#define lock_mode_compatible(lock, mode)                               \
24425 +             (((mode) == ZNODE_WRITE_LOCK && !lock_is_locked(lock)) || \
24426 +              ((mode) == ZNODE_READ_LOCK && lock_can_be_rlocked(lock)))
24427 +
24428 +/* Since we have R/W znode locks we need additional bidirectional `link'
24429 +   objects to implement n<->m relationship between lock owners and lock
24430 +   objects. We call them `lock handles'.
24431 +
24432 +   Locking: see lock.c/"SHORT-TERM LOCKING"
24433 +*/
24434 +struct lock_handle {
24435 +       /* This flag indicates that a signal to yield a lock was passed to
24436 +          lock owner and counted in owner->nr_signalled
24437 +
24438 +          Locking: this is accessed under spin lock on ->node.
24439 +        */
24440 +       int signaled;
24441 +       /* A link to owner of a lock */
24442 +       lock_stack *owner;
24443 +       /* A link to znode locked */
24444 +       znode *node;
24445 +       /* A list of all locks for a process */
24446 +       struct list_head locks_link;
24447 +       /* A list of all owners for a znode */
24448 +       struct list_head owners_link;
24449 +};
24450 +
24451 +typedef struct lock_request {
24452 +       /* A pointer to uninitialized link object */
24453 +       lock_handle *handle;
24454 +       /* A pointer to the object we want to lock */
24455 +       znode *node;
24456 +       /* Lock mode (ZNODE_READ_LOCK or ZNODE_WRITE_LOCK) */
24457 +       znode_lock_mode mode;
24458 +       /* how dispatch_lock_requests() returns lock request result code */
24459 +       int ret_code;
24460 +} lock_request;
24461 +
24462 +/* A lock stack structure for accumulating locks owned by a process */
24463 +struct lock_stack {
24464 +       /* A guard lock protecting a lock stack */
24465 +       spinlock_t sguard;
24466 +       /* number of znodes which were requested by high priority processes */
24467 +       atomic_t nr_signaled;
24468 +       /* Current priority of a process
24469 +
24470 +          This is only accessed by the current thread and thus requires no
24471 +          locking.
24472 +        */
24473 +       int curpri;
24474 +       /* A list of all locks owned by this process. Elements can be added to
24475 +        * this list only by the current thread. ->node pointers in this list
24476 +        * can be only changed by the current thread. */
24477 +       struct list_head locks;
24478 +       /* When lock_stack waits for the lock, it puts itself on double-linked
24479 +          requestors list of that lock */
24480 +       struct list_head requestors_link;
24481 +       /* Current lock request info.
24482 +
24483 +          This is only accessed by the current thread and thus requires no
24484 +          locking.
24485 +        */
24486 +       lock_request request;
24487 +       /* It is a lock_stack's synchronization object for when process sleeps
24488 +          when requested lock not on this lock_stack but which it wishes to
24489 +          add to this lock_stack is not immediately available. It is used
24490 +          instead of wait_queue_t object due to locking problems (lost wake
24491 +          up). "lost wakeup" occurs when process is waken up before he actually
24492 +          becomes 'sleepy' (through sleep_on()). Using of semaphore object is
24493 +          simplest way to avoid that problem.
24494 +
24495 +          A semaphore is used in the following way: only the process that is
24496 +          the owner of the lock_stack initializes it (to zero) and calls
24497 +          down(sema) on it. Usually this causes the process to sleep on the
24498 +          semaphore. Other processes may wake him up by calling up(sema). The
24499 +          advantage to a semaphore is that up() and down() calls are not
24500 +          required to preserve order. Unlike wait_queue it works when process
24501 +          is woken up before getting to sleep.
24502 +
24503 +          NOTE-NIKITA: Transaction manager is going to have condition variables
24504 +          (&kcondvar_t) anyway, so this probably will be replaced with
24505 +          one in the future.
24506 +
24507 +          After further discussion, Nikita has shown me that Zam's implementation is
24508 +          exactly a condition variable.  The znode's {zguard,requestors_list} represents
24509 +          condition variable and the lock_stack's {sguard,semaphore} guards entry and
24510 +          exit from the condition variable's wait queue.  But the existing code can't
24511 +          just be replaced with a more general abstraction, and I think its fine the way
24512 +          it is. */
24513 +       struct semaphore sema;
24514 +#if REISER4_DEBUG
24515 +       int nr_locks;           /* number of lock handles in the above list */
24516 +#endif
24517 +};
24518 +
24519 +
24520 +/*
24521 +  User-visible znode locking functions
24522 +*/
24523 +
24524 +extern int longterm_lock_znode(lock_handle * handle,
24525 +                              znode * node,
24526 +                              znode_lock_mode mode,
24527 +                              znode_lock_request request);
24528 +
24529 +extern void longterm_unlock_znode(lock_handle * handle);
24530 +
24531 +extern int check_deadlock(void);
24532 +
24533 +extern lock_stack *get_current_lock_stack(void);
24534 +
24535 +extern void init_lock_stack(lock_stack * owner);
24536 +extern void reiser4_init_lock(zlock * lock);
24537 +
24538 +static inline void init_lh(lock_handle *lh)
24539 +{
24540 +#if REISER4_DEBUG
24541 +       memset(lh, 0, sizeof *lh);
24542 +       INIT_LIST_HEAD(&lh->locks_link);
24543 +       INIT_LIST_HEAD(&lh->owners_link);
24544 +#else
24545 +       lh->node = NULL;
24546 +#endif
24547 +}
24548 +
24549 +static inline  void done_lh(lock_handle *lh)
24550 +{
24551 +       assert("zam-342", lh != NULL);
24552 +       if (lh->node != NULL)
24553 +               longterm_unlock_znode(lh);
24554 +}
24555 +
24556 +extern void move_lh(lock_handle * new, lock_handle * old);
24557 +extern void copy_lh(lock_handle * new, lock_handle * old);
24558 +
24559 +extern int prepare_to_sleep(lock_stack * owner);
24560 +extern void go_to_sleep(lock_stack * owner);
24561 +extern void __reiser4_wake_up(lock_stack * owner);
24562 +
24563 +extern int lock_stack_isclean(lock_stack * owner);
24564 +
24565 +/* zlock object state check macros: only used in assertions.  Both forms imply that the
24566 +   lock is held by the current thread. */
24567 +extern int znode_is_write_locked(const znode *);
24568 +extern void invalidate_lock(lock_handle *);
24569 +
24570 +/* lock ordering is: first take zlock spin lock, then lock stack spin lock */
24571 +#define spin_ordering_pred_stack(stack)                        \
24572 +       (LOCK_CNT_NIL(spin_locked_stack) &&             \
24573 +        LOCK_CNT_NIL(spin_locked_txnmgr) &&            \
24574 +        LOCK_CNT_NIL(spin_locked_inode) &&             \
24575 +        LOCK_CNT_NIL(rw_locked_cbk_cache) &&           \
24576 +        LOCK_CNT_NIL(spin_locked_super_eflush) )
24577 +
24578 +static inline void spin_lock_stack(lock_stack *stack)
24579 +{
24580 +       assert("", spin_ordering_pred_stack(stack));
24581 +       spin_lock(&(stack->sguard));
24582 +       LOCK_CNT_INC(spin_locked_stack);
24583 +       LOCK_CNT_INC(spin_locked);
24584 +}
24585 +
24586 +static inline void spin_unlock_stack(lock_stack *stack)
24587 +{
24588 +       assert_spin_locked(&(stack->sguard));
24589 +       assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_stack));
24590 +       assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
24591 +       LOCK_CNT_DEC(spin_locked_stack);
24592 +       LOCK_CNT_DEC(spin_locked);
24593 +       spin_unlock(&(stack->sguard));
24594 +}
24595 +
24596 +
24597 +static inline void reiser4_wake_up(lock_stack * owner)
24598 +{
24599 +       spin_lock_stack(owner);
24600 +       __reiser4_wake_up(owner);
24601 +       spin_unlock_stack(owner);
24602 +}
24603 +
24604 +const char *lock_mode_name(znode_lock_mode lock);
24605 +
24606 +#if REISER4_DEBUG
24607 +extern void check_lock_data(void);
24608 +extern void check_lock_node_data(znode * node);
24609 +#else
24610 +#define check_lock_data() noop
24611 +#define check_lock_node_data() noop
24612 +#endif
24613 +
24614 +/* __LOCK_H__ */
24615 +#endif
24616 +
24617 +/* Make Linus happy.
24618 +   Local variables:
24619 +   c-indentation-style: "K&R"
24620 +   mode-name: "LC"
24621 +   c-basic-offset: 8
24622 +   tab-width: 8
24623 +   fill-column: 120
24624 +   End:
24625 +*/
24626 diff --git a/fs/reiser4/oid.c b/fs/reiser4/oid.c
24627 new file mode 100644
24628 index 0000000..f311d06
24629 --- /dev/null
24630 +++ b/fs/reiser4/oid.c
24631 @@ -0,0 +1,141 @@
24632 +/* Copyright 2003 by Hans Reiser, licensing governed by reiser4/README */
24633 +
24634 +#include "debug.h"
24635 +#include "super.h"
24636 +#include "txnmgr.h"
24637 +
24638 +/* we used to have oid allocation plugin. It was removed because it
24639 +   was recognized as providing unneeded level of abstraction. If one
24640 +   ever will find it useful - look at yet_unneeded_abstractions/oid
24641 +*/
24642 +
24643 +/*
24644 + * initialize in-memory data for oid allocator at @super. @nr_files and @next
24645 + * are provided by disk format plugin that reads them from the disk during
24646 + * mount.
24647 + */
24648 +int oid_init_allocator(struct super_block *super, oid_t nr_files, oid_t next)
24649 +{
24650 +       reiser4_super_info_data *sbinfo;
24651 +
24652 +       sbinfo = get_super_private(super);
24653 +
24654 +       sbinfo->next_to_use = next;
24655 +       sbinfo->oids_in_use = nr_files;
24656 +       return 0;
24657 +}
24658 +
24659 +/*
24660 + * allocate oid and return it. ABSOLUTE_MAX_OID is returned when allocator
24661 + * runs out of oids.
24662 + */
24663 +oid_t oid_allocate(struct super_block * super)
24664 +{
24665 +       reiser4_super_info_data *sbinfo;
24666 +       oid_t oid;
24667 +
24668 +       sbinfo = get_super_private(super);
24669 +
24670 +       spin_lock_reiser4_super(sbinfo);
24671 +       if (sbinfo->next_to_use != ABSOLUTE_MAX_OID) {
24672 +               oid = sbinfo->next_to_use++;
24673 +               sbinfo->oids_in_use++;
24674 +       } else
24675 +               oid = ABSOLUTE_MAX_OID;
24676 +       spin_unlock_reiser4_super(sbinfo);
24677 +       return oid;
24678 +}
24679 +
24680 +/*
24681 + * Tell oid allocator that @oid is now free.
24682 + */
24683 +int oid_release(struct super_block *super, oid_t oid UNUSED_ARG)
24684 +{
24685 +       reiser4_super_info_data *sbinfo;
24686 +
24687 +       sbinfo = get_super_private(super);
24688 +
24689 +       spin_lock_reiser4_super(sbinfo);
24690 +       sbinfo->oids_in_use--;
24691 +       spin_unlock_reiser4_super(sbinfo);
24692 +       return 0;
24693 +}
24694 +
24695 +/*
24696 + * return next @oid that would be allocated (i.e., returned by oid_allocate())
24697 + * without actually allocating it. This is used by disk format plugin to save
24698 + * oid allocator state on the disk.
24699 + */
24700 +oid_t oid_next(const struct super_block * super)
24701 +{
24702 +       reiser4_super_info_data *sbinfo;
24703 +       oid_t oid;
24704 +
24705 +       sbinfo = get_super_private(super);
24706 +
24707 +       spin_lock_reiser4_super(sbinfo);
24708 +       oid = sbinfo->next_to_use;
24709 +       spin_unlock_reiser4_super(sbinfo);
24710 +       return oid;
24711 +}
24712 +
24713 +/*
24714 + * returns number of currently used oids. This is used by statfs(2) to report
24715 + * number of "inodes" and by disk format plugin to save oid allocator state on
24716 + * the disk.
24717 + */
24718 +long oids_used(const struct super_block *super)
24719 +{
24720 +       reiser4_super_info_data *sbinfo;
24721 +       oid_t used;
24722 +
24723 +       sbinfo = get_super_private(super);
24724 +
24725 +       spin_lock_reiser4_super(sbinfo);
24726 +       used = sbinfo->oids_in_use;
24727 +       spin_unlock_reiser4_super(sbinfo);
24728 +       if (used < (__u64) ((long)~0) >> 1)
24729 +               return (long)used;
24730 +       else
24731 +               return (long)-1;
24732 +}
24733 +
24734 +/*
24735 + * Count oid as allocated in atom. This is done after call to oid_allocate()
24736 + * at the point when we are irrevocably committed to creation of the new file
24737 + * (i.e., when oid allocation cannot be any longer rolled back due to some
24738 + * error).
24739 + */
24740 +void oid_count_allocated(void)
24741 +{
24742 +       txn_atom *atom;
24743 +
24744 +       atom = get_current_atom_locked();
24745 +       atom->nr_objects_created++;
24746 +       spin_unlock_atom(atom);
24747 +}
24748 +
24749 +/*
24750 + * Count oid as free in atom. This is done after call to oid_release() at the
24751 + * point when we are irrevocably committed to the deletion of the file (i.e.,
24752 + * when oid release cannot be any longer rolled back due to some error).
24753 + */
24754 +void oid_count_released(void)
24755 +{
24756 +       txn_atom *atom;
24757 +
24758 +       atom = get_current_atom_locked();
24759 +       atom->nr_objects_deleted++;
24760 +       spin_unlock_atom(atom);
24761 +}
24762 +
24763 +/*
24764 +   Local variables:
24765 +   c-indentation-style: "K&R"
24766 +   mode-name: "LC"
24767 +   c-basic-offset: 8
24768 +   tab-width: 8
24769 +   fill-column: 120
24770 +   scroll-step: 1
24771 +   End:
24772 +*/
24773 diff --git a/fs/reiser4/page_cache.c b/fs/reiser4/page_cache.c
24774 new file mode 100644
24775 index 0000000..185ecc9
24776 --- /dev/null
24777 +++ b/fs/reiser4/page_cache.c
24778 @@ -0,0 +1,712 @@
24779 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
24780 + * reiser4/README */
24781 +
24782 +/* Memory pressure hooks. Fake inodes handling. */
24783 +/* We store all file system meta data (and data, of course) in the page cache.
24784 +
24785 +   What does this mean? In stead of using bread/brelse we create special
24786 +   "fake" inode (one per super block) and store content of formatted nodes
24787 +   into pages bound to this inode in the page cache. In newer kernels bread()
24788 +   already uses inode attached to block device (bd_inode). Advantage of having
24789 +   our own fake inode is that we can install appropriate methods in its
24790 +   address_space operations. Such methods are called by VM on memory pressure
24791 +   (or during background page flushing) and we can use them to react
24792 +   appropriately.
24793 +
24794 +   In initial version we only support one block per page. Support for multiple
24795 +   blocks per page is complicated by relocation.
24796 +
24797 +   To each page, used by reiser4, jnode is attached. jnode is analogous to
24798 +   buffer head. Difference is that jnode is bound to the page permanently:
24799 +   jnode cannot be removed from memory until its backing page is.
24800 +
24801 +   jnode contain pointer to page (->pg field) and page contain pointer to
24802 +   jnode in ->private field. Pointer from jnode to page is protected to by
24803 +   jnode's spinlock and pointer from page to jnode is protected by page lock
24804 +   (PG_locked bit). Lock ordering is: first take page lock, then jnode spin
24805 +   lock. To go into reverse direction use jnode_lock_page() function that uses
24806 +   standard try-lock-and-release device.
24807 +
24808 +   Properties:
24809 +
24810 +   1. when jnode-to-page mapping is established (by jnode_attach_page()), page
24811 +   reference counter is increased.
24812 +
24813 +   2. when jnode-to-page mapping is destroyed (by jnode_detach_page() and
24814 +   page_detach_jnode()), page reference counter is decreased.
24815 +
24816 +   3. on jload() reference counter on jnode page is increased, page is
24817 +   kmapped and `referenced'.
24818 +
24819 +   4. on jrelse() inverse operations are performed.
24820 +
24821 +   5. kmapping/kunmapping of unformatted pages is done by read/write methods.
24822 +
24823 +   DEADLOCKS RELATED TO MEMORY PRESSURE. [OUTDATED. Only interesting
24824 +   historically.]
24825 +
24826 +   [In the following discussion, `lock' invariably means long term lock on
24827 +   znode.] (What about page locks?)
24828 +
24829 +   There is some special class of deadlock possibilities related to memory
24830 +   pressure. Locks acquired by other reiser4 threads are accounted for in
24831 +   deadlock prevention mechanism (lock.c), but when ->vm_writeback() is
24832 +   invoked additional hidden arc is added to the locking graph: thread that
24833 +   tries to allocate memory waits for ->vm_writeback() to finish. If this
24834 +   thread keeps lock and ->vm_writeback() tries to acquire this lock, deadlock
24835 +   prevention is useless.
24836 +
24837 +   Another related problem is possibility for ->vm_writeback() to run out of
24838 +   memory itself. This is not a problem for ext2 and friends, because their
24839 +   ->vm_writeback() don't allocate much memory, but reiser4 flush is
24840 +   definitely able to allocate huge amounts of memory.
24841 +
24842 +   It seems that there is no reliable way to cope with the problems above. In
24843 +   stead it was decided that ->vm_writeback() (as invoked in the kswapd
24844 +   context) wouldn't perform any flushing itself, but rather should just wake
24845 +   up some auxiliary thread dedicated for this purpose (or, the same thread
24846 +   that does periodic commit of old atoms (ktxnmgrd.c)).
24847 +
24848 +   Details:
24849 +
24850 +   1. Page is called `reclaimable' against particular reiser4 mount F if this
24851 +   page can be ultimately released by try_to_free_pages() under presumptions
24852 +   that:
24853 +
24854 +    a. ->vm_writeback() for F is no-op, and
24855 +
24856 +    b. none of the threads accessing F are making any progress, and
24857 +
24858 +    c. other reiser4 mounts obey the same memory reservation protocol as F
24859 +    (described below).
24860 +
24861 +   For example, clean un-pinned page, or page occupied by ext2 data are
24862 +   reclaimable against any reiser4 mount.
24863 +
24864 +   When there is more than one reiser4 mount in a system, condition (c) makes
24865 +   reclaim-ability not easily verifiable beyond trivial cases mentioned above.
24866 +
24867 +   THIS COMMENT IS VALID FOR "MANY BLOCKS ON PAGE" CASE
24868 +
24869 +   Fake inode is used to bound formatted nodes and each node is indexed within
24870 +   fake inode by its block number. If block size of smaller than page size, it
24871 +   may so happen that block mapped to the page with formatted node is occupied
24872 +   by unformatted node or is unallocated. This lead to some complications,
24873 +   because flushing whole page can lead to an incorrect overwrite of
24874 +   unformatted node that is moreover, can be cached in some other place as
24875 +   part of the file body. To avoid this, buffers for unformatted nodes are
24876 +   never marked dirty. Also pages in the fake are never marked dirty. This
24877 +   rules out usage of ->writepage() as memory pressure hook. In stead
24878 +   ->releasepage() is used.
24879 +
24880 +   Josh is concerned that page->buffer is going to die. This should not pose
24881 +   significant problem though, because we need to add some data structures to
24882 +   the page anyway (jnode) and all necessary book keeping can be put there.
24883 +
24884 +*/
24885 +
24886 +/* Life cycle of pages/nodes.
24887 +
24888 +   jnode contains reference to page and page contains reference back to
24889 +   jnode. This reference is counted in page ->count. Thus, page bound to jnode
24890 +   cannot be released back into free pool.
24891 +
24892 +    1. Formatted nodes.
24893 +
24894 +      1. formatted node is represented by znode. When new znode is created its
24895 +      ->pg pointer is NULL initially.
24896 +
24897 +      2. when node content is loaded into znode (by call to zload()) for the
24898 +      first time following happens (in call to ->read_node() or
24899 +      ->allocate_node()):
24900 +
24901 +        1. new page is added to the page cache.
24902 +
24903 +        2. this page is attached to znode and its ->count is increased.
24904 +
24905 +        3. page is kmapped.
24906 +
24907 +      3. if more calls to zload() follow (without corresponding zrelses), page
24908 +      counter is left intact and in its stead ->d_count is increased in znode.
24909 +
24910 +      4. each call to zrelse decreases ->d_count. When ->d_count drops to zero
24911 +      ->release_node() is called and page is kunmapped as result.
24912 +
24913 +      5. at some moment node can be captured by a transaction. Its ->x_count
24914 +      is then increased by transaction manager.
24915 +
24916 +      6. if node is removed from the tree (empty node with JNODE_HEARD_BANSHEE
24917 +      bit set) following will happen (also see comment at the top of znode.c):
24918 +
24919 +        1. when last lock is released, node will be uncaptured from
24920 +        transaction. This released reference that transaction manager acquired
24921 +        at the step 5.
24922 +
24923 +        2. when last reference is released, zput() detects that node is
24924 +        actually deleted and calls ->delete_node()
24925 +        operation. page_cache_delete_node() implementation detaches jnode from
24926 +        page and releases page.
24927 +
24928 +      7. otherwise (node wasn't removed from the tree), last reference to
24929 +      znode will be released after transaction manager committed transaction
24930 +      node was in. This implies squallocing of this node (see
24931 +      flush.c). Nothing special happens at this point. Znode is still in the
24932 +      hash table and page is still attached to it.
24933 +
24934 +      8. znode is actually removed from the memory because of the memory
24935 +      pressure, or during umount (znodes_tree_done()). Anyway, znode is
24936 +      removed by the call to zdrop(). At this moment, page is detached from
24937 +      znode and removed from the inode address space.
24938 +
24939 +*/
24940 +
24941 +#include "debug.h"
24942 +#include "dformat.h"
24943 +#include "key.h"
24944 +#include "txnmgr.h"
24945 +#include "jnode.h"
24946 +#include "znode.h"
24947 +#include "block_alloc.h"
24948 +#include "tree.h"
24949 +#include "vfs_ops.h"
24950 +#include "inode.h"
24951 +#include "super.h"
24952 +#include "entd.h"
24953 +#include "page_cache.h"
24954 +#include "ktxnmgrd.h"
24955 +
24956 +#include <linux/types.h>
24957 +#include <linux/fs.h>
24958 +#include <linux/mm.h>          /* for struct page */
24959 +#include <linux/swap.h>                /* for struct page */
24960 +#include <linux/pagemap.h>
24961 +#include <linux/bio.h>
24962 +#include <linux/writeback.h>
24963 +#include <linux/blkdev.h>
24964 +
24965 +static struct bio *page_bio(struct page *, jnode *, int rw, gfp_t gfp);
24966 +
24967 +static struct address_space_operations formatted_fake_as_ops;
24968 +
24969 +static const oid_t fake_ino = 0x1;
24970 +static const oid_t bitmap_ino = 0x2;
24971 +static const oid_t cc_ino = 0x3;
24972 +
24973 +static void
24974 +init_fake_inode(struct super_block *super, struct inode *fake,
24975 +               struct inode **pfake)
24976 +{
24977 +       assert("nikita-2168", fake->i_state & I_NEW);
24978 +       fake->i_mapping->a_ops = &formatted_fake_as_ops;
24979 +       *pfake = fake;
24980 +       /* NOTE-NIKITA something else? */
24981 +       unlock_new_inode(fake);
24982 +}
24983 +
24984 +/**
24985 + * init_formatted_fake - iget inodes for formatted nodes and bitmaps
24986 + * @super: super block to init fake inode for
24987 + *
24988 + * Initializes fake inode to which formatted nodes are bound in the page cache
24989 + * and inode for bitmaps.
24990 + */
24991 +int init_formatted_fake(struct super_block *super)
24992 +{
24993 +       struct inode *fake;
24994 +       struct inode *bitmap;
24995 +       struct inode *cc;
24996 +       reiser4_super_info_data *sinfo;
24997 +
24998 +       assert("nikita-1703", super != NULL);
24999 +
25000 +       sinfo = get_super_private_nocheck(super);
25001 +       fake = iget_locked(super, oid_to_ino(fake_ino));
25002 +
25003 +       if (fake != NULL) {
25004 +               init_fake_inode(super, fake, &sinfo->fake);
25005 +
25006 +               bitmap = iget_locked(super, oid_to_ino(bitmap_ino));
25007 +               if (bitmap != NULL) {
25008 +                       init_fake_inode(super, bitmap, &sinfo->bitmap);
25009 +
25010 +                       cc = iget_locked(super, oid_to_ino(cc_ino));
25011 +                       if (cc != NULL) {
25012 +                               init_fake_inode(super, cc, &sinfo->cc);
25013 +                               return 0;
25014 +                       } else {
25015 +                               iput(sinfo->fake);
25016 +                               iput(sinfo->bitmap);
25017 +                               sinfo->fake = NULL;
25018 +                               sinfo->bitmap = NULL;
25019 +                       }
25020 +               } else {
25021 +                       iput(sinfo->fake);
25022 +                       sinfo->fake = NULL;
25023 +               }
25024 +       }
25025 +       return RETERR(-ENOMEM);
25026 +}
25027 +
25028 +/**
25029 + * done_formatted_fake - release inode used by formatted nodes and bitmaps
25030 + * @super: super block to init fake inode for
25031 + *
25032 + * Releases inodes which were used as address spaces of bitmap and formatted
25033 + * nodes.
25034 + */
25035 +void done_formatted_fake(struct super_block *super)
25036 +{
25037 +       reiser4_super_info_data *sinfo;
25038 +
25039 +       sinfo = get_super_private_nocheck(super);
25040 +
25041 +       if (sinfo->fake != NULL) {
25042 +               assert("vs-1426", sinfo->fake->i_data.nrpages == 0);
25043 +               iput(sinfo->fake);
25044 +               sinfo->fake = NULL;
25045 +       }
25046 +
25047 +       if (sinfo->bitmap != NULL) {
25048 +               iput(sinfo->bitmap);
25049 +               sinfo->bitmap = NULL;
25050 +       }
25051 +
25052 +       if (sinfo->cc != NULL) {
25053 +               iput(sinfo->cc);
25054 +               sinfo->cc = NULL;
25055 +       }
25056 +       return;
25057 +}
25058 +
25059 +void reiser4_wait_page_writeback(struct page *page)
25060 +{
25061 +       assert("zam-783", PageLocked(page));
25062 +
25063 +       do {
25064 +               unlock_page(page);
25065 +               wait_on_page_writeback(page);
25066 +               lock_page(page);
25067 +       } while (PageWriteback(page));
25068 +}
25069 +
25070 +/* return tree @page is in */
25071 +reiser4_tree *tree_by_page(const struct page *page /* page to query */ )
25072 +{
25073 +       assert("nikita-2461", page != NULL);
25074 +       return &get_super_private(page->mapping->host->i_sb)->tree;
25075 +}
25076 +
25077 +/* completion handler for single page bio-based read.
25078 +
25079 +   mpage_end_io_read() would also do. But it's static.
25080 +
25081 +*/
25082 +static int
25083 +end_bio_single_page_read(struct bio *bio, unsigned int bytes_done UNUSED_ARG,
25084 +                        int err UNUSED_ARG)
25085 +{
25086 +       struct page *page;
25087 +
25088 +       if (bio->bi_size != 0) {
25089 +               warning("nikita-3332", "Truncated single page read: %i",
25090 +                       bio->bi_size);
25091 +               return 1;
25092 +       }
25093 +
25094 +       page = bio->bi_io_vec[0].bv_page;
25095 +
25096 +       if (test_bit(BIO_UPTODATE, &bio->bi_flags)) {
25097 +               SetPageUptodate(page);
25098 +       } else {
25099 +               ClearPageUptodate(page);
25100 +               SetPageError(page);
25101 +       }
25102 +       unlock_page(page);
25103 +       bio_put(bio);
25104 +       return 0;
25105 +}
25106 +
25107 +/* completion handler for single page bio-based write.
25108 +
25109 +   mpage_end_io_write() would also do. But it's static.
25110 +
25111 +*/
25112 +static int
25113 +end_bio_single_page_write(struct bio *bio, unsigned int bytes_done UNUSED_ARG,
25114 +                         int err UNUSED_ARG)
25115 +{
25116 +       struct page *page;
25117 +
25118 +       if (bio->bi_size != 0) {
25119 +               warning("nikita-3333", "Truncated single page write: %i",
25120 +                       bio->bi_size);
25121 +               return 1;
25122 +       }
25123 +
25124 +       page = bio->bi_io_vec[0].bv_page;
25125 +
25126 +       if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
25127 +               SetPageError(page);
25128 +       end_page_writeback(page);
25129 +       bio_put(bio);
25130 +       return 0;
25131 +}
25132 +
25133 +/* ->readpage() method for formatted nodes */
25134 +static int formatted_readpage(struct file *f UNUSED_ARG,
25135 +                             struct page *page /* page to read */ )
25136 +{
25137 +       assert("nikita-2412", PagePrivate(page) && jprivate(page));
25138 +       return page_io(page, jprivate(page), READ, get_gfp_mask());
25139 +}
25140 +
25141 +/**
25142 + * page_io - submit single-page bio request
25143 + * @page: page to perform io for
25144 + * @node: jnode of page
25145 + * @rw: read or write
25146 + * @gfp: gfp mask for bio allocation
25147 + *
25148 + * Submits single page read or write.
25149 + */
25150 +int page_io(struct page *page, jnode *node, int rw, gfp_t gfp)
25151 +{
25152 +       struct bio *bio;
25153 +       int result;
25154 +
25155 +       assert("nikita-2094", page != NULL);
25156 +       assert("nikita-2226", PageLocked(page));
25157 +       assert("nikita-2634", node != NULL);
25158 +       assert("nikita-2893", rw == READ || rw == WRITE);
25159 +
25160 +       if (rw) {
25161 +               if (unlikely(page->mapping->host->i_sb->s_flags & MS_RDONLY)) {
25162 +                       unlock_page(page);
25163 +                       return 0;
25164 +               }
25165 +       }
25166 +
25167 +       bio = page_bio(page, node, rw, gfp);
25168 +       if (!IS_ERR(bio)) {
25169 +               if (rw == WRITE) {
25170 +                       SetPageWriteback(page);
25171 +                       unlock_page(page);
25172 +               }
25173 +               reiser4_submit_bio(rw, bio);
25174 +               result = 0;
25175 +       } else {
25176 +               unlock_page(page);
25177 +               result = PTR_ERR(bio);
25178 +       }
25179 +
25180 +       return result;
25181 +}
25182 +
25183 +/* helper function to construct bio for page */
25184 +static struct bio *page_bio(struct page *page, jnode * node, int rw, gfp_t gfp)
25185 +{
25186 +       struct bio *bio;
25187 +       assert("nikita-2092", page != NULL);
25188 +       assert("nikita-2633", node != NULL);
25189 +
25190 +       /* Simple implementation in the assumption that blocksize == pagesize.
25191 +
25192 +          We only have to submit one block, but submit_bh() will allocate bio
25193 +          anyway, so lets use all the bells-and-whistles of bio code.
25194 +        */
25195 +
25196 +       bio = bio_alloc(gfp, 1);
25197 +       if (bio != NULL) {
25198 +               int blksz;
25199 +               struct super_block *super;
25200 +               reiser4_block_nr blocknr;
25201 +
25202 +               super = page->mapping->host->i_sb;
25203 +               assert("nikita-2029", super != NULL);
25204 +               blksz = super->s_blocksize;
25205 +               assert("nikita-2028", blksz == (int)PAGE_CACHE_SIZE);
25206 +
25207 +               spin_lock_jnode(node);
25208 +               blocknr = *jnode_get_io_block(node);
25209 +               spin_unlock_jnode(node);
25210 +
25211 +               assert("nikita-2275", blocknr != (reiser4_block_nr) 0);
25212 +               assert("nikita-2276", !blocknr_is_fake(&blocknr));
25213 +
25214 +               bio->bi_bdev = super->s_bdev;
25215 +               /* fill bio->bi_sector before calling bio_add_page(), because
25216 +                * q->merge_bvec_fn may want to inspect it (see
25217 +                * drivers/md/linear.c:linear_mergeable_bvec() for example. */
25218 +               bio->bi_sector = blocknr * (blksz >> 9);
25219 +
25220 +               if (!bio_add_page(bio, page, blksz, 0)) {
25221 +                       warning("nikita-3452",
25222 +                               "Single page bio cannot be constructed");
25223 +                       return ERR_PTR(RETERR(-EINVAL));
25224 +               }
25225 +
25226 +               /* bio -> bi_idx is filled by bio_init() */
25227 +               bio->bi_end_io = (rw == READ) ?
25228 +                   end_bio_single_page_read : end_bio_single_page_write;
25229 +
25230 +               return bio;
25231 +       } else
25232 +               return ERR_PTR(RETERR(-ENOMEM));
25233 +}
25234 +
25235 +/* this function is internally called by jnode_make_dirty() */
25236 +int set_page_dirty_internal(struct page *page)
25237 +{
25238 +       struct address_space *mapping;
25239 +
25240 +       mapping = page->mapping;
25241 +       BUG_ON(mapping == NULL);
25242 +
25243 +       if (!TestSetPageDirty(page)) {
25244 +               if (mapping_cap_account_dirty(mapping))
25245 +                       inc_zone_page_state(page, NR_FILE_DIRTY);
25246 +
25247 +               __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
25248 +       }
25249 +
25250 +       /* znode must be dirty ? */
25251 +       if (mapping->host == get_super_fake(mapping->host->i_sb))
25252 +               assert("", JF_ISSET(jprivate(page), JNODE_DIRTY));
25253 +       return 0;
25254 +}
25255 +
25256 +#if REISER4_DEBUG
25257 +
25258 +/**
25259 + * can_hit_entd
25260 + *
25261 + * This is used on
25262 + */
25263 +static int can_hit_entd(reiser4_context *ctx, struct super_block *s)
25264 +{
25265 +       if (ctx == NULL || ((unsigned long)ctx->magic) != context_magic)
25266 +               return 1;
25267 +       if (ctx->super != s)
25268 +               return 1;
25269 +       if (get_super_private(s)->entd.tsk == current)
25270 +               return 0;
25271 +       if (!lock_stack_isclean(&ctx->stack))
25272 +               return 0;
25273 +       if (ctx->trans->atom != NULL)
25274 +               return 0;
25275 +       return 1;
25276 +}
25277 +
25278 +#endif
25279 +
25280 +/**
25281 + * reiser4_writepage - writepage of struct address_space_operations
25282 + * @page: page to write
25283 + * @wbc:
25284 + *
25285 + *
25286 + */
25287 +/* Common memory pressure notification. */
25288 +int reiser4_writepage(struct page *page,
25289 +                     struct writeback_control *wbc)
25290 +{
25291 +       struct super_block *s;
25292 +       reiser4_context *ctx;
25293 +
25294 +       assert("vs-828", PageLocked(page));
25295 +
25296 +       s = page->mapping->host->i_sb;
25297 +       ctx = get_current_context_check();
25298 +
25299 +       assert("", can_hit_entd(ctx, s));
25300 +
25301 +       return write_page_by_ent(page, wbc);
25302 +}
25303 +
25304 +/* ->set_page_dirty() method of formatted address_space */
25305 +static int formatted_set_page_dirty(struct page *page)
25306 +{
25307 +       assert("nikita-2173", page != NULL);
25308 +       BUG();
25309 +       return __set_page_dirty_nobuffers(page);
25310 +}
25311 +
25312 +/* writepages method of address space operations in reiser4 is used to involve
25313 +   into transactions pages which are dirtied via mmap. Only regular files can
25314 +   have such pages. Fake inode is used to access formatted nodes via page
25315 +   cache. As formatted nodes can never be mmaped, fake inode's writepages has
25316 +   nothing to do */
25317 +static int
25318 +writepages_fake(struct address_space *mapping, struct writeback_control *wbc)
25319 +{
25320 +       return 0;
25321 +}
25322 +
25323 +/* address space operations for the fake inode */
25324 +static struct address_space_operations formatted_fake_as_ops = {
25325 +       /* Perform a writeback of a single page as a memory-freeing
25326 +        * operation. */
25327 +       .writepage = reiser4_writepage,
25328 +       /* this is called to read formatted node */
25329 +       .readpage = formatted_readpage,
25330 +       /* ->sync_page() method of fake inode address space operations. Called
25331 +          from wait_on_page() and lock_page().
25332 +
25333 +          This is most annoyingly misnomered method. Actually it is called
25334 +          from wait_on_page_bit() and lock_page() and its purpose is to
25335 +          actually start io by jabbing device drivers.
25336 +        */
25337 +       .sync_page = block_sync_page,
25338 +       /* Write back some dirty pages from this mapping. Called from sync.
25339 +          called during sync (pdflush) */
25340 +       .writepages = writepages_fake,
25341 +       /* Set a page dirty */
25342 +       .set_page_dirty = formatted_set_page_dirty,
25343 +       /* used for read-ahead. Not applicable */
25344 +       .readpages = NULL,
25345 +       .prepare_write = NULL,
25346 +       .commit_write = NULL,
25347 +       .bmap = NULL,
25348 +       /* called just before page is being detached from inode mapping and
25349 +          removed from memory. Called on truncate, cut/squeeze, and
25350 +          umount. */
25351 +       .invalidatepage = reiser4_invalidatepage,
25352 +       /* this is called by shrink_cache() so that file system can try to
25353 +          release objects (jnodes, buffers, journal heads) attached to page
25354 +          and, may be made page itself free-able.
25355 +        */
25356 +       .releasepage = reiser4_releasepage,
25357 +       .direct_IO = NULL
25358 +};
25359 +
25360 +/* called just before page is released (no longer used by reiser4). Callers:
25361 +   jdelete() and extent2tail(). */
25362 +void drop_page(struct page *page)
25363 +{
25364 +       assert("nikita-2181", PageLocked(page));
25365 +       clear_page_dirty_for_io(page);
25366 +       ClearPageUptodate(page);
25367 +#if defined(PG_skipped)
25368 +       ClearPageSkipped(page);
25369 +#endif
25370 +       if (page->mapping != NULL) {
25371 +               remove_from_page_cache(page);
25372 +               unlock_page(page);
25373 +               page_cache_release(page);
25374 +       } else
25375 +               unlock_page(page);
25376 +}
25377 +
25378 +/* this is called by truncate_jnodes_range which in its turn is always called
25379 +   after truncate_mapping_pages_range. Therefore, here jnode can not have
25380 +   page. New pages can not be created because truncate_jnodes_range goes under
25381 +   exclusive access on file obtained, where as new page creation requires
25382 +   non-exclusive access obtained */
25383 +static void invalidate_unformatted(jnode * node)
25384 +{
25385 +       struct page *page;
25386 +
25387 +       spin_lock_jnode(node);
25388 +       page = node->pg;
25389 +       if (page) {
25390 +               loff_t from, to;
25391 +
25392 +               page_cache_get(page);
25393 +               spin_unlock_jnode(node);
25394 +               /* FIXME: use truncate_complete_page instead */
25395 +               from = (loff_t) page->index << PAGE_CACHE_SHIFT;
25396 +               to = from + PAGE_CACHE_SIZE - 1;
25397 +               truncate_inode_pages_range(page->mapping, from, to);
25398 +               page_cache_release(page);
25399 +       } else {
25400 +               JF_SET(node, JNODE_HEARD_BANSHEE);
25401 +               uncapture_jnode(node);
25402 +               unhash_unformatted_jnode(node);
25403 +       }
25404 +}
25405 +
25406 +#define JNODE_GANG_SIZE (16)
25407 +
25408 +/* find all eflushed jnodes from range specified and invalidate them */
25409 +static int
25410 +truncate_jnodes_range(struct inode *inode, pgoff_t from, pgoff_t count)
25411 +{
25412 +       reiser4_inode *info;
25413 +       int truncated_jnodes;
25414 +       reiser4_tree *tree;
25415 +       unsigned long index;
25416 +       unsigned long end;
25417 +
25418 +       truncated_jnodes = 0;
25419 +
25420 +       info = reiser4_inode_data(inode);
25421 +       tree = tree_by_inode(inode);
25422 +
25423 +       index = from;
25424 +       end = from + count;
25425 +
25426 +       while (1) {
25427 +               jnode *gang[JNODE_GANG_SIZE];
25428 +               int taken;
25429 +               int i;
25430 +               jnode *node;
25431 +
25432 +               assert("nikita-3466", index <= end);
25433 +
25434 +               read_lock_tree(tree);
25435 +               taken =
25436 +                   radix_tree_gang_lookup(jnode_tree_by_reiser4_inode(info),
25437 +                                          (void **)gang, index,
25438 +                                          JNODE_GANG_SIZE);
25439 +               for (i = 0; i < taken; ++i) {
25440 +                       node = gang[i];
25441 +                       if (index_jnode(node) < end)
25442 +                               jref(node);
25443 +                       else
25444 +                               gang[i] = NULL;
25445 +               }
25446 +               read_unlock_tree(tree);
25447 +
25448 +               for (i = 0; i < taken; ++i) {
25449 +                       node = gang[i];
25450 +                       if (node != NULL) {
25451 +                               index = max(index, index_jnode(node));
25452 +                               invalidate_unformatted(node);
25453 +                               truncated_jnodes++;
25454 +                               jput(node);
25455 +                       } else
25456 +                               break;
25457 +               }
25458 +               if (i != taken || taken == 0)
25459 +                       break;
25460 +       }
25461 +       return truncated_jnodes;
25462 +}
25463 +
25464 +void
25465 +reiser4_invalidate_pages(struct address_space *mapping, pgoff_t from,
25466 +                        unsigned long count, int even_cows)
25467 +{
25468 +       loff_t from_bytes, count_bytes;
25469 +
25470 +       if (count == 0)
25471 +               return;
25472 +       from_bytes = ((loff_t) from) << PAGE_CACHE_SHIFT;
25473 +       count_bytes = ((loff_t) count) << PAGE_CACHE_SHIFT;
25474 +
25475 +       unmap_mapping_range(mapping, from_bytes, count_bytes, even_cows);
25476 +       truncate_inode_pages_range(mapping, from_bytes,
25477 +                                  from_bytes + count_bytes - 1);
25478 +       truncate_jnodes_range(mapping->host, from, count);
25479 +}
25480 +
25481 +/*
25482 + * Local variables:
25483 + * c-indentation-style: "K&R"
25484 + * mode-name: "LC"
25485 + * c-basic-offset: 8
25486 + * tab-width: 8
25487 + * fill-column: 120
25488 + * scroll-step: 1
25489 + * End:
25490 + */
25491 diff --git a/fs/reiser4/page_cache.h b/fs/reiser4/page_cache.h
25492 new file mode 100644
25493 index 0000000..4cb0ce9
25494 --- /dev/null
25495 +++ b/fs/reiser4/page_cache.h
25496 @@ -0,0 +1,62 @@
25497 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
25498 + * reiser4/README */
25499 +/* Memory pressure hooks. Fake inodes handling. See page_cache.c. */
25500 +
25501 +#if !defined( __REISER4_PAGE_CACHE_H__ )
25502 +#define __REISER4_PAGE_CACHE_H__
25503 +
25504 +#include "forward.h"
25505 +#include "debug.h"
25506 +
25507 +#include <linux/fs.h>          /* for struct super_block, address_space  */
25508 +#include <linux/mm.h>          /* for struct page  */
25509 +#include <linux/pagemap.h>     /* for lock_page()  */
25510 +
25511 +
25512 +extern int init_formatted_fake(struct super_block *);
25513 +extern void done_formatted_fake(struct super_block *);
25514 +
25515 +extern reiser4_tree *tree_by_page(const struct page *);
25516 +
25517 +extern int set_page_dirty_internal(struct page *);
25518 +
25519 +#define reiser4_submit_bio(rw, bio) submit_bio((rw), (bio))
25520 +
25521 +extern void reiser4_wait_page_writeback(struct page *);
25522 +static inline void lock_and_wait_page_writeback(struct page *page)
25523 +{
25524 +       lock_page(page);
25525 +       if (unlikely(PageWriteback(page)))
25526 +               reiser4_wait_page_writeback(page);
25527 +}
25528 +
25529 +#define jprivate(page) ((jnode *)page_private(page))
25530 +
25531 +extern int page_io(struct page *, jnode *, int rw, gfp_t);
25532 +extern void drop_page(struct page *);
25533 +extern void reiser4_invalidate_pages(struct address_space *, pgoff_t from,
25534 +                                    unsigned long count, int even_cows);
25535 +extern void capture_reiser4_inodes(struct super_block *,
25536 +                                  struct writeback_control *);
25537 +
25538 +#define PAGECACHE_TAG_REISER4_MOVED PAGECACHE_TAG_DIRTY
25539 +
25540 +#if REISER4_DEBUG
25541 +extern void print_page(const char *prefix, struct page *page);
25542 +#else
25543 +#define print_page(prf, p) noop
25544 +#endif
25545 +
25546 +/* __REISER4_PAGE_CACHE_H__ */
25547 +#endif
25548 +
25549 +/* Make Linus happy.
25550 +   Local variables:
25551 +   c-indentation-style: "K&R"
25552 +   mode-name: "LC"
25553 +   c-basic-offset: 8
25554 +   tab-width: 8
25555 +   fill-column: 120
25556 +   scroll-step: 1
25557 +   End:
25558 +*/
25559 diff --git a/fs/reiser4/plugin/Makefile b/fs/reiser4/plugin/Makefile
25560 new file mode 100644
25561 index 0000000..4b2c9f8
25562 --- /dev/null
25563 +++ b/fs/reiser4/plugin/Makefile
25564 @@ -0,0 +1,26 @@
25565 +obj-$(CONFIG_REISER4_FS) += plugins.o
25566 +
25567 +plugins-objs :=                        \
25568 +       plugin.o                \
25569 +       plugin_set.o            \
25570 +       object.o                \
25571 +       inode_ops.o             \
25572 +       inode_ops_rename.o      \
25573 +       file_ops.o              \
25574 +       file_ops_readdir.o      \
25575 +       file_plugin_common.o    \
25576 +       dir_plugin_common.o     \
25577 +       digest.o                \
25578 +       hash.o                  \
25579 +       fibration.o             \
25580 +       tail_policy.o           \
25581 +       regular.o
25582 +
25583 +obj-$(CONFIG_REISER4_FS) += item/
25584 +obj-$(CONFIG_REISER4_FS) += file/
25585 +obj-$(CONFIG_REISER4_FS) += dir/
25586 +obj-$(CONFIG_REISER4_FS) += node/
25587 +obj-$(CONFIG_REISER4_FS) += compress/
25588 +obj-$(CONFIG_REISER4_FS) += space/
25589 +obj-$(CONFIG_REISER4_FS) += disk_format/
25590 +obj-$(CONFIG_REISER4_FS) += security/
25591 diff --git a/fs/reiser4/plugin/cluster.c b/fs/reiser4/plugin/cluster.c
25592 new file mode 100644
25593 index 0000000..55cce66
25594 --- /dev/null
25595 +++ b/fs/reiser4/plugin/cluster.c
25596 @@ -0,0 +1,66 @@
25597 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
25598 +
25599 +/* Contains reiser4 cluster plugins (see
25600 +   http://www.namesys.com/cryptcompress_design.html
25601 +   "Concepts of clustering" for details). */
25602 +
25603 +#include "plugin_header.h"
25604 +#include "plugin.h"
25605 +#include "../inode.h"
25606 +
25607 +static int change_cluster(struct inode *inode, reiser4_plugin * plugin)
25608 +{
25609 +       int result = 0;
25610 +
25611 +       assert("edward-1324", inode != NULL);
25612 +       assert("edward-1325", plugin != NULL);
25613 +       assert("edward-1326", is_reiser4_inode(inode));
25614 +       assert("edward-1327", plugin->h.type_id == REISER4_CLUSTER_PLUGIN_TYPE);
25615 +
25616 +       if (inode_file_plugin(inode)->h.id == DIRECTORY_FILE_PLUGIN_ID)
25617 +               result = plugin_set_cluster(&reiser4_inode_data(inode)->pset,
25618 +                                           &plugin->clust);
25619 +       else
25620 +               result = RETERR(-EINVAL);
25621 +       return result;
25622 +}
25623 +
25624 +static reiser4_plugin_ops cluster_plugin_ops = {
25625 +       .init = NULL,
25626 +       .load = NULL,
25627 +       .save_len = NULL,
25628 +       .save = NULL,
25629 +       .change = &change_cluster
25630 +};
25631 +
25632 +#define SUPPORT_CLUSTER(SHIFT, ID, LABEL, DESC)                        \
25633 +       [CLUSTER_ ## ID ## _ID] = {                             \
25634 +               .h = {                                          \
25635 +                       .type_id = REISER4_CLUSTER_PLUGIN_TYPE, \
25636 +                       .id = CLUSTER_ ## ID ## _ID,            \
25637 +                       .pops = &cluster_plugin_ops,            \
25638 +                       .label = LABEL,                         \
25639 +                       .desc = DESC,                           \
25640 +                       .linkage = {NULL, NULL}                 \
25641 +               },                                              \
25642 +               .shift = SHIFT                                  \
25643 +       }
25644 +
25645 +cluster_plugin cluster_plugins[LAST_CLUSTER_ID] = {
25646 +       SUPPORT_CLUSTER(16, 64K, "64K", "Large"),
25647 +       SUPPORT_CLUSTER(15, 32K, "32K", "Big"),
25648 +       SUPPORT_CLUSTER(14, 16K, "16K", "Average"),
25649 +       SUPPORT_CLUSTER(13, 8K, "8K", "Small"),
25650 +       SUPPORT_CLUSTER(12, 4K, "4K", "Minimal")
25651 +};
25652 +
25653 +/*
25654 +  Local variables:
25655 +  c-indentation-style: "K&R"
25656 +  mode-name: "LC"
25657 +  c-basic-offset: 8
25658 +  tab-width: 8
25659 +  fill-column: 120
25660 +  scroll-step: 1
25661 +  End:
25662 +*/
25663 diff --git a/fs/reiser4/plugin/cluster.h b/fs/reiser4/plugin/cluster.h
25664 new file mode 100644
25665 index 0000000..0c6413e
25666 --- /dev/null
25667 +++ b/fs/reiser4/plugin/cluster.h
25668 @@ -0,0 +1,315 @@
25669 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
25670 +
25671 +/* This file contains page/cluster index translators and offset modulators
25672 +   See http://www.namesys.com/cryptcompress_design.html for details */
25673 +
25674 +#if !defined( __FS_REISER4_CLUSTER_H__ )
25675 +#define __FS_REISER4_CLUSTER_H__
25676 +
25677 +#include "../inode.h"
25678 +
25679 +static inline int inode_cluster_shift(struct inode *inode)
25680 +{
25681 +       assert("edward-92", inode != NULL);
25682 +       assert("edward-93", reiser4_inode_data(inode) != NULL);
25683 +
25684 +       return inode_cluster_plugin(inode)->shift;
25685 +}
25686 +
25687 +static inline unsigned cluster_nrpages_shift(struct inode *inode)
25688 +{
25689 +       return inode_cluster_shift(inode) - PAGE_CACHE_SHIFT;
25690 +}
25691 +
25692 +/* cluster size in page units */
25693 +static inline unsigned cluster_nrpages(struct inode *inode)
25694 +{
25695 +       return 1U << cluster_nrpages_shift(inode);
25696 +}
25697 +
25698 +static inline size_t inode_cluster_size(struct inode *inode)
25699 +{
25700 +       assert("edward-96", inode != NULL);
25701 +
25702 +       return 1U << inode_cluster_shift(inode);
25703 +}
25704 +
25705 +static inline cloff_t pg_to_clust(pgoff_t idx, struct inode *inode)
25706 +{
25707 +       return idx >> cluster_nrpages_shift(inode);
25708 +}
25709 +
25710 +static inline pgoff_t clust_to_pg(cloff_t idx, struct inode *inode)
25711 +{
25712 +       return idx << cluster_nrpages_shift(inode);
25713 +}
25714 +
25715 +static inline pgoff_t pg_to_clust_to_pg(pgoff_t idx, struct inode *inode)
25716 +{
25717 +       return clust_to_pg(pg_to_clust(idx, inode), inode);
25718 +}
25719 +
25720 +static inline pgoff_t off_to_pg(loff_t off)
25721 +{
25722 +       return (off >> PAGE_CACHE_SHIFT);
25723 +}
25724 +
25725 +static inline loff_t pg_to_off(pgoff_t idx)
25726 +{
25727 +       return ((loff_t) (idx) << PAGE_CACHE_SHIFT);
25728 +}
25729 +
25730 +static inline cloff_t off_to_clust(loff_t off, struct inode *inode)
25731 +{
25732 +       return off >> inode_cluster_shift(inode);
25733 +}
25734 +
25735 +static inline loff_t clust_to_off(cloff_t idx, struct inode *inode)
25736 +{
25737 +       return (loff_t) idx << inode_cluster_shift(inode);
25738 +}
25739 +
25740 +static inline unsigned long count_to_nr(loff_t count, unsigned shift)
25741 +{
25742 +       return (count + (1UL << shift) - 1) >> shift;
25743 +}
25744 +
25745 +/* number of pages occupied by @count bytes */
25746 +static inline pgoff_t count_to_nrpages(loff_t count)
25747 +{
25748 +       return count_to_nr(count, PAGE_CACHE_SHIFT);
25749 +}
25750 +
25751 +/* number of clusters occupied by @count bytes */
25752 +static inline cloff_t count_to_nrclust(loff_t count, struct inode *inode)
25753 +{
25754 +       return count_to_nr(count, inode_cluster_shift(inode));
25755 +}
25756 +
25757 +/* number of clusters occupied by @count pages */
25758 +static inline cloff_t pgcount_to_nrclust(pgoff_t count, struct inode *inode)
25759 +{
25760 +       return count_to_nr(count, cluster_nrpages_shift(inode));
25761 +}
25762 +
25763 +static inline loff_t off_to_clust_to_off(loff_t off, struct inode *inode)
25764 +{
25765 +       return clust_to_off(off_to_clust(off, inode), inode);
25766 +}
25767 +
25768 +static inline pgoff_t off_to_clust_to_pg(loff_t off, struct inode *inode)
25769 +{
25770 +       return clust_to_pg(off_to_clust(off, inode), inode);
25771 +}
25772 +
25773 +static inline unsigned off_to_pgoff(loff_t off)
25774 +{
25775 +       return off & (PAGE_CACHE_SIZE - 1);
25776 +}
25777 +
25778 +static inline unsigned off_to_cloff(loff_t off, struct inode *inode)
25779 +{
25780 +       return off & ((loff_t) (inode_cluster_size(inode)) - 1);
25781 +}
25782 +
25783 +static inline unsigned
25784 +pg_to_off_to_cloff(unsigned long idx, struct inode *inode)
25785 +{
25786 +       return off_to_cloff(pg_to_off(idx), inode);
25787 +}
25788 +
25789 +/* if @size != 0, returns index of the page
25790 +   which contains the last byte of the file */
25791 +static inline pgoff_t size_to_pg(loff_t size)
25792 +{
25793 +       return (size ? off_to_pg(size - 1) : 0);
25794 +}
25795 +
25796 +/* minimal index of the page which doesn't contain
25797 +   file data */
25798 +static inline pgoff_t size_to_next_pg(loff_t size)
25799 +{
25800 +       return (size ? off_to_pg(size - 1) + 1 : 0);
25801 +}
25802 +
25803 +/* how many bytes of file of size @cnt can be contained
25804 +   in page of index @idx */
25805 +static inline unsigned cnt_to_pgcnt(loff_t cnt, pgoff_t idx)
25806 +{
25807 +       if (idx > off_to_pg(cnt))
25808 +               return 0;
25809 +       if (idx < off_to_pg(cnt))
25810 +               return PAGE_CACHE_SIZE;
25811 +       return off_to_pgoff(cnt);
25812 +}
25813 +
25814 +/* how many bytes of file of size @cnt can be contained
25815 +   in logical cluster of index @idx */
25816 +static inline unsigned cnt_to_clcnt(loff_t cnt, cloff_t idx,
25817 +                                   struct inode *inode)
25818 +{
25819 +       if (idx > off_to_clust(cnt, inode))
25820 +               return 0;
25821 +       if (idx < off_to_clust(cnt, inode))
25822 +               return inode_cluster_size(inode);
25823 +       return off_to_cloff(cnt, inode);
25824 +}
25825 +
25826 +static inline unsigned
25827 +fsize_to_count(reiser4_cluster_t * clust, struct inode *inode)
25828 +{
25829 +       assert("edward-288", clust != NULL);
25830 +       assert("edward-289", inode != NULL);
25831 +
25832 +       return cnt_to_clcnt(inode->i_size, clust->index, inode);
25833 +}
25834 +
25835 +static inline int
25836 +cluster_is_complete(reiser4_cluster_t * clust, struct inode * inode)
25837 +{
25838 +       return clust->tc.lsize == inode_cluster_size(inode);
25839 +}
25840 +
25841 +static inline void reiser4_slide_init(reiser4_slide_t * win)
25842 +{
25843 +       assert("edward-1084", win != NULL);
25844 +       memset(win, 0, sizeof *win);
25845 +}
25846 +
25847 +static inline void
25848 +tfm_cluster_init_act(tfm_cluster_t * tc, tfm_action act)
25849 +{
25850 +       assert("edward-1356", tc != NULL);
25851 +       tc->act = act;
25852 +}
25853 +
25854 +static inline void
25855 +cluster_init_act (reiser4_cluster_t * clust, tfm_action act, reiser4_slide_t * window){
25856 +       assert("edward-84", clust != NULL);
25857 +       memset(clust, 0, sizeof *clust);
25858 +       tfm_cluster_init_act(&clust->tc, act);
25859 +       clust->dstat = INVAL_DISK_CLUSTER;
25860 +       clust->win = window;
25861 +}
25862 +
25863 +static inline void
25864 +cluster_init_read(reiser4_cluster_t * clust, reiser4_slide_t * window)
25865 +{
25866 +       cluster_init_act (clust, TFM_READ_ACT, window);
25867 +}
25868 +
25869 +static inline void
25870 +cluster_init_write(reiser4_cluster_t * clust, reiser4_slide_t * window)
25871 +{
25872 +       cluster_init_act (clust, TFM_WRITE_ACT, window);
25873 +}
25874 +
25875 +static inline int dclust_get_extension(hint_t * hint)
25876 +{
25877 +       return hint->ext_coord.extension.ctail.shift;
25878 +}
25879 +
25880 +static inline void dclust_set_extension(hint_t * hint)
25881 +{
25882 +       assert("edward-1270",
25883 +              item_id_by_coord(&hint->ext_coord.coord) == CTAIL_ID);
25884 +       hint->ext_coord.extension.ctail.shift =
25885 +           cluster_shift_by_coord(&hint->ext_coord.coord);
25886 +}
25887 +
25888 +static inline int hint_is_unprepped_dclust(hint_t * hint)
25889 +{
25890 +       return dclust_get_extension(hint) == (int)UCTAIL_SHIFT;
25891 +}
25892 +
25893 +static inline void coord_set_between_clusters(coord_t * coord)
25894 +{
25895 +#if REISER4_DEBUG
25896 +       int result;
25897 +       result = zload(coord->node);
25898 +       assert("edward-1296", !result);
25899 +#endif
25900 +       if (!coord_is_between_items(coord)) {
25901 +               coord->between = AFTER_ITEM;
25902 +               coord->unit_pos = 0;
25903 +       }
25904 +#if REISER4_DEBUG
25905 +       zrelse(coord->node);
25906 +#endif
25907 +}
25908 +
25909 +int inflate_cluster(reiser4_cluster_t *, struct inode *);
25910 +int find_cluster(reiser4_cluster_t *, struct inode *, int read, int write);
25911 +int flush_cluster_pages(reiser4_cluster_t *, jnode *, struct inode *);
25912 +int deflate_cluster(reiser4_cluster_t *, struct inode *);
25913 +void truncate_page_cluster(struct inode *inode, cloff_t start);
25914 +void invalidate_hint_cluster(reiser4_cluster_t * clust);
25915 +void put_hint_cluster(reiser4_cluster_t * clust, struct inode *inode,
25916 +                     znode_lock_mode mode);
25917 +int get_disk_cluster_locked(reiser4_cluster_t * clust, struct inode *inode,
25918 +                           znode_lock_mode lock_mode);
25919 +void reset_cluster_params(reiser4_cluster_t * clust);
25920 +int set_cluster_by_page(reiser4_cluster_t * clust, struct page * page,
25921 +                       int count);
25922 +int prepare_page_cluster(struct inode *inode, reiser4_cluster_t * clust,
25923 +                        int capture);
25924 +void release_cluster_pages(reiser4_cluster_t *);
25925 +void put_cluster_handle(reiser4_cluster_t * clust);
25926 +int grab_tfm_stream(struct inode *inode, tfm_cluster_t * tc, tfm_stream_id id);
25927 +int tfm_cluster_is_uptodate(tfm_cluster_t * tc);
25928 +void tfm_cluster_set_uptodate(tfm_cluster_t * tc);
25929 +void tfm_cluster_clr_uptodate(tfm_cluster_t * tc);
25930 +
25931 +/* move cluster handle to the target position
25932 +   specified by the page of index @pgidx
25933 +*/
25934 +static inline void
25935 +move_cluster_forward(reiser4_cluster_t * clust, struct inode *inode,
25936 +                    pgoff_t pgidx, int *progress)
25937 +{
25938 +       assert("edward-1297", clust != NULL);
25939 +       assert("edward-1298", inode != NULL);
25940 +
25941 +       reset_cluster_params(clust);
25942 +       if (*progress &&
25943 +           /* Hole in the indices. Hint became invalid and can not be
25944 +              used by find_cluster_item() even if seal/node versions
25945 +              will coincide */
25946 +           pg_to_clust(pgidx, inode) != clust->index + 1) {
25947 +               unset_hint(clust->hint);
25948 +               invalidate_hint_cluster(clust);
25949 +       }
25950 +       *progress = 1;
25951 +       clust->index = pg_to_clust(pgidx, inode);
25952 +}
25953 +
25954 +static inline int
25955 +alloc_clust_pages(reiser4_cluster_t * clust, struct inode *inode)
25956 +{
25957 +       assert("edward-791", clust != NULL);
25958 +       assert("edward-792", inode != NULL);
25959 +       clust->pages =
25960 +               kmalloc(sizeof(*clust->pages) << inode_cluster_shift(inode),
25961 +                       GFP_KERNEL);
25962 +       if (!clust->pages)
25963 +               return -ENOMEM;
25964 +       return 0;
25965 +}
25966 +
25967 +static inline void free_clust_pages(reiser4_cluster_t * clust)
25968 +{
25969 +       kfree(clust->pages);
25970 +}
25971 +
25972 +#endif                         /* __FS_REISER4_CLUSTER_H__ */
25973 +
25974 +/* Make Linus happy.
25975 +   Local variables:
25976 +   c-indentation-style: "K&R"
25977 +   mode-name: "LC"
25978 +   c-basic-offset: 8
25979 +   tab-width: 8
25980 +   fill-column: 120
25981 +   scroll-step: 1
25982 +   End:
25983 +*/
25984 diff --git a/fs/reiser4/plugin/compress/Makefile b/fs/reiser4/plugin/compress/Makefile
25985 new file mode 100644
25986 index 0000000..82793a4
25987 --- /dev/null
25988 +++ b/fs/reiser4/plugin/compress/Makefile
25989 @@ -0,0 +1,6 @@
25990 +obj-$(CONFIG_REISER4_FS) += compress_plugins.o
25991 +
25992 +compress_plugins-objs :=       \
25993 +       compress.o              \
25994 +       minilzo.o               \
25995 +       compress_mode.o
25996 diff --git a/fs/reiser4/plugin/compress/compress.c b/fs/reiser4/plugin/compress/compress.c
25997 new file mode 100644
25998 index 0000000..1a7122d
25999 --- /dev/null
26000 +++ b/fs/reiser4/plugin/compress/compress.c
26001 @@ -0,0 +1,369 @@
26002 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
26003 +/* reiser4 compression transform plugins */
26004 +
26005 +#include "../../debug.h"
26006 +#include "../../inode.h"
26007 +#include "../plugin.h"
26008 +#include "minilzo.h"
26009 +
26010 +#include <linux/zlib.h>
26011 +#include <linux/types.h>
26012 +#include <linux/hardirq.h>
26013 +
26014 +static int change_compression(struct inode *inode, reiser4_plugin * plugin)
26015 +{
26016 +       assert("edward-1316", inode != NULL);
26017 +       assert("edward-1317", plugin != NULL);
26018 +       assert("edward-1318", is_reiser4_inode(inode));
26019 +       assert("edward-1319",
26020 +              plugin->h.type_id == REISER4_COMPRESSION_PLUGIN_TYPE);
26021 +       /* cannot change compression plugin of already existing object */
26022 +       return RETERR(-EINVAL);
26023 +}
26024 +
26025 +static reiser4_plugin_ops compression_plugin_ops = {
26026 +       .init = NULL,
26027 +       .load = NULL,
26028 +       .save_len = NULL,
26029 +       .save = NULL,
26030 +       .change = &change_compression
26031 +};
26032 +
26033 +/******************************************************************************/
26034 +/*                         gzip1 compression                                  */
26035 +/******************************************************************************/
26036 +
26037 +#define GZIP1_DEF_LEVEL                        Z_BEST_SPEED
26038 +#define GZIP1_DEF_WINBITS              15
26039 +#define GZIP1_DEF_MEMLEVEL             MAX_MEM_LEVEL
26040 +
26041 +static int gzip1_init(void)
26042 +{
26043 +       int ret = -EINVAL;
26044 +#if REISER4_ZLIB
26045 +       ret = 0;
26046 +#endif
26047 +       if (ret == -EINVAL)
26048 +               warning("edward-1337", "Zlib not compiled into kernel");
26049 +       return ret;
26050 +}
26051 +
26052 +static int gzip1_overrun(unsigned src_len UNUSED_ARG)
26053 +{
26054 +       return 0;
26055 +}
26056 +
26057 +static coa_t gzip1_alloc(tfm_action act)
26058 +{
26059 +       coa_t coa = NULL;
26060 +#if REISER4_ZLIB
26061 +       int ret = 0;
26062 +       switch (act) {
26063 +       case TFM_WRITE_ACT:     /* compress */
26064 +               coa = vmalloc(zlib_deflate_workspacesize());
26065 +               if (!coa) {
26066 +                       ret = -ENOMEM;
26067 +                       break;
26068 +               }
26069 +               memset(coa, 0, zlib_deflate_workspacesize());
26070 +               break;
26071 +       case TFM_READ_ACT:      /* decompress */
26072 +               coa = vmalloc(zlib_inflate_workspacesize());
26073 +               if (!coa) {
26074 +                       ret = -ENOMEM;
26075 +                       break;
26076 +               }
26077 +               memset(coa, 0, zlib_inflate_workspacesize());
26078 +               break;
26079 +       default:
26080 +               impossible("edward-767",
26081 +                          "trying to alloc workspace for unknown tfm action");
26082 +       }
26083 +       if (ret) {
26084 +               warning("edward-768",
26085 +                       "alloc workspace for gzip1 (tfm action = %d) failed\n",
26086 +                       act);
26087 +               return ERR_PTR(ret);
26088 +       }
26089 +#endif
26090 +       return coa;
26091 +}
26092 +
26093 +static void gzip1_free(coa_t coa, tfm_action act)
26094 +{
26095 +       assert("edward-769", coa != NULL);
26096 +
26097 +       switch (act) {
26098 +       case TFM_WRITE_ACT:     /* compress */
26099 +               vfree(coa);
26100 +               break;
26101 +       case TFM_READ_ACT:              /* decompress */
26102 +               vfree(coa);
26103 +               break;
26104 +       default:
26105 +               impossible("edward-770", "unknown tfm action");
26106 +       }
26107 +       return;
26108 +}
26109 +
26110 +static int gzip1_min_size_deflate(void)
26111 +{
26112 +       return 64;
26113 +}
26114 +
26115 +static void
26116 +gzip1_compress(coa_t coa, __u8 * src_first, unsigned src_len,
26117 +              __u8 * dst_first, unsigned *dst_len)
26118 +{
26119 +#if REISER4_ZLIB
26120 +       int ret = 0;
26121 +       struct z_stream_s stream;
26122 +
26123 +       memset(&stream, 0, sizeof(stream));
26124 +
26125 +       assert("edward-842", coa != NULL);
26126 +       assert("edward-875", src_len != 0);
26127 +
26128 +       stream.workspace = coa;
26129 +       ret = zlib_deflateInit2(&stream, GZIP1_DEF_LEVEL, Z_DEFLATED,
26130 +                               -GZIP1_DEF_WINBITS, GZIP1_DEF_MEMLEVEL,
26131 +                               Z_DEFAULT_STRATEGY);
26132 +       if (ret != Z_OK) {
26133 +               warning("edward-771", "zlib_deflateInit2 returned %d\n", ret);
26134 +               goto rollback;
26135 +       }
26136 +       ret = zlib_deflateReset(&stream);
26137 +       if (ret != Z_OK) {
26138 +               warning("edward-772", "zlib_deflateReset returned %d\n", ret);
26139 +               goto rollback;
26140 +       }
26141 +       stream.next_in = src_first;
26142 +       stream.avail_in = src_len;
26143 +       stream.next_out = dst_first;
26144 +       stream.avail_out = *dst_len;
26145 +
26146 +       ret = zlib_deflate(&stream, Z_FINISH);
26147 +       if (ret != Z_STREAM_END) {
26148 +               if (ret != Z_OK)
26149 +                       warning("edward-773",
26150 +                               "zlib_deflate returned %d\n", ret);
26151 +               goto rollback;
26152 +       }
26153 +       *dst_len = stream.total_out;
26154 +       return;
26155 +      rollback:
26156 +       *dst_len = src_len;
26157 +#endif
26158 +       return;
26159 +}
26160 +
26161 +static void
26162 +gzip1_decompress(coa_t coa, __u8 * src_first, unsigned src_len,
26163 +                __u8 * dst_first, unsigned *dst_len)
26164 +{
26165 +#if REISER4_ZLIB
26166 +       int ret = 0;
26167 +       struct z_stream_s stream;
26168 +
26169 +       memset(&stream, 0, sizeof(stream));
26170 +
26171 +       assert("edward-843", coa != NULL);
26172 +       assert("edward-876", src_len != 0);
26173 +
26174 +       stream.workspace = coa;
26175 +       ret = zlib_inflateInit2(&stream, -GZIP1_DEF_WINBITS);
26176 +       if (ret != Z_OK) {
26177 +               warning("edward-774", "zlib_inflateInit2 returned %d\n", ret);
26178 +               return;
26179 +       }
26180 +       ret = zlib_inflateReset(&stream);
26181 +       if (ret != Z_OK) {
26182 +               warning("edward-775", "zlib_inflateReset returned %d\n", ret);
26183 +               return;
26184 +       }
26185 +
26186 +       stream.next_in = src_first;
26187 +       stream.avail_in = src_len;
26188 +       stream.next_out = dst_first;
26189 +       stream.avail_out = *dst_len;
26190 +
26191 +       ret = zlib_inflate(&stream, Z_SYNC_FLUSH);
26192 +       /*
26193 +        * Work around a bug in zlib, which sometimes wants to taste an extra
26194 +        * byte when being used in the (undocumented) raw deflate mode.
26195 +        * (From USAGI).
26196 +        */
26197 +       if (ret == Z_OK && !stream.avail_in && stream.avail_out) {
26198 +               u8 zerostuff = 0;
26199 +               stream.next_in = &zerostuff;
26200 +               stream.avail_in = 1;
26201 +               ret = zlib_inflate(&stream, Z_FINISH);
26202 +       }
26203 +       if (ret != Z_STREAM_END) {
26204 +               warning("edward-776", "zlib_inflate returned %d\n", ret);
26205 +               return;
26206 +       }
26207 +       *dst_len = stream.total_out;
26208 +#endif
26209 +       return;
26210 +}
26211 +
26212 +/******************************************************************************/
26213 +/*                            lzo1 compression                                */
26214 +/******************************************************************************/
26215 +
26216 +static int lzo1_init(void)
26217 +{
26218 +       int ret;
26219 +       ret = lzo_init();
26220 +       if (ret != LZO_E_OK)
26221 +               warning("edward-848", "lzo_init() failed with ret = %d\n", ret);
26222 +       return ret;
26223 +}
26224 +
26225 +static int lzo1_overrun(unsigned in_len)
26226 +{
26227 +       return in_len / 64 + 16 + 3;
26228 +}
26229 +
26230 +#define LZO_HEAP_SIZE(size) \
26231 +       sizeof(lzo_align_t) * (((size) + (sizeof(lzo_align_t) - 1)) / sizeof(lzo_align_t))
26232 +
26233 +static coa_t lzo1_alloc(tfm_action act)
26234 +{
26235 +       int ret = 0;
26236 +       coa_t coa = NULL;
26237 +
26238 +       switch (act) {
26239 +       case TFM_WRITE_ACT:     /* compress */
26240 +               coa = vmalloc(LZO_HEAP_SIZE(LZO1X_1_MEM_COMPRESS));
26241 +               if (!coa) {
26242 +                       ret = -ENOMEM;
26243 +                       break;
26244 +               }
26245 +               memset(coa, 0, LZO_HEAP_SIZE(LZO1X_1_MEM_COMPRESS));
26246 +       case TFM_READ_ACT:              /* decompress */
26247 +               break;
26248 +       default:
26249 +               impossible("edward-877",
26250 +                          "trying to alloc workspace for unknown tfm action");
26251 +       }
26252 +       if (ret) {
26253 +               warning("edward-878",
26254 +                       "alloc workspace for lzo1 (tfm action = %d) failed\n",
26255 +                       act);
26256 +               return ERR_PTR(ret);
26257 +       }
26258 +       return coa;
26259 +}
26260 +
26261 +static void lzo1_free(coa_t coa, tfm_action act)
26262 +{
26263 +       assert("edward-879", coa != NULL);
26264 +
26265 +       switch (act) {
26266 +       case TFM_WRITE_ACT:     /* compress */
26267 +               vfree(coa);
26268 +               break;
26269 +       case TFM_READ_ACT:              /* decompress */
26270 +               impossible("edward-1304",
26271 +                          "trying to free non-allocated workspace");
26272 +       default:
26273 +               impossible("edward-880", "unknown tfm action");
26274 +       }
26275 +       return;
26276 +}
26277 +
26278 +static int lzo1_min_size_deflate(void)
26279 +{
26280 +       return 256;
26281 +}
26282 +
26283 +static void
26284 +lzo1_compress(coa_t coa, __u8 * src_first, unsigned src_len,
26285 +             __u8 * dst_first, unsigned *dst_len)
26286 +{
26287 +       int result;
26288 +
26289 +       assert("edward-846", coa != NULL);
26290 +       assert("edward-847", src_len != 0);
26291 +
26292 +       result = lzo1x_1_compress(src_first, src_len, dst_first, dst_len, coa);
26293 +       if (result != LZO_E_OK) {
26294 +               warning("edward-849", "lzo1x_1_compress failed\n");
26295 +               goto out;
26296 +       }
26297 +       if (*dst_len >= src_len) {
26298 +               //warning("edward-850", "lzo1x_1_compress: incompressible data\n");
26299 +               goto out;
26300 +       }
26301 +       return;
26302 +      out:
26303 +       *dst_len = src_len;
26304 +       return;
26305 +}
26306 +
26307 +static void
26308 +lzo1_decompress(coa_t coa, __u8 * src_first, unsigned src_len,
26309 +               __u8 * dst_first, unsigned *dst_len)
26310 +{
26311 +       int result;
26312 +
26313 +       assert("edward-851", coa == NULL);
26314 +       assert("edward-852", src_len != 0);
26315 +
26316 +       result = lzo1x_decompress(src_first, src_len, dst_first, dst_len, NULL);
26317 +       if (result != LZO_E_OK)
26318 +               warning("edward-853", "lzo1x_1_decompress failed\n");
26319 +       return;
26320 +}
26321 +
26322 +compression_plugin compression_plugins[LAST_COMPRESSION_ID] = {
26323 +       [LZO1_COMPRESSION_ID] = {
26324 +               .h = {
26325 +                       .type_id = REISER4_COMPRESSION_PLUGIN_TYPE,
26326 +                       .id = LZO1_COMPRESSION_ID,
26327 +                       .pops = &compression_plugin_ops,
26328 +                       .label = "lzo1",
26329 +                       .desc = "lzo1 compression transform",
26330 +                       .linkage = {NULL, NULL}
26331 +               },
26332 +               .init = lzo1_init,
26333 +               .overrun = lzo1_overrun,
26334 +               .alloc = lzo1_alloc,
26335 +               .free = lzo1_free,
26336 +               .min_size_deflate = lzo1_min_size_deflate,
26337 +               .checksum = reiser4_adler32,
26338 +               .compress = lzo1_compress,
26339 +               .decompress = lzo1_decompress
26340 +       },
26341 +       [GZIP1_COMPRESSION_ID] = {
26342 +               .h = {
26343 +                       .type_id = REISER4_COMPRESSION_PLUGIN_TYPE,
26344 +                       .id = GZIP1_COMPRESSION_ID,
26345 +                       .pops = &compression_plugin_ops,
26346 +                       .label = "gzip1",
26347 +                       .desc = "gzip1 compression transform",
26348 +                       .linkage = {NULL, NULL}
26349 +               },
26350 +               .init = gzip1_init,
26351 +               .overrun = gzip1_overrun,
26352 +               .alloc = gzip1_alloc,
26353 +               .free = gzip1_free,
26354 +               .min_size_deflate = gzip1_min_size_deflate,
26355 +               .checksum = NULL,
26356 +               .compress = gzip1_compress,
26357 +               .decompress = gzip1_decompress
26358 +       }
26359 +};
26360 +
26361 +/*
26362 +  Local variables:
26363 +  c-indentation-style: "K&R"
26364 +  mode-name: "LC"
26365 +  c-basic-offset: 8
26366 +  tab-width: 8
26367 +  fill-column: 120
26368 +  scroll-step: 1
26369 +  End:
26370 +*/
26371 diff --git a/fs/reiser4/plugin/compress/compress.h b/fs/reiser4/plugin/compress/compress.h
26372 new file mode 100644
26373 index 0000000..89aabcb
26374 --- /dev/null
26375 +++ b/fs/reiser4/plugin/compress/compress.h
26376 @@ -0,0 +1,38 @@
26377 +#if !defined( __FS_REISER4_COMPRESS_H__ )
26378 +#define __FS_REISER4_COMPRESS_H__
26379 +
26380 +#include <linux/types.h>
26381 +#include <linux/string.h>
26382 +
26383 +typedef enum {
26384 +       TFM_READ_ACT,
26385 +       TFM_WRITE_ACT,
26386 +       TFM_LAST_ACT
26387 +} tfm_action;
26388 +
26389 +/* builtin compression plugins */
26390 +
26391 +typedef enum {
26392 +       LZO1_COMPRESSION_ID,
26393 +       GZIP1_COMPRESSION_ID,
26394 +       LAST_COMPRESSION_ID,
26395 +} reiser4_compression_id;
26396 +
26397 +typedef unsigned long cloff_t;
26398 +typedef void *coa_t;
26399 +typedef coa_t coa_set[LAST_COMPRESSION_ID][TFM_LAST_ACT];
26400 +
26401 +__u32 reiser4_adler32(char *data, __u32 len);
26402 +
26403 +#endif                         /* __FS_REISER4_COMPRESS_H__ */
26404 +
26405 +/* Make Linus happy.
26406 +   Local variables:
26407 +   c-indentation-style: "K&R"
26408 +   mode-name: "LC"
26409 +   c-basic-offset: 8
26410 +   tab-width: 8
26411 +   fill-column: 120
26412 +   scroll-step: 1
26413 +   End:
26414 +*/
26415 diff --git a/fs/reiser4/plugin/compress/compress_mode.c b/fs/reiser4/plugin/compress/compress_mode.c
26416 new file mode 100644
26417 index 0000000..ae3aea1
26418 --- /dev/null
26419 +++ b/fs/reiser4/plugin/compress/compress_mode.c
26420 @@ -0,0 +1,163 @@
26421 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
26422 +/* This file contains Reiser4 compression mode plugins.
26423 +
26424 +   Compression mode plugin is a set of handlers called by compressor
26425 +   at flush time and represent some heuristics including the ones
26426 +   which are to avoid compression of incompressible data, see
26427 +   http://www.namesys.com/cryptcompress_design.html for more details.
26428 +*/
26429 +#include "../../inode.h"
26430 +#include "../plugin.h"
26431 +
26432 +static int should_deflate_test(struct inode * inode, cloff_t index)
26433 +{
26434 +       return !test_bit(0, &index);
26435 +}
26436 +
26437 +static int should_deflate_none(struct inode * inode, cloff_t index)
26438 +{
26439 +       return 0;
26440 +}
26441 +
26442 +static int should_deflate_common(struct inode * inode, cloff_t index)
26443 +{
26444 +       return compression_is_on(cryptcompress_inode_data(inode));
26445 +}
26446 +
26447 +static int turn_off_compression(struct inode *inode, cloff_t index)
26448 +{
26449 +       toggle_compression(cryptcompress_inode_data(inode), 0);
26450 +       return 0;
26451 +}
26452 +
26453 +static int turn_on_compression(struct inode *inode, cloff_t index)
26454 +{
26455 +       toggle_compression(cryptcompress_inode_data(inode), 1);
26456 +       return 0;
26457 +}
26458 +
26459 +static int turn_off_compression_on_zero(struct inode *inode, cloff_t index)
26460 +{
26461 +       assert("edward-1308", inode != NULL);
26462 +       if (index == 0)
26463 +               toggle_compression(cryptcompress_inode_data(inode), 0);
26464 +       return 0;
26465 +}
26466 +
26467 +/* Check on lattice (COL) of some sparseness factor,
26468 +   the family of adaptive compression modes which define
26469 +   the following behavior:
26470 +
26471 +   Compression is on: try to compress everything and turn
26472 +   it off, whenever cluster is incompressible.
26473 +
26474 +   Compression is off: try to compress clusters of indexes
26475 +   k * FACTOR (k = 0, 1, 2, ...) and turn it on, if some of
26476 +   them is compressible. */
26477 +
26478 +/* check if @index belongs to one-dimensional lattice
26479 +   of sparce factor @factor */
26480 +static int check_on_lattice(cloff_t index, int factor)
26481 +{
26482 +       return (factor ? index % factor == 0: index == 0);
26483 +}
26484 +
26485 +#define DEFINE_CHECK_ON_LATTICE(FACTOR)                                 \
26486 +       static int check_on_lattice_ ## FACTOR (struct inode * inode,   \
26487 +                                               cloff_t index)          \
26488 +{                                                                       \
26489 +       return should_deflate_common(inode, index) ||                   \
26490 +               check_on_lattice(index, FACTOR);                        \
26491 +}
26492 +
26493 +#define SUPPORT_COL_COMPRESSION_MODE(FACTOR, LABEL)                     \
26494 +[COL_ ## FACTOR ## _COMPRESSION_MODE_ID] = {                            \
26495 +       .h = {                                                          \
26496 +               .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,        \
26497 +               .id = COL_ ## FACTOR ## _COMPRESSION_MODE_ID,           \
26498 +               .pops = NULL,                                           \
26499 +               .label = LABEL,                                         \
26500 +               .desc = LABEL,                                          \
26501 +               .linkage = {NULL, NULL}                                 \
26502 +       },                                                              \
26503 +       .should_deflate = check_on_lattice_ ## FACTOR,                  \
26504 +       .accept_hook =  turn_on_compression,                            \
26505 +       .discard_hook = turn_off_compression                            \
26506 +}
26507 +
26508 +DEFINE_CHECK_ON_LATTICE(8)
26509 +DEFINE_CHECK_ON_LATTICE(16)
26510 +DEFINE_CHECK_ON_LATTICE(32)
26511 +
26512 +/* compression mode_plugins */
26513 +compression_mode_plugin compression_mode_plugins[LAST_COMPRESSION_MODE_ID] = {
26514 +       [NONE_COMPRESSION_MODE_ID] = {
26515 +               .h = {
26516 +                       .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
26517 +                       .id = NONE_COMPRESSION_MODE_ID,
26518 +                       .pops = NULL,
26519 +                       .label = "none",
26520 +                       .desc = "Don't compress",
26521 +                       .linkage = {NULL, NULL}
26522 +               },
26523 +               .should_deflate = should_deflate_none,
26524 +               .accept_hook = NULL,
26525 +               .discard_hook = NULL
26526 +       },
26527 +       /* Check-on-lattice adaptive compression modes */
26528 +       SUPPORT_COL_COMPRESSION_MODE(8, "col8"),
26529 +       SUPPORT_COL_COMPRESSION_MODE(16, "col16"),
26530 +       SUPPORT_COL_COMPRESSION_MODE(32, "col32"),
26531 +       /* Turn off compression if logical cluster of index == 0
26532 +          is incompressible, then don't check anymore */
26533 +       [COZ_COMPRESSION_MODE_ID] = {
26534 +               .h = {
26535 +                       .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
26536 +                       .id = COZ_COMPRESSION_MODE_ID,
26537 +                       .pops = NULL,
26538 +                       .label = "coz",
26539 +                       .desc = "Check on zero",
26540 +                       .linkage = {NULL, NULL}
26541 +               },
26542 +               .should_deflate = should_deflate_common,
26543 +               .accept_hook = NULL,
26544 +               .discard_hook = turn_off_compression_on_zero
26545 +       },
26546 +       [FORCE_COMPRESSION_MODE_ID] = {
26547 +               .h = {
26548 +                       .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
26549 +                       .id = FORCE_COMPRESSION_MODE_ID,
26550 +                       .pops = NULL,
26551 +                       .label = "force",
26552 +                       .desc = "Compress everything",
26553 +                       .linkage = {NULL, NULL}
26554 +               },
26555 +               .should_deflate = NULL,
26556 +               .accept_hook = NULL,
26557 +               .discard_hook = NULL
26558 +       },
26559 +       [TEST_COMPRESSION_MODE_ID] = {
26560 +               .h = {
26561 +                       .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
26562 +                       .id = TEST_COMPRESSION_MODE_ID,
26563 +                       .pops = NULL,
26564 +                       .label = "test", /* This mode is for benchmarks only */
26565 +                       .desc = "Don't compress odd clusters",
26566 +                       .linkage = {NULL, NULL}
26567 +               },
26568 +               .should_deflate = should_deflate_test,
26569 +               .accept_hook = NULL,
26570 +               .discard_hook = NULL
26571 +       }
26572 +};
26573 +
26574 +/*
26575 +  Local variables:
26576 +  c-indentation-style: "K&R"
26577 +  mode-name: "LC"
26578 +  c-basic-offset: 8
26579 +  tab-width: 8
26580 +  fill-column: 120
26581 +  scroll-step: 1
26582 +  End:
26583 +*/
26584 diff --git a/fs/reiser4/plugin/compress/lzoconf.h b/fs/reiser4/plugin/compress/lzoconf.h
26585 new file mode 100644
26586 index 0000000..458423a
26587 --- /dev/null
26588 +++ b/fs/reiser4/plugin/compress/lzoconf.h
26589 @@ -0,0 +1,420 @@
26590 +/* lzoconf.h -- configuration for the LZO real-time data compression library
26591 +   adopted for reiser4 compression transform plugin.
26592 +
26593 +   This file is part of the LZO real-time data compression library
26594 +   and not included in any proprietary licenses of reiser4.
26595 +
26596 +   Copyright (C) 2002 Markus Franz Xaver Johannes Oberhumer
26597 +   Copyright (C) 2001 Markus Franz Xaver Johannes Oberhumer
26598 +   Copyright (C) 2000 Markus Franz Xaver Johannes Oberhumer
26599 +   Copyright (C) 1999 Markus Franz Xaver Johannes Oberhumer
26600 +   Copyright (C) 1998 Markus Franz Xaver Johannes Oberhumer
26601 +   Copyright (C) 1997 Markus Franz Xaver Johannes Oberhumer
26602 +   Copyright (C) 1996 Markus Franz Xaver Johannes Oberhumer
26603 +   All Rights Reserved.
26604 +
26605 +   The LZO library is free software; you can redistribute it and/or
26606 +   modify it under the terms of the GNU General Public License as
26607 +   published by the Free Software Foundation; either version 2 of
26608 +   the License, or (at your option) any later version.
26609 +
26610 +   The LZO library is distributed in the hope that it will be useful,
26611 +   but WITHOUT ANY WARRANTY; without even the implied warranty of
26612 +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
26613 +   GNU General Public License for more details.
26614 +
26615 +   You should have received a copy of the GNU General Public License
26616 +   along with the LZO library; see the file COPYING.
26617 +   If not, write to the Free Software Foundation, Inc.,
26618 +   59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
26619 +
26620 +   Markus F.X.J. Oberhumer
26621 +   <markus@oberhumer.com>
26622 +   http://www.oberhumer.com/opensource/lzo/
26623 + */
26624 +
26625 +#include <linux/kernel.h>      /* for UINT_MAX, ULONG_MAX - edward */
26626 +
26627 +#ifndef __LZOCONF_H
26628 +#define __LZOCONF_H
26629 +
26630 +#define LZO_VERSION             0x1080
26631 +#define LZO_VERSION_STRING      "1.08"
26632 +#define LZO_VERSION_DATE        "Jul 12 2002"
26633 +
26634 +/* internal Autoconf configuration file - only used when building LZO */
26635 +#if defined(LZO_HAVE_CONFIG_H)
26636 +#  include <config.h>
26637 +#endif
26638 +#ifdef __cplusplus
26639 +extern "C" {
26640 +#endif
26641 +
26642 +/***********************************************************************
26643 +// LZO requires a conforming <limits.h>
26644 +************************************************************************/
26645 +
26646 +#define CHAR_BIT  8
26647 +#define USHRT_MAX 0xffff
26648 +
26649 +/* workaround a cpp bug under hpux 10.20 */
26650 +#define LZO_0xffffffffL         4294967295ul
26651 +
26652 +/***********************************************************************
26653 +// architecture defines
26654 +************************************************************************/
26655 +
26656 +#if !defined(__LZO_WIN) && !defined(__LZO_DOS) && !defined(__LZO_OS2)
26657 +#  if defined(__WINDOWS__) || defined(_WINDOWS) || defined(_Windows)
26658 +#    define __LZO_WIN
26659 +#  elif defined(__WIN32__) || defined(_WIN32) || defined(WIN32)
26660 +#    define __LZO_WIN
26661 +#  elif defined(__NT__) || defined(__NT_DLL__) || defined(__WINDOWS_386__)
26662 +#    define __LZO_WIN
26663 +#  elif defined(__DOS__) || defined(__MSDOS__) || defined(MSDOS)
26664 +#    define __LZO_DOS
26665 +#  elif defined(__OS2__) || defined(__OS2V2__) || defined(OS2)
26666 +#    define __LZO_OS2
26667 +#  elif defined(__palmos__)
26668 +#    define __LZO_PALMOS
26669 +#  elif defined(__TOS__) || defined(__atarist__)
26670 +#    define __LZO_TOS
26671 +#  endif
26672 +#endif
26673 +
26674 +#if (UINT_MAX < LZO_0xffffffffL)
26675 +#  if defined(__LZO_WIN)
26676 +#    define __LZO_WIN16
26677 +#  elif defined(__LZO_DOS)
26678 +#    define __LZO_DOS16
26679 +#  elif defined(__LZO_PALMOS)
26680 +#    define __LZO_PALMOS16
26681 +#  elif defined(__LZO_TOS)
26682 +#    define __LZO_TOS16
26683 +#  elif defined(__C166__)
26684 +#  else
26685 +       /* porting hint: for pure 16-bit architectures try compiling
26686 +        * everything with -D__LZO_STRICT_16BIT */
26687 +#    error "16-bit target not supported - contact me for porting hints"
26688 +#  endif
26689 +#endif
26690 +
26691 +#if !defined(__LZO_i386)
26692 +#  if defined(__LZO_DOS) || defined(__LZO_WIN16)
26693 +#    define __LZO_i386
26694 +#  elif defined(__i386__) || defined(__386__) || defined(_M_IX86)
26695 +#    define __LZO_i386
26696 +#  endif
26697 +#endif
26698 +
26699 +#if defined(__LZO_STRICT_16BIT)
26700 +#  if (UINT_MAX < LZO_0xffffffffL)
26701 +#    include <lzo16bit.h>
26702 +#  endif
26703 +#endif
26704 +
26705 +/* memory checkers */
26706 +#if !defined(__LZO_CHECKER)
26707 +#  if defined(__BOUNDS_CHECKING_ON)
26708 +#    define __LZO_CHECKER
26709 +#  elif defined(__CHECKER__)
26710 +#    define __LZO_CHECKER
26711 +#  elif defined(__INSURE__)
26712 +#    define __LZO_CHECKER
26713 +#  elif defined(__PURIFY__)
26714 +#    define __LZO_CHECKER
26715 +#  endif
26716 +#endif
26717 +
26718 +/***********************************************************************
26719 +// integral and pointer types
26720 +************************************************************************/
26721 +
26722 +/* Integral types with 32 bits or more */
26723 +#if !defined(LZO_UINT32_MAX)
26724 +#  if (UINT_MAX >= LZO_0xffffffffL)
26725 +       typedef unsigned int lzo_uint32;
26726 +       typedef int lzo_int32;
26727 +#    define LZO_UINT32_MAX      UINT_MAX
26728 +#    define LZO_INT32_MAX       INT_MAX
26729 +#    define LZO_INT32_MIN       INT_MIN
26730 +#  elif (ULONG_MAX >= LZO_0xffffffffL)
26731 +       typedef unsigned long lzo_uint32;
26732 +       typedef long lzo_int32;
26733 +#    define LZO_UINT32_MAX      ULONG_MAX
26734 +#    define LZO_INT32_MAX       LONG_MAX
26735 +#    define LZO_INT32_MIN       LONG_MIN
26736 +#  else
26737 +#    error "lzo_uint32"
26738 +#  endif
26739 +#endif
26740 +
26741 +/* lzo_uint is used like size_t */
26742 +#if !defined(LZO_UINT_MAX)
26743 +#  if (UINT_MAX >= LZO_0xffffffffL)
26744 +       typedef unsigned int lzo_uint;
26745 +       typedef int lzo_int;
26746 +#    define LZO_UINT_MAX        UINT_MAX
26747 +#    define LZO_INT_MAX         INT_MAX
26748 +#    define LZO_INT_MIN         INT_MIN
26749 +#  elif (ULONG_MAX >= LZO_0xffffffffL)
26750 +       typedef unsigned long lzo_uint;
26751 +       typedef long lzo_int;
26752 +#    define LZO_UINT_MAX        ULONG_MAX
26753 +#    define LZO_INT_MAX         LONG_MAX
26754 +#    define LZO_INT_MIN         LONG_MIN
26755 +#  else
26756 +#    error "lzo_uint"
26757 +#  endif
26758 +#endif
26759 +
26760 +       typedef int lzo_bool;
26761 +
26762 +/***********************************************************************
26763 +// memory models
26764 +************************************************************************/
26765 +
26766 +/* Memory model for the public code segment. */
26767 +#if !defined(__LZO_CMODEL)
26768 +#  if defined(__LZO_DOS16) || defined(__LZO_WIN16)
26769 +#    define __LZO_CMODEL        __far
26770 +#  elif defined(__LZO_i386) && defined(__WATCOMC__)
26771 +#    define __LZO_CMODEL        __near
26772 +#  else
26773 +#    define __LZO_CMODEL
26774 +#  endif
26775 +#endif
26776 +
26777 +/* Memory model for the public data segment. */
26778 +#if !defined(__LZO_DMODEL)
26779 +#  if defined(__LZO_DOS16) || defined(__LZO_WIN16)
26780 +#    define __LZO_DMODEL        __far
26781 +#  elif defined(__LZO_i386) && defined(__WATCOMC__)
26782 +#    define __LZO_DMODEL        __near
26783 +#  else
26784 +#    define __LZO_DMODEL
26785 +#  endif
26786 +#endif
26787 +
26788 +/* Memory model that allows to access memory at offsets of lzo_uint. */
26789 +#if !defined(__LZO_MMODEL)
26790 +#  if (LZO_UINT_MAX <= UINT_MAX)
26791 +#    define __LZO_MMODEL
26792 +#  elif defined(__LZO_DOS16) || defined(__LZO_WIN16)
26793 +#    define __LZO_MMODEL        __huge
26794 +#    define LZO_999_UNSUPPORTED
26795 +#  elif defined(__LZO_PALMOS16) || defined(__LZO_TOS16)
26796 +#    define __LZO_MMODEL
26797 +#  else
26798 +#    error "__LZO_MMODEL"
26799 +#  endif
26800 +#endif
26801 +
26802 +/* no typedef here because of const-pointer issues */
26803 +#define lzo_byte                unsigned char __LZO_MMODEL
26804 +#define lzo_bytep               unsigned char __LZO_MMODEL *
26805 +#define lzo_charp               char __LZO_MMODEL *
26806 +#define lzo_voidp               void __LZO_MMODEL *
26807 +#define lzo_shortp              short __LZO_MMODEL *
26808 +#define lzo_ushortp             unsigned short __LZO_MMODEL *
26809 +#define lzo_uint32p             lzo_uint32 __LZO_MMODEL *
26810 +#define lzo_int32p              lzo_int32 __LZO_MMODEL *
26811 +#define lzo_uintp               lzo_uint __LZO_MMODEL *
26812 +#define lzo_intp                lzo_int __LZO_MMODEL *
26813 +#define lzo_voidpp              lzo_voidp __LZO_MMODEL *
26814 +#define lzo_bytepp              lzo_bytep __LZO_MMODEL *
26815 +
26816 +#ifndef lzo_sizeof_dict_t
26817 +#  define lzo_sizeof_dict_t     sizeof(lzo_bytep)
26818 +#endif
26819 +
26820 +/***********************************************************************
26821 +// calling conventions and function types
26822 +************************************************************************/
26823 +
26824 +/* linkage */
26825 +#if !defined(__LZO_EXTERN_C)
26826 +#  ifdef __cplusplus
26827 +#    define __LZO_EXTERN_C      extern "C"
26828 +#  else
26829 +#    define __LZO_EXTERN_C      extern
26830 +#  endif
26831 +#endif
26832 +
26833 +/* calling convention */
26834 +#if !defined(__LZO_CDECL)
26835 +#  if defined(__LZO_DOS16) || defined(__LZO_WIN16)
26836 +#    define __LZO_CDECL         __LZO_CMODEL __cdecl
26837 +#  elif defined(__LZO_i386) && defined(_MSC_VER)
26838 +#    define __LZO_CDECL         __LZO_CMODEL __cdecl
26839 +#  elif defined(__LZO_i386) && defined(__WATCOMC__)
26840 +#    define __LZO_CDECL         __LZO_CMODEL __cdecl
26841 +#  else
26842 +#    define __LZO_CDECL         __LZO_CMODEL
26843 +#  endif
26844 +#endif
26845 +#if !defined(__LZO_ENTRY)
26846 +#  define __LZO_ENTRY           __LZO_CDECL
26847 +#endif
26848 +
26849 +/* C++ exception specification for extern "C" function types */
26850 +#if !defined(__cplusplus)
26851 +#  undef LZO_NOTHROW
26852 +#  define LZO_NOTHROW
26853 +#elif !defined(LZO_NOTHROW)
26854 +#  define LZO_NOTHROW
26855 +#endif
26856 +
26857 +       typedef int
26858 +        (__LZO_ENTRY * lzo_compress_t) (const lzo_byte * src, lzo_uint src_len,
26859 +                                        lzo_byte * dst, lzo_uintp dst_len,
26860 +                                        lzo_voidp wrkmem);
26861 +
26862 +       typedef int
26863 +        (__LZO_ENTRY * lzo_decompress_t) (const lzo_byte * src,
26864 +                                          lzo_uint src_len, lzo_byte * dst,
26865 +                                          lzo_uintp dst_len, lzo_voidp wrkmem);
26866 +
26867 +       typedef int
26868 +        (__LZO_ENTRY * lzo_optimize_t) (lzo_byte * src, lzo_uint src_len,
26869 +                                        lzo_byte * dst, lzo_uintp dst_len,
26870 +                                        lzo_voidp wrkmem);
26871 +
26872 +       typedef int
26873 +        (__LZO_ENTRY * lzo_compress_dict_t) (const lzo_byte * src,
26874 +                                             lzo_uint src_len, lzo_byte * dst,
26875 +                                             lzo_uintp dst_len,
26876 +                                             lzo_voidp wrkmem,
26877 +                                             const lzo_byte * dict,
26878 +                                             lzo_uint dict_len);
26879 +
26880 +       typedef int
26881 +        (__LZO_ENTRY * lzo_decompress_dict_t) (const lzo_byte * src,
26882 +                                               lzo_uint src_len,
26883 +                                               lzo_byte * dst,
26884 +                                               lzo_uintp dst_len,
26885 +                                               lzo_voidp wrkmem,
26886 +                                               const lzo_byte * dict,
26887 +                                               lzo_uint dict_len);
26888 +
26889 +/* assembler versions always use __cdecl */
26890 +       typedef int
26891 +        (__LZO_CDECL * lzo_compress_asm_t) (const lzo_byte * src,
26892 +                                            lzo_uint src_len, lzo_byte * dst,
26893 +                                            lzo_uintp dst_len,
26894 +                                            lzo_voidp wrkmem);
26895 +
26896 +       typedef int
26897 +        (__LZO_CDECL * lzo_decompress_asm_t) (const lzo_byte * src,
26898 +                                              lzo_uint src_len, lzo_byte * dst,
26899 +                                              lzo_uintp dst_len,
26900 +                                              lzo_voidp wrkmem);
26901 +
26902 +/* a progress indicator callback function */
26903 +       typedef void (__LZO_ENTRY * lzo_progress_callback_t) (lzo_uint,
26904 +                                                             lzo_uint);
26905 +
26906 +/***********************************************************************
26907 +// export information
26908 +************************************************************************/
26909 +
26910 +/* DLL export information */
26911 +#if !defined(__LZO_EXPORT1)
26912 +#  define __LZO_EXPORT1
26913 +#endif
26914 +#if !defined(__LZO_EXPORT2)
26915 +#  define __LZO_EXPORT2
26916 +#endif
26917 +
26918 +/* exported calling convention for C functions */
26919 +#if !defined(LZO_PUBLIC)
26920 +#  define LZO_PUBLIC(_rettype) \
26921 +                __LZO_EXPORT1 _rettype __LZO_EXPORT2 __LZO_ENTRY
26922 +#endif
26923 +#if !defined(LZO_EXTERN)
26924 +#  define LZO_EXTERN(_rettype)          __LZO_EXTERN_C LZO_PUBLIC(_rettype)
26925 +#endif
26926 +#if !defined(LZO_PRIVATE)
26927 +#  define LZO_PRIVATE(_rettype)         static _rettype __LZO_ENTRY
26928 +#endif
26929 +
26930 +/* exported __cdecl calling convention for assembler functions */
26931 +#if !defined(LZO_PUBLIC_CDECL)
26932 +#  define LZO_PUBLIC_CDECL(_rettype) \
26933 +                __LZO_EXPORT1 _rettype __LZO_EXPORT2 __LZO_CDECL
26934 +#endif
26935 +#if !defined(LZO_EXTERN_CDECL)
26936 +#  define LZO_EXTERN_CDECL(_rettype)    __LZO_EXTERN_C LZO_PUBLIC_CDECL(_rettype)
26937 +#endif
26938 +
26939 +/* exported global variables (LZO currently uses no static variables and
26940 + * is fully thread safe) */
26941 +#if !defined(LZO_PUBLIC_VAR)
26942 +#  define LZO_PUBLIC_VAR(_type) \
26943 +                __LZO_EXPORT1 _type __LZO_EXPORT2 __LZO_DMODEL
26944 +#endif
26945 +#if !defined(LZO_EXTERN_VAR)
26946 +#  define LZO_EXTERN_VAR(_type)         extern LZO_PUBLIC_VAR(_type)
26947 +#endif
26948 +
26949 +/***********************************************************************
26950 +// error codes and prototypes
26951 +************************************************************************/
26952 +
26953 +/* Error codes for the compression/decompression functions. Negative
26954 + * values are errors, positive values will be used for special but
26955 + * normal events.
26956 + */
26957 +#define LZO_E_OK                    0
26958 +#define LZO_E_ERROR                 (-1)
26959 +#define LZO_E_OUT_OF_MEMORY         (-2)       /* not used right now */
26960 +#define LZO_E_NOT_COMPRESSIBLE      (-3)       /* not used right now */
26961 +#define LZO_E_INPUT_OVERRUN         (-4)
26962 +#define LZO_E_OUTPUT_OVERRUN        (-5)
26963 +#define LZO_E_LOOKBEHIND_OVERRUN    (-6)
26964 +#define LZO_E_EOF_NOT_FOUND         (-7)
26965 +#define LZO_E_INPUT_NOT_CONSUMED    (-8)
26966 +
26967 +/* lzo_init() should be the first function you call.
26968 + * Check the return code !
26969 + *
26970 + * lzo_init() is a macro to allow checking that the library and the
26971 + * compiler's view of various types are consistent.
26972 + */
26973 +#define lzo_init() __lzo_init2(LZO_VERSION,(int)sizeof(short),(int)sizeof(int),\
26974 +    (int)sizeof(long),(int)sizeof(lzo_uint32),(int)sizeof(lzo_uint),\
26975 +    (int)lzo_sizeof_dict_t,(int)sizeof(char *),(int)sizeof(lzo_voidp),\
26976 +    (int)sizeof(lzo_compress_t))
26977 +        LZO_EXTERN(int) __lzo_init2(unsigned, int, int, int, int, int, int,
26978 +                                    int, int, int);
26979 +
26980 +/* checksum functions */
26981 +        LZO_EXTERN(lzo_uint32)
26982 +        lzo_crc32(lzo_uint32 _c, const lzo_byte * _buf, lzo_uint _len);
26983 +
26984 +/* misc. */
26985 +       typedef union {
26986 +               lzo_bytep p;
26987 +               lzo_uint u;
26988 +       } __lzo_pu_u;
26989 +       typedef union {
26990 +               lzo_bytep p;
26991 +               lzo_uint32 u32;
26992 +       } __lzo_pu32_u;
26993 +       typedef union {
26994 +               void *vp;
26995 +               lzo_bytep bp;
26996 +               lzo_uint32 u32;
26997 +               long l;
26998 +       } lzo_align_t;
26999 +
27000 +#define LZO_PTR_ALIGN_UP(_ptr,_size) \
27001 +    ((_ptr) + (lzo_uint) __lzo_align_gap((const lzo_voidp)(_ptr),(lzo_uint)(_size)))
27002 +
27003 +/* deprecated - only for backward compatibility */
27004 +#define LZO_ALIGN(_ptr,_size) LZO_PTR_ALIGN_UP(_ptr,_size)
27005 +
27006 +#ifdef __cplusplus
27007 +}                              /* extern "C" */
27008 +#endif
27009 +#endif                         /* already included */
27010 diff --git a/fs/reiser4/plugin/compress/minilzo.c b/fs/reiser4/plugin/compress/minilzo.c
27011 new file mode 100644
27012 index 0000000..4d9d887
27013 --- /dev/null
27014 +++ b/fs/reiser4/plugin/compress/minilzo.c
27015 @@ -0,0 +1,2155 @@
27016 +/* minilzo.c -- mini subset of the LZO real-time data compression library
27017 +   adopted for reiser4 compression transform plugin.
27018 +
27019 +   This file is part of the LZO real-time data compression library
27020 +   and not included in any proprietary licenses of reiser4.
27021 +
27022 +   Copyright (C) 2002 Markus Franz Xaver Johannes Oberhumer
27023 +   Copyright (C) 2001 Markus Franz Xaver Johannes Oberhumer
27024 +   Copyright (C) 2000 Markus Franz Xaver Johannes Oberhumer
27025 +   Copyright (C) 1999 Markus Franz Xaver Johannes Oberhumer
27026 +   Copyright (C) 1998 Markus Franz Xaver Johannes Oberhumer
27027 +   Copyright (C) 1997 Markus Franz Xaver Johannes Oberhumer
27028 +   Copyright (C) 1996 Markus Franz Xaver Johannes Oberhumer
27029 +   All Rights Reserved.
27030 +
27031 +   The LZO library is free software; you can redistribute it and/or
27032 +   modify it under the terms of the GNU General Public License as
27033 +   published by the Free Software Foundation; either version 2 of
27034 +   the License, or (at your option) any later version.
27035 +
27036 +   The LZO library is distributed in the hope that it will be useful,
27037 +   but WITHOUT ANY WARRANTY; without even the implied warranty of
27038 +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
27039 +   GNU General Public License for more details.
27040 +
27041 +   You should have received a copy of the GNU General Public License
27042 +   along with the LZO library; see the file COPYING.
27043 +   If not, write to the Free Software Foundation, Inc.,
27044 +   59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
27045 +
27046 +   Markus F.X.J. Oberhumer
27047 +   <markus@oberhumer.com>
27048 +   http://www.oberhumer.com/opensource/lzo/
27049 + */
27050 +
27051 +/*
27052 + * NOTE:
27053 + *   the full LZO package can be found at
27054 + *   http://www.oberhumer.com/opensource/lzo/
27055 + */
27056 +
27057 +#include "../../debug.h"       /* for reiser4 assert macro -edward */
27058 +
27059 +#define __LZO_IN_MINILZO
27060 +#define LZO_BUILD
27061 +
27062 +#ifdef MINILZO_HAVE_CONFIG_H
27063 +#  include <config.h>
27064 +#endif
27065 +
27066 +#undef LZO_HAVE_CONFIG_H
27067 +#include "minilzo.h"
27068 +
27069 +#if !defined(MINILZO_VERSION) || (MINILZO_VERSION != 0x1080)
27070 +#  error "version mismatch in miniLZO source files"
27071 +#endif
27072 +
27073 +#ifdef MINILZO_HAVE_CONFIG_H
27074 +#  define LZO_HAVE_CONFIG_H
27075 +#endif
27076 +
27077 +
27078 +#ifndef __LZO_CONF_H
27079 +#define __LZO_CONF_H
27080 +
27081 +#if !defined(__LZO_IN_MINILZO)
27082 +#  ifndef __LZOCONF_H
27083 +#    include <lzoconf.h>
27084 +#  endif
27085 +#endif
27086 +
27087 +#if defined(__BOUNDS_CHECKING_ON)
27088 +#  include <unchecked.h>
27089 +#else
27090 +#  define BOUNDS_CHECKING_OFF_DURING(stmt)      stmt
27091 +#  define BOUNDS_CHECKING_OFF_IN_EXPR(expr)     (expr)
27092 +#endif
27093 +
27094 +#  define HAVE_MEMCMP
27095 +#  define HAVE_MEMCPY
27096 +#  define HAVE_MEMMOVE
27097 +#  define HAVE_MEMSET
27098 +
27099 +#if defined(__LZO_DOS16) || defined(__LZO_WIN16)
27100 +#  define HAVE_MALLOC_H
27101 +#  define HAVE_HALLOC
27102 +#endif
27103 +
27104 +#undef NDEBUG
27105 +#if !defined(LZO_DEBUG)
27106 +#  define NDEBUG
27107 +#endif
27108 +#if defined(LZO_DEBUG) || !defined(NDEBUG)
27109 +#  if !defined(NO_STDIO_H)
27110 +#    include <stdio.h>
27111 +#  endif
27112 +#endif
27113 +# if 0                         /* edward */
27114 +#include <assert.h>
27115 +#endif                         /* edward */
27116 +
27117 +#if !defined(LZO_COMPILE_TIME_ASSERT)
27118 +#  define LZO_COMPILE_TIME_ASSERT(expr) \
27119 +       { typedef int __lzo_compile_time_assert_fail[1 - 2 * !(expr)]; }
27120 +#endif
27121 +
27122 +#if !defined(LZO_UNUSED)
27123 +#  if 1
27124 +#    define LZO_UNUSED(var)     ((void)&var)
27125 +#  elif 0
27126 +#    define LZO_UNUSED(var)     { typedef int __lzo_unused[sizeof(var) ? 2 : 1]; }
27127 +#  else
27128 +#    define LZO_UNUSED(parm)    (parm = parm)
27129 +#  endif
27130 +#endif
27131 +
27132 +#if !defined(__inline__) && !defined(__GNUC__)
27133 +#  if defined(__cplusplus)
27134 +#    define __inline__      inline
27135 +#  else
27136 +#    define __inline__
27137 +#  endif
27138 +#endif
27139 +
27140 +#if defined(NO_MEMCMP)
27141 +#  undef HAVE_MEMCMP
27142 +#endif
27143 +
27144 +#if !defined(HAVE_MEMSET)
27145 +#  undef memset
27146 +#  define memset    lzo_memset
27147 +#endif
27148 +
27149 +#  define LZO_BYTE(x)       ((unsigned char) ((x) & 0xff))
27150 +
27151 +#define LZO_MAX(a,b)        ((a) >= (b) ? (a) : (b))
27152 +#define LZO_MIN(a,b)        ((a) <= (b) ? (a) : (b))
27153 +#define LZO_MAX3(a,b,c)     ((a) >= (b) ? LZO_MAX(a,c) : LZO_MAX(b,c))
27154 +#define LZO_MIN3(a,b,c)     ((a) <= (b) ? LZO_MIN(a,c) : LZO_MIN(b,c))
27155 +
27156 +#define lzo_sizeof(type)    ((lzo_uint) (sizeof(type)))
27157 +
27158 +#define LZO_HIGH(array)     ((lzo_uint) (sizeof(array)/sizeof(*(array))))
27159 +
27160 +#define LZO_SIZE(bits)      (1u << (bits))
27161 +#define LZO_MASK(bits)      (LZO_SIZE(bits) - 1)
27162 +
27163 +#define LZO_LSIZE(bits)     (1ul << (bits))
27164 +#define LZO_LMASK(bits)     (LZO_LSIZE(bits) - 1)
27165 +
27166 +#define LZO_USIZE(bits)     ((lzo_uint) 1 << (bits))
27167 +#define LZO_UMASK(bits)     (LZO_USIZE(bits) - 1)
27168 +
27169 +#define LZO_STYPE_MAX(b)    (((1l  << (8*(b)-2)) - 1l)  + (1l  << (8*(b)-2)))
27170 +#define LZO_UTYPE_MAX(b)    (((1ul << (8*(b)-1)) - 1ul) + (1ul << (8*(b)-1)))
27171 +
27172 +#if !defined(SIZEOF_UNSIGNED)
27173 +#  if (UINT_MAX == 0xffff)
27174 +#    define SIZEOF_UNSIGNED         2
27175 +#  elif (UINT_MAX == LZO_0xffffffffL)
27176 +#    define SIZEOF_UNSIGNED         4
27177 +#  elif (UINT_MAX >= LZO_0xffffffffL)
27178 +#    define SIZEOF_UNSIGNED         8
27179 +#  else
27180 +#    error "SIZEOF_UNSIGNED"
27181 +#  endif
27182 +#endif
27183 +
27184 +#if !defined(SIZEOF_UNSIGNED_LONG)
27185 +#  if (ULONG_MAX == LZO_0xffffffffL)
27186 +#    define SIZEOF_UNSIGNED_LONG    4
27187 +#  elif (ULONG_MAX >= LZO_0xffffffffL)
27188 +#    define SIZEOF_UNSIGNED_LONG    8
27189 +#  else
27190 +#    error "SIZEOF_UNSIGNED_LONG"
27191 +#  endif
27192 +#endif
27193 +
27194 +#if !defined(SIZEOF_SIZE_T)
27195 +#  define SIZEOF_SIZE_T             SIZEOF_UNSIGNED
27196 +#endif
27197 +#if !defined(SIZE_T_MAX)
27198 +#  define SIZE_T_MAX                LZO_UTYPE_MAX(SIZEOF_SIZE_T)
27199 +#endif
27200 +
27201 +#if 1 && defined(__LZO_i386) && (UINT_MAX == LZO_0xffffffffL)
27202 +#  if !defined(LZO_UNALIGNED_OK_2) && (USHRT_MAX == 0xffff)
27203 +#    define LZO_UNALIGNED_OK_2
27204 +#  endif
27205 +#  if !defined(LZO_UNALIGNED_OK_4) && (LZO_UINT32_MAX == LZO_0xffffffffL)
27206 +#    define LZO_UNALIGNED_OK_4
27207 +#  endif
27208 +#endif
27209 +
27210 +#if defined(LZO_UNALIGNED_OK_2) || defined(LZO_UNALIGNED_OK_4)
27211 +#  if !defined(LZO_UNALIGNED_OK)
27212 +#    define LZO_UNALIGNED_OK
27213 +#  endif
27214 +#endif
27215 +
27216 +#if defined(__LZO_NO_UNALIGNED)
27217 +#  undef LZO_UNALIGNED_OK
27218 +#  undef LZO_UNALIGNED_OK_2
27219 +#  undef LZO_UNALIGNED_OK_4
27220 +#endif
27221 +
27222 +#if defined(LZO_UNALIGNED_OK_2) && (USHRT_MAX != 0xffff)
27223 +#  error "LZO_UNALIGNED_OK_2 must not be defined on this system"
27224 +#endif
27225 +#if defined(LZO_UNALIGNED_OK_4) && (LZO_UINT32_MAX != LZO_0xffffffffL)
27226 +#  error "LZO_UNALIGNED_OK_4 must not be defined on this system"
27227 +#endif
27228 +
27229 +#if defined(__LZO_NO_ALIGNED)
27230 +#  undef LZO_ALIGNED_OK_4
27231 +#endif
27232 +
27233 +#if defined(LZO_ALIGNED_OK_4) && (LZO_UINT32_MAX != LZO_0xffffffffL)
27234 +#  error "LZO_ALIGNED_OK_4 must not be defined on this system"
27235 +#endif
27236 +
27237 +#define LZO_LITTLE_ENDIAN       1234
27238 +#define LZO_BIG_ENDIAN          4321
27239 +#define LZO_PDP_ENDIAN          3412
27240 +
27241 +#if !defined(LZO_BYTE_ORDER)
27242 +#  if defined(MFX_BYTE_ORDER)
27243 +#    define LZO_BYTE_ORDER      MFX_BYTE_ORDER
27244 +#  elif defined(__LZO_i386)
27245 +#    define LZO_BYTE_ORDER      LZO_LITTLE_ENDIAN
27246 +#  elif defined(BYTE_ORDER)
27247 +#    define LZO_BYTE_ORDER      BYTE_ORDER
27248 +#  elif defined(__BYTE_ORDER)
27249 +#    define LZO_BYTE_ORDER      __BYTE_ORDER
27250 +#  endif
27251 +#endif
27252 +
27253 +#if defined(LZO_BYTE_ORDER)
27254 +#  if (LZO_BYTE_ORDER != LZO_LITTLE_ENDIAN) && \
27255 +      (LZO_BYTE_ORDER != LZO_BIG_ENDIAN)
27256 +#    error "invalid LZO_BYTE_ORDER"
27257 +#  endif
27258 +#endif
27259 +
27260 +#if defined(LZO_UNALIGNED_OK) && !defined(LZO_BYTE_ORDER)
27261 +#  error "LZO_BYTE_ORDER is not defined"
27262 +#endif
27263 +
27264 +#define LZO_OPTIMIZE_GNUC_i386_IS_BUGGY
27265 +
27266 +#if defined(NDEBUG) && !defined(LZO_DEBUG) && !defined(__LZO_CHECKER)
27267 +#  if defined(__GNUC__) && defined(__i386__)
27268 +#    if !defined(LZO_OPTIMIZE_GNUC_i386_IS_BUGGY)
27269 +#      define LZO_OPTIMIZE_GNUC_i386
27270 +#    endif
27271 +#  endif
27272 +#endif
27273 +
27274 +__LZO_EXTERN_C const lzo_uint32 _lzo_crc32_table[256];
27275 +
27276 +#define _LZO_STRINGIZE(x)           #x
27277 +#define _LZO_MEXPAND(x)             _LZO_STRINGIZE(x)
27278 +
27279 +#define _LZO_CONCAT2(a,b)           a ## b
27280 +#define _LZO_CONCAT3(a,b,c)         a ## b ## c
27281 +#define _LZO_CONCAT4(a,b,c,d)       a ## b ## c ## d
27282 +#define _LZO_CONCAT5(a,b,c,d,e)     a ## b ## c ## d ## e
27283 +
27284 +#define _LZO_ECONCAT2(a,b)          _LZO_CONCAT2(a,b)
27285 +#define _LZO_ECONCAT3(a,b,c)        _LZO_CONCAT3(a,b,c)
27286 +#define _LZO_ECONCAT4(a,b,c,d)      _LZO_CONCAT4(a,b,c,d)
27287 +#define _LZO_ECONCAT5(a,b,c,d,e)    _LZO_CONCAT5(a,b,c,d,e)
27288 +
27289 +#ifndef __LZO_PTR_H
27290 +#define __LZO_PTR_H
27291 +
27292 +#ifdef __cplusplus
27293 +extern "C" {
27294 +#endif
27295 +
27296 +#if defined(__LZO_DOS16) || defined(__LZO_WIN16)
27297 +#  include <dos.h>
27298 +#  if 1 && defined(__WATCOMC__)
27299 +#    include <i86.h>
27300 +       __LZO_EXTERN_C unsigned char _HShift;
27301 +#    define __LZO_HShift    _HShift
27302 +#  elif 1 && defined(_MSC_VER)
27303 +       __LZO_EXTERN_C unsigned short __near _AHSHIFT;
27304 +#    define __LZO_HShift    ((unsigned) &_AHSHIFT)
27305 +#  elif defined(__LZO_WIN16)
27306 +#    define __LZO_HShift    3
27307 +#  else
27308 +#    define __LZO_HShift    12
27309 +#  endif
27310 +#  if !defined(_FP_SEG) && defined(FP_SEG)
27311 +#    define _FP_SEG         FP_SEG
27312 +#  endif
27313 +#  if !defined(_FP_OFF) && defined(FP_OFF)
27314 +#    define _FP_OFF         FP_OFF
27315 +#  endif
27316 +#endif
27317 +
27318 +#if !defined(lzo_ptrdiff_t)
27319 +#  if (UINT_MAX >= LZO_0xffffffffL)
27320 +       typedef ptrdiff_t lzo_ptrdiff_t;
27321 +#  else
27322 +       typedef long lzo_ptrdiff_t;
27323 +#  endif
27324 +#endif
27325 +
27326 +#if !defined(__LZO_HAVE_PTR_T)
27327 +#  if defined(lzo_ptr_t)
27328 +#    define __LZO_HAVE_PTR_T
27329 +#  endif
27330 +#endif
27331 +#if !defined(__LZO_HAVE_PTR_T)
27332 +#  if defined(SIZEOF_CHAR_P) && defined(SIZEOF_UNSIGNED_LONG)
27333 +#    if (SIZEOF_CHAR_P == SIZEOF_UNSIGNED_LONG)
27334 +       typedef unsigned long lzo_ptr_t;
27335 +       typedef long lzo_sptr_t;
27336 +#      define __LZO_HAVE_PTR_T
27337 +#    endif
27338 +#  endif
27339 +#endif
27340 +#if !defined(__LZO_HAVE_PTR_T)
27341 +#  if defined(SIZEOF_CHAR_P) && defined(SIZEOF_UNSIGNED)
27342 +#    if (SIZEOF_CHAR_P == SIZEOF_UNSIGNED)
27343 +       typedef unsigned int lzo_ptr_t;
27344 +       typedef int lzo_sptr_t;
27345 +#      define __LZO_HAVE_PTR_T
27346 +#    endif
27347 +#  endif
27348 +#endif
27349 +#if !defined(__LZO_HAVE_PTR_T)
27350 +#  if defined(SIZEOF_CHAR_P) && defined(SIZEOF_UNSIGNED_SHORT)
27351 +#    if (SIZEOF_CHAR_P == SIZEOF_UNSIGNED_SHORT)
27352 +       typedef unsigned short lzo_ptr_t;
27353 +       typedef short lzo_sptr_t;
27354 +#      define __LZO_HAVE_PTR_T
27355 +#    endif
27356 +#  endif
27357 +#endif
27358 +#if !defined(__LZO_HAVE_PTR_T)
27359 +#  if defined(LZO_HAVE_CONFIG_H) || defined(SIZEOF_CHAR_P)
27360 +#    error "no suitable type for lzo_ptr_t"
27361 +#  else
27362 +       typedef unsigned long lzo_ptr_t;
27363 +       typedef long lzo_sptr_t;
27364 +#    define __LZO_HAVE_PTR_T
27365 +#  endif
27366 +#endif
27367 +
27368 +#if defined(__LZO_DOS16) || defined(__LZO_WIN16)
27369 +#define PTR(a)              ((lzo_bytep) (a))
27370 +#define PTR_ALIGNED_4(a)    ((_FP_OFF(a) & 3) == 0)
27371 +#define PTR_ALIGNED2_4(a,b) (((_FP_OFF(a) | _FP_OFF(b)) & 3) == 0)
27372 +#else
27373 +#define PTR(a)              ((lzo_ptr_t) (a))
27374 +#define PTR_LINEAR(a)       PTR(a)
27375 +#define PTR_ALIGNED_4(a)    ((PTR_LINEAR(a) & 3) == 0)
27376 +#define PTR_ALIGNED_8(a)    ((PTR_LINEAR(a) & 7) == 0)
27377 +#define PTR_ALIGNED2_4(a,b) (((PTR_LINEAR(a) | PTR_LINEAR(b)) & 3) == 0)
27378 +#define PTR_ALIGNED2_8(a,b) (((PTR_LINEAR(a) | PTR_LINEAR(b)) & 7) == 0)
27379 +#endif
27380 +
27381 +#define PTR_LT(a,b)         (PTR(a) < PTR(b))
27382 +#define PTR_GE(a,b)         (PTR(a) >= PTR(b))
27383 +#define PTR_DIFF(a,b)       ((lzo_ptrdiff_t) (PTR(a) - PTR(b)))
27384 +#define pd(a,b)             ((lzo_uint) ((a)-(b)))
27385 +
27386 +       typedef union {
27387 +               char a_char;
27388 +               unsigned char a_uchar;
27389 +               short a_short;
27390 +               unsigned short a_ushort;
27391 +               int a_int;
27392 +               unsigned int a_uint;
27393 +               long a_long;
27394 +               unsigned long a_ulong;
27395 +               lzo_int a_lzo_int;
27396 +               lzo_uint a_lzo_uint;
27397 +               lzo_int32 a_lzo_int32;
27398 +               lzo_uint32 a_lzo_uint32;
27399 +               ptrdiff_t a_ptrdiff_t;
27400 +               lzo_ptrdiff_t a_lzo_ptrdiff_t;
27401 +               lzo_ptr_t a_lzo_ptr_t;
27402 +               lzo_voidp a_lzo_voidp;
27403 +               void *a_void_p;
27404 +               lzo_bytep a_lzo_bytep;
27405 +               lzo_bytepp a_lzo_bytepp;
27406 +               lzo_uintp a_lzo_uintp;
27407 +               lzo_uint *a_lzo_uint_p;
27408 +               lzo_uint32p a_lzo_uint32p;
27409 +               lzo_uint32 *a_lzo_uint32_p;
27410 +               unsigned char *a_uchar_p;
27411 +               char *a_char_p;
27412 +       } lzo_full_align_t;
27413 +
27414 +#ifdef __cplusplus
27415 +}
27416 +#endif
27417 +#endif
27418 +#define LZO_DETERMINISTIC
27419 +#define LZO_DICT_USE_PTR
27420 +#if defined(__LZO_DOS16) || defined(__LZO_WIN16) || defined(__LZO_STRICT_16BIT)
27421 +#  undef LZO_DICT_USE_PTR
27422 +#endif
27423 +#if defined(LZO_DICT_USE_PTR)
27424 +#  define lzo_dict_t    const lzo_bytep
27425 +#  define lzo_dict_p    lzo_dict_t __LZO_MMODEL *
27426 +#else
27427 +#  define lzo_dict_t    lzo_uint
27428 +#  define lzo_dict_p    lzo_dict_t __LZO_MMODEL *
27429 +#endif
27430 +#if !defined(lzo_moff_t)
27431 +#define lzo_moff_t      lzo_uint
27432 +#endif
27433 +#endif
27434 +static lzo_ptr_t __lzo_ptr_linear(const lzo_voidp ptr)
27435 +{
27436 +       lzo_ptr_t p;
27437 +
27438 +#if defined(__LZO_DOS16) || defined(__LZO_WIN16)
27439 +       p = (((lzo_ptr_t) (_FP_SEG(ptr))) << (16 - __LZO_HShift)) +
27440 +           (_FP_OFF(ptr));
27441 +#else
27442 +       p = PTR_LINEAR(ptr);
27443 +#endif
27444 +
27445 +       return p;
27446 +}
27447 +
27448 +static unsigned __lzo_align_gap(const lzo_voidp ptr, lzo_uint size)
27449 +{
27450 +       lzo_ptr_t p, s, n;
27451 +
27452 +       assert("lzo-01", size > 0);
27453 +
27454 +       p = __lzo_ptr_linear(ptr);
27455 +       s = (lzo_ptr_t) (size - 1);
27456 +       n = (((p + s) / size) * size) - p;
27457 +
27458 +       assert("lzo-02", (long)n >= 0);
27459 +       assert("lzo-03", n <= s);
27460 +
27461 +       return (unsigned)n;
27462 +}
27463 +
27464 +#ifndef __LZO_UTIL_H
27465 +#define __LZO_UTIL_H
27466 +
27467 +#ifndef __LZO_CONF_H
27468 +#endif
27469 +
27470 +#ifdef __cplusplus
27471 +extern "C" {
27472 +#endif
27473 +
27474 +#if 1 && defined(HAVE_MEMCPY)
27475 +#if !defined(__LZO_DOS16) && !defined(__LZO_WIN16)
27476 +
27477 +#define MEMCPY8_DS(dest,src,len) \
27478 +    memcpy(dest,src,len); \
27479 +    dest += len; \
27480 +    src += len
27481 +
27482 +#endif
27483 +#endif
27484 +
27485 +#if !defined(MEMCPY8_DS)
27486 +
27487 +#define MEMCPY8_DS(dest,src,len) \
27488 +    { register lzo_uint __l = (len) / 8; \
27489 +    do { \
27490 +       *dest++ = *src++; \
27491 +       *dest++ = *src++; \
27492 +       *dest++ = *src++; \
27493 +       *dest++ = *src++; \
27494 +       *dest++ = *src++; \
27495 +       *dest++ = *src++; \
27496 +       *dest++ = *src++; \
27497 +       *dest++ = *src++; \
27498 +    } while (--__l > 0); }
27499 +
27500 +#endif
27501 +
27502 +#define MEMCPY_DS(dest,src,len) \
27503 +    do *dest++ = *src++; \
27504 +    while (--len > 0)
27505 +
27506 +#define MEMMOVE_DS(dest,src,len) \
27507 +    do *dest++ = *src++; \
27508 +    while (--len > 0)
27509 +
27510 +
27511 +#if (LZO_UINT_MAX <= SIZE_T_MAX) && defined(HAVE_MEMSET)
27512 +
27513 +#define BZERO8_PTR(s,l,n)   memset((s),0,(lzo_uint)(l)*(n))
27514 +
27515 +#else
27516 +
27517 +#define BZERO8_PTR(s,l,n) \
27518 +    lzo_memset((lzo_voidp)(s),0,(lzo_uint)(l)*(n))
27519 +
27520 +#endif
27521 +
27522 +#ifdef __cplusplus
27523 +}
27524 +#endif
27525 +
27526 +#endif
27527 +
27528 +/* If you use the LZO library in a product, you *must* keep this
27529 + * copyright string in the executable of your product.
27530 + */
27531 +
27532 +static const lzo_byte __lzo_copyright[] =
27533 +#if !defined(__LZO_IN_MINLZO)
27534 +    LZO_VERSION_STRING;
27535 +#else
27536 +    "\n\n\n"
27537 +    "LZO real-time data compression library.\n"
27538 +    "Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002 Markus Franz Xaver Johannes Oberhumer\n"
27539 +    "<markus.oberhumer@jk.uni-linz.ac.at>\n"
27540 +    "http://www.oberhumer.com/opensource/lzo/\n"
27541 +    "\n"
27542 +    "LZO version: v" LZO_VERSION_STRING ", " LZO_VERSION_DATE "\n"
27543 +    "LZO build date: " __DATE__ " " __TIME__ "\n\n"
27544 +    "LZO special compilation options:\n"
27545 +#ifdef __cplusplus
27546 +    " __cplusplus\n"
27547 +#endif
27548 +#if defined(__PIC__)
27549 +    " __PIC__\n"
27550 +#elif defined(__pic__)
27551 +    " __pic__\n"
27552 +#endif
27553 +#if (UINT_MAX < LZO_0xffffffffL)
27554 +    " 16BIT\n"
27555 +#endif
27556 +#if defined(__LZO_STRICT_16BIT)
27557 +    " __LZO_STRICT_16BIT\n"
27558 +#endif
27559 +#if (UINT_MAX > LZO_0xffffffffL)
27560 +    " UINT_MAX=" _LZO_MEXPAND(UINT_MAX) "\n"
27561 +#endif
27562 +#if (ULONG_MAX > LZO_0xffffffffL)
27563 +    " ULONG_MAX=" _LZO_MEXPAND(ULONG_MAX) "\n"
27564 +#endif
27565 +#if defined(LZO_BYTE_ORDER)
27566 +    " LZO_BYTE_ORDER=" _LZO_MEXPAND(LZO_BYTE_ORDER) "\n"
27567 +#endif
27568 +#if defined(LZO_UNALIGNED_OK_2)
27569 +    " LZO_UNALIGNED_OK_2\n"
27570 +#endif
27571 +#if defined(LZO_UNALIGNED_OK_4)
27572 +    " LZO_UNALIGNED_OK_4\n"
27573 +#endif
27574 +#if defined(LZO_ALIGNED_OK_4)
27575 +    " LZO_ALIGNED_OK_4\n"
27576 +#endif
27577 +#if defined(LZO_DICT_USE_PTR)
27578 +    " LZO_DICT_USE_PTR\n"
27579 +#endif
27580 +#if defined(__LZO_QUERY_COMPRESS)
27581 +    " __LZO_QUERY_COMPRESS\n"
27582 +#endif
27583 +#if defined(__LZO_QUERY_DECOMPRESS)
27584 +    " __LZO_QUERY_DECOMPRESS\n"
27585 +#endif
27586 +#if defined(__LZO_IN_MINILZO)
27587 +    " __LZO_IN_MINILZO\n"
27588 +#endif
27589 +    "\n\n" "$Id: LZO " LZO_VERSION_STRING " built " __DATE__ " " __TIME__
27590 +#if defined(__GNUC__) && defined(__VERSION__)
27591 +    " by gcc " __VERSION__
27592 +#elif defined(__BORLANDC__)
27593 +    " by Borland C " _LZO_MEXPAND(__BORLANDC__)
27594 +#elif defined(_MSC_VER)
27595 +    " by Microsoft C " _LZO_MEXPAND(_MSC_VER)
27596 +#elif defined(__PUREC__)
27597 +    " by Pure C " _LZO_MEXPAND(__PUREC__)
27598 +#elif defined(__SC__)
27599 +    " by Symantec C " _LZO_MEXPAND(__SC__)
27600 +#elif defined(__TURBOC__)
27601 +    " by Turbo C " _LZO_MEXPAND(__TURBOC__)
27602 +#elif defined(__WATCOMC__)
27603 +    " by Watcom C " _LZO_MEXPAND(__WATCOMC__)
27604 +#endif
27605 +    " $\n"
27606 +    "$Copyright: LZO (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002 Markus Franz Xaver Johannes Oberhumer $\n";
27607 +#endif
27608 +
27609 +#define LZO_BASE 65521u
27610 +#define LZO_NMAX 5552
27611 +
27612 +#define LZO_DO1(buf,i)  {s1 += buf[i]; s2 += s1;}
27613 +#define LZO_DO2(buf,i)  LZO_DO1(buf,i); LZO_DO1(buf,i+1);
27614 +#define LZO_DO4(buf,i)  LZO_DO2(buf,i); LZO_DO2(buf,i+2);
27615 +#define LZO_DO8(buf,i)  LZO_DO4(buf,i); LZO_DO4(buf,i+4);
27616 +#define LZO_DO16(buf,i) LZO_DO8(buf,i); LZO_DO8(buf,i+8);
27617 +
27618 +#  define IS_SIGNED(type)       (((type) (-1)) < ((type) 0))
27619 +#  define IS_UNSIGNED(type)     (((type) (-1)) > ((type) 0))
27620 +
27621 +#define IS_POWER_OF_2(x)        (((x) & ((x) - 1)) == 0)
27622 +
27623 +static lzo_bool schedule_insns_bug(void);
27624 +static lzo_bool strength_reduce_bug(int *);
27625 +
27626 +#  define __lzo_assert(x)   ((x) ? 1 : 0)
27627 +
27628 +#undef COMPILE_TIME_ASSERT
27629 +
27630 +#  define COMPILE_TIME_ASSERT(expr)     LZO_COMPILE_TIME_ASSERT(expr)
27631 +
27632 +static lzo_bool basic_integral_check(void)
27633 +{
27634 +       lzo_bool r = 1;
27635 +
27636 +       COMPILE_TIME_ASSERT(CHAR_BIT == 8);
27637 +       COMPILE_TIME_ASSERT(sizeof(char) == 1);
27638 +       COMPILE_TIME_ASSERT(sizeof(short) >= 2);
27639 +       COMPILE_TIME_ASSERT(sizeof(long) >= 4);
27640 +       COMPILE_TIME_ASSERT(sizeof(int) >= sizeof(short));
27641 +       COMPILE_TIME_ASSERT(sizeof(long) >= sizeof(int));
27642 +
27643 +       COMPILE_TIME_ASSERT(sizeof(lzo_uint) == sizeof(lzo_int));
27644 +       COMPILE_TIME_ASSERT(sizeof(lzo_uint32) == sizeof(lzo_int32));
27645 +
27646 +       COMPILE_TIME_ASSERT(sizeof(lzo_uint32) >= 4);
27647 +       COMPILE_TIME_ASSERT(sizeof(lzo_uint32) >= sizeof(unsigned));
27648 +#if defined(__LZO_STRICT_16BIT)
27649 +       COMPILE_TIME_ASSERT(sizeof(lzo_uint) == 2);
27650 +#else
27651 +       COMPILE_TIME_ASSERT(sizeof(lzo_uint) >= 4);
27652 +       COMPILE_TIME_ASSERT(sizeof(lzo_uint) >= sizeof(unsigned));
27653 +#endif
27654 +
27655 +#if (USHRT_MAX == 65535u)
27656 +       COMPILE_TIME_ASSERT(sizeof(short) == 2);
27657 +#elif (USHRT_MAX == LZO_0xffffffffL)
27658 +       COMPILE_TIME_ASSERT(sizeof(short) == 4);
27659 +#elif (USHRT_MAX >= LZO_0xffffffffL)
27660 +       COMPILE_TIME_ASSERT(sizeof(short) > 4);
27661 +#endif
27662 +#if 0                          /* to make gcc happy -edward */
27663 +#if (UINT_MAX == 65535u)
27664 +       COMPILE_TIME_ASSERT(sizeof(int) == 2);
27665 +#elif (UINT_MAX == LZO_0xffffffffL)
27666 +       COMPILE_TIME_ASSERT(sizeof(int) == 4);
27667 +#elif (UINT_MAX >= LZO_0xffffffffL)
27668 +       COMPILE_TIME_ASSERT(sizeof(int) > 4);
27669 +#endif
27670 +#if (ULONG_MAX == 65535ul)
27671 +       COMPILE_TIME_ASSERT(sizeof(long) == 2);
27672 +#elif (ULONG_MAX == LZO_0xffffffffL)
27673 +       COMPILE_TIME_ASSERT(sizeof(long) == 4);
27674 +#elif (ULONG_MAX >= LZO_0xffffffffL)
27675 +       COMPILE_TIME_ASSERT(sizeof(long) > 4);
27676 +#endif
27677 +#if defined(SIZEOF_UNSIGNED)
27678 +       COMPILE_TIME_ASSERT(SIZEOF_UNSIGNED == sizeof(unsigned));
27679 +#endif
27680 +#if defined(SIZEOF_UNSIGNED_LONG)
27681 +       COMPILE_TIME_ASSERT(SIZEOF_UNSIGNED_LONG == sizeof(unsigned long));
27682 +#endif
27683 +#if defined(SIZEOF_UNSIGNED_SHORT)
27684 +       COMPILE_TIME_ASSERT(SIZEOF_UNSIGNED_SHORT == sizeof(unsigned short));
27685 +#endif
27686 +#if !defined(__LZO_IN_MINILZO)
27687 +#if defined(SIZEOF_SIZE_T)
27688 +       COMPILE_TIME_ASSERT(SIZEOF_SIZE_T == sizeof(size_t));
27689 +#endif
27690 +#endif
27691 +#endif                         /* -edward */
27692 +
27693 +       COMPILE_TIME_ASSERT(IS_UNSIGNED(unsigned char));
27694 +       COMPILE_TIME_ASSERT(IS_UNSIGNED(unsigned short));
27695 +       COMPILE_TIME_ASSERT(IS_UNSIGNED(unsigned));
27696 +       COMPILE_TIME_ASSERT(IS_UNSIGNED(unsigned long));
27697 +       COMPILE_TIME_ASSERT(IS_SIGNED(short));
27698 +       COMPILE_TIME_ASSERT(IS_SIGNED(int));
27699 +       COMPILE_TIME_ASSERT(IS_SIGNED(long));
27700 +
27701 +       COMPILE_TIME_ASSERT(IS_UNSIGNED(lzo_uint32));
27702 +       COMPILE_TIME_ASSERT(IS_UNSIGNED(lzo_uint));
27703 +       COMPILE_TIME_ASSERT(IS_SIGNED(lzo_int32));
27704 +       COMPILE_TIME_ASSERT(IS_SIGNED(lzo_int));
27705 +
27706 +       COMPILE_TIME_ASSERT(INT_MAX == LZO_STYPE_MAX(sizeof(int)));
27707 +       COMPILE_TIME_ASSERT(UINT_MAX == LZO_UTYPE_MAX(sizeof(unsigned)));
27708 +       COMPILE_TIME_ASSERT(LONG_MAX == LZO_STYPE_MAX(sizeof(long)));
27709 +       COMPILE_TIME_ASSERT(ULONG_MAX == LZO_UTYPE_MAX(sizeof(unsigned long)));
27710 +       //    COMPILE_TIME_ASSERT(SHRT_MAX   == LZO_STYPE_MAX(sizeof(short))); /* edward */
27711 +       COMPILE_TIME_ASSERT(USHRT_MAX == LZO_UTYPE_MAX(sizeof(unsigned short)));
27712 +       COMPILE_TIME_ASSERT(LZO_UINT32_MAX ==
27713 +                           LZO_UTYPE_MAX(sizeof(lzo_uint32)));
27714 +       COMPILE_TIME_ASSERT(LZO_UINT_MAX == LZO_UTYPE_MAX(sizeof(lzo_uint)));
27715 +#if !defined(__LZO_IN_MINILZO)
27716 +       COMPILE_TIME_ASSERT(SIZE_T_MAX == LZO_UTYPE_MAX(sizeof(size_t)));
27717 +#endif
27718 +
27719 +       r &= __lzo_assert(LZO_BYTE(257) == 1);
27720 +
27721 +       return r;
27722 +}
27723 +
27724 +static lzo_bool basic_ptr_check(void)
27725 +{
27726 +       lzo_bool r = 1;
27727 +
27728 +       COMPILE_TIME_ASSERT(sizeof(char *) >= sizeof(int));
27729 +       COMPILE_TIME_ASSERT(sizeof(lzo_byte *) >= sizeof(char *));
27730 +
27731 +       COMPILE_TIME_ASSERT(sizeof(lzo_voidp) == sizeof(lzo_byte *));
27732 +       COMPILE_TIME_ASSERT(sizeof(lzo_voidp) == sizeof(lzo_voidpp));
27733 +       COMPILE_TIME_ASSERT(sizeof(lzo_voidp) == sizeof(lzo_bytepp));
27734 +       COMPILE_TIME_ASSERT(sizeof(lzo_voidp) >= sizeof(lzo_uint));
27735 +
27736 +       COMPILE_TIME_ASSERT(sizeof(lzo_ptr_t) == sizeof(lzo_voidp));
27737 +       COMPILE_TIME_ASSERT(sizeof(lzo_ptr_t) == sizeof(lzo_sptr_t));
27738 +       COMPILE_TIME_ASSERT(sizeof(lzo_ptr_t) >= sizeof(lzo_uint));
27739 +
27740 +       COMPILE_TIME_ASSERT(sizeof(lzo_ptrdiff_t) >= 4);
27741 +       COMPILE_TIME_ASSERT(sizeof(lzo_ptrdiff_t) >= sizeof(ptrdiff_t));
27742 +
27743 +       COMPILE_TIME_ASSERT(sizeof(ptrdiff_t) >= sizeof(size_t));
27744 +       COMPILE_TIME_ASSERT(sizeof(lzo_ptrdiff_t) >= sizeof(lzo_uint));
27745 +
27746 +#if defined(SIZEOF_CHAR_P)
27747 +       COMPILE_TIME_ASSERT(SIZEOF_CHAR_P == sizeof(char *));
27748 +#endif
27749 +#if defined(SIZEOF_PTRDIFF_T)
27750 +       COMPILE_TIME_ASSERT(SIZEOF_PTRDIFF_T == sizeof(ptrdiff_t));
27751 +#endif
27752 +
27753 +       COMPILE_TIME_ASSERT(IS_SIGNED(ptrdiff_t));
27754 +       COMPILE_TIME_ASSERT(IS_UNSIGNED(size_t));
27755 +       COMPILE_TIME_ASSERT(IS_SIGNED(lzo_ptrdiff_t));
27756 +       COMPILE_TIME_ASSERT(IS_SIGNED(lzo_sptr_t));
27757 +       COMPILE_TIME_ASSERT(IS_UNSIGNED(lzo_ptr_t));
27758 +       COMPILE_TIME_ASSERT(IS_UNSIGNED(lzo_moff_t));
27759 +
27760 +       return r;
27761 +}
27762 +
27763 +static lzo_bool ptr_check(void)
27764 +{
27765 +       lzo_bool r = 1;
27766 +       int i;
27767 +       char _wrkmem[10 * sizeof(lzo_byte *) + sizeof(lzo_full_align_t)];
27768 +       lzo_bytep wrkmem;
27769 +       lzo_bytepp dict;
27770 +       unsigned char x[4 * sizeof(lzo_full_align_t)];
27771 +       long d;
27772 +       lzo_full_align_t a;
27773 +       lzo_full_align_t u;
27774 +
27775 +       for (i = 0; i < (int)sizeof(x); i++)
27776 +               x[i] = LZO_BYTE(i);
27777 +
27778 +       wrkmem =
27779 +           LZO_PTR_ALIGN_UP((lzo_byte *) _wrkmem, sizeof(lzo_full_align_t));
27780 +
27781 +       u.a_lzo_bytep = wrkmem;
27782 +       dict = u.a_lzo_bytepp;
27783 +
27784 +       d = (long)((const lzo_bytep)dict - (const lzo_bytep)_wrkmem);
27785 +       r &= __lzo_assert(d >= 0);
27786 +       r &= __lzo_assert(d < (long)sizeof(lzo_full_align_t));
27787 +
27788 +       memset(&a, 0, sizeof(a));
27789 +       r &= __lzo_assert(a.a_lzo_voidp == NULL);
27790 +
27791 +       memset(&a, 0xff, sizeof(a));
27792 +       r &= __lzo_assert(a.a_ushort == USHRT_MAX);
27793 +       r &= __lzo_assert(a.a_uint == UINT_MAX);
27794 +       r &= __lzo_assert(a.a_ulong == ULONG_MAX);
27795 +       r &= __lzo_assert(a.a_lzo_uint == LZO_UINT_MAX);
27796 +       r &= __lzo_assert(a.a_lzo_uint32 == LZO_UINT32_MAX);
27797 +
27798 +       if (r == 1) {
27799 +               for (i = 0; i < 8; i++)
27800 +                       r &= __lzo_assert((const lzo_voidp)(&dict[i]) ==
27801 +                                         (const
27802 +                                          lzo_voidp)(&wrkmem[i *
27803 +                                                             sizeof(lzo_byte
27804 +                                                                    *)]));
27805 +       }
27806 +
27807 +       memset(&a, 0, sizeof(a));
27808 +       r &= __lzo_assert(a.a_char_p == NULL);
27809 +       r &= __lzo_assert(a.a_lzo_bytep == NULL);
27810 +       r &= __lzo_assert(NULL == (void *)0);
27811 +       if (r == 1) {
27812 +               for (i = 0; i < 10; i++)
27813 +                       dict[i] = wrkmem;
27814 +               BZERO8_PTR(dict + 1, sizeof(dict[0]), 8);
27815 +               r &= __lzo_assert(dict[0] == wrkmem);
27816 +               for (i = 1; i < 9; i++)
27817 +                       r &= __lzo_assert(dict[i] == NULL);
27818 +               r &= __lzo_assert(dict[9] == wrkmem);
27819 +       }
27820 +
27821 +       if (r == 1) {
27822 +               unsigned k = 1;
27823 +               const unsigned n = (unsigned)sizeof(lzo_uint32);
27824 +               lzo_byte *p0;
27825 +               lzo_byte *p1;
27826 +
27827 +               k += __lzo_align_gap(&x[k], n);
27828 +               p0 = (lzo_bytep) & x[k];
27829 +#if defined(PTR_LINEAR)
27830 +               r &= __lzo_assert((PTR_LINEAR(p0) & (n - 1)) == 0);
27831 +#else
27832 +               r &= __lzo_assert(n == 4);
27833 +               r &= __lzo_assert(PTR_ALIGNED_4(p0));
27834 +#endif
27835 +
27836 +               r &= __lzo_assert(k >= 1);
27837 +               p1 = (lzo_bytep) & x[1];
27838 +               r &= __lzo_assert(PTR_GE(p0, p1));
27839 +
27840 +               r &= __lzo_assert(k < 1 + n);
27841 +               p1 = (lzo_bytep) & x[1 + n];
27842 +               r &= __lzo_assert(PTR_LT(p0, p1));
27843 +
27844 +               if (r == 1) {
27845 +                       lzo_uint32 v0, v1;
27846 +
27847 +                       u.a_uchar_p = &x[k];
27848 +                       v0 = *u.a_lzo_uint32_p;
27849 +                       u.a_uchar_p = &x[k + n];
27850 +                       v1 = *u.a_lzo_uint32_p;
27851 +
27852 +                       r &= __lzo_assert(v0 > 0);
27853 +                       r &= __lzo_assert(v1 > 0);
27854 +               }
27855 +       }
27856 +
27857 +       return r;
27858 +}
27859 +
27860 +static int _lzo_config_check(void)
27861 +{
27862 +       lzo_bool r = 1;
27863 +       int i;
27864 +       union {
27865 +               lzo_uint32 a;
27866 +               unsigned short b;
27867 +               lzo_uint32 aa[4];
27868 +               unsigned char x[4 * sizeof(lzo_full_align_t)];
27869 +       }
27870 +       u;
27871 +
27872 +       COMPILE_TIME_ASSERT((int)((unsigned char)((signed char)-1)) == 255);
27873 +       COMPILE_TIME_ASSERT((((unsigned char)128) << (int)(8 * sizeof(int) - 8))
27874 +                           < 0);
27875 +
27876 +       r &= basic_integral_check();
27877 +       r &= basic_ptr_check();
27878 +       if (r != 1)
27879 +               return LZO_E_ERROR;
27880 +
27881 +       u.a = 0;
27882 +       u.b = 0;
27883 +       for (i = 0; i < (int)sizeof(u.x); i++)
27884 +               u.x[i] = LZO_BYTE(i);
27885 +
27886 +#if defined(LZO_BYTE_ORDER)
27887 +       if (r == 1) {
27888 +#  if (LZO_BYTE_ORDER == LZO_LITTLE_ENDIAN)
27889 +               lzo_uint32 a = (lzo_uint32) (u.a & LZO_0xffffffffL);
27890 +               unsigned short b = (unsigned short)(u.b & 0xffff);
27891 +               r &= __lzo_assert(a == 0x03020100L);
27892 +               r &= __lzo_assert(b == 0x0100);
27893 +#  elif (LZO_BYTE_ORDER == LZO_BIG_ENDIAN)
27894 +               lzo_uint32 a = u.a >> (8 * sizeof(u.a) - 32);
27895 +               unsigned short b = u.b >> (8 * sizeof(u.b) - 16);
27896 +               r &= __lzo_assert(a == 0x00010203L);
27897 +               r &= __lzo_assert(b == 0x0001);
27898 +#  else
27899 +#    error "invalid LZO_BYTE_ORDER"
27900 +#  endif
27901 +       }
27902 +#endif
27903 +
27904 +#if defined(LZO_UNALIGNED_OK_2)
27905 +       COMPILE_TIME_ASSERT(sizeof(short) == 2);
27906 +       if (r == 1) {
27907 +               unsigned short b[4];
27908 +
27909 +               for (i = 0; i < 4; i++)
27910 +                       b[i] = *(const unsigned short *)&u.x[i];
27911 +
27912 +#  if (LZO_BYTE_ORDER == LZO_LITTLE_ENDIAN)
27913 +               r &= __lzo_assert(b[0] == 0x0100);
27914 +               r &= __lzo_assert(b[1] == 0x0201);
27915 +               r &= __lzo_assert(b[2] == 0x0302);
27916 +               r &= __lzo_assert(b[3] == 0x0403);
27917 +#  elif (LZO_BYTE_ORDER == LZO_BIG_ENDIAN)
27918 +               r &= __lzo_assert(b[0] == 0x0001);
27919 +               r &= __lzo_assert(b[1] == 0x0102);
27920 +               r &= __lzo_assert(b[2] == 0x0203);
27921 +               r &= __lzo_assert(b[3] == 0x0304);
27922 +#  endif
27923 +       }
27924 +#endif
27925 +
27926 +#if defined(LZO_UNALIGNED_OK_4)
27927 +       COMPILE_TIME_ASSERT(sizeof(lzo_uint32) == 4);
27928 +       if (r == 1) {
27929 +               lzo_uint32 a[4];
27930 +
27931 +               for (i = 0; i < 4; i++)
27932 +                       a[i] = *(const lzo_uint32 *)&u.x[i];
27933 +
27934 +#  if (LZO_BYTE_ORDER == LZO_LITTLE_ENDIAN)
27935 +               r &= __lzo_assert(a[0] == 0x03020100L);
27936 +               r &= __lzo_assert(a[1] == 0x04030201L);
27937 +               r &= __lzo_assert(a[2] == 0x05040302L);
27938 +               r &= __lzo_assert(a[3] == 0x06050403L);
27939 +#  elif (LZO_BYTE_ORDER == LZO_BIG_ENDIAN)
27940 +               r &= __lzo_assert(a[0] == 0x00010203L);
27941 +               r &= __lzo_assert(a[1] == 0x01020304L);
27942 +               r &= __lzo_assert(a[2] == 0x02030405L);
27943 +               r &= __lzo_assert(a[3] == 0x03040506L);
27944 +#  endif
27945 +       }
27946 +#endif
27947 +
27948 +#if defined(LZO_ALIGNED_OK_4)
27949 +       COMPILE_TIME_ASSERT(sizeof(lzo_uint32) == 4);
27950 +#endif
27951 +
27952 +       COMPILE_TIME_ASSERT(lzo_sizeof_dict_t == sizeof(lzo_dict_t));
27953 +
27954 +       if (r == 1) {
27955 +               r &= __lzo_assert(!schedule_insns_bug());
27956 +       }
27957 +
27958 +       if (r == 1) {
27959 +               static int x[3];
27960 +               static unsigned xn = 3;
27961 +               register unsigned j;
27962 +
27963 +               for (j = 0; j < xn; j++)
27964 +                       x[j] = (int)j - 3;
27965 +               r &= __lzo_assert(!strength_reduce_bug(x));
27966 +       }
27967 +
27968 +       if (r == 1) {
27969 +               r &= ptr_check();
27970 +       }
27971 +
27972 +       return r == 1 ? LZO_E_OK : LZO_E_ERROR;
27973 +}
27974 +
27975 +static lzo_bool schedule_insns_bug(void)
27976 +{
27977 +#if defined(__LZO_CHECKER)
27978 +       return 0;
27979 +#else
27980 +       const int clone[] = { 1, 2, 0 };
27981 +       const int *q;
27982 +       q = clone;
27983 +       return (*q) ? 0 : 1;
27984 +#endif
27985 +}
27986 +
27987 +static lzo_bool strength_reduce_bug(int *x)
27988 +{
27989 +       return x[0] != -3 || x[1] != -2 || x[2] != -1;
27990 +}
27991 +
27992 +#undef COMPILE_TIME_ASSERT
27993 +
27994 +LZO_PUBLIC(int)
27995 +    __lzo_init2(unsigned v, int s1, int s2, int s3, int s4, int s5,
27996 +           int s6, int s7, int s8, int s9)
27997 +{
27998 +       int r;
27999 +
28000 +       if (v == 0)
28001 +               return LZO_E_ERROR;
28002 +
28003 +       r = (s1 == -1 || s1 == (int)sizeof(short)) &&
28004 +           (s2 == -1 || s2 == (int)sizeof(int)) &&
28005 +           (s3 == -1 || s3 == (int)sizeof(long)) &&
28006 +           (s4 == -1 || s4 == (int)sizeof(lzo_uint32)) &&
28007 +           (s5 == -1 || s5 == (int)sizeof(lzo_uint)) &&
28008 +           (s6 == -1 || s6 == (int)lzo_sizeof_dict_t) &&
28009 +           (s7 == -1 || s7 == (int)sizeof(char *)) &&
28010 +           (s8 == -1 || s8 == (int)sizeof(lzo_voidp)) &&
28011 +           (s9 == -1 || s9 == (int)sizeof(lzo_compress_t));
28012 +       if (!r)
28013 +               return LZO_E_ERROR;
28014 +
28015 +       r = _lzo_config_check();
28016 +       if (r != LZO_E_OK)
28017 +               return r;
28018 +
28019 +       return r;
28020 +}
28021 +
28022 +#if !defined(__LZO_IN_MINILZO)
28023 +
28024 +LZO_EXTERN(int)
28025 +    __lzo_init(unsigned v, int s1, int s2, int s3, int s4, int s5, int s6, int s7);
28026 +
28027 +LZO_PUBLIC(int)
28028 +__lzo_init(unsigned v, int s1, int s2, int s3, int s4, int s5, int s6, int s7)
28029 +{
28030 +       if (v == 0 || v > 0x1010)
28031 +               return LZO_E_ERROR;
28032 +       return __lzo_init2(v, s1, s2, s3, s4, s5, -1, -1, s6, s7);
28033 +}
28034 +
28035 +#endif
28036 +
28037 +#define do_compress         _lzo1x_1_do_compress
28038 +
28039 +#define LZO_NEED_DICT_H
28040 +#define D_BITS          14
28041 +#define D_INDEX1(d,p)       d = DM((0x21*DX3(p,5,5,6)) >> 5)
28042 +#define D_INDEX2(d,p)       d = (d & (D_MASK & 0x7ff)) ^ (D_HIGH | 0x1f)
28043 +
28044 +#ifndef __LZO_CONFIG1X_H
28045 +#define __LZO_CONFIG1X_H
28046 +
28047 +#if !defined(LZO1X) && !defined(LZO1Y) && !defined(LZO1Z)
28048 +#  define LZO1X
28049 +#endif
28050 +
28051 +#if !defined(__LZO_IN_MINILZO)
28052 +#include <lzo1x.h>
28053 +#endif
28054 +
28055 +#define LZO_EOF_CODE
28056 +#undef LZO_DETERMINISTIC
28057 +
28058 +#define M1_MAX_OFFSET   0x0400
28059 +#ifndef M2_MAX_OFFSET
28060 +#define M2_MAX_OFFSET   0x0800
28061 +#endif
28062 +#define M3_MAX_OFFSET   0x4000
28063 +#define M4_MAX_OFFSET   0xbfff
28064 +
28065 +#define MX_MAX_OFFSET   (M1_MAX_OFFSET + M2_MAX_OFFSET)
28066 +
28067 +#define M1_MIN_LEN      2
28068 +#define M1_MAX_LEN      2
28069 +#define M2_MIN_LEN      3
28070 +#ifndef M2_MAX_LEN
28071 +#define M2_MAX_LEN      8
28072 +#endif
28073 +#define M3_MIN_LEN      3
28074 +#define M3_MAX_LEN      33
28075 +#define M4_MIN_LEN      3
28076 +#define M4_MAX_LEN      9
28077 +
28078 +#define M1_MARKER       0
28079 +#define M2_MARKER       64
28080 +#define M3_MARKER       32
28081 +#define M4_MARKER       16
28082 +
28083 +#ifndef MIN_LOOKAHEAD
28084 +#define MIN_LOOKAHEAD       (M2_MAX_LEN + 1)
28085 +#endif
28086 +
28087 +#if defined(LZO_NEED_DICT_H)
28088 +
28089 +#ifndef LZO_HASH
28090 +#define LZO_HASH            LZO_HASH_LZO_INCREMENTAL_B
28091 +#endif
28092 +#define DL_MIN_LEN          M2_MIN_LEN
28093 +
28094 +#ifndef __LZO_DICT_H
28095 +#define __LZO_DICT_H
28096 +
28097 +#ifdef __cplusplus
28098 +extern "C" {
28099 +#endif
28100 +
28101 +#if !defined(D_BITS) && defined(DBITS)
28102 +#  define D_BITS        DBITS
28103 +#endif
28104 +#if !defined(D_BITS)
28105 +#  error "D_BITS is not defined"
28106 +#endif
28107 +#if (D_BITS < 16)
28108 +#  define D_SIZE        LZO_SIZE(D_BITS)
28109 +#  define D_MASK        LZO_MASK(D_BITS)
28110 +#else
28111 +#  define D_SIZE        LZO_USIZE(D_BITS)
28112 +#  define D_MASK        LZO_UMASK(D_BITS)
28113 +#endif
28114 +#define D_HIGH          ((D_MASK >> 1) + 1)
28115 +
28116 +#if !defined(DD_BITS)
28117 +#  define DD_BITS       0
28118 +#endif
28119 +#define DD_SIZE         LZO_SIZE(DD_BITS)
28120 +#define DD_MASK         LZO_MASK(DD_BITS)
28121 +
28122 +#if !defined(DL_BITS)
28123 +#  define DL_BITS       (D_BITS - DD_BITS)
28124 +#endif
28125 +#if (DL_BITS < 16)
28126 +#  define DL_SIZE       LZO_SIZE(DL_BITS)
28127 +#  define DL_MASK       LZO_MASK(DL_BITS)
28128 +#else
28129 +#  define DL_SIZE       LZO_USIZE(DL_BITS)
28130 +#  define DL_MASK       LZO_UMASK(DL_BITS)
28131 +#endif
28132 +
28133 +#if (D_BITS != DL_BITS + DD_BITS)
28134 +#  error "D_BITS does not match"
28135 +#endif
28136 +#if (D_BITS < 8 || D_BITS > 18)
28137 +#  error "invalid D_BITS"
28138 +#endif
28139 +#if (DL_BITS < 8 || DL_BITS > 20)
28140 +#  error "invalid DL_BITS"
28141 +#endif
28142 +#if (DD_BITS < 0 || DD_BITS > 6)
28143 +#  error "invalid DD_BITS"
28144 +#endif
28145 +
28146 +#if !defined(DL_MIN_LEN)
28147 +#  define DL_MIN_LEN    3
28148 +#endif
28149 +#if !defined(DL_SHIFT)
28150 +#  define DL_SHIFT      ((DL_BITS + (DL_MIN_LEN - 1)) / DL_MIN_LEN)
28151 +#endif
28152 +
28153 +#define LZO_HASH_GZIP                   1
28154 +#define LZO_HASH_GZIP_INCREMENTAL       2
28155 +#define LZO_HASH_LZO_INCREMENTAL_A      3
28156 +#define LZO_HASH_LZO_INCREMENTAL_B      4
28157 +
28158 +#if !defined(LZO_HASH)
28159 +#  error "choose a hashing strategy"
28160 +#endif
28161 +
28162 +#if (DL_MIN_LEN == 3)
28163 +#  define _DV2_A(p,shift1,shift2) \
28164 +       (((( (lzo_uint32)((p)[0]) << shift1) ^ (p)[1]) << shift2) ^ (p)[2])
28165 +#  define _DV2_B(p,shift1,shift2) \
28166 +       (((( (lzo_uint32)((p)[2]) << shift1) ^ (p)[1]) << shift2) ^ (p)[0])
28167 +#  define _DV3_B(p,shift1,shift2,shift3) \
28168 +       ((_DV2_B((p)+1,shift1,shift2) << (shift3)) ^ (p)[0])
28169 +#elif (DL_MIN_LEN == 2)
28170 +#  define _DV2_A(p,shift1,shift2) \
28171 +       (( (lzo_uint32)(p[0]) << shift1) ^ p[1])
28172 +#  define _DV2_B(p,shift1,shift2) \
28173 +       (( (lzo_uint32)(p[1]) << shift1) ^ p[2])
28174 +#else
28175 +#  error "invalid DL_MIN_LEN"
28176 +#endif
28177 +#define _DV_A(p,shift)      _DV2_A(p,shift,shift)
28178 +#define _DV_B(p,shift)      _DV2_B(p,shift,shift)
28179 +#define DA2(p,s1,s2) \
28180 +       (((((lzo_uint32)((p)[2]) << (s2)) + (p)[1]) << (s1)) + (p)[0])
28181 +#define DS2(p,s1,s2) \
28182 +       (((((lzo_uint32)((p)[2]) << (s2)) - (p)[1]) << (s1)) - (p)[0])
28183 +#define DX2(p,s1,s2) \
28184 +       (((((lzo_uint32)((p)[2]) << (s2)) ^ (p)[1]) << (s1)) ^ (p)[0])
28185 +#define DA3(p,s1,s2,s3) ((DA2((p)+1,s2,s3) << (s1)) + (p)[0])
28186 +#define DS3(p,s1,s2,s3) ((DS2((p)+1,s2,s3) << (s1)) - (p)[0])
28187 +#define DX3(p,s1,s2,s3) ((DX2((p)+1,s2,s3) << (s1)) ^ (p)[0])
28188 +#define DMS(v,s)        ((lzo_uint) (((v) & (D_MASK >> (s))) << (s)))
28189 +#define DM(v)           DMS(v,0)
28190 +
28191 +#if (LZO_HASH == LZO_HASH_GZIP)
28192 +#  define _DINDEX(dv,p)     (_DV_A((p),DL_SHIFT))
28193 +
28194 +#elif (LZO_HASH == LZO_HASH_GZIP_INCREMENTAL)
28195 +#  define __LZO_HASH_INCREMENTAL
28196 +#  define DVAL_FIRST(dv,p)  dv = _DV_A((p),DL_SHIFT)
28197 +#  define DVAL_NEXT(dv,p)   dv = (((dv) << DL_SHIFT) ^ p[2])
28198 +#  define _DINDEX(dv,p)     (dv)
28199 +#  define DVAL_LOOKAHEAD    DL_MIN_LEN
28200 +
28201 +#elif (LZO_HASH == LZO_HASH_LZO_INCREMENTAL_A)
28202 +#  define __LZO_HASH_INCREMENTAL
28203 +#  define DVAL_FIRST(dv,p)  dv = _DV_A((p),5)
28204 +#  define DVAL_NEXT(dv,p) \
28205 +               dv ^= (lzo_uint32)(p[-1]) << (2*5); dv = (((dv) << 5) ^ p[2])
28206 +#  define _DINDEX(dv,p)     ((0x9f5f * (dv)) >> 5)
28207 +#  define DVAL_LOOKAHEAD    DL_MIN_LEN
28208 +
28209 +#elif (LZO_HASH == LZO_HASH_LZO_INCREMENTAL_B)
28210 +#  define __LZO_HASH_INCREMENTAL
28211 +#  define DVAL_FIRST(dv,p)  dv = _DV_B((p),5)
28212 +#  define DVAL_NEXT(dv,p) \
28213 +               dv ^= p[-1]; dv = (((dv) >> 5) ^ ((lzo_uint32)(p[2]) << (2*5)))
28214 +#  define _DINDEX(dv,p)     ((0x9f5f * (dv)) >> 5)
28215 +#  define DVAL_LOOKAHEAD    DL_MIN_LEN
28216 +
28217 +#else
28218 +#  error "choose a hashing strategy"
28219 +#endif
28220 +
28221 +#ifndef DINDEX
28222 +#define DINDEX(dv,p)        ((lzo_uint)((_DINDEX(dv,p)) & DL_MASK) << DD_BITS)
28223 +#endif
28224 +#if !defined(DINDEX1) && defined(D_INDEX1)
28225 +#define DINDEX1             D_INDEX1
28226 +#endif
28227 +#if !defined(DINDEX2) && defined(D_INDEX2)
28228 +#define DINDEX2             D_INDEX2
28229 +#endif
28230 +
28231 +#if !defined(__LZO_HASH_INCREMENTAL)
28232 +#  define DVAL_FIRST(dv,p)  ((void) 0)
28233 +#  define DVAL_NEXT(dv,p)   ((void) 0)
28234 +#  define DVAL_LOOKAHEAD    0
28235 +#endif
28236 +
28237 +#if !defined(DVAL_ASSERT)
28238 +#if defined(__LZO_HASH_INCREMENTAL) && !defined(NDEBUG)
28239 +       static void DVAL_ASSERT(lzo_uint32 dv, const lzo_byte * p) {
28240 +               lzo_uint32 df;
28241 +                DVAL_FIRST(df, (p));
28242 +                assert(DINDEX(dv, p) == DINDEX(df, p));
28243 +       }
28244 +#else
28245 +#  define DVAL_ASSERT(dv,p) ((void) 0)
28246 +#endif
28247 +#endif
28248 +
28249 +#if defined(LZO_DICT_USE_PTR)
28250 +#  define DENTRY(p,in)                          (p)
28251 +#  define GINDEX(m_pos,m_off,dict,dindex,in)    m_pos = dict[dindex]
28252 +#else
28253 +#  define DENTRY(p,in)                          ((lzo_uint) ((p)-(in)))
28254 +#  define GINDEX(m_pos,m_off,dict,dindex,in)    m_off = dict[dindex]
28255 +#endif
28256 +
28257 +#if (DD_BITS == 0)
28258 +
28259 +#  define UPDATE_D(dict,drun,dv,p,in)       dict[ DINDEX(dv,p) ] = DENTRY(p,in)
28260 +#  define UPDATE_I(dict,drun,index,p,in)    dict[index] = DENTRY(p,in)
28261 +#  define UPDATE_P(ptr,drun,p,in)           (ptr)[0] = DENTRY(p,in)
28262 +
28263 +#else
28264 +
28265 +#  define UPDATE_D(dict,drun,dv,p,in)   \
28266 +       dict[ DINDEX(dv,p) + drun++ ] = DENTRY(p,in); drun &= DD_MASK
28267 +#  define UPDATE_I(dict,drun,index,p,in)    \
28268 +       dict[ (index) + drun++ ] = DENTRY(p,in); drun &= DD_MASK
28269 +#  define UPDATE_P(ptr,drun,p,in)   \
28270 +       (ptr) [ drun++ ] = DENTRY(p,in); drun &= DD_MASK
28271 +
28272 +#endif
28273 +
28274 +#if defined(LZO_DICT_USE_PTR)
28275 +
28276 +#define LZO_CHECK_MPOS_DET(m_pos,m_off,in,ip,max_offset) \
28277 +       (m_pos == NULL || (m_off = (lzo_moff_t) (ip - m_pos)) > max_offset)
28278 +
28279 +#define LZO_CHECK_MPOS_NON_DET(m_pos,m_off,in,ip,max_offset) \
28280 +    (BOUNDS_CHECKING_OFF_IN_EXPR( \
28281 +       (PTR_LT(m_pos,in) || \
28282 +        (m_off = (lzo_moff_t) PTR_DIFF(ip,m_pos)) <= 0 || \
28283 +         m_off > max_offset) ))
28284 +
28285 +#else
28286 +
28287 +#define LZO_CHECK_MPOS_DET(m_pos,m_off,in,ip,max_offset) \
28288 +       (m_off == 0 || \
28289 +        ((m_off = (lzo_moff_t) ((ip)-(in)) - m_off) > max_offset) || \
28290 +        (m_pos = (ip) - (m_off), 0) )
28291 +
28292 +#define LZO_CHECK_MPOS_NON_DET(m_pos,m_off,in,ip,max_offset) \
28293 +       ((lzo_moff_t) ((ip)-(in)) <= m_off || \
28294 +        ((m_off = (lzo_moff_t) ((ip)-(in)) - m_off) > max_offset) || \
28295 +        (m_pos = (ip) - (m_off), 0) )
28296 +
28297 +#endif
28298 +
28299 +#if defined(LZO_DETERMINISTIC)
28300 +#  define LZO_CHECK_MPOS    LZO_CHECK_MPOS_DET
28301 +#else
28302 +#  define LZO_CHECK_MPOS    LZO_CHECK_MPOS_NON_DET
28303 +#endif
28304 +
28305 +#ifdef __cplusplus
28306 +}
28307 +#endif
28308 +#endif
28309 +#endif
28310 +#endif
28311 +#define DO_COMPRESS     lzo1x_1_compress
28312 +static
28313 +lzo_uint do_compress(const lzo_byte * in, lzo_uint in_len,
28314 +                    lzo_byte * out, lzo_uintp out_len, lzo_voidp wrkmem)
28315 +{
28316 +       register const lzo_byte *ip;
28317 +       lzo_byte *op;
28318 +       const lzo_byte *const in_end = in + in_len;
28319 +       const lzo_byte *const ip_end = in + in_len - M2_MAX_LEN - 5;
28320 +       const lzo_byte *ii;
28321 +       lzo_dict_p const dict = (lzo_dict_p) wrkmem;
28322 +
28323 +       op = out;
28324 +       ip = in;
28325 +       ii = ip;
28326 +
28327 +       ip += 4;
28328 +       for (;;) {
28329 +               register const lzo_byte *m_pos;
28330 +
28331 +               lzo_moff_t m_off;
28332 +               lzo_uint m_len;
28333 +               lzo_uint dindex;
28334 +
28335 +               DINDEX1(dindex, ip);
28336 +               GINDEX(m_pos, m_off, dict, dindex, in);
28337 +               if (LZO_CHECK_MPOS_NON_DET(m_pos, m_off, in, ip, M4_MAX_OFFSET))
28338 +                       goto literal;
28339 +#if 1
28340 +               if (m_off <= M2_MAX_OFFSET || m_pos[3] == ip[3])
28341 +                       goto try_match;
28342 +               DINDEX2(dindex, ip);
28343 +#endif
28344 +               GINDEX(m_pos, m_off, dict, dindex, in);
28345 +               if (LZO_CHECK_MPOS_NON_DET(m_pos, m_off, in, ip, M4_MAX_OFFSET))
28346 +                       goto literal;
28347 +               if (m_off <= M2_MAX_OFFSET || m_pos[3] == ip[3])
28348 +                       goto try_match;
28349 +               goto literal;
28350 +
28351 +             try_match:
28352 +#if 1 && defined(LZO_UNALIGNED_OK_2)
28353 +               if (*(const lzo_ushortp)m_pos != *(const lzo_ushortp)ip) {
28354 +#else
28355 +               if (m_pos[0] != ip[0] || m_pos[1] != ip[1]) {
28356 +#endif
28357 +                       ;
28358 +               } else {
28359 +                       if (m_pos[2] == ip[2]) {
28360 +                               goto match;
28361 +                       } else {
28362 +                               ;
28363 +                       }
28364 +               }
28365 +
28366 +             literal:
28367 +               UPDATE_I(dict, 0, dindex, ip, in);
28368 +               ++ip;
28369 +               if (ip >= ip_end)
28370 +                       break;
28371 +               continue;
28372 +
28373 +             match:
28374 +               UPDATE_I(dict, 0, dindex, ip, in);
28375 +               if (pd(ip, ii) > 0) {
28376 +                       register lzo_uint t = pd(ip, ii);
28377 +
28378 +                       if (t <= 3) {
28379 +                               assert("lzo-04", op - 2 > out);
28380 +                               op[-2] |= LZO_BYTE(t);
28381 +                       } else if (t <= 18)
28382 +                               *op++ = LZO_BYTE(t - 3);
28383 +                       else {
28384 +                               register lzo_uint tt = t - 18;
28385 +
28386 +                               *op++ = 0;
28387 +                               while (tt > 255) {
28388 +                                       tt -= 255;
28389 +                                       *op++ = 0;
28390 +                               }
28391 +                               assert("lzo-05", tt > 0);
28392 +                               *op++ = LZO_BYTE(tt);
28393 +                       }
28394 +                       do
28395 +                               *op++ = *ii++;
28396 +                       while (--t > 0);
28397 +               }
28398 +
28399 +               assert("lzo-06", ii == ip);
28400 +               ip += 3;
28401 +               if (m_pos[3] != *ip++ || m_pos[4] != *ip++ || m_pos[5] != *ip++
28402 +                   || m_pos[6] != *ip++ || m_pos[7] != *ip++
28403 +                   || m_pos[8] != *ip++
28404 +#ifdef LZO1Y
28405 +                   || m_pos[9] != *ip++ || m_pos[10] != *ip++
28406 +                   || m_pos[11] != *ip++ || m_pos[12] != *ip++
28407 +                   || m_pos[13] != *ip++ || m_pos[14] != *ip++
28408 +#endif
28409 +                   ) {
28410 +                       --ip;
28411 +                       m_len = ip - ii;
28412 +                       assert("lzo-07", m_len >= 3);
28413 +                       assert("lzo-08", m_len <= M2_MAX_LEN);
28414 +
28415 +                       if (m_off <= M2_MAX_OFFSET) {
28416 +                               m_off -= 1;
28417 +#if defined(LZO1X)
28418 +                               *op++ =
28419 +                                   LZO_BYTE(((m_len -
28420 +                                              1) << 5) | ((m_off & 7) << 2));
28421 +                               *op++ = LZO_BYTE(m_off >> 3);
28422 +#elif defined(LZO1Y)
28423 +                               *op++ =
28424 +                                   LZO_BYTE(((m_len +
28425 +                                              1) << 4) | ((m_off & 3) << 2));
28426 +                               *op++ = LZO_BYTE(m_off >> 2);
28427 +#endif
28428 +                       } else if (m_off <= M3_MAX_OFFSET) {
28429 +                               m_off -= 1;
28430 +                               *op++ = LZO_BYTE(M3_MARKER | (m_len - 2));
28431 +                               goto m3_m4_offset;
28432 +                       } else
28433 +#if defined(LZO1X)
28434 +                       {
28435 +                               m_off -= 0x4000;
28436 +                               assert("lzo-09", m_off > 0);
28437 +                               assert("lzo-10", m_off <= 0x7fff);
28438 +                               *op++ = LZO_BYTE(M4_MARKER |
28439 +                                                ((m_off & 0x4000) >> 11) |
28440 +                                                (m_len - 2));
28441 +                               goto m3_m4_offset;
28442 +                       }
28443 +#elif defined(LZO1Y)
28444 +                               goto m4_match;
28445 +#endif
28446 +               } else {
28447 +                       {
28448 +                               const lzo_byte *end = in_end;
28449 +                               const lzo_byte *m = m_pos + M2_MAX_LEN + 1;
28450 +                               while (ip < end && *m == *ip)
28451 +                                       m++, ip++;
28452 +                               m_len = (ip - ii);
28453 +                       }
28454 +                       assert("lzo-11", m_len > M2_MAX_LEN);
28455 +
28456 +                       if (m_off <= M3_MAX_OFFSET) {
28457 +                               m_off -= 1;
28458 +                               if (m_len <= 33)
28459 +                                       *op++ =
28460 +                                           LZO_BYTE(M3_MARKER | (m_len - 2));
28461 +                               else {
28462 +                                       m_len -= 33;
28463 +                                       *op++ = M3_MARKER | 0;
28464 +                                       goto m3_m4_len;
28465 +                               }
28466 +                       } else {
28467 +#if defined(LZO1Y)
28468 +                             m4_match:
28469 +#endif
28470 +                               m_off -= 0x4000;
28471 +                               assert("lzo-12", m_off > 0);
28472 +                               assert("lzo-13", m_off <= 0x7fff);
28473 +                               if (m_len <= M4_MAX_LEN)
28474 +                                       *op++ = LZO_BYTE(M4_MARKER |
28475 +                                                        ((m_off & 0x4000) >>
28476 +                                                         11) | (m_len - 2));
28477 +                               else {
28478 +                                       m_len -= M4_MAX_LEN;
28479 +                                       *op++ =
28480 +                                           LZO_BYTE(M4_MARKER |
28481 +                                                    ((m_off & 0x4000) >> 11));
28482 +                                     m3_m4_len:
28483 +                                       while (m_len > 255) {
28484 +                                               m_len -= 255;
28485 +                                               *op++ = 0;
28486 +                                       }
28487 +                                       assert("lzo-14", m_len > 0);
28488 +                                       *op++ = LZO_BYTE(m_len);
28489 +                               }
28490 +                       }
28491 +
28492 +                     m3_m4_offset:
28493 +                       *op++ = LZO_BYTE((m_off & 63) << 2);
28494 +                       *op++ = LZO_BYTE(m_off >> 6);
28495 +               }
28496 +
28497 +               ii = ip;
28498 +               if (ip >= ip_end)
28499 +                       break;
28500 +       }
28501 +
28502 +       *out_len = op - out;
28503 +       return pd(in_end, ii);
28504 +}
28505 +
28506 +LZO_PUBLIC(int)
28507 +    DO_COMPRESS(const lzo_byte * in, lzo_uint in_len,
28508 +           lzo_byte * out, lzo_uintp out_len, lzo_voidp wrkmem)
28509 +{
28510 +       lzo_byte *op = out;
28511 +       lzo_uint t;
28512 +
28513 +#if defined(__LZO_QUERY_COMPRESS)
28514 +       if (__LZO_IS_COMPRESS_QUERY(in, in_len, out, out_len, wrkmem))
28515 +               return __LZO_QUERY_COMPRESS(in, in_len, out, out_len, wrkmem,
28516 +                                           D_SIZE, lzo_sizeof(lzo_dict_t));
28517 +#endif
28518 +
28519 +       if (in_len <= M2_MAX_LEN + 5)
28520 +               t = in_len;
28521 +       else {
28522 +               t = do_compress(in, in_len, op, out_len, wrkmem);
28523 +               op += *out_len;
28524 +       }
28525 +
28526 +       if (t > 0) {
28527 +               const lzo_byte *ii = in + in_len - t;
28528 +
28529 +               if (op == out && t <= 238)
28530 +                       *op++ = LZO_BYTE(17 + t);
28531 +               else if (t <= 3)
28532 +                       op[-2] |= LZO_BYTE(t);
28533 +               else if (t <= 18)
28534 +                       *op++ = LZO_BYTE(t - 3);
28535 +               else {
28536 +                       lzo_uint tt = t - 18;
28537 +
28538 +                       *op++ = 0;
28539 +                       while (tt > 255) {
28540 +                               tt -= 255;
28541 +                               *op++ = 0;
28542 +                       }
28543 +                       assert("lzo-15", tt > 0);
28544 +                       *op++ = LZO_BYTE(tt);
28545 +               }
28546 +               do
28547 +                       *op++ = *ii++;
28548 +               while (--t > 0);
28549 +       }
28550 +
28551 +       *op++ = M4_MARKER | 1;
28552 +       *op++ = 0;
28553 +       *op++ = 0;
28554 +
28555 +       *out_len = op - out;
28556 +       return LZO_E_OK;
28557 +}
28558 +
28559 +#undef do_compress
28560 +#undef DO_COMPRESS
28561 +#undef LZO_HASH
28562 +
28563 +#undef LZO_TEST_DECOMPRESS_OVERRUN
28564 +#undef LZO_TEST_DECOMPRESS_OVERRUN_INPUT
28565 +#undef LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT
28566 +#undef LZO_TEST_DECOMPRESS_OVERRUN_LOOKBEHIND
28567 +#undef DO_DECOMPRESS
28568 +#define DO_DECOMPRESS       lzo1x_decompress
28569 +
28570 +#if defined(LZO_TEST_DECOMPRESS_OVERRUN)
28571 +#  if !defined(LZO_TEST_DECOMPRESS_OVERRUN_INPUT)
28572 +#    define LZO_TEST_DECOMPRESS_OVERRUN_INPUT       2
28573 +#  endif
28574 +#  if !defined(LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT)
28575 +#    define LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT      2
28576 +#  endif
28577 +#  if !defined(LZO_TEST_DECOMPRESS_OVERRUN_LOOKBEHIND)
28578 +#    define LZO_TEST_DECOMPRESS_OVERRUN_LOOKBEHIND
28579 +#  endif
28580 +#endif
28581 +
28582 +#undef TEST_IP
28583 +#undef TEST_OP
28584 +#undef TEST_LOOKBEHIND
28585 +#undef NEED_IP
28586 +#undef NEED_OP
28587 +#undef HAVE_TEST_IP
28588 +#undef HAVE_TEST_OP
28589 +#undef HAVE_NEED_IP
28590 +#undef HAVE_NEED_OP
28591 +#undef HAVE_ANY_IP
28592 +#undef HAVE_ANY_OP
28593 +
28594 +#if defined(LZO_TEST_DECOMPRESS_OVERRUN_INPUT)
28595 +#  if (LZO_TEST_DECOMPRESS_OVERRUN_INPUT >= 1)
28596 +#    define TEST_IP             (ip < ip_end)
28597 +#  endif
28598 +#  if (LZO_TEST_DECOMPRESS_OVERRUN_INPUT >= 2)
28599 +#    define NEED_IP(x) \
28600 +           if ((lzo_uint)(ip_end - ip) < (lzo_uint)(x))  goto input_overrun
28601 +#  endif
28602 +#endif
28603 +
28604 +#if defined(LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT)
28605 +#  if (LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT >= 1)
28606 +#    define TEST_OP             (op <= op_end)
28607 +#  endif
28608 +#  if (LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT >= 2)
28609 +#    undef TEST_OP
28610 +#    define NEED_OP(x) \
28611 +           if ((lzo_uint)(op_end - op) < (lzo_uint)(x))  goto output_overrun
28612 +#  endif
28613 +#endif
28614 +
28615 +#if defined(LZO_TEST_DECOMPRESS_OVERRUN_LOOKBEHIND)
28616 +#  define TEST_LOOKBEHIND(m_pos,out)    if (m_pos < out) goto lookbehind_overrun
28617 +#else
28618 +#  define TEST_LOOKBEHIND(m_pos,op)     ((void) 0)
28619 +#endif
28620 +
28621 +#if !defined(LZO_EOF_CODE) && !defined(TEST_IP)
28622 +#  define TEST_IP               (ip < ip_end)
28623 +#endif
28624 +
28625 +#if defined(TEST_IP)
28626 +#  define HAVE_TEST_IP
28627 +#else
28628 +#  define TEST_IP               1
28629 +#endif
28630 +#if defined(TEST_OP)
28631 +#  define HAVE_TEST_OP
28632 +#else
28633 +#  define TEST_OP               1
28634 +#endif
28635 +
28636 +#if defined(NEED_IP)
28637 +#  define HAVE_NEED_IP
28638 +#else
28639 +#  define NEED_IP(x)            ((void) 0)
28640 +#endif
28641 +#if defined(NEED_OP)
28642 +#  define HAVE_NEED_OP
28643 +#else
28644 +#  define NEED_OP(x)            ((void) 0)
28645 +#endif
28646 +
28647 +#if defined(HAVE_TEST_IP) || defined(HAVE_NEED_IP)
28648 +#  define HAVE_ANY_IP
28649 +#endif
28650 +#if defined(HAVE_TEST_OP) || defined(HAVE_NEED_OP)
28651 +#  define HAVE_ANY_OP
28652 +#endif
28653 +
28654 +#undef __COPY4
28655 +#define __COPY4(dst,src)    * (lzo_uint32p)(dst) = * (const lzo_uint32p)(src)
28656 +
28657 +#undef COPY4
28658 +#if defined(LZO_UNALIGNED_OK_4)
28659 +#  define COPY4(dst,src)    __COPY4(dst,src)
28660 +#elif defined(LZO_ALIGNED_OK_4)
28661 +#  define COPY4(dst,src)    __COPY4((lzo_ptr_t)(dst),(lzo_ptr_t)(src))
28662 +#endif
28663 +
28664 +#if defined(DO_DECOMPRESS)
28665 +LZO_PUBLIC(int)
28666 +    DO_DECOMPRESS(const lzo_byte * in, lzo_uint in_len,
28667 +             lzo_byte * out, lzo_uintp out_len, lzo_voidp wrkmem)
28668 +#endif
28669 +{
28670 +       register lzo_byte *op;
28671 +       register const lzo_byte *ip;
28672 +       register lzo_uint t;
28673 +#if defined(COPY_DICT)
28674 +       lzo_uint m_off;
28675 +       const lzo_byte *dict_end;
28676 +#else
28677 +       register const lzo_byte *m_pos;
28678 +#endif
28679 +
28680 +       const lzo_byte *const ip_end = in + in_len;
28681 +#if defined(HAVE_ANY_OP)
28682 +       lzo_byte *const op_end = out + *out_len;
28683 +#endif
28684 +#if defined(LZO1Z)
28685 +       lzo_uint last_m_off = 0;
28686 +#endif
28687 +
28688 +       LZO_UNUSED(wrkmem);
28689 +
28690 +#if defined(__LZO_QUERY_DECOMPRESS)
28691 +       if (__LZO_IS_DECOMPRESS_QUERY(in, in_len, out, out_len, wrkmem))
28692 +               return __LZO_QUERY_DECOMPRESS(in, in_len, out, out_len, wrkmem,
28693 +                                             0, 0);
28694 +#endif
28695 +
28696 +#if defined(COPY_DICT)
28697 +       if (dict) {
28698 +               if (dict_len > M4_MAX_OFFSET) {
28699 +                       dict += dict_len - M4_MAX_OFFSET;
28700 +                       dict_len = M4_MAX_OFFSET;
28701 +               }
28702 +               dict_end = dict + dict_len;
28703 +       } else {
28704 +               dict_len = 0;
28705 +               dict_end = NULL;
28706 +       }
28707 +#endif
28708 +
28709 +       *out_len = 0;
28710 +
28711 +       op = out;
28712 +       ip = in;
28713 +
28714 +       if (*ip > 17) {
28715 +               t = *ip++ - 17;
28716 +               if (t < 4)
28717 +                       goto match_next;
28718 +               assert("lzo-16", t > 0);
28719 +               NEED_OP(t);
28720 +               NEED_IP(t + 1);
28721 +               do
28722 +                       *op++ = *ip++;
28723 +               while (--t > 0);
28724 +               goto first_literal_run;
28725 +       }
28726 +
28727 +       while (TEST_IP && TEST_OP) {
28728 +               t = *ip++;
28729 +               if (t >= 16)
28730 +                       goto match;
28731 +               if (t == 0) {
28732 +                       NEED_IP(1);
28733 +                       while (*ip == 0) {
28734 +                               t += 255;
28735 +                               ip++;
28736 +                               NEED_IP(1);
28737 +                       }
28738 +                       t += 15 + *ip++;
28739 +               }
28740 +               assert("lzo-17", t > 0);
28741 +               NEED_OP(t + 3);
28742 +               NEED_IP(t + 4);
28743 +#if defined(LZO_UNALIGNED_OK_4) || defined(LZO_ALIGNED_OK_4)
28744 +#if !defined(LZO_UNALIGNED_OK_4)
28745 +               if (PTR_ALIGNED2_4(op, ip)) {
28746 +#endif
28747 +                       COPY4(op, ip);
28748 +                       op += 4;
28749 +                       ip += 4;
28750 +                       if (--t > 0) {
28751 +                               if (t >= 4) {
28752 +                                       do {
28753 +                                               COPY4(op, ip);
28754 +                                               op += 4;
28755 +                                               ip += 4;
28756 +                                               t -= 4;
28757 +                                       } while (t >= 4);
28758 +                                       if (t > 0)
28759 +                                               do
28760 +                                                       *op++ = *ip++;
28761 +                                               while (--t > 0);
28762 +                               } else
28763 +                                       do
28764 +                                               *op++ = *ip++;
28765 +                                       while (--t > 0);
28766 +                       }
28767 +#if !defined(LZO_UNALIGNED_OK_4)
28768 +               } else
28769 +#endif
28770 +#endif
28771 +#if !defined(LZO_UNALIGNED_OK_4)
28772 +               {
28773 +                       *op++ = *ip++;
28774 +                       *op++ = *ip++;
28775 +                       *op++ = *ip++;
28776 +                       do
28777 +                               *op++ = *ip++;
28778 +                       while (--t > 0);
28779 +               }
28780 +#endif
28781 +
28782 +             first_literal_run:
28783 +
28784 +               t = *ip++;
28785 +               if (t >= 16)
28786 +                       goto match;
28787 +#if defined(COPY_DICT)
28788 +#if defined(LZO1Z)
28789 +               m_off = (1 + M2_MAX_OFFSET) + (t << 6) + (*ip++ >> 2);
28790 +               last_m_off = m_off;
28791 +#else
28792 +               m_off = (1 + M2_MAX_OFFSET) + (t >> 2) + (*ip++ << 2);
28793 +#endif
28794 +               NEED_OP(3);
28795 +               t = 3;
28796 +               COPY_DICT(t, m_off)
28797 +#else
28798 +#if defined(LZO1Z)
28799 +               t = (1 + M2_MAX_OFFSET) + (t << 6) + (*ip++ >> 2);
28800 +               m_pos = op - t;
28801 +               last_m_off = t;
28802 +#else
28803 +               m_pos = op - (1 + M2_MAX_OFFSET);
28804 +               m_pos -= t >> 2;
28805 +               m_pos -= *ip++ << 2;
28806 +#endif
28807 +               TEST_LOOKBEHIND(m_pos, out);
28808 +               NEED_OP(3);
28809 +               *op++ = *m_pos++;
28810 +               *op++ = *m_pos++;
28811 +               *op++ = *m_pos;
28812 +#endif
28813 +               goto match_done;
28814 +
28815 +               while (TEST_IP && TEST_OP) {
28816 +                     match:
28817 +                       if (t >= 64) {
28818 +#if defined(COPY_DICT)
28819 +#if defined(LZO1X)
28820 +                               m_off = 1 + ((t >> 2) & 7) + (*ip++ << 3);
28821 +                               t = (t >> 5) - 1;
28822 +#elif defined(LZO1Y)
28823 +                               m_off = 1 + ((t >> 2) & 3) + (*ip++ << 2);
28824 +                               t = (t >> 4) - 3;
28825 +#elif defined(LZO1Z)
28826 +                               m_off = t & 0x1f;
28827 +                               if (m_off >= 0x1c)
28828 +                                       m_off = last_m_off;
28829 +                               else {
28830 +                                       m_off = 1 + (m_off << 6) + (*ip++ >> 2);
28831 +                                       last_m_off = m_off;
28832 +                               }
28833 +                               t = (t >> 5) - 1;
28834 +#endif
28835 +#else
28836 +#if defined(LZO1X)
28837 +                               m_pos = op - 1;
28838 +                               m_pos -= (t >> 2) & 7;
28839 +                               m_pos -= *ip++ << 3;
28840 +                               t = (t >> 5) - 1;
28841 +#elif defined(LZO1Y)
28842 +                               m_pos = op - 1;
28843 +                               m_pos -= (t >> 2) & 3;
28844 +                               m_pos -= *ip++ << 2;
28845 +                               t = (t >> 4) - 3;
28846 +#elif defined(LZO1Z)
28847 +                               {
28848 +                                       lzo_uint off = t & 0x1f;
28849 +                                       m_pos = op;
28850 +                                       if (off >= 0x1c) {
28851 +                                               assert(last_m_off > 0);
28852 +                                               m_pos -= last_m_off;
28853 +                                       } else {
28854 +                                               off =
28855 +                                                   1 + (off << 6) +
28856 +                                                   (*ip++ >> 2);
28857 +                                               m_pos -= off;
28858 +                                               last_m_off = off;
28859 +                                       }
28860 +                               }
28861 +                               t = (t >> 5) - 1;
28862 +#endif
28863 +                               TEST_LOOKBEHIND(m_pos, out);
28864 +                               assert("lzo-18", t > 0);
28865 +                               NEED_OP(t + 3 - 1);
28866 +                               goto copy_match;
28867 +#endif
28868 +                       } else if (t >= 32) {
28869 +                               t &= 31;
28870 +                               if (t == 0) {
28871 +                                       NEED_IP(1);
28872 +                                       while (*ip == 0) {
28873 +                                               t += 255;
28874 +                                               ip++;
28875 +                                               NEED_IP(1);
28876 +                                       }
28877 +                                       t += 31 + *ip++;
28878 +                               }
28879 +#if defined(COPY_DICT)
28880 +#if defined(LZO1Z)
28881 +                               m_off = 1 + (ip[0] << 6) + (ip[1] >> 2);
28882 +                               last_m_off = m_off;
28883 +#else
28884 +                               m_off = 1 + (ip[0] >> 2) + (ip[1] << 6);
28885 +#endif
28886 +#else
28887 +#if defined(LZO1Z)
28888 +                               {
28889 +                                       lzo_uint off =
28890 +                                           1 + (ip[0] << 6) + (ip[1] >> 2);
28891 +                                       m_pos = op - off;
28892 +                                       last_m_off = off;
28893 +                               }
28894 +#elif defined(LZO_UNALIGNED_OK_2) && (LZO_BYTE_ORDER == LZO_LITTLE_ENDIAN)
28895 +                               m_pos = op - 1;
28896 +                               m_pos -= (*(const lzo_ushortp)ip) >> 2;
28897 +#else
28898 +                               m_pos = op - 1;
28899 +                               m_pos -= (ip[0] >> 2) + (ip[1] << 6);
28900 +#endif
28901 +#endif
28902 +                               ip += 2;
28903 +                       } else if (t >= 16) {
28904 +#if defined(COPY_DICT)
28905 +                               m_off = (t & 8) << 11;
28906 +#else
28907 +                               m_pos = op;
28908 +                               m_pos -= (t & 8) << 11;
28909 +#endif
28910 +                               t &= 7;
28911 +                               if (t == 0) {
28912 +                                       NEED_IP(1);
28913 +                                       while (*ip == 0) {
28914 +                                               t += 255;
28915 +                                               ip++;
28916 +                                               NEED_IP(1);
28917 +                                       }
28918 +                                       t += 7 + *ip++;
28919 +                               }
28920 +#if defined(COPY_DICT)
28921 +#if defined(LZO1Z)
28922 +                               m_off += (ip[0] << 6) + (ip[1] >> 2);
28923 +#else
28924 +                               m_off += (ip[0] >> 2) + (ip[1] << 6);
28925 +#endif
28926 +                               ip += 2;
28927 +                               if (m_off == 0)
28928 +                                       goto eof_found;
28929 +                               m_off += 0x4000;
28930 +#if defined(LZO1Z)
28931 +                               last_m_off = m_off;
28932 +#endif
28933 +#else
28934 +#if defined(LZO1Z)
28935 +                               m_pos -= (ip[0] << 6) + (ip[1] >> 2);
28936 +#elif defined(LZO_UNALIGNED_OK_2) && (LZO_BYTE_ORDER == LZO_LITTLE_ENDIAN)
28937 +                               m_pos -= (*(const lzo_ushortp)ip) >> 2;
28938 +#else
28939 +                               m_pos -= (ip[0] >> 2) + (ip[1] << 6);
28940 +#endif
28941 +                               ip += 2;
28942 +                               if (m_pos == op)
28943 +                                       goto eof_found;
28944 +                               m_pos -= 0x4000;
28945 +#if defined(LZO1Z)
28946 +                               last_m_off = op - m_pos;
28947 +#endif
28948 +#endif
28949 +                       } else {
28950 +#if defined(COPY_DICT)
28951 +#if defined(LZO1Z)
28952 +                               m_off = 1 + (t << 6) + (*ip++ >> 2);
28953 +                               last_m_off = m_off;
28954 +#else
28955 +                               m_off = 1 + (t >> 2) + (*ip++ << 2);
28956 +#endif
28957 +                               NEED_OP(2);
28958 +                               t = 2;
28959 +                               COPY_DICT(t, m_off)
28960 +#else
28961 +#if defined(LZO1Z)
28962 +                               t = 1 + (t << 6) + (*ip++ >> 2);
28963 +                               m_pos = op - t;
28964 +                               last_m_off = t;
28965 +#else
28966 +                               m_pos = op - 1;
28967 +                               m_pos -= t >> 2;
28968 +                               m_pos -= *ip++ << 2;
28969 +#endif
28970 +                               TEST_LOOKBEHIND(m_pos, out);
28971 +                               NEED_OP(2);
28972 +                               *op++ = *m_pos++;
28973 +                               *op++ = *m_pos;
28974 +#endif
28975 +                               goto match_done;
28976 +                       }
28977 +
28978 +#if defined(COPY_DICT)
28979 +
28980 +                       NEED_OP(t + 3 - 1);
28981 +                       t += 3 - 1;
28982 +                       COPY_DICT(t, m_off)
28983 +#else
28984 +
28985 +                       TEST_LOOKBEHIND(m_pos, out);
28986 +                       assert("lzo-19", t > 0);
28987 +                       NEED_OP(t + 3 - 1);
28988 +#if defined(LZO_UNALIGNED_OK_4) || defined(LZO_ALIGNED_OK_4)
28989 +#if !defined(LZO_UNALIGNED_OK_4)
28990 +                       if (t >= 2 * 4 - (3 - 1) && PTR_ALIGNED2_4(op, m_pos)) {
28991 +                               assert((op - m_pos) >= 4);
28992 +#else
28993 +                       if (t >= 2 * 4 - (3 - 1) && (op - m_pos) >= 4) {
28994 +#endif
28995 +                               COPY4(op, m_pos);
28996 +                               op += 4;
28997 +                               m_pos += 4;
28998 +                               t -= 4 - (3 - 1);
28999 +                               do {
29000 +                                       COPY4(op, m_pos);
29001 +                                       op += 4;
29002 +                                       m_pos += 4;
29003 +                                       t -= 4;
29004 +                               } while (t >= 4);
29005 +                               if (t > 0)
29006 +                                       do
29007 +                                               *op++ = *m_pos++;
29008 +                                       while (--t > 0);
29009 +                       } else
29010 +#endif
29011 +                       {
29012 +                             copy_match:
29013 +                               *op++ = *m_pos++;
29014 +                               *op++ = *m_pos++;
29015 +                               do
29016 +                                       *op++ = *m_pos++;
29017 +                               while (--t > 0);
29018 +                       }
29019 +
29020 +#endif
29021 +
29022 +                     match_done:
29023 +#if defined(LZO1Z)
29024 +                       t = ip[-1] & 3;
29025 +#else
29026 +                       t = ip[-2] & 3;
29027 +#endif
29028 +                       if (t == 0)
29029 +                               break;
29030 +
29031 +                     match_next:
29032 +                       assert("lzo-20", t > 0);
29033 +                       NEED_OP(t);
29034 +                       NEED_IP(t + 1);
29035 +                       do
29036 +                               *op++ = *ip++;
29037 +                       while (--t > 0);
29038 +                       t = *ip++;
29039 +               }
29040 +       }
29041 +
29042 +#if defined(HAVE_TEST_IP) || defined(HAVE_TEST_OP)
29043 +       *out_len = op - out;
29044 +       return LZO_E_EOF_NOT_FOUND;
29045 +#endif
29046 +
29047 +      eof_found:
29048 +       assert("lzo-21", t == 1);
29049 +       *out_len = op - out;
29050 +       return (ip == ip_end ? LZO_E_OK :
29051 +               (ip < ip_end ? LZO_E_INPUT_NOT_CONSUMED : LZO_E_INPUT_OVERRUN));
29052 +
29053 +#if defined(HAVE_NEED_IP)
29054 +      input_overrun:
29055 +       *out_len = op - out;
29056 +       return LZO_E_INPUT_OVERRUN;
29057 +#endif
29058 +
29059 +#if defined(HAVE_NEED_OP)
29060 +      output_overrun:
29061 +       *out_len = op - out;
29062 +       return LZO_E_OUTPUT_OVERRUN;
29063 +#endif
29064 +
29065 +#if defined(LZO_TEST_DECOMPRESS_OVERRUN_LOOKBEHIND)
29066 +      lookbehind_overrun:
29067 +       *out_len = op - out;
29068 +       return LZO_E_LOOKBEHIND_OVERRUN;
29069 +#endif
29070 +}
29071 +
29072 +#define LZO_TEST_DECOMPRESS_OVERRUN
29073 +#undef DO_DECOMPRESS
29074 +#define DO_DECOMPRESS       lzo1x_decompress_safe
29075 +
29076 +#if defined(LZO_TEST_DECOMPRESS_OVERRUN)
29077 +#  if !defined(LZO_TEST_DECOMPRESS_OVERRUN_INPUT)
29078 +#    define LZO_TEST_DECOMPRESS_OVERRUN_INPUT       2
29079 +#  endif
29080 +#  if !defined(LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT)
29081 +#    define LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT      2
29082 +#  endif
29083 +#  if !defined(LZO_TEST_DECOMPRESS_OVERRUN_LOOKBEHIND)
29084 +#    define LZO_TEST_DECOMPRESS_OVERRUN_LOOKBEHIND
29085 +#  endif
29086 +#endif
29087 +
29088 +#undef TEST_IP
29089 +#undef TEST_OP
29090 +#undef TEST_LOOKBEHIND
29091 +#undef NEED_IP
29092 +#undef NEED_OP
29093 +#undef HAVE_TEST_IP
29094 +#undef HAVE_TEST_OP
29095 +#undef HAVE_NEED_IP
29096 +#undef HAVE_NEED_OP
29097 +#undef HAVE_ANY_IP
29098 +#undef HAVE_ANY_OP
29099 +
29100 +#if defined(LZO_TEST_DECOMPRESS_OVERRUN_INPUT)
29101 +#  if (LZO_TEST_DECOMPRESS_OVERRUN_INPUT >= 1)
29102 +#    define TEST_IP             (ip < ip_end)
29103 +#  endif
29104 +#  if (LZO_TEST_DECOMPRESS_OVERRUN_INPUT >= 2)
29105 +#    define NEED_IP(x) \
29106 +           if ((lzo_uint)(ip_end - ip) < (lzo_uint)(x))  goto input_overrun
29107 +#  endif
29108 +#endif
29109 +
29110 +#if defined(LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT)
29111 +#  if (LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT >= 1)
29112 +#    define TEST_OP             (op <= op_end)
29113 +#  endif
29114 +#  if (LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT >= 2)
29115 +#    undef TEST_OP
29116 +#    define NEED_OP(x) \
29117 +           if ((lzo_uint)(op_end - op) < (lzo_uint)(x))  goto output_overrun
29118 +#  endif
29119 +#endif
29120 +
29121 +#if defined(LZO_TEST_DECOMPRESS_OVERRUN_LOOKBEHIND)
29122 +#  define TEST_LOOKBEHIND(m_pos,out)    if (m_pos < out) goto lookbehind_overrun
29123 +#else
29124 +#  define TEST_LOOKBEHIND(m_pos,op)     ((void) 0)
29125 +#endif
29126 +
29127 +#if !defined(LZO_EOF_CODE) && !defined(TEST_IP)
29128 +#  define TEST_IP               (ip < ip_end)
29129 +#endif
29130 +
29131 +#if defined(TEST_IP)
29132 +#  define HAVE_TEST_IP
29133 +#else
29134 +#  define TEST_IP               1
29135 +#endif
29136 +#if defined(TEST_OP)
29137 +#  define HAVE_TEST_OP
29138 +#else
29139 +#  define TEST_OP               1
29140 +#endif
29141 +
29142 +#if defined(NEED_IP)
29143 +#  define HAVE_NEED_IP
29144 +#else
29145 +#  define NEED_IP(x)            ((void) 0)
29146 +#endif
29147 +#if defined(NEED_OP)
29148 +#  define HAVE_NEED_OP
29149 +#else
29150 +#  define NEED_OP(x)            ((void) 0)
29151 +#endif
29152 +
29153 +#if defined(HAVE_TEST_IP) || defined(HAVE_NEED_IP)
29154 +#  define HAVE_ANY_IP
29155 +#endif
29156 +#if defined(HAVE_TEST_OP) || defined(HAVE_NEED_OP)
29157 +#  define HAVE_ANY_OP
29158 +#endif
29159 +
29160 +#undef __COPY4
29161 +#define __COPY4(dst,src)    * (lzo_uint32p)(dst) = * (const lzo_uint32p)(src)
29162 +
29163 +#undef COPY4
29164 +#if defined(LZO_UNALIGNED_OK_4)
29165 +#  define COPY4(dst,src)    __COPY4(dst,src)
29166 +#elif defined(LZO_ALIGNED_OK_4)
29167 +#  define COPY4(dst,src)    __COPY4((lzo_ptr_t)(dst),(lzo_ptr_t)(src))
29168 +#endif
29169 +
29170 +/***** End of minilzo.c *****/
29171 diff --git a/fs/reiser4/plugin/compress/minilzo.h b/fs/reiser4/plugin/compress/minilzo.h
29172 new file mode 100644
29173 index 0000000..75d9893
29174 --- /dev/null
29175 +++ b/fs/reiser4/plugin/compress/minilzo.h
29176 @@ -0,0 +1,94 @@
29177 +/* minilzo.h -- mini subset of the LZO real-time data compression library
29178 +   adopted for reiser4 compression transform plugin.
29179 +
29180 +   This file is part of the LZO real-time data compression library
29181 +   and not included in any proprietary licenses of reiser4.
29182 +
29183 +   Copyright (C) 2002 Markus Franz Xaver Johannes Oberhumer
29184 +   Copyright (C) 2001 Markus Franz Xaver Johannes Oberhumer
29185 +   Copyright (C) 2000 Markus Franz Xaver Johannes Oberhumer
29186 +   Copyright (C) 1999 Markus Franz Xaver Johannes Oberhumer
29187 +   Copyright (C) 1998 Markus Franz Xaver Johannes Oberhumer
29188 +   Copyright (C) 1997 Markus Franz Xaver Johannes Oberhumer
29189 +   Copyright (C) 1996 Markus Franz Xaver Johannes Oberhumer
29190 +   All Rights Reserved.
29191 +
29192 +   The LZO library is free software; you can redistribute it and/or
29193 +   modify it under the terms of the GNU General Public License as
29194 +   published by the Free Software Foundation; either version 2 of
29195 +   the License, or (at your option) any later version.
29196 +
29197 +   The LZO library is distributed in the hope that it will be useful,
29198 +   but WITHOUT ANY WARRANTY; without even the implied warranty of
29199 +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
29200 +   GNU General Public License for more details.
29201 +
29202 +   You should have received a copy of the GNU General Public License
29203 +   along with the LZO library; see the file COPYING.
29204 +   If not, write to the Free Software Foundation, Inc.,
29205 +   59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
29206 +
29207 +   Markus F.X.J. Oberhumer
29208 +   <markus@oberhumer.com>
29209 +   http://www.oberhumer.com/opensource/lzo/
29210 + */
29211 +
29212 +/*
29213 + * NOTE:
29214 + *   the full LZO package can be found at
29215 + *   http://www.oberhumer.com/opensource/lzo/
29216 + */
29217 +
29218 +#ifndef __MINILZO_H
29219 +#define __MINILZO_H
29220 +
29221 +#define MINILZO_VERSION         0x1080
29222 +
29223 +#ifdef __LZOCONF_H
29224 +#  error "you cannot use both LZO and miniLZO"
29225 +#endif
29226 +
29227 +#undef LZO_HAVE_CONFIG_H
29228 +#include "lzoconf.h"
29229 +
29230 +#if !defined(LZO_VERSION) || (LZO_VERSION != MINILZO_VERSION)
29231 +#  error "version mismatch in header files"
29232 +#endif
29233 +
29234 +#ifdef __cplusplus
29235 +extern "C" {
29236 +#endif
29237 +
29238 +/***********************************************************************
29239 +//
29240 +************************************************************************/
29241 +
29242 +/* Memory required for the wrkmem parameter.
29243 + * When the required size is 0, you can also pass a NULL pointer.
29244 + */
29245 +
29246 +#define LZO1X_MEM_COMPRESS      LZO1X_1_MEM_COMPRESS
29247 +#define LZO1X_1_MEM_COMPRESS    ((lzo_uint32) (16384L * lzo_sizeof_dict_t))
29248 +#define LZO1X_MEM_DECOMPRESS    (0)
29249 +
29250 +/* compression */
29251 +       LZO_EXTERN(int)
29252 +        lzo1x_1_compress(const lzo_byte * src, lzo_uint src_len,
29253 +                         lzo_byte * dst, lzo_uintp dst_len, lzo_voidp wrkmem);
29254 +
29255 +/* decompression */
29256 +        LZO_EXTERN(int)
29257 +        lzo1x_decompress(const lzo_byte * src, lzo_uint src_len,
29258 +                         lzo_byte * dst, lzo_uintp dst_len,
29259 +                         lzo_voidp wrkmem /* NOT USED */ );
29260 +
29261 +/* safe decompression with overrun testing */
29262 +        LZO_EXTERN(int)
29263 +        lzo1x_decompress_safe(const lzo_byte * src, lzo_uint src_len,
29264 +                              lzo_byte * dst, lzo_uintp dst_len,
29265 +                              lzo_voidp wrkmem /* NOT USED */ );
29266 +
29267 +#ifdef __cplusplus
29268 +}                              /* extern "C" */
29269 +#endif
29270 +#endif                         /* already included */
29271 diff --git a/fs/reiser4/plugin/crypto/cipher.c b/fs/reiser4/plugin/crypto/cipher.c
29272 new file mode 100644
29273 index 0000000..aa438c6
29274 --- /dev/null
29275 +++ b/fs/reiser4/plugin/crypto/cipher.c
29276 @@ -0,0 +1,116 @@
29277 +/* Copyright 2001, 2002, 2003 by Hans Reiser,
29278 +   licensing governed by reiser4/README */
29279 +/* Reiser4 cipher transform plugins */
29280 +
29281 +#include "../../debug.h"
29282 +#include "../plugin.h"
29283 +#include "../file/cryptcompress.h"
29284 +#include <linux/types.h>
29285 +#include <linux/random.h>
29286 +
29287 +#define MIN_CIPHER_BLOCKSIZE 8
29288 +#define MAX_CIPHER_BLOCKSIZE 128
29289 +
29290 +/*
29291 +  Default align() method of the cipher plugin (look for description of this
29292 +  method in plugin/plugin.h)
29293 +
29294 +  1) creates the aligning armored format of the input flow before encryption.
29295 +     "armored" means that padding is filled by private data (for example,
29296 +     pseudo-random sequence of bytes is not private data).
29297 +  2) returns length of appended padding
29298 +
29299 +   [ flow | aligning_padding ]
29300 +            ^
29301 +            |
29302 +         @pad
29303 +*/
29304 +static int align_stream_common(__u8 * pad,
29305 +                              int flow_size /* size of non-aligned flow */,
29306 +                              int blocksize /* cipher block size */)
29307 +{
29308 +       int pad_size;
29309 +
29310 +       assert("edward-01", pad != NULL);
29311 +       assert("edward-02", flow_size != 0);
29312 +       assert("edward-03", blocksize != 0
29313 +              || blocksize <= MAX_CIPHER_BLOCKSIZE);
29314 +
29315 +       pad_size = blocksize - (flow_size % blocksize);
29316 +       get_random_bytes(pad, pad_size);
29317 +       return pad_size;
29318 +}
29319 +
29320 +/* This is used for all the cipher algorithms which do not inflate
29321 +   block-aligned data */
29322 +static loff_t scale_common(struct inode *inode, size_t blocksize,
29323 +                          loff_t src_off /* offset to scale */ )
29324 +{
29325 +       return src_off;
29326 +}
29327 +
29328 +static void free_aes (struct crypto_tfm * tfm)
29329 +{
29330 +#if REISER4_AES
29331 +       crypto_free_tfm(tfm);
29332 +#endif
29333 +       return;
29334 +}
29335 +
29336 +static struct crypto_tfm * alloc_aes (void)
29337 +{
29338 +#if REISER4_AES
29339 +       return crypto_alloc_tfm ("aes", 0);
29340 +#else
29341 +       warning("edward-1417", "aes unsupported");
29342 +       return ERR_PTR(-EINVAL);
29343 +#endif /* REISER4_AES */
29344 +}
29345 +
29346 +cipher_plugin cipher_plugins[LAST_CIPHER_ID] = {
29347 +       [NONE_CIPHER_ID] = {
29348 +               .h = {
29349 +                       .type_id = REISER4_CIPHER_PLUGIN_TYPE,
29350 +                       .id = NONE_CIPHER_ID,
29351 +                       .pops = NULL,
29352 +                       .label = "none",
29353 +                       .desc = "no cipher transform",
29354 +                       .linkage = {NULL, NULL}
29355 +               },
29356 +               .alloc = NULL,
29357 +               .free = NULL,
29358 +               .scale = NULL,
29359 +               .align_stream = NULL,
29360 +               .setkey = NULL,
29361 +               .encrypt = NULL,
29362 +               .decrypt = NULL
29363 +       },
29364 +       [AES_CIPHER_ID] = {
29365 +               .h = {
29366 +                       .type_id = REISER4_CIPHER_PLUGIN_TYPE,
29367 +                       .id = AES_CIPHER_ID,
29368 +                       .pops = NULL,
29369 +                       .label = "aes",
29370 +                       .desc = "aes cipher transform",
29371 +                       .linkage = {NULL, NULL}
29372 +               },
29373 +               .alloc = alloc_aes,
29374 +               .free = free_aes,
29375 +               .scale = scale_common,
29376 +               .align_stream = align_stream_common,
29377 +               .setkey = NULL,
29378 +               .encrypt = NULL,
29379 +               .decrypt = NULL
29380 +       }
29381 +};
29382 +
29383 +/* Make Linus happy.
29384 +   Local variables:
29385 +   c-indentation-style: "K&R"
29386 +   mode-name: "LC"
29387 +   c-basic-offset: 8
29388 +   tab-width: 8
29389 +   fill-column: 120
29390 +   scroll-step: 1
29391 +   End:
29392 +*/
29393 diff --git a/fs/reiser4/plugin/crypto/cipher.h b/fs/reiser4/plugin/crypto/cipher.h
29394 new file mode 100644
29395 index 0000000..29979c6
29396 --- /dev/null
29397 +++ b/fs/reiser4/plugin/crypto/cipher.h
29398 @@ -0,0 +1,67 @@
29399 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
29400 +/* This file contains definitions for the objects operated
29401 +   by reiser4 key manager, which is something like keyring
29402 +   wrapped by appropriate reiser4 plugin */
29403 +
29404 +#if !defined( __FS_REISER4_CRYPT_H__ )
29405 +#define __FS_REISER4_CRYPT_H__
29406 +
29407 +#include <linux/crypto.h>
29408 +
29409 +
29410 +/* Transform actions involved in ciphering process and
29411 +   supported by reiser4 via appropriate transform plugins */
29412 +typedef enum {
29413 +       CIPHER_TFM,       /* cipher transform */
29414 +       DIGEST_TFM,       /* digest transform */
29415 +       LAST_TFM
29416 +} reiser4_tfm;
29417 +
29418 +/* This represents a transform action in reiser4 */
29419 +typedef struct reiser4_tfma {
29420 +       reiser4_plugin * plug;     /* transform plugin */
29421 +       struct crypto_tfm * tfm;   /* low-level info, operated by
29422 +                                     linux crypto-api (see linux/crypto) */
29423 +} reiser4_tfma_t;
29424 +
29425 +/* key info imported from user space */
29426 +typedef struct crypto_data {
29427 +       int keysize;    /* uninstantiated key size */
29428 +       __u8 * key;     /* uninstantiated key */
29429 +       int keyid_size; /* size of passphrase */
29430 +       __u8 * keyid;   /* passphrase */
29431 +} crypto_data_t;
29432 +
29433 +/* This object contains all needed infrastructure to implement
29434 +   cipher transform. This is operated (allocating, inheriting,
29435 +   validating, binding to host inode, etc..) by reiser4 key manager.
29436 +
29437 +   This info can be allocated in two cases:
29438 +   1. importing a key from user space.
29439 +   2. reading inode from disk */
29440 +typedef struct crypto_stat {
29441 +       reiser4_tfma_t tfma[LAST_TFM];
29442 +//      cipher_key_plugin * kplug; /* key manager */
29443 +       __u8 * keyid;              /* key fingerprint, created by digest plugin,
29444 +                                     using uninstantiated key and passphrase.
29445 +                                     supposed to be stored in disk stat-data */
29446 +       int inst;                  /* this indicates if the cipher key is
29447 +                                     instantiated (case 1 above) */
29448 +       int keysize;               /* uninstantiated key size (bytes), supposed
29449 +                                     to be stored in disk stat-data */
29450 +       int keyload_count;         /* number of the objects which has this
29451 +                                     crypto-stat attached */
29452 +} crypto_stat_t;
29453 +
29454 +#endif /* __FS_REISER4_CRYPT_H__ */
29455 +
29456 +/*
29457 +   Local variables:
29458 +   c-indentation-style: "K&R"
29459 +   mode-name: "LC"
29460 +   c-basic-offset: 8
29461 +   tab-width: 8
29462 +   fill-column: 120
29463 +   scroll-step: 1
29464 +   End:
29465 +*/
29466 diff --git a/fs/reiser4/plugin/crypto/digest.c b/fs/reiser4/plugin/crypto/digest.c
29467 new file mode 100644
29468 index 0000000..07be88b
29469 --- /dev/null
29470 +++ b/fs/reiser4/plugin/crypto/digest.c
29471 @@ -0,0 +1,58 @@
29472 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
29473 +
29474 +/* reiser4 digest transform plugin (is used by cryptcompress object plugin) */
29475 +/* EDWARD-FIXME-HANS: and it does what? a digest is a what? */
29476 +#include "../../debug.h"
29477 +#include "../plugin_header.h"
29478 +#include "../plugin.h"
29479 +#include "../file/cryptcompress.h"
29480 +
29481 +#include <linux/types.h>
29482 +
29483 +extern digest_plugin digest_plugins[LAST_DIGEST_ID];
29484 +
29485 +static struct crypto_tfm * alloc_sha256 (void)
29486 +{
29487 +#if REISER4_SHA256
29488 +       return crypto_alloc_tfm ("sha256", 0);
29489 +#else
29490 +       warning("edward-1418", "sha256 unsupported");
29491 +       return ERR_PTR(-EINVAL);
29492 +#endif
29493 +}
29494 +
29495 +static void free_sha256 (struct crypto_tfm * tfm)
29496 +{
29497 +#if REISER4_SHA256
29498 +       crypto_free_tfm(tfm);
29499 +#endif
29500 +       return;
29501 +}
29502 +
29503 +/* digest plugins */
29504 +digest_plugin digest_plugins[LAST_DIGEST_ID] = {
29505 +       [SHA256_32_DIGEST_ID] = {
29506 +               .h = {
29507 +                       .type_id = REISER4_DIGEST_PLUGIN_TYPE,
29508 +                       .id = SHA256_32_DIGEST_ID,
29509 +                       .pops = NULL,
29510 +                       .label = "sha256_32",
29511 +                       .desc = "sha256_32 digest transform",
29512 +                       .linkage = {NULL, NULL}
29513 +               },
29514 +               .fipsize = sizeof(__u32),
29515 +               .alloc = alloc_sha256,
29516 +               .free = free_sha256
29517 +       }
29518 +};
29519 +
29520 +/*
29521 +  Local variables:
29522 +  c-indentation-style: "K&R"
29523 +  mode-name: "LC"
29524 +  c-basic-offset: 8
29525 +  tab-width: 8
29526 +  fill-column: 120
29527 +  scroll-step: 1
29528 +  End:
29529 +*/
29530 diff --git a/fs/reiser4/plugin/dir/Makefile b/fs/reiser4/plugin/dir/Makefile
29531 new file mode 100644
29532 index 0000000..ed370b1
29533 --- /dev/null
29534 +++ b/fs/reiser4/plugin/dir/Makefile
29535 @@ -0,0 +1,5 @@
29536 +obj-$(CONFIG_REISER4_FS) += dir_plugins.o
29537 +
29538 +dir_plugins-objs :=    \
29539 +       hashed_dir.o    \
29540 +       seekable_dir.o
29541 diff --git a/fs/reiser4/plugin/dir/dir.h b/fs/reiser4/plugin/dir/dir.h
29542 new file mode 100644
29543 index 0000000..4a91ebe
29544 --- /dev/null
29545 +++ b/fs/reiser4/plugin/dir/dir.h
29546 @@ -0,0 +1,36 @@
29547 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
29548 + * reiser4/README */
29549 +
29550 +/* this file contains declarations of methods implementing directory plugins */
29551 +
29552 +#if !defined( __REISER4_DIR_H__ )
29553 +#define __REISER4_DIR_H__
29554 +
29555 +/*#include "../../key.h"
29556 +
29557 +#include <linux/fs.h>*/
29558 +
29559 +/* declarations of functions implementing HASHED_DIR_PLUGIN_ID dir plugin */
29560 +
29561 +/* "hashed" directory methods of dir plugin */
29562 +void build_entry_key_hashed(const struct inode *, const struct qstr *,
29563 +                           reiser4_key *);
29564 +
29565 +/* declarations of functions implementing SEEKABLE_HASHED_DIR_PLUGIN_ID dir plugin */
29566 +
29567 +/* "seekable" directory methods of dir plugin */
29568 +void build_entry_key_seekable(const struct inode *, const struct qstr *,
29569 +                             reiser4_key *);
29570 +
29571 +/* __REISER4_DIR_H__ */
29572 +#endif
29573 +
29574 +/*
29575 +   Local variables:
29576 +   c-indentation-style: "K&R"
29577 +   mode-name: "LC"
29578 +   c-basic-offset: 8
29579 +   tab-width: 8
29580 +   fill-column: 120
29581 +   End:
29582 +*/
29583 diff --git a/fs/reiser4/plugin/dir/hashed_dir.c b/fs/reiser4/plugin/dir/hashed_dir.c
29584 new file mode 100644
29585 index 0000000..0f34824
29586 --- /dev/null
29587 +++ b/fs/reiser4/plugin/dir/hashed_dir.c
29588 @@ -0,0 +1,81 @@
29589 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
29590 + * reiser4/README */
29591 +
29592 +/* Directory plugin using hashes (see fs/reiser4/plugin/hash.c) to map file
29593 +   names to the files. */
29594 +
29595 +/*
29596 + * Hashed directory logically consists of persistent directory
29597 + * entries. Directory entry is a pair of a file name and a key of stat-data of
29598 + * a file that has this name in the given directory.
29599 + *
29600 + * Directory entries are stored in the tree in the form of directory
29601 + * items. Directory item should implement dir_entry_ops portion of item plugin
29602 + * interface (see plugin/item/item.h). Hashed directory interacts with
29603 + * directory item plugin exclusively through dir_entry_ops operations.
29604 + *
29605 + * Currently there are two implementations of directory items: "simple
29606 + * directory item" (plugin/item/sde.[ch]), and "compound directory item"
29607 + * (plugin/item/cde.[ch]) with the latter being the default.
29608 + *
29609 + * There is, however some delicate way through which directory code interferes
29610 + * with item plugin: key assignment policy. A key for a directory item is
29611 + * chosen by directory code, and as described in kassign.c, this key contains
29612 + * a portion of file name. Directory item uses this knowledge to avoid storing
29613 + * this portion of file name twice: in the key and in the directory item body.
29614 + *
29615 + */
29616 +
29617 +#include "../../inode.h"
29618 +
29619 +void complete_entry_key(const struct inode *, const char *name,
29620 +                       int len, reiser4_key * result);
29621 +
29622 +/* this is implementation of build_entry_key method of dir
29623 +   plugin for HASHED_DIR_PLUGIN_ID
29624 + */
29625 +void build_entry_key_hashed(const struct inode *dir,   /* directory where entry is
29626 +                                                        * (or will be) in.*/
29627 +                           const struct qstr *qname,   /* name of file referenced
29628 +                                                        * by this entry */
29629 +                           reiser4_key * result        /* resulting key of directory
29630 +                                                        * entry */ )
29631 +{
29632 +       const char *name;
29633 +       int len;
29634 +
29635 +       assert("nikita-1139", dir != NULL);
29636 +       assert("nikita-1140", qname != NULL);
29637 +       assert("nikita-1141", qname->name != NULL);
29638 +       assert("nikita-1142", result != NULL);
29639 +
29640 +       name = qname->name;
29641 +       len = qname->len;
29642 +
29643 +       assert("nikita-2867", strlen(name) == len);
29644 +
29645 +       reiser4_key_init(result);
29646 +       /* locality of directory entry's key is objectid of parent
29647 +          directory */
29648 +       set_key_locality(result, get_inode_oid(dir));
29649 +       /* minor packing locality is constant */
29650 +       set_key_type(result, KEY_FILE_NAME_MINOR);
29651 +       /* dot is special case---we always want it to be first entry in
29652 +          a directory. Actually, we just want to have smallest
29653 +          directory entry.
29654 +        */
29655 +       if (len == 1 && name[0] == '.')
29656 +               return;
29657 +
29658 +       /* initialize part of entry key which depends on file name */
29659 +       complete_entry_key(dir, name, len, result);
29660 +}
29661 +
29662 +/* Local variables:
29663 +   c-indentation-style: "K&R"
29664 +   mode-name: "LC"
29665 +   c-basic-offset: 8
29666 +   tab-width: 8
29667 +   fill-column: 120
29668 +   End:
29669 +*/
29670 diff --git a/fs/reiser4/plugin/dir/seekable_dir.c b/fs/reiser4/plugin/dir/seekable_dir.c
29671 new file mode 100644
29672 index 0000000..c1c6c4c
29673 --- /dev/null
29674 +++ b/fs/reiser4/plugin/dir/seekable_dir.c
29675 @@ -0,0 +1,46 @@
29676 +/* Copyright 2005 by Hans Reiser, licensing governed by
29677 + * reiser4/README */
29678 +
29679 +#include "../../inode.h"
29680 +
29681 +/* this is implementation of build_entry_key method of dir
29682 +   plugin for SEEKABLE_HASHED_DIR_PLUGIN_ID
29683 +   This is for directories where we want repeatable and restartable readdir()
29684 +   even in case 32bit user level struct dirent (readdir(3)).
29685 +*/
29686 +void
29687 +build_entry_key_seekable(const struct inode *dir, const struct qstr *name,
29688 +                        reiser4_key * result)
29689 +{
29690 +       oid_t objectid;
29691 +
29692 +       assert("nikita-2283", dir != NULL);
29693 +       assert("nikita-2284", name != NULL);
29694 +       assert("nikita-2285", name->name != NULL);
29695 +       assert("nikita-2286", result != NULL);
29696 +
29697 +       reiser4_key_init(result);
29698 +       /* locality of directory entry's key is objectid of parent
29699 +          directory */
29700 +       set_key_locality(result, get_inode_oid(dir));
29701 +       /* minor packing locality is constant */
29702 +       set_key_type(result, KEY_FILE_NAME_MINOR);
29703 +       /* dot is special case---we always want it to be first entry in
29704 +          a directory. Actually, we just want to have smallest
29705 +          directory entry.
29706 +        */
29707 +       if ((name->len == 1) && (name->name[0] == '.'))
29708 +               return;
29709 +
29710 +       /* objectid of key is 31 lowest bits of hash. */
29711 +       objectid =
29712 +           inode_hash_plugin(dir)->hash(name->name,
29713 +                                        (int)name->len) & 0x7fffffff;
29714 +
29715 +       assert("nikita-2303", !(objectid & ~KEY_OBJECTID_MASK));
29716 +       set_key_objectid(result, objectid);
29717 +
29718 +       /* offset is always 0. */
29719 +       set_key_offset(result, (__u64) 0);
29720 +       return;
29721 +}
29722 diff --git a/fs/reiser4/plugin/dir_plugin_common.c b/fs/reiser4/plugin/dir_plugin_common.c
29723 new file mode 100644
29724 index 0000000..dd4b736
29725 --- /dev/null
29726 +++ b/fs/reiser4/plugin/dir_plugin_common.c
29727 @@ -0,0 +1,864 @@
29728 +/* Copyright 2005 by Hans Reiser, licensing governed by
29729 +   reiser4/README */
29730 +
29731 +/* this file contains typical implementations for most of methods of
29732 +   directory plugin
29733 +*/
29734 +
29735 +#include "../inode.h"
29736 +
29737 +int find_entry(struct inode *dir, struct dentry *name,
29738 +              lock_handle *, znode_lock_mode, reiser4_dir_entry_desc *);
29739 +int lookup_name(struct inode *parent, struct dentry *dentry, reiser4_key * key);
29740 +void check_light_weight(struct inode *inode, struct inode *parent);
29741 +
29742 +/* this is common implementation of get_parent method of dir plugin
29743 +   this is used by NFS kernel server to "climb" up directory tree to
29744 +   check permissions
29745 + */
29746 +struct dentry *get_parent_common(struct inode *child)
29747 +{
29748 +       struct super_block *s;
29749 +       struct inode *parent;
29750 +       struct dentry dotdot;
29751 +       struct dentry *dentry;
29752 +       reiser4_key key;
29753 +       int result;
29754 +
29755 +       /*
29756 +        * lookup dotdot entry.
29757 +        */
29758 +
29759 +       s = child->i_sb;
29760 +       memset(&dotdot, 0, sizeof(dotdot));
29761 +       dotdot.d_name.name = "..";
29762 +       dotdot.d_name.len = 2;
29763 +       dotdot.d_op = &get_super_private(s)->ops.dentry;
29764 +
29765 +       result = lookup_name(child, &dotdot, &key);
29766 +       if (result != 0)
29767 +               return ERR_PTR(result);
29768 +
29769 +       parent = reiser4_iget(s, &key, 1);
29770 +       if (!IS_ERR(parent)) {
29771 +               /*
29772 +                * FIXME-NIKITA dubious: attributes are inherited from @child
29773 +                * to @parent. But:
29774 +                *
29775 +                *     (*) this is the only this we can do
29776 +                *
29777 +                *     (*) attributes of light-weight object are inherited
29778 +                *     from a parent through which object was looked up first,
29779 +                *     so it is ambiguous anyway.
29780 +                *
29781 +                */
29782 +               check_light_weight(parent, child);
29783 +               reiser4_iget_complete(parent);
29784 +               dentry = d_alloc_anon(parent);
29785 +               if (dentry == NULL) {
29786 +                       iput(parent);
29787 +                       dentry = ERR_PTR(RETERR(-ENOMEM));
29788 +               } else
29789 +                       dentry->d_op = &get_super_private(s)->ops.dentry;
29790 +       } else if (PTR_ERR(parent) == -ENOENT)
29791 +               dentry = ERR_PTR(RETERR(-ESTALE));
29792 +       else
29793 +               dentry = (void *)parent;
29794 +       return dentry;
29795 +}
29796 +
29797 +/* this is common implementation of is_name_acceptable method of dir
29798 +   plugin
29799 + */
29800 +int is_name_acceptable_common(const struct inode *inode,       /* directory to check */
29801 +                             const char *name UNUSED_ARG,      /* name to check */
29802 +                             int len /* @name's length */ )
29803 +{
29804 +       assert("nikita-733", inode != NULL);
29805 +       assert("nikita-734", name != NULL);
29806 +       assert("nikita-735", len > 0);
29807 +
29808 +       return len <= reiser4_max_filename_len(inode);
29809 +}
29810 +
29811 +/* there is no common implementation of build_entry_key method of dir
29812 +   plugin. See plugin/dir/hashed_dir.c:build_entry_key_hashed() or
29813 +   plugin/dir/seekable.c:build_entry_key_seekable() for example
29814 +*/
29815 +
29816 +/* this is common implementation of build_readdir_key method of dir
29817 +   plugin
29818 +   see readdir_common for more details
29819 +*/
29820 +int build_readdir_key_common(struct file *dir /* directory being read */ ,
29821 +                            reiser4_key * result /* where to store key */ )
29822 +{
29823 +       reiser4_file_fsdata *fdata;
29824 +       struct inode *inode;
29825 +
29826 +       assert("nikita-1361", dir != NULL);
29827 +       assert("nikita-1362", result != NULL);
29828 +       assert("nikita-1363", dir->f_dentry != NULL);
29829 +       inode = dir->f_dentry->d_inode;
29830 +       assert("nikita-1373", inode != NULL);
29831 +
29832 +       fdata = reiser4_get_file_fsdata(dir);
29833 +       if (IS_ERR(fdata))
29834 +               return PTR_ERR(fdata);
29835 +       assert("nikita-1364", fdata != NULL);
29836 +       return extract_key_from_de_id(get_inode_oid(inode),
29837 +                                     &fdata->dir.readdir.position.
29838 +                                     dir_entry_key, result);
29839 +
29840 +}
29841 +
29842 +void adjust_dir_file(struct inode *, const struct dentry *, int offset,
29843 +                    int adj);
29844 +
29845 +/* this is common implementation of add_entry method of dir plugin
29846 +*/
29847 +int add_entry_common(struct inode *object,     /* directory to add new name
29848 +                                                * in */
29849 +                    struct dentry *where,      /* new name */
29850 +                    reiser4_object_create_data * data UNUSED_ARG,      /* parameters
29851 +                                                                        * of new
29852 +                                                                        * object */
29853 +                    reiser4_dir_entry_desc * entry     /* parameters of new
29854 +                                                        * directory entry */ )
29855 +{
29856 +       int result;
29857 +       coord_t *coord;
29858 +       lock_handle lh;
29859 +       reiser4_dentry_fsdata *fsdata;
29860 +       reiser4_block_nr reserve;
29861 +
29862 +       assert("nikita-1114", object != NULL);
29863 +       assert("nikita-1250", where != NULL);
29864 +
29865 +       fsdata = reiser4_get_dentry_fsdata(where);
29866 +       if (unlikely(IS_ERR(fsdata)))
29867 +               return PTR_ERR(fsdata);
29868 +
29869 +       reserve = inode_dir_plugin(object)->estimate.add_entry(object);
29870 +       if (reiser4_grab_space(reserve, BA_CAN_COMMIT))
29871 +               return RETERR(-ENOSPC);
29872 +
29873 +       init_lh(&lh);
29874 +       coord = &fsdata->dec.entry_coord;
29875 +       coord_clear_iplug(coord);
29876 +
29877 +       /* check for this entry in a directory. This is plugin method. */
29878 +       result = find_entry(object, where, &lh, ZNODE_WRITE_LOCK, entry);
29879 +       if (likely(result == -ENOENT)) {
29880 +               /* add new entry. Just pass control to the directory
29881 +                  item plugin. */
29882 +               assert("nikita-1709", inode_dir_item_plugin(object));
29883 +               assert("nikita-2230", coord->node == lh.node);
29884 +               seal_done(&fsdata->dec.entry_seal);
29885 +               result =
29886 +                   inode_dir_item_plugin(object)->s.dir.add_entry(object,
29887 +                                                                  coord, &lh,
29888 +                                                                  where,
29889 +                                                                  entry);
29890 +               if (result == 0) {
29891 +                       adjust_dir_file(object, where, fsdata->dec.pos + 1, +1);
29892 +                       INODE_INC_FIELD(object, i_size);
29893 +               }
29894 +       } else if (result == 0) {
29895 +               assert("nikita-2232", coord->node == lh.node);
29896 +               result = RETERR(-EEXIST);
29897 +       }
29898 +       done_lh(&lh);
29899 +
29900 +       return result;
29901 +}
29902 +
29903 +/**
29904 + * rem_entry - remove entry from directory item
29905 + * @dir:
29906 + * @dentry:
29907 + * @entry:
29908 + * @coord:
29909 + * @lh:
29910 + *
29911 + * Checks that coordinate @coord is set properly and calls item plugin
29912 + * method to cut entry.
29913 + */
29914 +static int
29915 +rem_entry(struct inode *dir, struct dentry *dentry,
29916 +         reiser4_dir_entry_desc * entry, coord_t * coord, lock_handle * lh)
29917 +{
29918 +       item_plugin *iplug;
29919 +       struct inode *child;
29920 +
29921 +       iplug = inode_dir_item_plugin(dir);
29922 +       child = dentry->d_inode;
29923 +       assert("nikita-3399", child != NULL);
29924 +
29925 +       /* check that we are really destroying an entry for @child */
29926 +       if (REISER4_DEBUG) {
29927 +               int result;
29928 +               reiser4_key key;
29929 +
29930 +               result = iplug->s.dir.extract_key(coord, &key);
29931 +               if (result != 0)
29932 +                       return result;
29933 +               if (get_key_objectid(&key) != get_inode_oid(child)) {
29934 +                       warning("nikita-3397",
29935 +                               "rem_entry: %#llx != %#llx\n",
29936 +                               get_key_objectid(&key),
29937 +                               (unsigned long long)get_inode_oid(child));
29938 +                       return RETERR(-EIO);
29939 +               }
29940 +       }
29941 +       return iplug->s.dir.rem_entry(dir, &dentry->d_name, coord, lh, entry);
29942 +}
29943 +
29944 +/**
29945 + * rem_entry_common - remove entry from a directory
29946 + * @dir: directory to remove entry from
29947 + * @where: name that is being removed
29948 + * @entry: description of entry being removed
29949 + *
29950 + * This is common implementation of rem_entry method of dir plugin.
29951 + */
29952 +int rem_entry_common(struct inode *dir,
29953 +                    struct dentry *dentry,
29954 +                    reiser4_dir_entry_desc *entry)
29955 +{
29956 +       int result;
29957 +       coord_t *coord;
29958 +       lock_handle lh;
29959 +       reiser4_dentry_fsdata *fsdata;
29960 +       __u64 tograb;
29961 +
29962 +       assert("nikita-1124", dir != NULL);
29963 +       assert("nikita-1125", dentry != NULL);
29964 +
29965 +       tograb = inode_dir_plugin(dir)->estimate.rem_entry(dir);
29966 +       result = reiser4_grab_space(tograb, BA_CAN_COMMIT | BA_RESERVED);
29967 +       if (result != 0)
29968 +               return RETERR(-ENOSPC);
29969 +
29970 +       init_lh(&lh);
29971 +
29972 +       /* check for this entry in a directory. This is plugin method. */
29973 +       result = find_entry(dir, dentry, &lh, ZNODE_WRITE_LOCK, entry);
29974 +       fsdata = reiser4_get_dentry_fsdata(dentry);
29975 +       if (IS_ERR(fsdata)) {
29976 +               done_lh(&lh);
29977 +               return PTR_ERR(fsdata);
29978 +       }
29979 +
29980 +       coord = &fsdata->dec.entry_coord;
29981 +
29982 +       assert("nikita-3404",
29983 +              get_inode_oid(dentry->d_inode) != get_inode_oid(dir) ||
29984 +              dir->i_size <= 1);
29985 +
29986 +       coord_clear_iplug(coord);
29987 +       if (result == 0) {
29988 +               /* remove entry. Just pass control to the directory item
29989 +                  plugin. */
29990 +               assert("vs-542", inode_dir_item_plugin(dir));
29991 +               seal_done(&fsdata->dec.entry_seal);
29992 +               adjust_dir_file(dir, dentry, fsdata->dec.pos, -1);
29993 +               result =
29994 +                   WITH_COORD(coord,
29995 +                              rem_entry(dir, dentry, entry, coord, &lh));
29996 +               if (result == 0) {
29997 +                       if (dir->i_size >= 1)
29998 +                               INODE_DEC_FIELD(dir, i_size);
29999 +                       else {
30000 +                               warning("nikita-2509", "Dir %llu is runt",
30001 +                                       (unsigned long long)
30002 +                                       get_inode_oid(dir));
30003 +                               result = RETERR(-EIO);
30004 +                       }
30005 +
30006 +                       assert("nikita-3405", dentry->d_inode->i_nlink != 1 ||
30007 +                              dentry->d_inode->i_size != 2 ||
30008 +                              inode_dir_plugin(dentry->d_inode) == NULL);
30009 +               }
30010 +       }
30011 +       done_lh(&lh);
30012 +
30013 +       return result;
30014 +}
30015 +
30016 +static reiser4_block_nr estimate_init(struct inode *parent,
30017 +                                     struct inode *object);
30018 +static int create_dot_dotdot(struct inode *object, struct inode *parent);
30019 +
30020 +/* this is common implementation of init method of dir plugin
30021 +   create "." and ".." entries
30022 +*/
30023 +int init_common(struct inode *object,  /* new directory */
30024 +               struct inode *parent,   /* parent directory */
30025 +               reiser4_object_create_data * data UNUSED_ARG    /* info passed
30026 +                                                                * to us, this
30027 +                                                                * is filled by
30028 +                                                                * reiser4()
30029 +                                                                * syscall in
30030 +                                                                * particular */ )
30031 +{
30032 +       reiser4_block_nr reserve;
30033 +
30034 +       assert("nikita-680", object != NULL);
30035 +       assert("nikita-681", S_ISDIR(object->i_mode));
30036 +       assert("nikita-682", parent != NULL);
30037 +       assert("nikita-684", data != NULL);
30038 +       assert("nikita-686", data->id == DIRECTORY_FILE_PLUGIN_ID);
30039 +       assert("nikita-687", object->i_mode & S_IFDIR);
30040 +
30041 +       reserve = estimate_init(parent, object);
30042 +       if (reiser4_grab_space(reserve, BA_CAN_COMMIT))
30043 +               return RETERR(-ENOSPC);
30044 +
30045 +       return create_dot_dotdot(object, parent);
30046 +}
30047 +
30048 +/* this is common implementation of done method of dir plugin
30049 +   remove "." entry
30050 +*/
30051 +int done_common(struct inode *object /* object being deleted */ )
30052 +{
30053 +       int result;
30054 +       reiser4_block_nr reserve;
30055 +       struct dentry goodby_dots;
30056 +       reiser4_dir_entry_desc entry;
30057 +
30058 +       assert("nikita-1449", object != NULL);
30059 +
30060 +       if (inode_get_flag(object, REISER4_NO_SD))
30061 +               return 0;
30062 +
30063 +       /* of course, this can be rewritten to sweep everything in one
30064 +          cut_tree(). */
30065 +       memset(&entry, 0, sizeof entry);
30066 +
30067 +       /* FIXME: this done method is called from delete_directory_common which
30068 +        * reserved space already */
30069 +       reserve = inode_dir_plugin(object)->estimate.rem_entry(object);
30070 +       if (reiser4_grab_space(reserve, BA_CAN_COMMIT | BA_RESERVED))
30071 +               return RETERR(-ENOSPC);
30072 +
30073 +       memset(&goodby_dots, 0, sizeof goodby_dots);
30074 +       entry.obj = goodby_dots.d_inode = object;
30075 +       goodby_dots.d_name.name = ".";
30076 +       goodby_dots.d_name.len = 1;
30077 +       result = rem_entry_common(object, &goodby_dots, &entry);
30078 +       reiser4_free_dentry_fsdata(&goodby_dots);
30079 +       if (unlikely(result != 0 && result != -ENOMEM && result != -ENOENT))
30080 +               /* only worth a warning
30081 +
30082 +                  "values of \ eB\ f will give rise to dom!\n"
30083 +                  -- v6src/s2/mv.c:89
30084 +                */
30085 +               warning("nikita-2252", "Cannot remove dot of %lli: %i",
30086 +                       (unsigned long long)get_inode_oid(object), result);
30087 +       return 0;
30088 +}
30089 +
30090 +/* this is common implementation of attach method of dir plugin
30091 +*/
30092 +int
30093 +attach_common(struct inode *child UNUSED_ARG, struct inode *parent UNUSED_ARG)
30094 +{
30095 +       assert("nikita-2647", child != NULL);
30096 +       assert("nikita-2648", parent != NULL);
30097 +
30098 +       return 0;
30099 +}
30100 +
30101 +/* this is common implementation of detach method of dir plugin
30102 +   remove "..", decrease nlink on parent
30103 +*/
30104 +int detach_common(struct inode *object, struct inode *parent)
30105 +{
30106 +       int result;
30107 +       struct dentry goodby_dots;
30108 +       reiser4_dir_entry_desc entry;
30109 +
30110 +       assert("nikita-2885", object != NULL);
30111 +       assert("nikita-2886", !inode_get_flag(object, REISER4_NO_SD));
30112 +
30113 +       memset(&entry, 0, sizeof entry);
30114 +
30115 +       /* NOTE-NIKITA this only works if @parent is -the- parent of
30116 +          @object, viz. object whose key is stored in dotdot
30117 +          entry. Wouldn't work with hard-links on directories. */
30118 +       memset(&goodby_dots, 0, sizeof goodby_dots);
30119 +       entry.obj = goodby_dots.d_inode = parent;
30120 +       goodby_dots.d_name.name = "..";
30121 +       goodby_dots.d_name.len = 2;
30122 +       result = rem_entry_common(object, &goodby_dots, &entry);
30123 +       reiser4_free_dentry_fsdata(&goodby_dots);
30124 +       if (result == 0) {
30125 +               /* the dot should be the only entry remaining at this time... */
30126 +               assert("nikita-3400", object->i_size == 1 &&
30127 +                      (object->i_nlink >= 0 && object->i_nlink <= 2));
30128 +#if 0
30129 +               /* and, together with the only name directory can have, they
30130 +                * provides for the last 2 remaining references. If we get
30131 +                * here as part of error handling during mkdir, @object
30132 +                * possibly has no name yet, so its nlink == 1. If we get here
30133 +                * from rename (targeting empty directory), it has no name
30134 +                * already, so its nlink == 1. */
30135 +               assert("nikita-3401",
30136 +                      object->i_nlink == 2 || object->i_nlink == 1);
30137 +#endif
30138 +
30139 +               /* decrement nlink of directory removed ".." pointed
30140 +                  to */
30141 +               reiser4_del_nlink(parent, NULL, 0);
30142 +       }
30143 +       return result;
30144 +}
30145 +
30146 +/* this is common implementation of estimate.add_entry method of
30147 +   dir plugin
30148 +   estimation of adding entry which supposes that entry is inserting a
30149 +   unit into item
30150 +*/
30151 +reiser4_block_nr estimate_add_entry_common(const struct inode * inode)
30152 +{
30153 +       return estimate_one_insert_into_item(tree_by_inode(inode));
30154 +}
30155 +
30156 +/* this is common implementation of estimate.rem_entry method of dir
30157 +   plugin
30158 +*/
30159 +reiser4_block_nr estimate_rem_entry_common(const struct inode * inode)
30160 +{
30161 +       return estimate_one_item_removal(tree_by_inode(inode));
30162 +}
30163 +
30164 +/* this is common implementation of estimate.unlink method of dir
30165 +   plugin
30166 +*/
30167 +reiser4_block_nr
30168 +dir_estimate_unlink_common(const struct inode * parent,
30169 +                          const struct inode * object)
30170 +{
30171 +       reiser4_block_nr res;
30172 +
30173 +       /* hashed_rem_entry(object) */
30174 +       res = inode_dir_plugin(object)->estimate.rem_entry(object);
30175 +       /* del_nlink(parent) */
30176 +       res += 2 * inode_file_plugin(parent)->estimate.update(parent);
30177 +
30178 +       return res;
30179 +}
30180 +
30181 +/*
30182 + * helper for inode_ops ->lookup() and dir plugin's ->get_parent()
30183 + * methods: if @inode is a light-weight file, setup its credentials
30184 + * that are not stored in the stat-data in this case
30185 + */
30186 +void check_light_weight(struct inode *inode, struct inode *parent)
30187 +{
30188 +       if (inode_get_flag(inode, REISER4_LIGHT_WEIGHT)) {
30189 +               inode->i_uid = parent->i_uid;
30190 +               inode->i_gid = parent->i_gid;
30191 +               /* clear light-weight flag. If inode would be read by any
30192 +                  other name, [ug]id wouldn't change. */
30193 +               inode_clr_flag(inode, REISER4_LIGHT_WEIGHT);
30194 +       }
30195 +}
30196 +
30197 +/* looks for name specified in @dentry in directory @parent and if name is
30198 +   found - key of object found entry points to is stored in @entry->key */
30199 +int lookup_name(struct inode *parent,  /* inode of directory to lookup for
30200 +                                        * name in */
30201 +               struct dentry *dentry,  /* name to look for */
30202 +               reiser4_key * key /* place to store key */ )
30203 +{
30204 +       int result;
30205 +       coord_t *coord;
30206 +       lock_handle lh;
30207 +       const char *name;
30208 +       int len;
30209 +       reiser4_dir_entry_desc entry;
30210 +       reiser4_dentry_fsdata *fsdata;
30211 +
30212 +       assert("nikita-1247", parent != NULL);
30213 +       assert("nikita-1248", dentry != NULL);
30214 +       assert("nikita-1123", dentry->d_name.name != NULL);
30215 +       assert("vs-1486",
30216 +              dentry->d_op == &get_super_private(parent->i_sb)->ops.dentry);
30217 +
30218 +       name = dentry->d_name.name;
30219 +       len = dentry->d_name.len;
30220 +
30221 +       if (!inode_dir_plugin(parent)->is_name_acceptable(parent, name, len))
30222 +               /* some arbitrary error code to return */
30223 +               return RETERR(-ENAMETOOLONG);
30224 +
30225 +       fsdata = reiser4_get_dentry_fsdata(dentry);
30226 +       if (IS_ERR(fsdata))
30227 +               return PTR_ERR(fsdata);
30228 +
30229 +       coord = &fsdata->dec.entry_coord;
30230 +       coord_clear_iplug(coord);
30231 +       init_lh(&lh);
30232 +
30233 +       /* find entry in a directory. This is plugin method. */
30234 +       result = find_entry(parent, dentry, &lh, ZNODE_READ_LOCK, &entry);
30235 +       if (result == 0) {
30236 +               /* entry was found, extract object key from it. */
30237 +               result =
30238 +                   WITH_COORD(coord,
30239 +                              item_plugin_by_coord(coord)->s.dir.
30240 +                              extract_key(coord, key));
30241 +       }
30242 +       done_lh(&lh);
30243 +       return result;
30244 +
30245 +}
30246 +
30247 +/* helper for init_common(): estimate number of blocks to reserve */
30248 +static reiser4_block_nr
30249 +estimate_init(struct inode *parent, struct inode *object)
30250 +{
30251 +       reiser4_block_nr res = 0;
30252 +
30253 +       assert("vpf-321", parent != NULL);
30254 +       assert("vpf-322", object != NULL);
30255 +
30256 +       /* hashed_add_entry(object) */
30257 +       res += inode_dir_plugin(object)->estimate.add_entry(object);
30258 +       /* reiser4_add_nlink(object) */
30259 +       res += inode_file_plugin(object)->estimate.update(object);
30260 +       /* hashed_add_entry(object) */
30261 +       res += inode_dir_plugin(object)->estimate.add_entry(object);
30262 +       /* reiser4_add_nlink(parent) */
30263 +       res += inode_file_plugin(parent)->estimate.update(parent);
30264 +
30265 +       return 0;
30266 +}
30267 +
30268 +/* helper function for init_common(). Create "." and ".." */
30269 +static int create_dot_dotdot(struct inode *object      /* object to create dot and
30270 +                                                        * dotdot for */ ,
30271 +                            struct inode *parent /* parent of @object */ )
30272 +{
30273 +       int result;
30274 +       struct dentry dots_entry;
30275 +       reiser4_dir_entry_desc entry;
30276 +
30277 +       assert("nikita-688", object != NULL);
30278 +       assert("nikita-689", S_ISDIR(object->i_mode));
30279 +       assert("nikita-691", parent != NULL);
30280 +
30281 +       /* We store dot and dotdot as normal directory entries. This is
30282 +          not necessary, because almost all information stored in them
30283 +          is already in the stat-data of directory, the only thing
30284 +          being missed is objectid of grand-parent directory that can
30285 +          easily be added there as extension.
30286 +
30287 +          But it is done the way it is done, because not storing dot
30288 +          and dotdot will lead to the following complications:
30289 +
30290 +          . special case handling in ->lookup().
30291 +          . addition of another extension to the sd.
30292 +          . dependency on key allocation policy for stat data.
30293 +
30294 +        */
30295 +
30296 +       memset(&entry, 0, sizeof entry);
30297 +       memset(&dots_entry, 0, sizeof dots_entry);
30298 +       entry.obj = dots_entry.d_inode = object;
30299 +       dots_entry.d_name.name = ".";
30300 +       dots_entry.d_name.len = 1;
30301 +       result = add_entry_common(object, &dots_entry, NULL, &entry);
30302 +       reiser4_free_dentry_fsdata(&dots_entry);
30303 +
30304 +       if (result == 0) {
30305 +               result = reiser4_add_nlink(object, object, 0);
30306 +               if (result == 0) {
30307 +                       entry.obj = dots_entry.d_inode = parent;
30308 +                       dots_entry.d_name.name = "..";
30309 +                       dots_entry.d_name.len = 2;
30310 +                       result = add_entry_common(object,
30311 +                                                 &dots_entry, NULL, &entry);
30312 +                       reiser4_free_dentry_fsdata(&dots_entry);
30313 +                       /* if creation of ".." failed, iput() will delete
30314 +                          object with ".". */
30315 +                       if (result == 0) {
30316 +                               result = reiser4_add_nlink(parent, object, 0);
30317 +                               if (result != 0)
30318 +                                       /*
30319 +                                        * if we failed to bump i_nlink, try
30320 +                                        * to remove ".."
30321 +                                        */
30322 +                                       detach_common(object, parent);
30323 +                       }
30324 +               }
30325 +       }
30326 +
30327 +       if (result != 0) {
30328 +               /*
30329 +                * in the case of error, at least update stat-data so that,
30330 +                * ->i_nlink updates are not lingering.
30331 +                */
30332 +               reiser4_update_sd(object);
30333 +               reiser4_update_sd(parent);
30334 +       }
30335 +
30336 +       return result;
30337 +}
30338 +
30339 +/*
30340 + * return 0 iff @coord contains a directory entry for the file with the name
30341 + * @name.
30342 + */
30343 +static int
30344 +check_item(const struct inode *dir, const coord_t * coord, const char *name)
30345 +{
30346 +       item_plugin *iplug;
30347 +       char buf[DE_NAME_BUF_LEN];
30348 +
30349 +       iplug = item_plugin_by_coord(coord);
30350 +       if (iplug == NULL) {
30351 +               warning("nikita-1135", "Cannot get item plugin");
30352 +               print_coord("coord", coord, 1);
30353 +               return RETERR(-EIO);
30354 +       } else if (item_id_by_coord(coord) !=
30355 +                  item_id_by_plugin(inode_dir_item_plugin(dir))) {
30356 +               /* item id of current item does not match to id of items a
30357 +                  directory is built of */
30358 +               warning("nikita-1136", "Wrong item plugin");
30359 +               print_coord("coord", coord, 1);
30360 +               return RETERR(-EIO);
30361 +       }
30362 +       assert("nikita-1137", iplug->s.dir.extract_name);
30363 +
30364 +       /* Compare name stored in this entry with name we are looking for.
30365 +
30366 +          NOTE-NIKITA Here should go code for support of something like
30367 +          unicode, code tables, etc.
30368 +        */
30369 +       return !!strcmp(name, iplug->s.dir.extract_name(coord, buf));
30370 +}
30371 +
30372 +static int
30373 +check_entry(const struct inode *dir, coord_t * coord, const struct qstr *name)
30374 +{
30375 +       return WITH_COORD(coord, check_item(dir, coord, name->name));
30376 +}
30377 +
30378 +/*
30379 + * argument package used by entry_actor to scan entries with identical keys.
30380 + */
30381 +typedef struct entry_actor_args {
30382 +       /* name we are looking for */
30383 +       const char *name;
30384 +       /* key of directory entry. entry_actor() scans through sequence of
30385 +        * items/units having the same key */
30386 +       reiser4_key *key;
30387 +       /* how many entries with duplicate key was scanned so far. */
30388 +       int non_uniq;
30389 +#if REISER4_USE_COLLISION_LIMIT
30390 +       /* scan limit */
30391 +       int max_non_uniq;
30392 +#endif
30393 +       /* return parameter: set to true, if ->name wasn't found */
30394 +       int not_found;
30395 +       /* what type of lock to take when moving to the next node during
30396 +        * scan */
30397 +       znode_lock_mode mode;
30398 +
30399 +       /* last coord that was visited during scan */
30400 +       coord_t last_coord;
30401 +       /* last node locked during scan */
30402 +       lock_handle last_lh;
30403 +       /* inode of directory */
30404 +       const struct inode *inode;
30405 +} entry_actor_args;
30406 +
30407 +/* Function called by find_entry() to look for given name in the directory. */
30408 +static int entry_actor(reiser4_tree * tree UNUSED_ARG /* tree being scanned */ ,
30409 +                      coord_t * coord /* current coord */ ,
30410 +                      lock_handle * lh /* current lock handle */ ,
30411 +                      void *entry_actor_arg /* argument to scan */ )
30412 +{
30413 +       reiser4_key unit_key;
30414 +       entry_actor_args *args;
30415 +
30416 +       assert("nikita-1131", tree != NULL);
30417 +       assert("nikita-1132", coord != NULL);
30418 +       assert("nikita-1133", entry_actor_arg != NULL);
30419 +
30420 +       args = entry_actor_arg;
30421 +       ++args->non_uniq;
30422 +#if REISER4_USE_COLLISION_LIMIT
30423 +       if (args->non_uniq > args->max_non_uniq) {
30424 +               args->not_found = 1;
30425 +               /* hash collision overflow. */
30426 +               return RETERR(-EBUSY);
30427 +       }
30428 +#endif
30429 +
30430 +       /*
30431 +        * did we just reach the end of the sequence of items/units with
30432 +        * identical keys?
30433 +        */
30434 +       if (!keyeq(args->key, unit_key_by_coord(coord, &unit_key))) {
30435 +               assert("nikita-1791",
30436 +                      keylt(args->key, unit_key_by_coord(coord, &unit_key)));
30437 +               args->not_found = 1;
30438 +               args->last_coord.between = AFTER_UNIT;
30439 +               return 0;
30440 +       }
30441 +
30442 +       coord_dup(&args->last_coord, coord);
30443 +       /*
30444 +        * did scan just moved to the next node?
30445 +        */
30446 +       if (args->last_lh.node != lh->node) {
30447 +               int lock_result;
30448 +
30449 +               /*
30450 +                * if so, lock new node with the mode requested by the caller
30451 +                */
30452 +               done_lh(&args->last_lh);
30453 +               assert("nikita-1896", znode_is_any_locked(lh->node));
30454 +               lock_result = longterm_lock_znode(&args->last_lh, lh->node,
30455 +                                                 args->mode, ZNODE_LOCK_HIPRI);
30456 +               if (lock_result != 0)
30457 +                       return lock_result;
30458 +       }
30459 +       return check_item(args->inode, coord, args->name);
30460 +}
30461 +
30462 +/* Look for given @name within directory @dir.
30463 +
30464 +   This is called during lookup, creation and removal of directory
30465 +   entries and on rename_common
30466 +
30467 +   First calculate key that directory entry for @name would have. Search
30468 +   for this key in the tree. If such key is found, scan all items with
30469 +   the same key, checking name in each directory entry along the way.
30470 +*/
30471 +int find_entry(struct inode *dir,      /* directory to scan */
30472 +              struct dentry *de,       /* name to search for */
30473 +              lock_handle * lh,        /* resulting lock handle */
30474 +              znode_lock_mode mode,    /* required lock mode */
30475 +              reiser4_dir_entry_desc * entry   /* parameters of found directory
30476 +                                                * entry */ )
30477 +{
30478 +       const struct qstr *name;
30479 +       seal_t *seal;
30480 +       coord_t *coord;
30481 +       int result;
30482 +       __u32 flags;
30483 +       de_location *dec;
30484 +       reiser4_dentry_fsdata *fsdata;
30485 +
30486 +       assert("nikita-1130", lh != NULL);
30487 +       assert("nikita-1128", dir != NULL);
30488 +
30489 +       name = &de->d_name;
30490 +       assert("nikita-1129", name != NULL);
30491 +
30492 +       /* dentry private data don't require lock, because dentry
30493 +          manipulations are protected by i_mutex on parent.
30494 +
30495 +          This is not so for inodes, because there is no -the- parent in
30496 +          inode case.
30497 +        */
30498 +       fsdata = reiser4_get_dentry_fsdata(de);
30499 +       if (IS_ERR(fsdata))
30500 +               return PTR_ERR(fsdata);
30501 +       dec = &fsdata->dec;
30502 +
30503 +       coord = &dec->entry_coord;
30504 +       coord_clear_iplug(coord);
30505 +       seal = &dec->entry_seal;
30506 +       /* compose key of directory entry for @name */
30507 +       inode_dir_plugin(dir)->build_entry_key(dir, name, &entry->key);
30508 +
30509 +       if (seal_is_set(seal)) {
30510 +               /* check seal */
30511 +               result = seal_validate(seal, coord, &entry->key,
30512 +                                      lh, mode, ZNODE_LOCK_LOPRI);
30513 +               if (result == 0) {
30514 +                       /* key was found. Check that it is really item we are
30515 +                          looking for. */
30516 +                       result = check_entry(dir, coord, name);
30517 +                       if (result == 0)
30518 +                               return 0;
30519 +               }
30520 +       }
30521 +       flags = (mode == ZNODE_WRITE_LOCK) ? CBK_FOR_INSERT : 0;
30522 +       /*
30523 +        * find place in the tree where directory item should be located.
30524 +        */
30525 +       result = object_lookup(dir, &entry->key, coord, lh, mode,
30526 +                              FIND_EXACT, LEAF_LEVEL, LEAF_LEVEL, flags,
30527 +                              NULL /*ra_info */ );
30528 +       if (result == CBK_COORD_FOUND) {
30529 +               entry_actor_args arg;
30530 +
30531 +               /* fast path: no hash collisions */
30532 +               result = check_entry(dir, coord, name);
30533 +               if (result == 0) {
30534 +                       seal_init(seal, coord, &entry->key);
30535 +                       dec->pos = 0;
30536 +               } else if (result > 0) {
30537 +                       /* Iterate through all units with the same keys. */
30538 +                       arg.name = name->name;
30539 +                       arg.key = &entry->key;
30540 +                       arg.not_found = 0;
30541 +                       arg.non_uniq = 0;
30542 +#if REISER4_USE_COLLISION_LIMIT
30543 +                       arg.max_non_uniq = max_hash_collisions(dir);
30544 +                       assert("nikita-2851", arg.max_non_uniq > 1);
30545 +#endif
30546 +                       arg.mode = mode;
30547 +                       arg.inode = dir;
30548 +                       coord_init_zero(&arg.last_coord);
30549 +                       init_lh(&arg.last_lh);
30550 +
30551 +                       result = iterate_tree(tree_by_inode(dir), coord, lh,
30552 +                                             entry_actor, &arg, mode, 1);
30553 +                       /* if end of the tree or extent was reached during
30554 +                          scanning. */
30555 +                       if (arg.not_found || (result == -E_NO_NEIGHBOR)) {
30556 +                               /* step back */
30557 +                               done_lh(lh);
30558 +
30559 +                               result = zload(arg.last_coord.node);
30560 +                               if (result == 0) {
30561 +                                       coord_clear_iplug(&arg.last_coord);
30562 +                                       coord_dup(coord, &arg.last_coord);
30563 +                                       move_lh(lh, &arg.last_lh);
30564 +                                       result = RETERR(-ENOENT);
30565 +                                       zrelse(arg.last_coord.node);
30566 +                                       --arg.non_uniq;
30567 +                               }
30568 +                       }
30569 +
30570 +                       done_lh(&arg.last_lh);
30571 +                       if (result == 0)
30572 +                               seal_init(seal, coord, &entry->key);
30573 +
30574 +                       if (result == 0 || result == -ENOENT) {
30575 +                               assert("nikita-2580", arg.non_uniq > 0);
30576 +                               dec->pos = arg.non_uniq - 1;
30577 +                       }
30578 +               }
30579 +       } else
30580 +               dec->pos = -1;
30581 +       return result;
30582 +}
30583 +
30584 +/* Local variables:
30585 +   c-indentation-style: "K&R"
30586 +   mode-name: "LC"
30587 +   c-basic-offset: 8
30588 +   tab-width: 8
30589 +   fill-column: 120
30590 +   End:
30591 +*/
30592 diff --git a/fs/reiser4/plugin/disk_format/Makefile b/fs/reiser4/plugin/disk_format/Makefile
30593 new file mode 100644
30594 index 0000000..e4e9e54
30595 --- /dev/null
30596 +++ b/fs/reiser4/plugin/disk_format/Makefile
30597 @@ -0,0 +1,5 @@
30598 +obj-$(CONFIG_REISER4_FS) += df_plugins.o
30599 +
30600 +df_plugins-objs :=     \
30601 +       disk_format40.o \
30602 +       disk_format.o
30603 diff --git a/fs/reiser4/plugin/disk_format/disk_format.c b/fs/reiser4/plugin/disk_format/disk_format.c
30604 new file mode 100644
30605 index 0000000..e324787
30606 --- /dev/null
30607 +++ b/fs/reiser4/plugin/disk_format/disk_format.c
30608 @@ -0,0 +1,37 @@
30609 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
30610 +
30611 +#include "../../debug.h"
30612 +#include "../plugin_header.h"
30613 +#include "disk_format40.h"
30614 +#include "disk_format.h"
30615 +#include "../plugin.h"
30616 +
30617 +/* initialization of disk layout plugins */
30618 +disk_format_plugin format_plugins[LAST_FORMAT_ID] = {
30619 +       [FORMAT40_ID] = {
30620 +               .h = {
30621 +                       .type_id = REISER4_FORMAT_PLUGIN_TYPE,
30622 +                       .id = FORMAT40_ID,
30623 +                       .pops = NULL,
30624 +                       .label = "reiser40",
30625 +                       .desc = "standard disk layout for reiser40",
30626 +                       .linkage = {NULL, NULL}
30627 +               },
30628 +               .init_format = init_format_format40,
30629 +               .root_dir_key = root_dir_key_format40,
30630 +               .release = release_format40,
30631 +               .log_super = log_super_format40,
30632 +               .check_open = check_open_format40
30633 +       }
30634 +};
30635 +
30636 +/* Make Linus happy.
30637 +   Local variables:
30638 +   c-indentation-style: "K&R"
30639 +   mode-name: "LC"
30640 +   c-basic-offset: 8
30641 +   tab-width: 8
30642 +   fill-column: 120
30643 +   scroll-step: 1
30644 +   End:
30645 +*/
30646 diff --git a/fs/reiser4/plugin/disk_format/disk_format.h b/fs/reiser4/plugin/disk_format/disk_format.h
30647 new file mode 100644
30648 index 0000000..b9c53ac
30649 --- /dev/null
30650 +++ b/fs/reiser4/plugin/disk_format/disk_format.h
30651 @@ -0,0 +1,27 @@
30652 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
30653 +
30654 +/* identifiers for disk layouts, they are also used as indexes in array of disk
30655 +   plugins */
30656 +
30657 +#if !defined( __REISER4_DISK_FORMAT_H__ )
30658 +#define __REISER4_DISK_FORMAT_H__
30659 +
30660 +typedef enum {
30661 +       /* standard reiser4 disk layout plugin id */
30662 +       FORMAT40_ID,
30663 +       LAST_FORMAT_ID
30664 +} disk_format_id;
30665 +
30666 +/* __REISER4_DISK_FORMAT_H__ */
30667 +#endif
30668 +
30669 +/* Make Linus happy.
30670 +   Local variables:
30671 +   c-indentation-style: "K&R"
30672 +   mode-name: "LC"
30673 +   c-basic-offset: 8
30674 +   tab-width: 8
30675 +   fill-column: 120
30676 +   scroll-step: 1
30677 +   End:
30678 +*/
30679 diff --git a/fs/reiser4/plugin/disk_format/disk_format40.c b/fs/reiser4/plugin/disk_format/disk_format40.c
30680 new file mode 100644
30681 index 0000000..0ace020
30682 --- /dev/null
30683 +++ b/fs/reiser4/plugin/disk_format/disk_format40.c
30684 @@ -0,0 +1,556 @@
30685 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
30686 +
30687 +#include "../../debug.h"
30688 +#include "../../dformat.h"
30689 +#include "../../key.h"
30690 +#include "../node/node.h"
30691 +#include "../space/space_allocator.h"
30692 +#include "disk_format40.h"
30693 +#include "../plugin.h"
30694 +#include "../../txnmgr.h"
30695 +#include "../../jnode.h"
30696 +#include "../../tree.h"
30697 +#include "../../super.h"
30698 +#include "../../wander.h"
30699 +#include "../../inode.h"
30700 +#include "../../ktxnmgrd.h"
30701 +#include "../../status_flags.h"
30702 +
30703 +#include <linux/types.h>       /* for __u??  */
30704 +#include <linux/fs.h>          /* for struct super_block  */
30705 +#include <linux/buffer_head.h>
30706 +
30707 +/* reiser 4.0 default disk layout */
30708 +
30709 +/* Amount of free blocks needed to perform release_format40 when fs gets
30710 +   mounted RW: 1 for SB, 1 for non-leaves in overwrite set, 2 for tx header
30711 +   & tx record. */
30712 +#define RELEASE_RESERVED 4
30713 +
30714 +/* functions to access fields of format40_disk_super_block */
30715 +static __u64 get_format40_block_count(const format40_disk_super_block * sb)
30716 +{
30717 +       return le64_to_cpu(get_unaligned(&sb->block_count));
30718 +}
30719 +
30720 +static __u64 get_format40_free_blocks(const format40_disk_super_block * sb)
30721 +{
30722 +       return le64_to_cpu(get_unaligned(&sb->free_blocks));
30723 +}
30724 +
30725 +static __u64 get_format40_root_block(const format40_disk_super_block * sb)
30726 +{
30727 +       return le64_to_cpu(get_unaligned(&sb->root_block));
30728 +}
30729 +
30730 +static __u16 get_format40_tree_height(const format40_disk_super_block * sb)
30731 +{
30732 +       return le16_to_cpu(get_unaligned(&sb->tree_height));
30733 +}
30734 +
30735 +static __u64 get_format40_file_count(const format40_disk_super_block * sb)
30736 +{
30737 +       return le64_to_cpu(get_unaligned(&sb->file_count));
30738 +}
30739 +
30740 +static __u64 get_format40_oid(const format40_disk_super_block * sb)
30741 +{
30742 +       return le64_to_cpu(get_unaligned(&sb->oid));
30743 +}
30744 +
30745 +static __u32 get_format40_mkfs_id(const format40_disk_super_block * sb)
30746 +{
30747 +       return le32_to_cpu(get_unaligned(&sb->mkfs_id));
30748 +}
30749 +
30750 +static __u64 get_format40_flags(const format40_disk_super_block * sb)
30751 +{
30752 +       return le64_to_cpu(get_unaligned(&sb->flags));
30753 +}
30754 +
30755 +static format40_super_info *get_sb_info(struct super_block *super)
30756 +{
30757 +       return &get_super_private(super)->u.format40;
30758 +}
30759 +
30760 +static int consult_diskmap(struct super_block *s)
30761 +{
30762 +       format40_super_info *info;
30763 +       journal_location *jloc;
30764 +
30765 +       info = get_sb_info(s);
30766 +       jloc = &get_super_private(s)->jloc;
30767 +       /* Default format-specific locations, if there is nothing in
30768 +        * diskmap */
30769 +       jloc->footer = FORMAT40_JOURNAL_FOOTER_BLOCKNR;
30770 +       jloc->header = FORMAT40_JOURNAL_HEADER_BLOCKNR;
30771 +       info->loc.super = FORMAT40_OFFSET / s->s_blocksize;
30772 +#ifdef CONFIG_REISER4_BADBLOCKS
30773 +       reiser4_get_diskmap_value(FORMAT40_PLUGIN_DISKMAP_ID, FORMAT40_JF,
30774 +                                 &jloc->footer);
30775 +       reiser4_get_diskmap_value(FORMAT40_PLUGIN_DISKMAP_ID, FORMAT40_JH,
30776 +                                 &jloc->header);
30777 +       reiser4_get_diskmap_value(FORMAT40_PLUGIN_DISKMAP_ID, FORMAT40_SUPER,
30778 +                                 &info->loc.super);
30779 +#endif
30780 +       return 0;
30781 +}
30782 +
30783 +/* find any valid super block of disk_format40 (even if the first
30784 +   super block is destroyed), will change block numbers of actual journal header/footer (jf/jh)
30785 +   if needed */
30786 +static struct buffer_head *find_a_disk_format40_super_block(struct super_block
30787 +                                                           *s)
30788 +{
30789 +       struct buffer_head *super_bh;
30790 +       format40_disk_super_block *disk_sb;
30791 +       format40_super_info *info;
30792 +
30793 +       assert("umka-487", s != NULL);
30794 +
30795 +       info = get_sb_info(s);
30796 +
30797 +       super_bh = sb_bread(s, info->loc.super);
30798 +       if (super_bh == NULL)
30799 +               return ERR_PTR(RETERR(-EIO));
30800 +
30801 +       disk_sb = (format40_disk_super_block *) super_bh->b_data;
30802 +       if (strncmp(disk_sb->magic, FORMAT40_MAGIC, sizeof(FORMAT40_MAGIC))) {
30803 +               brelse(super_bh);
30804 +               return ERR_PTR(RETERR(-EINVAL));
30805 +       }
30806 +
30807 +       reiser4_set_block_count(s, le64_to_cpu(get_unaligned(&disk_sb->block_count)));
30808 +       reiser4_set_data_blocks(s, le64_to_cpu(get_unaligned(&disk_sb->block_count)) -
30809 +                               le64_to_cpu(get_unaligned(&disk_sb->free_blocks)));
30810 +       reiser4_set_free_blocks(s, le64_to_cpu(get_unaligned(&disk_sb->free_blocks)));
30811 +
30812 +       return super_bh;
30813 +}
30814 +
30815 +/* find the most recent version of super block. This is called after journal is
30816 +   replayed */
30817 +static struct buffer_head *read_super_block(struct super_block *s UNUSED_ARG)
30818 +{
30819 +       /* Here the most recent superblock copy has to be read. However, as
30820 +          journal replay isn't complete, we are using
30821 +          find_a_disk_format40_super_block() function. */
30822 +       return find_a_disk_format40_super_block(s);
30823 +}
30824 +
30825 +static int get_super_jnode(struct super_block *s)
30826 +{
30827 +       reiser4_super_info_data *sbinfo = get_super_private(s);
30828 +       jnode *sb_jnode;
30829 +       int ret;
30830 +
30831 +       sb_jnode = alloc_io_head(&get_sb_info(s)->loc.super);
30832 +
30833 +       ret = jload(sb_jnode);
30834 +
30835 +       if (ret) {
30836 +               drop_io_head(sb_jnode);
30837 +               return ret;
30838 +       }
30839 +
30840 +       pin_jnode_data(sb_jnode);
30841 +       jrelse(sb_jnode);
30842 +
30843 +       sbinfo->u.format40.sb_jnode = sb_jnode;
30844 +
30845 +       return 0;
30846 +}
30847 +
30848 +static void done_super_jnode(struct super_block *s)
30849 +{
30850 +       jnode *sb_jnode = get_super_private(s)->u.format40.sb_jnode;
30851 +
30852 +       if (sb_jnode) {
30853 +               unpin_jnode_data(sb_jnode);
30854 +               drop_io_head(sb_jnode);
30855 +       }
30856 +}
30857 +
30858 +typedef enum format40_init_stage {
30859 +       NONE_DONE = 0,
30860 +       CONSULT_DISKMAP,
30861 +       FIND_A_SUPER,
30862 +       INIT_JOURNAL_INFO,
30863 +       INIT_STATUS,
30864 +       JOURNAL_REPLAY,
30865 +       READ_SUPER,
30866 +       KEY_CHECK,
30867 +       INIT_OID,
30868 +       INIT_TREE,
30869 +       JOURNAL_RECOVER,
30870 +       INIT_SA,
30871 +       INIT_JNODE,
30872 +       ALL_DONE
30873 +} format40_init_stage;
30874 +
30875 +static format40_disk_super_block *copy_sb(const struct buffer_head *super_bh)
30876 +{
30877 +       format40_disk_super_block *sb_copy;
30878 +
30879 +       sb_copy = kmalloc(sizeof(format40_disk_super_block), get_gfp_mask());
30880 +       if (sb_copy == NULL)
30881 +               return ERR_PTR(RETERR(-ENOMEM));
30882 +       memcpy(sb_copy, ((format40_disk_super_block *) super_bh->b_data),
30883 +              sizeof(format40_disk_super_block));
30884 +       return sb_copy;
30885 +}
30886 +
30887 +static int check_key_format(const format40_disk_super_block *sb_copy)
30888 +{
30889 +       if (!equi(REISER4_LARGE_KEY,
30890 +                 get_format40_flags(sb_copy) & (1 << FORMAT40_LARGE_KEYS))) {
30891 +               warning("nikita-3228", "Key format mismatch. "
30892 +                       "Only %s keys are supported.",
30893 +                       REISER4_LARGE_KEY ? "large" : "small");
30894 +               return RETERR(-EINVAL);
30895 +       }
30896 +       return 0;
30897 +}
30898 +
30899 +/**
30900 + * try_init_format40
30901 + * @super:
30902 + * @stage:
30903 + *
30904 + */
30905 +static int try_init_format40(struct super_block *super,
30906 +                            format40_init_stage *stage)
30907 +{
30908 +       int result;
30909 +       struct buffer_head *super_bh;
30910 +       reiser4_super_info_data *sbinfo;
30911 +       format40_disk_super_block *sb_copy;
30912 +       tree_level height;
30913 +       reiser4_block_nr root_block;
30914 +       node_plugin *nplug;
30915 +
30916 +       assert("vs-475", super != NULL);
30917 +       assert("vs-474", get_super_private(super));
30918 +
30919 +       *stage = NONE_DONE;
30920 +
30921 +       result = consult_diskmap(super);
30922 +       if (result)
30923 +               return result;
30924 +       *stage = CONSULT_DISKMAP;
30925 +
30926 +       super_bh = find_a_disk_format40_super_block(super);
30927 +       if (IS_ERR(super_bh))
30928 +               return PTR_ERR(super_bh);
30929 +       brelse(super_bh);
30930 +       *stage = FIND_A_SUPER;
30931 +
30932 +       /* map jnodes for journal control blocks (header, footer) to disk  */
30933 +       result = init_journal_info(super);
30934 +       if (result)
30935 +               return result;
30936 +       *stage = INIT_JOURNAL_INFO;
30937 +
30938 +       /* ok, we are sure that filesystem format is a format40 format */
30939 +       /* Now check it's state */
30940 +       result = reiser4_status_init(FORMAT40_STATUS_BLOCKNR);
30941 +       if (result != 0 && result != -EINVAL)
30942 +               /* -EINVAL means there is no magic, so probably just old
30943 +                * fs. */
30944 +               return result;
30945 +       *stage = INIT_STATUS;
30946 +
30947 +       result = reiser4_status_query(NULL, NULL);
30948 +       if (result == REISER4_STATUS_MOUNT_WARN)
30949 +               printk("Warning, mounting filesystem with errors\n");
30950 +       if (result == REISER4_STATUS_MOUNT_RO) {
30951 +               printk
30952 +                   ("Warning, mounting filesystem with fatal errors, forcing read-only mount\n");
30953 +               /* FIXME: here we should actually enforce read-only mount,
30954 +                * only it is unsupported yet. */
30955 +       }
30956 +
30957 +       result = reiser4_journal_replay(super);
30958 +       if (result)
30959 +               return result;
30960 +       *stage = JOURNAL_REPLAY;
30961 +
30962 +       super_bh = read_super_block(super);
30963 +       if (IS_ERR(super_bh))
30964 +               return PTR_ERR(super_bh);
30965 +       *stage = READ_SUPER;
30966 +
30967 +       /* allocate and make a copy of format40_disk_super_block */
30968 +       sb_copy = copy_sb(super_bh);
30969 +       brelse(super_bh);
30970 +       if (IS_ERR(sb_copy))
30971 +               return PTR_ERR(sb_copy);
30972 +
30973 +       /* make sure that key format of kernel and filesyste match */
30974 +       result = check_key_format(sb_copy);
30975 +       if (result) {
30976 +               kfree(sb_copy);
30977 +               return result;
30978 +       }
30979 +       *stage = KEY_CHECK;
30980 +
30981 +       result = oid_init_allocator(super, get_format40_file_count(sb_copy),
30982 +                                   get_format40_oid(sb_copy));
30983 +       if (result) {
30984 +               kfree(sb_copy);
30985 +               return result;
30986 +       }
30987 +       *stage = INIT_OID;
30988 +
30989 +       /* get things necessary to init reiser4_tree */
30990 +       root_block = get_format40_root_block(sb_copy);
30991 +       height = get_format40_tree_height(sb_copy);
30992 +       nplug = node_plugin_by_id(NODE40_ID);
30993 +
30994 +
30995 +       /* initialize reiser4_super_info_data */
30996 +       sbinfo = get_super_private(super);
30997 +       assert("", sbinfo->tree.super == super);
30998 +       /* init reiser4_tree for the filesystem */
30999 +       result = init_tree(&sbinfo->tree, &root_block, height, nplug);
31000 +       if (result) {
31001 +               kfree(sb_copy);
31002 +               return result;
31003 +       }
31004 +       *stage = INIT_TREE;
31005 +
31006 +       /*
31007 +        * initialize reiser4_super_info_data with data from format40 super
31008 +        * block
31009 +        */
31010 +       sbinfo->default_uid = 0;
31011 +       sbinfo->default_gid = 0;
31012 +       sbinfo->mkfs_id = get_format40_mkfs_id(sb_copy);
31013 +       /* number of blocks in filesystem and reserved space */
31014 +       reiser4_set_block_count(super, get_format40_block_count(sb_copy));
31015 +       sbinfo->blocks_free = get_format40_free_blocks(sb_copy);
31016 +       kfree(sb_copy);
31017 +
31018 +       sbinfo->fsuid = 0;
31019 +       sbinfo->fs_flags |= (1 << REISER4_ADG); /* hard links for directories
31020 +                                                * are not supported */
31021 +       sbinfo->fs_flags |= (1 << REISER4_ONE_NODE_PLUGIN);     /* all nodes in
31022 +                                                                * layout 40 are
31023 +                                                                * of one
31024 +                                                                * plugin */
31025 +       /* sbinfo->tmgr is initialized already */
31026 +
31027 +       /* recover sb data which were logged separately from sb block */
31028 +
31029 +       /* NOTE-NIKITA: reiser4_journal_recover_sb_data() calls
31030 +        * oid_init_allocator() and reiser4_set_free_blocks() with new
31031 +        * data. What's the reason to call them above? */
31032 +       result = reiser4_journal_recover_sb_data(super);
31033 +       if (result != 0)
31034 +               return result;
31035 +       *stage = JOURNAL_RECOVER;
31036 +
31037 +       /*
31038 +        * Set number of used blocks.  The number of used blocks is not stored
31039 +        * neither in on-disk super block nor in the journal footer blocks.  At
31040 +        * this moment actual values of total blocks and free block counters
31041 +        * are set in the reiser4 super block (in-memory structure) and we can
31042 +        * calculate number of used blocks from them.
31043 +        */
31044 +       reiser4_set_data_blocks(super,
31045 +                               reiser4_block_count(super) -
31046 +                               reiser4_free_blocks(super));
31047 +
31048 +#if REISER4_DEBUG
31049 +       sbinfo->min_blocks_used = 16 /* reserved area */  +
31050 +               2 /* super blocks */  +
31051 +               2 /* journal footer and header */ ;
31052 +#endif
31053 +
31054 +       /* init disk space allocator */
31055 +       result = sa_init_allocator(get_space_allocator(super), super, NULL);
31056 +       if (result)
31057 +               return result;
31058 +       *stage = INIT_SA;
31059 +
31060 +       result = get_super_jnode(super);
31061 +       if (result == 0)
31062 +               *stage = ALL_DONE;
31063 +       return result;
31064 +}
31065 +
31066 +/* plugin->u.format.get_ready */
31067 +int init_format_format40(struct super_block *s, void *data UNUSED_ARG)
31068 +{
31069 +       int result;
31070 +       format40_init_stage stage;
31071 +
31072 +       result = try_init_format40(s, &stage);
31073 +       switch (stage) {
31074 +       case ALL_DONE:
31075 +               assert("nikita-3458", result == 0);
31076 +               break;
31077 +       case INIT_JNODE:
31078 +               done_super_jnode(s);
31079 +       case INIT_SA:
31080 +               sa_destroy_allocator(get_space_allocator(s), s);
31081 +       case JOURNAL_RECOVER:
31082 +       case INIT_TREE:
31083 +               done_tree(&get_super_private(s)->tree);
31084 +       case INIT_OID:
31085 +       case KEY_CHECK:
31086 +       case READ_SUPER:
31087 +       case JOURNAL_REPLAY:
31088 +       case INIT_STATUS:
31089 +               reiser4_status_finish();
31090 +       case INIT_JOURNAL_INFO:
31091 +               done_journal_info(s);
31092 +       case FIND_A_SUPER:
31093 +       case CONSULT_DISKMAP:
31094 +       case NONE_DONE:
31095 +               break;
31096 +       default:
31097 +               impossible("nikita-3457", "init stage: %i", stage);
31098 +       }
31099 +
31100 +       if (!rofs_super(s) && reiser4_free_blocks(s) < RELEASE_RESERVED)
31101 +               return RETERR(-ENOSPC);
31102 +
31103 +       return result;
31104 +}
31105 +
31106 +static void pack_format40_super(const struct super_block *s, char *data)
31107 +{
31108 +       format40_disk_super_block *super_data =
31109 +           (format40_disk_super_block *) data;
31110 +       reiser4_super_info_data *sbinfo = get_super_private(s);
31111 +
31112 +       assert("zam-591", data != NULL);
31113 +
31114 +       put_unaligned(cpu_to_le64(reiser4_free_committed_blocks(s)),
31115 +                     &super_data->free_blocks);
31116 +       put_unaligned(cpu_to_le64(sbinfo->tree.root_block), &super_data->root_block);
31117 +
31118 +       put_unaligned(cpu_to_le64(oid_next(s)), &super_data->oid);
31119 +       put_unaligned(cpu_to_le64(oids_used(s)), &super_data->file_count);
31120 +
31121 +       put_unaligned(cpu_to_le16(sbinfo->tree.height), &super_data->tree_height);
31122 +}
31123 +
31124 +/* plugin->u.format.log_super
31125 +   return a jnode which should be added to transaction when the super block
31126 +   gets logged */
31127 +jnode *log_super_format40(struct super_block *s)
31128 +{
31129 +       jnode *sb_jnode;
31130 +
31131 +       sb_jnode = get_super_private(s)->u.format40.sb_jnode;
31132 +
31133 +       jload(sb_jnode);
31134 +
31135 +       pack_format40_super(s, jdata(sb_jnode));
31136 +
31137 +       jrelse(sb_jnode);
31138 +
31139 +       return sb_jnode;
31140 +}
31141 +
31142 +/* plugin->u.format.release */
31143 +int release_format40(struct super_block *s)
31144 +{
31145 +       int ret;
31146 +       reiser4_super_info_data *sbinfo;
31147 +
31148 +       sbinfo = get_super_private(s);
31149 +       assert("zam-579", sbinfo != NULL);
31150 +
31151 +       if (!rofs_super(s)) {
31152 +               ret = capture_super_block(s);
31153 +               if (ret != 0)
31154 +                       warning("vs-898", "capture_super_block failed: %d",
31155 +                               ret);
31156 +
31157 +               ret = txnmgr_force_commit_all(s, 1);
31158 +               if (ret != 0)
31159 +                       warning("jmacd-74438", "txn_force failed: %d", ret);
31160 +
31161 +               all_grabbed2free();
31162 +       }
31163 +
31164 +       sa_destroy_allocator(&sbinfo->space_allocator, s);
31165 +       done_journal_info(s);
31166 +       done_super_jnode(s);
31167 +
31168 +       rcu_barrier();
31169 +       done_tree(&sbinfo->tree);
31170 +       /* call finish_rcu(), because some znode were "released" in
31171 +        * done_tree(). */
31172 +       rcu_barrier();
31173 +
31174 +       return 0;
31175 +}
31176 +
31177 +#define FORMAT40_ROOT_LOCALITY 41
31178 +#define FORMAT40_ROOT_OBJECTID 42
31179 +
31180 +/* plugin->u.format.root_dir_key */
31181 +const reiser4_key *root_dir_key_format40(const struct super_block *super
31182 +                                        UNUSED_ARG)
31183 +{
31184 +       static const reiser4_key FORMAT40_ROOT_DIR_KEY = {
31185 +               .el = {
31186 +                       __constant_cpu_to_le64((FORMAT40_ROOT_LOCALITY << 4) | KEY_SD_MINOR),
31187 +#if REISER4_LARGE_KEY
31188 +                       ON_LARGE_KEY(0ull,)
31189 +#endif
31190 +                       __constant_cpu_to_le64(FORMAT40_ROOT_OBJECTID),
31191 +                       0ull
31192 +               }
31193 +       };
31194 +
31195 +       return &FORMAT40_ROOT_DIR_KEY;
31196 +}
31197 +
31198 +/* plugin->u.format.check_open.
31199 +   Check the opened object for validness. For now it checks for the valid oid &
31200 +   locality only, can be improved later and it its work may depend on the mount
31201 +   options. */
31202 +int check_open_format40(const struct inode *object)
31203 +{
31204 +       oid_t max, oid;
31205 +
31206 +       max = oid_next(object->i_sb) - 1;
31207 +
31208 +       /* Check the oid. */
31209 +       oid = get_inode_oid(object);
31210 +       if (oid > max) {
31211 +               warning("vpf-1360", "The object with the oid %llu "
31212 +                       "greater then the max used oid %llu found.",
31213 +                       (unsigned long long)oid, (unsigned long long)max);
31214 +
31215 +               return RETERR(-EIO);
31216 +       }
31217 +
31218 +       /* Check the locality. */
31219 +       oid = reiser4_inode_data(object)->locality_id;
31220 +       if (oid > max) {
31221 +               warning("vpf-1360", "The object with the locality %llu "
31222 +                       "greater then the max used oid %llu found.",
31223 +                       (unsigned long long)oid, (unsigned long long)max);
31224 +
31225 +               return RETERR(-EIO);
31226 +       }
31227 +
31228 +       return 0;
31229 +}
31230 +
31231 +/* Make Linus happy.
31232 +   Local variables:
31233 +   c-indentation-style: "K&R"
31234 +   mode-name: "LC"
31235 +   c-basic-offset: 8
31236 +   tab-width: 8
31237 +   fill-column: 120
31238 +   scroll-step: 1
31239 +   End:
31240 +*/
31241 diff --git a/fs/reiser4/plugin/disk_format/disk_format40.h b/fs/reiser4/plugin/disk_format/disk_format40.h
31242 new file mode 100644
31243 index 0000000..a34302d
31244 --- /dev/null
31245 +++ b/fs/reiser4/plugin/disk_format/disk_format40.h
31246 @@ -0,0 +1,99 @@
31247 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
31248 +
31249 +/* this file contains:
31250 +   - definition of ondisk super block of standart disk layout for
31251 +     reiser 4.0 (layout 40)
31252 +   - definition of layout 40 specific portion of in-core super block
31253 +   - declarations of functions implementing methods of layout plugin
31254 +     for layout 40
31255 +   - declarations of functions used to get/set fields in layout 40 super block
31256 +*/
31257 +
31258 +#ifndef __DISK_FORMAT40_H__
31259 +#define __DISK_FORMAT40_H__
31260 +
31261 +/* magic for default reiser4 layout */
31262 +#define FORMAT40_MAGIC "ReIsEr40FoRmAt"
31263 +#define FORMAT40_OFFSET (REISER4_MASTER_OFFSET + PAGE_CACHE_SIZE)
31264 +
31265 +#include "../../dformat.h"
31266 +
31267 +#include <linux/fs.h>          /* for struct super_block  */
31268 +
31269 +typedef enum {
31270 +       FORMAT40_LARGE_KEYS
31271 +} format40_flags;
31272 +
31273 +/* ondisk super block for format 40. It is 512 bytes long */
31274 +typedef struct format40_disk_super_block {
31275 +       /*   0 */ d64 block_count;
31276 +       /* number of block in a filesystem */
31277 +       /*   8 */ d64 free_blocks;
31278 +       /* number of free blocks */
31279 +       /*  16 */ d64 root_block;
31280 +       /* filesystem tree root block */
31281 +       /*  24 */ d64 oid;
31282 +       /* smallest free objectid */
31283 +       /*  32 */ d64 file_count;
31284 +       /* number of files in a filesystem */
31285 +       /*  40 */ d64 flushes;
31286 +       /* number of times super block was
31287 +          flushed. Needed if format 40
31288 +          will have few super blocks */
31289 +       /*  48 */ d32 mkfs_id;
31290 +       /* unique identifier of fs */
31291 +       /*  52 */ char magic[16];
31292 +       /* magic string ReIsEr40FoRmAt */
31293 +       /*  68 */ d16 tree_height;
31294 +       /* height of filesystem tree */
31295 +       /*  70 */ d16 formatting_policy;
31296 +       /*  72 */ d64 flags;
31297 +       /*  72 */ char not_used[432];
31298 +} format40_disk_super_block;
31299 +
31300 +/* format 40 specific part of reiser4_super_info_data */
31301 +typedef struct format40_super_info {
31302 +/*     format40_disk_super_block actual_sb; */
31303 +       jnode *sb_jnode;
31304 +       struct {
31305 +               reiser4_block_nr super;
31306 +       } loc;
31307 +} format40_super_info;
31308 +
31309 +/* Defines for journal header and footer respectively. */
31310 +#define FORMAT40_JOURNAL_HEADER_BLOCKNR \
31311 +       ((REISER4_MASTER_OFFSET / PAGE_CACHE_SIZE) + 3)
31312 +
31313 +#define FORMAT40_JOURNAL_FOOTER_BLOCKNR \
31314 +       ((REISER4_MASTER_OFFSET / PAGE_CACHE_SIZE) + 4)
31315 +
31316 +#define FORMAT40_STATUS_BLOCKNR \
31317 +       ((REISER4_MASTER_OFFSET / PAGE_CACHE_SIZE) + 5)
31318 +
31319 +/* Diskmap declarations */
31320 +#define FORMAT40_PLUGIN_DISKMAP_ID ((REISER4_FORMAT_PLUGIN_TYPE<<16) | (FORMAT40_ID))
31321 +#define FORMAT40_SUPER 1
31322 +#define FORMAT40_JH 2
31323 +#define FORMAT40_JF 3
31324 +
31325 +/* declarations of functions implementing methods of layout plugin for
31326 +   format 40. The functions theirself are in disk_format40.c */
31327 +int init_format_format40(struct super_block *, void *data);
31328 +const reiser4_key *root_dir_key_format40(const struct super_block *);
31329 +int release_format40(struct super_block *s);
31330 +jnode *log_super_format40(struct super_block *s);
31331 +int check_open_format40(const struct inode *object);
31332 +
31333 +/* __DISK_FORMAT40_H__ */
31334 +#endif
31335 +
31336 +/* Make Linus happy.
31337 +   Local variables:
31338 +   c-indentation-style: "K&R"
31339 +   mode-name: "LC"
31340 +   c-basic-offset: 8
31341 +   tab-width: 8
31342 +   fill-column: 120
31343 +   scroll-step: 1
31344 +   End:
31345 +*/
31346 diff --git a/fs/reiser4/plugin/fibration.c b/fs/reiser4/plugin/fibration.c
31347 new file mode 100644
31348 index 0000000..dcd13c8
31349 --- /dev/null
31350 +++ b/fs/reiser4/plugin/fibration.c
31351 @@ -0,0 +1,174 @@
31352 +/* Copyright 2004 by Hans Reiser, licensing governed by
31353 + * reiser4/README */
31354 +
31355 +/* Directory fibrations */
31356 +
31357 +/*
31358 + * Suppose we have a directory tree with sources of some project. During
31359 + * compilation .o files are created within this tree. This makes access
31360 + * to the original source files less efficient, because source files are
31361 + * now "diluted" by object files: default directory plugin uses prefix
31362 + * of a file name as a part of the key for directory entry (and this
31363 + * part is also inherited by the key of file body). This means that
31364 + * foo.o will be located close to foo.c and foo.h in the tree.
31365 + *
31366 + * To avoid this effect directory plugin fill highest 7 (unused
31367 + * originally) bits of the second component of the directory entry key
31368 + * by bit-pattern depending on the file name (see
31369 + * fs/reiser4/kassign.c:build_entry_key_common()). These bits are called
31370 + * "fibre". Fibre of the file name key is inherited by key of stat data
31371 + * and keys of file body (in the case of REISER4_LARGE_KEY).
31372 + *
31373 + * Fibre for a given file is chosen by per-directory fibration
31374 + * plugin. Names within given fibre are ordered lexicographically.
31375 + */
31376 +
31377 +#include "../debug.h"
31378 +#include "plugin_header.h"
31379 +#include "plugin.h"
31380 +#include "../super.h"
31381 +#include "../inode.h"
31382 +
31383 +#include <linux/types.h>
31384 +
31385 +static const int fibre_shift = 57;
31386 +
31387 +#define FIBRE_NO(n) (((__u64)(n)) << fibre_shift)
31388 +
31389 +/*
31390 + * Trivial fibration: all files of directory are just ordered
31391 + * lexicographically.
31392 + */
31393 +static __u64 fibre_trivial(const struct inode *dir, const char *name, int len)
31394 +{
31395 +       return FIBRE_NO(0);
31396 +}
31397 +
31398 +/*
31399 + * dot-o fibration: place .o files after all others.
31400 + */
31401 +static __u64 fibre_dot_o(const struct inode *dir, const char *name, int len)
31402 +{
31403 +       /* special treatment for .*\.o */
31404 +       if (len > 2 && name[len - 1] == 'o' && name[len - 2] == '.')
31405 +               return FIBRE_NO(1);
31406 +       else
31407 +               return FIBRE_NO(0);
31408 +}
31409 +
31410 +/*
31411 + * ext.1 fibration: subdivide directory into 128 fibrations one for each
31412 + * 7bit extension character (file "foo.h" goes into fibre "h"), plus
31413 + * default fibre for the rest.
31414 + */
31415 +static __u64 fibre_ext_1(const struct inode *dir, const char *name, int len)
31416 +{
31417 +       if (len > 2 && name[len - 2] == '.')
31418 +               return FIBRE_NO(name[len - 1]);
31419 +       else
31420 +               return FIBRE_NO(0);
31421 +}
31422 +
31423 +/*
31424 + * ext.3 fibration: try to separate files with different 3-character
31425 + * extensions from each other.
31426 + */
31427 +static __u64 fibre_ext_3(const struct inode *dir, const char *name, int len)
31428 +{
31429 +       if (len > 4 && name[len - 4] == '.')
31430 +               return FIBRE_NO(name[len - 3] + name[len - 2] + name[len - 1]);
31431 +       else
31432 +               return FIBRE_NO(0);
31433 +}
31434 +
31435 +static int change_fibration(struct inode *inode, reiser4_plugin * plugin)
31436 +{
31437 +       int result;
31438 +
31439 +       assert("nikita-3503", inode != NULL);
31440 +       assert("nikita-3504", plugin != NULL);
31441 +
31442 +       assert("nikita-3505", is_reiser4_inode(inode));
31443 +       assert("nikita-3506", inode_dir_plugin(inode) != NULL);
31444 +       assert("nikita-3507",
31445 +              plugin->h.type_id == REISER4_FIBRATION_PLUGIN_TYPE);
31446 +
31447 +       result = 0;
31448 +       if (inode_fibration_plugin(inode) == NULL ||
31449 +           inode_fibration_plugin(inode)->h.id != plugin->h.id) {
31450 +               if (is_dir_empty(inode) == 0)
31451 +                       result =
31452 +                           plugin_set_fibration(&reiser4_inode_data(inode)->
31453 +                                                pset, &plugin->fibration);
31454 +               else
31455 +                       result = RETERR(-ENOTEMPTY);
31456 +
31457 +       }
31458 +       return result;
31459 +}
31460 +
31461 +static reiser4_plugin_ops fibration_plugin_ops = {
31462 +       .init = NULL,
31463 +       .load = NULL,
31464 +       .save_len = NULL,
31465 +       .save = NULL,
31466 +       .change = change_fibration
31467 +};
31468 +
31469 +/* fibration plugins */
31470 +fibration_plugin fibration_plugins[LAST_FIBRATION_ID] = {
31471 +       [FIBRATION_LEXICOGRAPHIC] = {
31472 +               .h = {
31473 +                       .type_id = REISER4_FIBRATION_PLUGIN_TYPE,
31474 +                       .id = FIBRATION_LEXICOGRAPHIC,
31475 +                       .pops = &fibration_plugin_ops,
31476 +                       .label = "lexicographic",
31477 +                       .desc = "no fibration",
31478 +                       .linkage = {NULL, NULL}
31479 +               },
31480 +               .fibre = fibre_trivial
31481 +       },
31482 +       [FIBRATION_DOT_O] = {
31483 +               .h = {
31484 +                       .type_id = REISER4_FIBRATION_PLUGIN_TYPE,
31485 +                       .id = FIBRATION_DOT_O,
31486 +                       .pops = &fibration_plugin_ops,
31487 +                       .label = "dot-o",
31488 +                       .desc = "fibrate .o files separately",
31489 +                       .linkage = {NULL, NULL}
31490 +               },
31491 +               .fibre = fibre_dot_o
31492 +       },
31493 +       [FIBRATION_EXT_1] = {
31494 +               .h = {
31495 +                       .type_id = REISER4_FIBRATION_PLUGIN_TYPE,
31496 +                       .id = FIBRATION_EXT_1,
31497 +                       .pops = &fibration_plugin_ops,
31498 +                       .label = "ext-1",
31499 +                       .desc = "fibrate file by single character extension",
31500 +                       .linkage = {NULL, NULL}
31501 +               },
31502 +               .fibre = fibre_ext_1
31503 +       },
31504 +       [FIBRATION_EXT_3] = {
31505 +               .h = {
31506 +                       .type_id = REISER4_FIBRATION_PLUGIN_TYPE,
31507 +                       .id = FIBRATION_EXT_3,
31508 +                       .pops = &fibration_plugin_ops,
31509 +                       .label = "ext-3",
31510 +                       .desc = "fibrate file by three character extension",
31511 +                       .linkage = {NULL, NULL}
31512 +               },
31513 +               .fibre = fibre_ext_3
31514 +       }
31515 +};
31516 +
31517 +/*
31518 + * Local variables:
31519 + * c-indentation-style: "K&R"
31520 + * mode-name: "LC"
31521 + * c-basic-offset: 8
31522 + * tab-width: 8
31523 + * fill-column: 79
31524 + * End:
31525 + */
31526 diff --git a/fs/reiser4/plugin/fibration.h b/fs/reiser4/plugin/fibration.h
31527 new file mode 100644
31528 index 0000000..0723cad
31529 --- /dev/null
31530 +++ b/fs/reiser4/plugin/fibration.h
31531 @@ -0,0 +1,37 @@
31532 +/* Copyright 2004 by Hans Reiser, licensing governed by reiser4/README */
31533 +
31534 +/* Fibration plugin used by hashed directory plugin to segment content
31535 + * of directory. See fs/reiser4/plugin/fibration.c for more on this. */
31536 +
31537 +#if !defined( __FS_REISER4_PLUGIN_FIBRATION_H__ )
31538 +#define __FS_REISER4_PLUGIN_FIBRATION_H__
31539 +
31540 +#include "plugin_header.h"
31541 +
31542 +typedef struct fibration_plugin {
31543 +       /* generic fields */
31544 +       plugin_header h;
31545 +
31546 +        __u64(*fibre) (const struct inode * dir, const char *name, int len);
31547 +} fibration_plugin;
31548 +
31549 +typedef enum {
31550 +       FIBRATION_LEXICOGRAPHIC,
31551 +       FIBRATION_DOT_O,
31552 +       FIBRATION_EXT_1,
31553 +       FIBRATION_EXT_3,
31554 +       LAST_FIBRATION_ID
31555 +} reiser4_fibration_id;
31556 +
31557 +/* __FS_REISER4_PLUGIN_FIBRATION_H__ */
31558 +#endif
31559 +
31560 +/* Make Linus happy.
31561 +   Local variables:
31562 +   c-indentation-style: "K&R"
31563 +   mode-name: "LC"
31564 +   c-basic-offset: 8
31565 +   tab-width: 8
31566 +   fill-column: 120
31567 +   End:
31568 +*/
31569 diff --git a/fs/reiser4/plugin/file/Makefile b/fs/reiser4/plugin/file/Makefile
31570 new file mode 100644
31571 index 0000000..134fa7a
31572 --- /dev/null
31573 +++ b/fs/reiser4/plugin/file/Makefile
31574 @@ -0,0 +1,7 @@
31575 +obj-$(CONFIG_REISER4_FS) += file_plugins.o
31576 +
31577 +file_plugins-objs :=           \
31578 +       file.o                  \
31579 +       tail_conversion.o       \
31580 +       symlink.o               \
31581 +       cryptcompress.o
31582 diff --git a/fs/reiser4/plugin/file/cryptcompress.c b/fs/reiser4/plugin/file/cryptcompress.c
31583 new file mode 100644
31584 index 0000000..64137dc
31585 --- /dev/null
31586 +++ b/fs/reiser4/plugin/file/cryptcompress.c
31587 @@ -0,0 +1,3819 @@
31588 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
31589 +   reiser4/README */
31590 +
31591 +/* This file contains implementations of inode/file/address_space/file plugin
31592 + * operations specific for cryptcompress file plugin which manages files with
31593 + * compressed and encrypted bodies. "Cryptcompress file" is built of items of
31594 + * CTAIL_ID (see http://www.namesys.com/cryptcompress_design.html for details).
31595 + */
31596 +
31597 +#include "../../page_cache.h"
31598 +#include "../../inode.h"
31599 +#include "../cluster.h"
31600 +#include "../object.h"
31601 +#include "../../tree_walk.h"
31602 +#include "cryptcompress.h"
31603 +
31604 +#include <asm/scatterlist.h>
31605 +#include <linux/pagevec.h>
31606 +#include <asm/uaccess.h>
31607 +#include <linux/swap.h>
31608 +#include <linux/writeback.h>
31609 +#include <linux/random.h>
31610 +
31611 +/* get cryptcompress specific portion of inode */
31612 +cryptcompress_info_t *cryptcompress_inode_data(const struct inode *inode)
31613 +{
31614 +       return &reiser4_inode_data(inode)->file_plugin_data.cryptcompress_info;
31615 +}
31616 +
31617 +/* plugin->u.file.init_inode_data */
31618 +void
31619 +init_inode_data_cryptcompress(struct inode *inode,
31620 +                             reiser4_object_create_data * crd, int create)
31621 +{
31622 +       cryptcompress_info_t *data;
31623 +
31624 +       data = cryptcompress_inode_data(inode);
31625 +       assert("edward-685", data != NULL);
31626 +
31627 +       memset(data, 0, sizeof(*data));
31628 +
31629 +       init_rwsem(&data->lock);
31630 +       toggle_compression(data, 1);
31631 +       init_inode_ordering(inode, crd, create);
31632 +}
31633 +
31634 +#if REISER4_DEBUG
31635 +int crc_inode_ok(struct inode *inode)
31636 +{
31637 +       if (cluster_shift_ok(inode_cluster_shift(inode)))
31638 +               return 1;
31639 +       assert("edward-686", 0);
31640 +       return 0;
31641 +}
31642 +#endif
31643 +
31644 +static int check_cryptcompress(struct inode *inode)
31645 +{
31646 +       int result = 0;
31647 +       assert("edward-1307", inode_compression_plugin(inode) != NULL);
31648 +
31649 +       if (inode_cluster_size(inode) < PAGE_CACHE_SIZE) {
31650 +               warning("edward-1331",
31651 +                       "%s clusters are unsupported",
31652 +                       inode_cluster_plugin(inode)->h.label);
31653 +               return RETERR(-EINVAL);
31654 +       }
31655 +
31656 +       /* FIXME-EDWARD: init? or check? */
31657 +       if (inode_compression_plugin(inode)->init)
31658 +               result = inode_compression_plugin(inode)->init();
31659 +       return result;
31660 +}
31661 +
31662 +/* The following is a part of reiser4 cipher key manager
31663 +   which is called when opening/creating a cryptcompress file */
31664 +
31665 +/* get/set cipher key info */
31666 +crypto_stat_t * inode_crypto_stat (struct inode * inode)
31667 +{
31668 +       assert("edward-90", inode != NULL);
31669 +       assert("edward-91", reiser4_inode_data(inode) != NULL);
31670 +       return cryptcompress_inode_data(inode)->crypt;
31671 +}
31672 +
31673 +static void set_inode_crypto_stat (struct inode * inode, crypto_stat_t * stat)
31674 +{
31675 +       cryptcompress_inode_data(inode)->crypt = stat;
31676 +}
31677 +
31678 +/* allocate a cipher key info */
31679 +crypto_stat_t * alloc_crypto_stat (struct inode * inode)
31680 +{
31681 +       crypto_stat_t * info;
31682 +       int fipsize;
31683 +
31684 +       assert("edward-1421", 0);
31685 +       info = kmalloc(sizeof(*info), GFP_KERNEL);
31686 +       if (!info)
31687 +               return ERR_PTR(-ENOMEM);
31688 +       memset(info, 0, sizeof (*info));
31689 +       fipsize = inode_digest_plugin(inode)->fipsize;
31690 +       info->keyid = kmalloc(fipsize, GFP_KERNEL);
31691 +       if (!info->keyid) {
31692 +               kfree(info);
31693 +               return ERR_PTR(-ENOMEM);
31694 +       }
31695 +       return info;
31696 +}
31697 +
31698 +#if 0
31699 +/* allocate/free low-level info for cipher and digest
31700 +   transforms */
31701 +static int
31702 +alloc_crypto_tfms(plugin_set * pset, crypto_stat_t * info)
31703 +{
31704 +       struct crypto_tfm * ret = NULL;
31705 +       cipher_plugin * cplug = pset->cipher;
31706 +       digest_plugin * dplug = pset->digest;
31707 +
31708 +       assert("edward-1363", info != NULL);
31709 +       assert("edward-414", cplug != NULL);
31710 +       assert("edward-415", dplug != NULL);
31711 +
31712 +       if (cplug->alloc) {
31713 +               ret = cplug->alloc();
31714 +               if (ret == NULL) {
31715 +                       warning("edward-1364",
31716 +                               "Can not allocate info for %s\n",
31717 +                               cplug->h.desc);
31718 +                       return RETERR(-EINVAL);
31719 +               }
31720 +       }
31721 +       info_set_tfm(info, CIPHER_TFM, ret);
31722 +       if (dplug->alloc) {
31723 +               ret = dplug->alloc();
31724 +               if (ret == NULL) {
31725 +                       warning("edward-1365",
31726 +                               "Can not allocate info for %s\n",
31727 +                               dplug->h.desc);
31728 +                       goto err;
31729 +               }
31730 +       }
31731 +       info_set_tfm(info, DIGEST_TFM, ret);
31732 +       return 0;
31733 + err:
31734 +       if (cplug->free) {
31735 +               cplug->free(info->tfma[CIPHER_TFM].tfm);
31736 +               info_set_tfm(info, CIPHER_TFM, NULL);
31737 +       }
31738 +       return RETERR(-EINVAL);
31739 +}
31740 +#endif
31741 +
31742 +static void
31743 +free_crypto_tfms(crypto_stat_t * info)
31744 +{
31745 +       assert("edward-1366", info != NULL);
31746 +       if (!info_cipher_tfm(info))
31747 +               return;
31748 +       info_cipher_plugin(info)->free(info_cipher_tfm(info));
31749 +       info_set_tfm(info, CIPHER_TFM, NULL);
31750 +       info_digest_plugin(info)->free(info_digest_tfm(info));
31751 +       info_set_tfm(info, DIGEST_TFM, NULL);
31752 +       return;
31753 +}
31754 +
31755 +#if 0
31756 +/* create a key fingerprint for disk stat-data */
31757 +static int create_keyid (crypto_stat_t * info, crypto_data_t * data)
31758 +{
31759 +       int ret = -ENOMEM;
31760 +       size_t blk, pad;
31761 +       __u8 * dmem;
31762 +       __u8 * cmem;
31763 +       struct crypto_tfm * dtfm;
31764 +       struct crypto_tfm * ctfm;
31765 +       struct scatterlist sg;
31766 +
31767 +       assert("edward-1422", 0);
31768 +       assert("edward-1367", info != NULL);
31769 +       assert("edward-1368", info->keyid != NULL);
31770 +
31771 +       dtfm = info_digest_tfm(info);
31772 +       ctfm = info_cipher_tfm(info);
31773 +
31774 +       dmem = kmalloc((size_t)crypto_tfm_alg_digestsize(dtfm),
31775 +                              GFP_KERNEL);
31776 +       if (!dmem)
31777 +               goto exit1;
31778 +
31779 +       blk = crypto_tfm_alg_blocksize(ctfm);
31780 +
31781 +       pad = data->keyid_size % blk;
31782 +       pad = (pad ? blk - pad : 0);
31783 +
31784 +       cmem = kmalloc((size_t)data->keyid_size + pad, GFP_KERNEL);
31785 +       if (!cmem)
31786 +               goto exit2;
31787 +       memcpy(cmem, data->keyid, data->keyid_size);
31788 +       memset(cmem + data->keyid_size, 0, pad);
31789 +
31790 +       sg.page = virt_to_page(cmem);
31791 +       sg.offset = offset_in_page(cmem);
31792 +       sg.length = data->keyid_size + pad;
31793 +
31794 +       ret = crypto_cipher_encrypt(ctfm, &sg, &sg, data->keyid_size + pad);
31795 +       if (ret) {
31796 +               warning("edward-1369",
31797 +                       "encryption failed flags=%x\n", ctfm->crt_flags);
31798 +               goto exit3;
31799 +       }
31800 +       crypto_digest_init (dtfm);
31801 +       crypto_digest_update (dtfm, &sg, 1);
31802 +       crypto_digest_final (dtfm, dmem);
31803 +       memcpy(info->keyid, dmem, info_digest_plugin(info)->fipsize);
31804 + exit3:
31805 +       kfree(cmem);
31806 + exit2:
31807 +       kfree(dmem);
31808 + exit1:
31809 +       return ret;
31810 +}
31811 +#endif
31812 +
31813 +static void destroy_keyid(crypto_stat_t * info)
31814 +{
31815 +       assert("edward-1370", info != NULL);
31816 +       assert("edward-1371", info->keyid != NULL);
31817 +       kfree(info->keyid);
31818 +       return;
31819 +}
31820 +
31821 +static void free_crypto_stat (crypto_stat_t * info)
31822 +{
31823 +       assert("edward-1372", info != NULL);
31824 +
31825 +       free_crypto_tfms(info);
31826 +       destroy_keyid(info);
31827 +       kfree(info);
31828 +}
31829 +
31830 +#if 0
31831 +static void instantiate_crypto_stat(crypto_stat_t * info)
31832 +{
31833 +       assert("edward-1373", info != NULL);
31834 +       assert("edward-1374", info->inst == 0);
31835 +       info->inst = 1;
31836 +}
31837 +#endif
31838 +
31839 +static void uninstantiate_crypto_stat(crypto_stat_t * info)
31840 +{
31841 +       assert("edward-1375", info != NULL);
31842 +       info->inst = 0;
31843 +}
31844 +
31845 +static int crypto_stat_instantiated(crypto_stat_t * info)
31846 +{
31847 +       return info->inst;
31848 +}
31849 +
31850 +static int inode_has_cipher_key(struct inode * inode)
31851 +{
31852 +       assert("edward-1376", inode != NULL);
31853 +       return inode_crypto_stat(inode) &&
31854 +               crypto_stat_instantiated(inode_crypto_stat(inode));
31855 +}
31856 +
31857 +static void inode_free_crypto_stat (struct inode * inode)
31858 +{
31859 +       uninstantiate_crypto_stat(inode_crypto_stat(inode));
31860 +       free_crypto_stat(inode_crypto_stat(inode));
31861 +}
31862 +
31863 +static int need_cipher(struct inode * inode)
31864 +{
31865 +       return inode_cipher_plugin(inode) !=
31866 +               cipher_plugin_by_id(NONE_CIPHER_ID);
31867 +}
31868 +
31869 +/* Create a crypto-stat and attach result to the @object.
31870 +   If success is returned, then low-level cipher info contains
31871 +   an instantiated key */
31872 +#if 0
31873 +crypto_stat_t *
31874 +create_crypto_stat(struct inode * object,
31875 +                  crypto_data_t * data /* this contains a (uninstantiated)
31876 +                                          cipher key imported from user
31877 +                                          space */)
31878 +{
31879 +       int ret;
31880 +       crypto_stat_t * info;
31881 +
31882 +       assert("edward-1377", data != NULL);
31883 +       assert("edward-1378", need_cipher(object));
31884 +
31885 +       if (inode_file_plugin(object) !=
31886 +           file_plugin_by_id(DIRECTORY_FILE_PLUGIN_ID))
31887 +               return ERR_PTR(-EINVAL);
31888 +
31889 +       info = alloc_crypto_stat(object);
31890 +       if (IS_ERR(info))
31891 +               return info;
31892 +       ret = alloc_crypto_tfms(reiser4_inode_data(object)->pset, info);
31893 +       if (ret)
31894 +               goto err;
31895 +       /* Someone can change plugins of the host (for example if
31896 +          the host is a directory), so we keep the original ones
31897 +          in the crypto-stat. */
31898 +       info_set_cipher_plugin(info, inode_cipher_plugin(object));
31899 +       info_set_digest_plugin(info, inode_digest_plugin(object));
31900 +       /* instantiating a key */
31901 +       ret = crypto_cipher_setkey(info_cipher_tfm(info),
31902 +                                  data->key,
31903 +                                  data->keysize);
31904 +       if (ret) {
31905 +               warning("edward-1379",
31906 +                       "setkey failed flags=%x\n",
31907 +                       info_cipher_tfm(info)->crt_flags);
31908 +               goto err;
31909 +       }
31910 +       info->keysize = data->keysize;
31911 +       ret = create_keyid(info, data);
31912 +       if (ret)
31913 +               goto err;
31914 +       instantiate_crypto_stat(info);
31915 +       return info;
31916 + err:
31917 +       free_crypto_stat(info);
31918 +       return ERR_PTR(ret);
31919 +}
31920 +#endif
31921 +
31922 +/* increment/decrement a load counter when
31923 +   attaching/detaching the crypto-stat to any object */
31924 +static void load_crypto_stat(crypto_stat_t * info)
31925 +{
31926 +       assert("edward-1380", info != NULL);
31927 +       inc_keyload_count(info);
31928 +}
31929 +
31930 +static void unload_crypto_stat(struct inode * inode)
31931 +{
31932 +       crypto_stat_t * info = inode_crypto_stat(inode);
31933 +       assert("edward-1381", info->keyload_count > 0);
31934 +
31935 +       dec_keyload_count(inode_crypto_stat(inode));
31936 +       if (info->keyload_count == 0)
31937 +               /* final release */
31938 +               inode_free_crypto_stat(inode);
31939 +}
31940 +
31941 +/* attach/detach an existing crypto-stat */
31942 +void attach_crypto_stat(struct inode * inode, crypto_stat_t * info)
31943 +{
31944 +       assert("edward-1382", inode != NULL);
31945 +       assert("edward-1383", info != NULL);
31946 +       assert("edward-1384", inode_crypto_stat(inode) == NULL);
31947 +
31948 +       set_inode_crypto_stat(inode, info);
31949 +       load_crypto_stat(info);
31950 +}
31951 +
31952 +/* returns true, if crypto stat can be attached to the @host */
31953 +#if REISER4_DEBUG
31954 +static int host_allows_crypto_stat(struct inode * host)
31955 +{
31956 +       int ret;
31957 +       file_plugin * fplug = inode_file_plugin(host);
31958 +
31959 +       switch (fplug->h.id) {
31960 +       case CRC_FILE_PLUGIN_ID:
31961 +               ret = 1;
31962 +               break;
31963 +       default:
31964 +               ret = 0;
31965 +       }
31966 +       return ret;
31967 +}
31968 +#endif  /*  REISER4_DEBUG  */
31969 +
31970 +static void detach_crypto_stat(struct inode * inode)
31971 +{
31972 +       assert("edward-1385", inode != NULL);
31973 +       assert("edward-1386", host_allows_crypto_stat(inode));
31974 +
31975 +       if (inode_crypto_stat(inode))
31976 +               unload_crypto_stat(inode);
31977 +       set_inode_crypto_stat(inode, NULL);
31978 +}
31979 +
31980 +#if 0
31981 +
31982 +/* compare fingerprints of @child and @parent */
31983 +static int keyid_eq(crypto_stat_t * child, crypto_stat_t * parent)
31984 +{
31985 +       return !memcmp(child->keyid, parent->keyid, info_digest_plugin(parent)->fipsize);
31986 +}
31987 +
31988 +/* check if a crypto-stat (which is bound to @parent) can be inherited */
31989 +int can_inherit_crypto_crc(struct inode *child, struct inode *parent)
31990 +{
31991 +       if (!need_cipher(child))
31992 +               return 0;
31993 +       /* the child is created */
31994 +       if (!inode_crypto_stat(child))
31995 +               return 1;
31996 +       /* the child is looked up */
31997 +       if (!inode_crypto_stat(parent))
31998 +               return 0;
31999 +       return (inode_cipher_plugin(child) == inode_cipher_plugin(parent) &&
32000 +               inode_digest_plugin(child) == inode_digest_plugin(parent) &&
32001 +               inode_crypto_stat(child)->keysize == inode_crypto_stat(parent)->keysize &&
32002 +               keyid_eq(inode_crypto_stat(child), inode_crypto_stat(parent)));
32003 +}
32004 +#endif
32005 +
32006 +/* helper functions for ->create() method of the cryptcompress plugin */
32007 +static int inode_set_crypto(struct inode * object)
32008 +{
32009 +       reiser4_inode * info;
32010 +       if (!inode_crypto_stat(object)) {
32011 +               if (need_cipher(object))
32012 +                       return RETERR(-EINVAL);
32013 +               /* the file is not to be encrypted */
32014 +               return 0;
32015 +       }
32016 +       info = reiser4_inode_data(object);
32017 +       info->extmask |= (1 << CRYPTO_STAT);
32018 +       info->plugin_mask |= (1 << PSET_CIPHER) | (1 << PSET_DIGEST);
32019 +       return 0;
32020 +}
32021 +
32022 +static int
32023 +inode_set_compression(struct inode * object)
32024 +{
32025 +       int result = 0;
32026 +       compression_plugin * cplug;
32027 +       reiser4_inode * info = reiser4_inode_data(object);
32028 +
32029 +       cplug = inode_compression_plugin(object);
32030 +
32031 +       if (cplug->init != NULL) {
32032 +               result = cplug->init();
32033 +               if (result)
32034 +                       return result;
32035 +       }
32036 +       info->plugin_mask |= (1 << PSET_COMPRESSION);
32037 +
32038 +       return 0;
32039 +}
32040 +
32041 +static void
32042 +inode_set_compression_mode(struct inode * object)
32043 +{
32044 +       reiser4_inode * info = reiser4_inode_data(object);
32045 +
32046 +       info->plugin_mask |= (1 << PSET_COMPRESSION_MODE);
32047 +       return;
32048 +}
32049 +
32050 +static int inode_set_cluster(struct inode *object)
32051 +{
32052 +       reiser4_inode *info;
32053 +       cluster_plugin *cplug;
32054 +
32055 +       assert("edward-696", object != NULL);
32056 +
32057 +       info = reiser4_inode_data(object);
32058 +       cplug = inode_cluster_plugin(object);
32059 +
32060 +       if (cplug->shift < PAGE_CACHE_SHIFT) {
32061 +               warning("edward-1320",
32062 +                       "Can not support %p clusters (less then page size)",
32063 +                       cplug->h.label);
32064 +               return RETERR(-EINVAL);
32065 +       }
32066 +       info->plugin_mask |= (1 << PSET_CLUSTER);
32067 +       return 0;
32068 +}
32069 +
32070 +/* ->destroy_inode() method of the cryptcompress plugin */
32071 +void destroy_inode_cryptcompress(struct inode * inode)
32072 +{
32073 +       assert("edward-23", cryptcompress_inode_data(inode)->pgcount == 0);
32074 +       detach_crypto_stat(inode);
32075 +       return;
32076 +}
32077 +
32078 +/* ->create() method of the cryptcompress plugin
32079 +
32080 +. install plugins
32081 +. attach crypto info if specified
32082 +. attach compression info if specified
32083 +. attach cluster info
32084 +*/
32085 +int
32086 +create_cryptcompress(struct inode *object, struct inode *parent,
32087 +                    reiser4_object_create_data * data)
32088 +{
32089 +       int result;
32090 +       reiser4_inode *info;
32091 +
32092 +       assert("edward-23", object != NULL);
32093 +       assert("edward-24", parent != NULL);
32094 +       assert("edward-30", data != NULL);
32095 +       assert("edward-26", inode_get_flag(object, REISER4_NO_SD));
32096 +       assert("edward-27", data->id == CRC_FILE_PLUGIN_ID);
32097 +
32098 +       info = reiser4_inode_data(object);
32099 +
32100 +       assert("edward-29", info != NULL);
32101 +
32102 +       /* set file bit */
32103 +       info->plugin_mask |= (1 << PSET_FILE);
32104 +
32105 +       /* set crypto */
32106 +       result = inode_set_crypto(object);
32107 +       if (result)
32108 +               goto error;
32109 +       /* set compression */
32110 +       result = inode_set_compression(object);
32111 +       if (result)
32112 +               goto error;
32113 +       inode_set_compression_mode(object);
32114 +
32115 +       /* set cluster info */
32116 +       result = inode_set_cluster(object);
32117 +       if (result)
32118 +               goto error;
32119 +       /* set plugin mask */
32120 +       info->extmask |= (1 << PLUGIN_STAT);
32121 +
32122 +       /* save everything in disk stat-data */
32123 +       result = write_sd_by_inode_common(object);
32124 +       if (!result)
32125 +               return 0;
32126 + error:
32127 +       detach_crypto_stat(object);
32128 +       return result;
32129 +}
32130 +
32131 +/* ->open() method of the cryptcompress plugin */
32132 +int open_cryptcompress(struct inode * inode, struct file * file)
32133 +{
32134 +       struct inode * parent;
32135 +
32136 +       assert("edward-1394", inode != NULL);
32137 +       assert("edward-1395", file != NULL);
32138 +       assert("edward-1396", file != NULL);
32139 +       assert("edward-1397", file->f_dentry->d_inode == inode);
32140 +       assert("edward-1398", file->f_dentry->d_parent != NULL);
32141 +       assert("edward-1399", file->f_dentry->d_parent->d_inode != NULL);
32142 +       assert("edward-698",
32143 +              inode_file_plugin(inode) ==
32144 +              file_plugin_by_id(CRC_FILE_PLUGIN_ID));
32145 +
32146 +       if (!need_cipher(inode))
32147 +               /* the file is not to be ciphered */
32148 +               return 0;
32149 +       parent = file->f_dentry->d_parent->d_inode;
32150 +       if (!inode_has_cipher_key(inode))
32151 +               return RETERR(-EINVAL);
32152 +       return 0;
32153 +}
32154 +
32155 +/* returns a blocksize, the attribute of a cipher algorithm */
32156 +static unsigned int
32157 +cipher_blocksize(struct inode * inode)
32158 +{
32159 +       assert("edward-758", need_cipher(inode));
32160 +       assert("edward-1400", inode_crypto_stat(inode) != NULL);
32161 +       return crypto_tfm_alg_blocksize
32162 +               (info_cipher_tfm(inode_crypto_stat(inode)));
32163 +}
32164 +
32165 +/* returns offset translated by scale factor of the crypto-algorithm */
32166 +static loff_t inode_scaled_offset (struct inode * inode,
32167 +                                  const loff_t src_off /* input offset */)
32168 +{
32169 +       assert("edward-97", inode != NULL);
32170 +
32171 +       if (!need_cipher(inode) ||
32172 +           src_off == get_key_offset(min_key()) ||
32173 +           src_off == get_key_offset(max_key()))
32174 +               return src_off;
32175 +
32176 +       return inode_cipher_plugin(inode)->scale(inode,
32177 +                                                cipher_blocksize(inode),
32178 +                                                src_off);
32179 +}
32180 +
32181 +/* returns disk cluster size */
32182 +size_t inode_scaled_cluster_size(struct inode * inode)
32183 +{
32184 +       assert("edward-110", inode != NULL);
32185 +
32186 +       return inode_scaled_offset(inode, inode_cluster_size(inode));
32187 +}
32188 +
32189 +static int new_cluster(reiser4_cluster_t * clust, struct inode *inode)
32190 +{
32191 +       return (clust_to_off(clust->index, inode) >= inode->i_size);
32192 +}
32193 +
32194 +/* set number of cluster pages */
32195 +static void set_cluster_nrpages(reiser4_cluster_t * clust, struct inode *inode)
32196 +{
32197 +       reiser4_slide_t *win;
32198 +
32199 +       assert("edward-180", clust != NULL);
32200 +       assert("edward-1040", inode != NULL);
32201 +
32202 +       win = clust->win;
32203 +       if (!win) {
32204 +               /* NOTE-EDWARD: i_size should be protected */
32205 +               clust->nr_pages =
32206 +                   count_to_nrpages(fsize_to_count(clust, inode));
32207 +               return;
32208 +       }
32209 +       assert("edward-1176", clust->op != PCL_UNKNOWN);
32210 +       assert("edward-1064", win->off + win->count + win->delta != 0);
32211 +
32212 +       if (win->stat == HOLE_WINDOW &&
32213 +           win->off == 0 && win->count == inode_cluster_size(inode)) {
32214 +               /* special case: we start write hole from fake cluster */
32215 +               clust->nr_pages = 0;
32216 +               return;
32217 +       }
32218 +       clust->nr_pages =
32219 +           count_to_nrpages(max_count(win->off + win->count + win->delta,
32220 +                                      fsize_to_count(clust, inode)));
32221 +       return;
32222 +}
32223 +
32224 +/* ->key_by_inode() method of the cryptcompress plugin */
32225 +/* see plugin/plugin.h for details */
32226 +int
32227 +key_by_inode_cryptcompress(struct inode *inode, loff_t off, reiser4_key * key)
32228 +{
32229 +       loff_t clust_off;
32230 +
32231 +       assert("edward-64", inode != 0);
32232 +       //      assert("edward-112", ergo(off != get_key_offset(max_key()), !off_to_cloff(off, inode)));
32233 +       /* don't come here with other offsets */
32234 +
32235 +       clust_off =
32236 +           (off ==
32237 +            get_key_offset(max_key())? get_key_offset(max_key()) :
32238 +            off_to_clust_to_off(off, inode));
32239 +
32240 +       key_by_inode_and_offset_common(inode, 0, key);
32241 +       set_key_offset(key,
32242 +                      (__u64) (!inode_crypto_stat(inode) ? clust_off :
32243 +                               inode_scaled_offset(inode, clust_off)));
32244 +       return 0;
32245 +}
32246 +
32247 +/* plugin->flow_by_inode */
32248 +int
32249 +flow_by_inode_cryptcompress(struct inode *inode /* file to build flow for */ ,
32250 +                           const char __user *buf /* user level buffer */ ,
32251 +                           int user    /* 1 if @buf is of user space, 0 - if it is
32252 +                                          kernel space */ ,
32253 +                           loff_t size /* buffer size */ ,
32254 +                           loff_t off /* offset to start io from */ ,
32255 +                           rw_op op /* READ or WRITE */ ,
32256 +                           flow_t * f /* resulting flow */ )
32257 +{
32258 +       assert("edward-436", f != NULL);
32259 +       assert("edward-149", inode != NULL);
32260 +       assert("edward-150", inode_file_plugin(inode) != NULL);
32261 +       assert("edward-151",
32262 +              inode_file_plugin(inode)->key_by_inode ==
32263 +              key_by_inode_cryptcompress);
32264 +
32265 +       f->length = size;
32266 +       memcpy(&f->data, &buf, sizeof(buf));
32267 +       f->user = user;
32268 +       f->op = op;
32269 +
32270 +       if (op == WRITE_OP && user == 1)
32271 +               return 0;
32272 +       return key_by_inode_cryptcompress(inode, off, &f->key);
32273 +}
32274 +
32275 +static int
32276 +crc_hint_validate(hint_t * hint, const reiser4_key * key,
32277 +                 znode_lock_mode lock_mode)
32278 +{
32279 +       coord_t *coord;
32280 +
32281 +       assert("edward-704", hint != NULL);
32282 +       assert("edward-1089", !hint->ext_coord.valid);
32283 +       assert("edward-706", hint->lh.owner == NULL);
32284 +
32285 +       coord = &hint->ext_coord.coord;
32286 +
32287 +       if (!hint || !hint_is_set(hint) || hint->mode != lock_mode)
32288 +               /* hint either not set or set by different operation */
32289 +               return RETERR(-E_REPEAT);
32290 +
32291 +       if (get_key_offset(key) != hint->offset)
32292 +               /* hint is set for different key */
32293 +               return RETERR(-E_REPEAT);
32294 +
32295 +       assert("edward-707", schedulable());
32296 +
32297 +       return seal_validate(&hint->seal, &hint->ext_coord.coord,
32298 +                            key, &hint->lh, lock_mode, ZNODE_LOCK_LOPRI);
32299 +}
32300 +
32301 +/* reserve disk space when writing a logical cluster */
32302 +static int reserve4cluster(struct inode *inode, reiser4_cluster_t *clust)
32303 +{
32304 +       int result = 0;
32305 +
32306 +       assert("edward-965", schedulable());
32307 +       assert("edward-439", inode != NULL);
32308 +       assert("edward-440", clust != NULL);
32309 +       assert("edward-441", clust->pages != NULL);
32310 +       assert("edward-1261", get_current_context()->grabbed_blocks == 0);
32311 +
32312 +       if (clust->nr_pages == 0) {
32313 +               assert("edward-1152", clust->win != NULL);
32314 +               assert("edward-1153", clust->win->stat == HOLE_WINDOW);
32315 +               /* don't reserve space for fake disk clusteer */
32316 +               return 0;
32317 +       }
32318 +       assert("edward-442", jprivate(clust->pages[0]) != NULL);
32319 +
32320 +       result = reiser4_grab_space_force(estimate_insert_cluster(inode) +
32321 +                                         estimate_update_cluster(inode),
32322 +                                         BA_CAN_COMMIT);
32323 +       if (result)
32324 +               return result;
32325 +       clust->reserved = 1;
32326 +       grabbed2cluster_reserved(estimate_insert_cluster(inode) +
32327 +                                estimate_update_cluster(inode));
32328 +#if REISER4_DEBUG
32329 +       clust->reserved_prepped = estimate_update_cluster(inode);
32330 +       clust->reserved_unprepped = estimate_insert_cluster(inode);
32331 +#endif
32332 +       /* there can be space grabbed by txnmgr_force_commit_all */
32333 +       all_grabbed2free();
32334 +       return 0;
32335 +}
32336 +
32337 +/* free reserved disk space if writing a logical cluster fails */
32338 +static void
32339 +free_reserved4cluster(struct inode *inode, reiser4_cluster_t * clust, int count)
32340 +{
32341 +       assert("edward-967", clust->reserved == 1);
32342 +
32343 +       cluster_reserved2free(count);
32344 +       clust->reserved = 0;
32345 +}
32346 +
32347 +/* The core search procedure of the cryptcompress plugin.
32348 +   If returned value is not cbk_errored, then current znode is locked */
32349 +static int find_cluster_item(hint_t * hint,
32350 +                            const reiser4_key * key, /* key of the item we are
32351 +                                                        looking for */
32352 +                            znode_lock_mode lock_mode /* which lock */ ,
32353 +                            ra_info_t * ra_info, lookup_bias bias, __u32 flags)
32354 +{
32355 +       int result;
32356 +       reiser4_key ikey;
32357 +       coord_t *coord = &hint->ext_coord.coord;
32358 +       coord_t orig = *coord;
32359 +
32360 +       assert("edward-152", hint != NULL);
32361 +
32362 +       if (hint->ext_coord.valid == 0) {
32363 +               result = crc_hint_validate(hint, key, lock_mode);
32364 +               if (result == -E_REPEAT)
32365 +                       goto traverse_tree;
32366 +               else if (result) {
32367 +                       assert("edward-1216", 0);
32368 +                       return result;
32369 +               }
32370 +               hint->ext_coord.valid = 1;
32371 +       }
32372 +       assert("edward-709", znode_is_any_locked(coord->node));
32373 +
32374 +       /* In-place lookup is going here, it means we just need to
32375 +          check if next item of the @coord match to the @keyhint) */
32376 +
32377 +       if (equal_to_rdk(coord->node, key)) {
32378 +               result = goto_right_neighbor(coord, &hint->lh);
32379 +               if (result == -E_NO_NEIGHBOR) {
32380 +                       assert("edward-1217", 0);
32381 +                       return RETERR(-EIO);
32382 +               }
32383 +               if (result)
32384 +                       return result;
32385 +               assert("edward-1218", equal_to_ldk(coord->node, key));
32386 +       } else {
32387 +               coord->item_pos++;
32388 +               coord->unit_pos = 0;
32389 +               coord->between = AT_UNIT;
32390 +       }
32391 +       result = zload(coord->node);
32392 +       if (result)
32393 +               return result;
32394 +       assert("edward-1219", !node_is_empty(coord->node));
32395 +
32396 +       if (!coord_is_existing_item(coord)) {
32397 +               zrelse(coord->node);
32398 +               goto not_found;
32399 +       }
32400 +       item_key_by_coord(coord, &ikey);
32401 +       zrelse(coord->node);
32402 +       if (!keyeq(key, &ikey))
32403 +               goto not_found;
32404 +       return CBK_COORD_FOUND;
32405 +
32406 +      not_found:
32407 +       assert("edward-1220", coord->item_pos > 0);
32408 +       //coord->item_pos--;
32409 +       /* roll back */
32410 +       *coord = orig;
32411 +       ON_DEBUG(coord_update_v(coord));
32412 +       return CBK_COORD_NOTFOUND;
32413 +
32414 +      traverse_tree:
32415 +       assert("edward-713", hint->lh.owner == NULL);
32416 +       assert("edward-714", schedulable());
32417 +
32418 +       unset_hint(hint);
32419 +       coord_init_zero(coord);
32420 +       result = coord_by_key(current_tree, key, coord, &hint->lh,
32421 +                             lock_mode, bias, LEAF_LEVEL, LEAF_LEVEL,
32422 +                             CBK_UNIQUE | flags, ra_info);
32423 +       if (cbk_errored(result))
32424 +               return result;
32425 +       hint->ext_coord.valid = 1;
32426 +       return result;
32427 +}
32428 +
32429 +/* This function is called by deflate[inflate] manager when
32430 +   creating a transformed/plain stream to check if we should
32431 +   create/cut some overhead. If this returns true, then @oh
32432 +   contains the size of this overhead.
32433 + */
32434 +static int
32435 +need_cut_or_align(struct inode * inode, reiser4_cluster_t * clust,
32436 +                 rw_op rw, int * oh)
32437 +{
32438 +       tfm_cluster_t * tc = &clust->tc;
32439 +       switch (rw) {
32440 +       case WRITE_OP: /* estimate align */
32441 +               *oh = tc->len % cipher_blocksize(inode);
32442 +               if (*oh != 0)
32443 +                       return 1;
32444 +               break;
32445 +       case READ_OP:  /* estimate cut */
32446 +               *oh = *(tfm_output_data(clust) + tc->len - 1);
32447 +               break;
32448 +       default:
32449 +               impossible("edward-1401", "bad option");
32450 +       }
32451 +       return (tc->len != tc->lsize);
32452 +}
32453 +
32454 +/* create/cut an overhead of transformed/plain stream */
32455 +static void
32456 +align_or_cut_overhead(struct inode * inode, reiser4_cluster_t * clust, rw_op rw)
32457 +{
32458 +       int oh;
32459 +       cipher_plugin * cplug = inode_cipher_plugin(inode);
32460 +
32461 +       assert("edward-1402", need_cipher(inode));
32462 +
32463 +       if (!need_cut_or_align(inode, clust, rw, &oh))
32464 +               return;
32465 +       switch (rw) {
32466 +       case WRITE_OP: /* do align */
32467 +               clust->tc.len +=
32468 +                       cplug->align_stream(tfm_input_data(clust) +
32469 +                                           clust->tc.len, clust->tc.len,
32470 +                                           cipher_blocksize(inode));
32471 +               *(tfm_input_data(clust) + clust->tc.len - 1) =
32472 +                       cipher_blocksize(inode) - oh;
32473 +               break;
32474 +       case READ_OP: /* do cut */
32475 +               assert("edward-1403", oh <= cipher_blocksize(inode));
32476 +               clust->tc.len -= oh;
32477 +               break;
32478 +       default:
32479 +               impossible("edward-1404", "bad option");
32480 +       }
32481 +       return;
32482 +}
32483 +
32484 +/* the following two functions are to evaluate results
32485 +   of compression transform */
32486 +static unsigned
32487 +max_cipher_overhead(struct inode * inode)
32488 +{
32489 +       if (!need_cipher(inode) || !inode_cipher_plugin(inode)->align_stream)
32490 +               return 0;
32491 +       return cipher_blocksize(inode);
32492 +}
32493 +
32494 +static int deflate_overhead(struct inode *inode)
32495 +{
32496 +       return (inode_compression_plugin(inode)->
32497 +               checksum ? DC_CHECKSUM_SIZE : 0);
32498 +}
32499 +
32500 +static unsigned deflate_overrun(struct inode * inode, int ilen)
32501 +{
32502 +       return coa_overrun(inode_compression_plugin(inode), ilen);
32503 +}
32504 +
32505 +/* Estimating compressibility of a logical cluster by various
32506 +   policies represented by compression mode plugin.
32507 +   If this returns false, then compressor won't be called for
32508 +   the cluster of index @index.
32509 +*/
32510 +static int should_compress(tfm_cluster_t * tc, cloff_t index,
32511 +                          struct inode *inode)
32512 +{
32513 +       compression_plugin *cplug = inode_compression_plugin(inode);
32514 +       compression_mode_plugin *mplug = inode_compression_mode_plugin(inode);
32515 +
32516 +       assert("edward-1321", tc->len != 0);
32517 +       assert("edward-1322", cplug != NULL);
32518 +       assert("edward-1323", mplug != NULL);
32519 +
32520 +       return /* estimate by size */
32521 +               (cplug->min_size_deflate ?
32522 +                tc->len >= cplug->min_size_deflate() :
32523 +                1) &&
32524 +               /* estimate by compression mode plugin */
32525 +               (mplug->should_deflate ?
32526 +                mplug->should_deflate(inode, index) :
32527 +                1);
32528 +}
32529 +
32530 +/* Evaluating results of compression transform.
32531 +   Returns true, if we need to accept this results */
32532 +static int
32533 +save_compressed(int size_before, int size_after, struct inode * inode)
32534 +{
32535 +       return (size_after + deflate_overhead(inode) +
32536 +               max_cipher_overhead(inode) < size_before);
32537 +}
32538 +
32539 +/* Guess result of the evaluation above */
32540 +static int
32541 +need_inflate(reiser4_cluster_t * clust, struct inode *inode,
32542 +            int encrypted /* is cluster encrypted */ )
32543 +{
32544 +       tfm_cluster_t *tc = &clust->tc;
32545 +
32546 +       assert("edward-142", tc != 0);
32547 +       assert("edward-143", inode != NULL);
32548 +
32549 +       return tc->len <
32550 +           (encrypted ?
32551 +            inode_scaled_offset(inode, tc->lsize) :
32552 +            tc->lsize);
32553 +}
32554 +
32555 +/* If results of compression were accepted, then we add
32556 +   a checksum to catch possible disk cluster corruption.
32557 +   The following is a format of the data stored in disk clusters:
32558 +
32559 +                  data                   This is (transformed) logical cluster.
32560 +                  cipher_overhead        This is created by ->align() method
32561 +                                          of cipher plugin. May be absent.
32562 +                  checksum          (4)  This is created by ->checksum method
32563 +                                          of compression plugin to check
32564 +                                          integrity. May be absent.
32565 +
32566 +                  Crypto overhead format:
32567 +
32568 +                  data
32569 +                  control_byte      (1)   contains aligned overhead size:
32570 +                                          1 <= overhead <= cipher_blksize
32571 +*/
32572 +/* Append a checksum at the end of a transformed stream */
32573 +static void dc_set_checksum(compression_plugin * cplug, tfm_cluster_t * tc)
32574 +{
32575 +       __u32 checksum;
32576 +
32577 +       assert("edward-1309", tc != NULL);
32578 +       assert("edward-1310", tc->len > 0);
32579 +       assert("edward-1311", cplug->checksum != NULL);
32580 +
32581 +       checksum = cplug->checksum(tfm_stream_data(tc, OUTPUT_STREAM), tc->len);
32582 +       put_unaligned(cpu_to_le32(checksum),
32583 +                (d32 *)(tfm_stream_data(tc, OUTPUT_STREAM) + tc->len));
32584 +       tc->len += (int)DC_CHECKSUM_SIZE;
32585 +}
32586 +
32587 +/* Check a disk cluster checksum.
32588 +   Returns 0 if checksum is correct, otherwise returns 1 */
32589 +static int dc_check_checksum(compression_plugin * cplug, tfm_cluster_t * tc)
32590 +{
32591 +       assert("edward-1312", tc != NULL);
32592 +       assert("edward-1313", tc->len > (int)DC_CHECKSUM_SIZE);
32593 +       assert("edward-1314", cplug->checksum != NULL);
32594 +
32595 +       if (cplug->checksum(tfm_stream_data(tc, INPUT_STREAM),
32596 +                           tc->len - (int)DC_CHECKSUM_SIZE) !=
32597 +           le32_to_cpu(get_unaligned((d32 *)
32598 +                                     (tfm_stream_data(tc, INPUT_STREAM)
32599 +                                      + tc->len - (int)DC_CHECKSUM_SIZE)))) {
32600 +               warning("edward-156",
32601 +                       "Bad disk cluster checksum %d, (should be %d) Fsck?\n",
32602 +                       (int)le32_to_cpu
32603 +                       (get_unaligned((d32 *)
32604 +                                      (tfm_stream_data(tc, INPUT_STREAM) +
32605 +                                       tc->len - (int)DC_CHECKSUM_SIZE))),
32606 +                       (int)cplug->checksum
32607 +                       (tfm_stream_data(tc, INPUT_STREAM),
32608 +                        tc->len - (int)DC_CHECKSUM_SIZE));
32609 +               return 1;
32610 +       }
32611 +       tc->len -= (int)DC_CHECKSUM_SIZE;
32612 +       return 0;
32613 +}
32614 +
32615 +/* get input/output stream for some transform action */
32616 +int grab_tfm_stream(struct inode * inode, tfm_cluster_t * tc,
32617 +                   tfm_stream_id id)
32618 +{
32619 +       size_t size = inode_scaled_cluster_size(inode);
32620 +
32621 +       assert("edward-901", tc != NULL);
32622 +       assert("edward-1027", inode_compression_plugin(inode) != NULL);
32623 +
32624 +       if (tc->act == TFM_WRITE_ACT)
32625 +               size += deflate_overrun(inode, inode_cluster_size(inode));
32626 +
32627 +       if (!tfm_stream(tc, id) && id == INPUT_STREAM)
32628 +               alternate_streams(tc);
32629 +       if (!tfm_stream(tc, id))
32630 +               return alloc_tfm_stream(tc, size, id);
32631 +
32632 +       assert("edward-902", tfm_stream_is_set(tc, id));
32633 +
32634 +       if (tfm_stream_size(tc, id) < size)
32635 +               return realloc_tfm_stream(tc, size, id);
32636 +       return 0;
32637 +}
32638 +
32639 +/* Common deflate manager */
32640 +int deflate_cluster(reiser4_cluster_t * clust, struct inode * inode)
32641 +{
32642 +       int result = 0;
32643 +       int compressed = 0;
32644 +       int encrypted = 0;
32645 +       tfm_cluster_t * tc = &clust->tc;
32646 +       compression_plugin * coplug;
32647 +
32648 +       assert("edward-401", inode != NULL);
32649 +       assert("edward-903", tfm_stream_is_set(tc, INPUT_STREAM));
32650 +       assert("edward-1348", tc->act == TFM_WRITE_ACT);
32651 +       assert("edward-498", !tfm_cluster_is_uptodate(tc));
32652 +
32653 +       coplug = inode_compression_plugin(inode);
32654 +       if (should_compress(tc, clust->index, inode)) {
32655 +               /* try to compress, discard bad results */
32656 +               __u32 dst_len;
32657 +               compression_mode_plugin * mplug =
32658 +                       inode_compression_mode_plugin(inode);
32659 +               assert("edward-602", coplug != NULL);
32660 +               assert("edward-1423", coplug->compress != NULL);
32661 +
32662 +               result = grab_coa(tc, coplug);
32663 +               if (result) {
32664 +                   warning("edward-1424",
32665 +                           "alloc_coa failed with ret=%d, skipped compression",
32666 +                           result);
32667 +                   goto cipher;
32668 +               }
32669 +               result = grab_tfm_stream(inode, tc, OUTPUT_STREAM);
32670 +               if (result) {
32671 +                   warning("edward-1425",
32672 +                        "alloc stream failed with ret=%d, skipped compression",
32673 +                           result);
32674 +                   goto cipher;
32675 +               }
32676 +               dst_len = tfm_stream_size(tc, OUTPUT_STREAM);
32677 +               coplug->compress(get_coa(tc, coplug->h.id, tc->act),
32678 +                                tfm_input_data(clust), tc->len,
32679 +                                tfm_output_data(clust), &dst_len);
32680 +               /* make sure we didn't overwrite extra bytes */
32681 +               assert("edward-603",
32682 +                      dst_len <= tfm_stream_size(tc, OUTPUT_STREAM));
32683 +
32684 +               /* evaluate results of compression transform */
32685 +               if (save_compressed(tc->len, dst_len, inode)) {
32686 +                       /* good result, accept */
32687 +                       tc->len = dst_len;
32688 +                       if (mplug->accept_hook != NULL) {
32689 +                              result = mplug->accept_hook(inode, clust->index);
32690 +                              if (result)
32691 +                                      warning("edward-1426",
32692 +                                              "accept_hook failed with ret=%d",
32693 +                                              result);
32694 +                       }
32695 +                       compressed = 1;
32696 +               }
32697 +               else {
32698 +                       /* bad result, discard */
32699 +#if REISER4_DEBUG
32700 +                       if (cluster_is_complete(clust, inode))
32701 +                             warning("edward-1338",
32702 +                                     "incompressible cluster %lu (inode %llu)",
32703 +                                     clust->index,
32704 +                                     (unsigned long long)get_inode_oid(inode));
32705 +#endif
32706 +                       if (mplug->discard_hook != NULL &&
32707 +                           cluster_is_complete(clust, inode)) {
32708 +                               result = mplug->discard_hook(inode,
32709 +                                                            clust->index);
32710 +                               if (result)
32711 +                                     warning("edward-1427",
32712 +                                             "discard_hook failed with ret=%d",
32713 +                                             result);
32714 +                       }
32715 +               }
32716 +       }
32717 + cipher:
32718 +       if (need_cipher(inode)) {
32719 +               cipher_plugin * ciplug;
32720 +               struct crypto_tfm * tfm;
32721 +               struct scatterlist src;
32722 +               struct scatterlist dst;
32723 +
32724 +               ciplug = inode_cipher_plugin(inode);
32725 +               tfm = info_cipher_tfm(inode_crypto_stat(inode));
32726 +               if (compressed)
32727 +                       alternate_streams(tc);
32728 +               result = grab_tfm_stream(inode, tc, OUTPUT_STREAM);
32729 +               if (result)
32730 +                       return result;
32731 +
32732 +               align_or_cut_overhead(inode, clust, WRITE_OP);
32733 +               src.page = virt_to_page(tfm_input_data(clust));
32734 +               src.offset = offset_in_page(tfm_input_data(clust));
32735 +               src.length = tc->len;
32736 +
32737 +               dst.page = virt_to_page(tfm_output_data(clust));
32738 +               dst.offset = offset_in_page(tfm_output_data(clust));
32739 +               dst.length = tc->len;
32740 +
32741 +               result = crypto_cipher_encrypt(tfm, &dst, &src, tc->len);
32742 +               if (result) {
32743 +                       warning("edward-1405",
32744 +                               "encryption failed flags=%x\n", tfm->crt_flags);
32745 +                       return result;
32746 +               }
32747 +               encrypted = 1;
32748 +       }
32749 +       if (compressed && coplug->checksum != NULL)
32750 +               dc_set_checksum(coplug, tc);
32751 +       if (!compressed && !encrypted)
32752 +               alternate_streams(tc);
32753 +       return result;
32754 +}
32755 +
32756 +/* Common inflate manager. */
32757 +int inflate_cluster(reiser4_cluster_t * clust, struct inode * inode)
32758 +{
32759 +       int result = 0;
32760 +       int transformed = 0;
32761 +       tfm_cluster_t * tc = &clust->tc;
32762 +       compression_plugin * coplug;
32763 +
32764 +       assert("edward-905", inode != NULL);
32765 +       assert("edward-1178", clust->dstat == PREP_DISK_CLUSTER);
32766 +       assert("edward-906", tfm_stream_is_set(&clust->tc, INPUT_STREAM));
32767 +       assert("edward-1349", tc->act == TFM_READ_ACT);
32768 +       assert("edward-907", !tfm_cluster_is_uptodate(tc));
32769 +
32770 +       /* Handle a checksum (if any) */
32771 +       coplug = inode_compression_plugin(inode);
32772 +       if (need_inflate(clust, inode, need_cipher(inode)) &&
32773 +           coplug->checksum != NULL) {
32774 +               result = dc_check_checksum(coplug, tc);
32775 +               if (result)
32776 +                       return RETERR(-EIO);
32777 +       }
32778 +       if (need_cipher(inode)) {
32779 +               cipher_plugin * ciplug;
32780 +               struct crypto_tfm * tfm;
32781 +               struct scatterlist src;
32782 +               struct scatterlist dst;
32783 +
32784 +               ciplug = inode_cipher_plugin(inode);
32785 +               tfm = info_cipher_tfm(inode_crypto_stat(inode));
32786 +               result = grab_tfm_stream(inode, tc, OUTPUT_STREAM);
32787 +               if (result)
32788 +                       return result;
32789 +               assert("edward-909", tfm_cluster_is_set(tc));
32790 +
32791 +               src.page   =   virt_to_page(tfm_input_data(clust));
32792 +               src.offset = offset_in_page(tfm_input_data(clust));
32793 +               src.length = tc->len;
32794 +
32795 +               dst.page   =   virt_to_page(tfm_output_data(clust));
32796 +               dst.offset = offset_in_page(tfm_output_data(clust));
32797 +               dst.length = tc->len;
32798 +
32799 +               result = crypto_cipher_decrypt(tfm, &dst, &src, tc->len);
32800 +               if (result)
32801 +                       return result;
32802 +               align_or_cut_overhead(inode, clust, READ_OP);
32803 +               transformed = 1;
32804 +       }
32805 +       if (need_inflate(clust, inode, 0)) {
32806 +               unsigned dst_len = inode_cluster_size(inode);
32807 +               if(transformed)
32808 +                       alternate_streams(tc);
32809 +
32810 +               result = grab_tfm_stream(inode, tc, OUTPUT_STREAM);
32811 +               if (result)
32812 +                       return result;
32813 +               assert("edward-1305", coplug->decompress != NULL);
32814 +               assert("edward-910", tfm_cluster_is_set(tc));
32815 +
32816 +               coplug->decompress(get_coa(tc, coplug->h.id, tc->act),
32817 +                                  tfm_input_data(clust), tc->len,
32818 +                                  tfm_output_data(clust), &dst_len);
32819 +               /* check length */
32820 +               tc->len = dst_len;
32821 +               assert("edward-157", dst_len == tc->lsize);
32822 +               transformed = 1;
32823 +       }
32824 +       if (!transformed)
32825 +               alternate_streams(tc);
32826 +       return result;
32827 +}
32828 +
32829 +/* This is implementation of readpage method of struct
32830 +   address_space_operations for cryptcompress plugin. */
32831 +int readpage_cryptcompress(struct file *file, struct page *page)
32832 +{
32833 +       reiser4_context *ctx;
32834 +       reiser4_cluster_t clust;
32835 +       item_plugin *iplug;
32836 +       int result;
32837 +
32838 +       assert("edward-88", PageLocked(page));
32839 +       assert("vs-976", !PageUptodate(page));
32840 +       assert("edward-89", page->mapping && page->mapping->host);
32841 +
32842 +       ctx = init_context(page->mapping->host->i_sb);
32843 +       if (IS_ERR(ctx))
32844 +               return PTR_ERR(ctx);
32845 +       result = check_cryptcompress(page->mapping->host);
32846 +       if (result) {
32847 +               unlock_page(page);
32848 +               reiser4_exit_context(ctx);
32849 +               return result;
32850 +       }
32851 +       assert("edward-113",
32852 +              ergo(file != NULL,
32853 +                   page->mapping == file->f_dentry->d_inode->i_mapping));
32854 +
32855 +       if (PageUptodate(page)) {
32856 +               warning("edward-1338", "page is already uptodate\n");
32857 +               reiser4_exit_context(ctx);
32858 +               return 0;
32859 +       }
32860 +       cluster_init_read(&clust, NULL);
32861 +       clust.file = file;
32862 +       iplug = item_plugin_by_id(CTAIL_ID);
32863 +       if (!iplug->s.file.readpage) {
32864 +               unlock_page(page);
32865 +               put_cluster_handle(&clust);
32866 +               reiser4_exit_context(ctx);
32867 +               return -EINVAL;
32868 +       }
32869 +       result = iplug->s.file.readpage(&clust, page);
32870 +       if (result)
32871 +               unlock_page(page);
32872 +       assert("edward-64",
32873 +              ergo(result == 0, (PageLocked(page) || PageUptodate(page))));
32874 +       put_cluster_handle(&clust);
32875 +       reiser4_exit_context(ctx);
32876 +       return result;
32877 +}
32878 +
32879 +/* how much pages will be captured */
32880 +static int cluster_nrpages_to_capture(reiser4_cluster_t * clust)
32881 +{
32882 +       switch (clust->op) {
32883 +       case PCL_APPEND:
32884 +               return clust->nr_pages;
32885 +       case PCL_TRUNCATE:
32886 +               assert("edward-1179", clust->win != NULL);
32887 +               return count_to_nrpages(clust->win->off + clust->win->count);
32888 +       default:
32889 +               impossible("edward-1180", "bad page cluster option");
32890 +               return 0;
32891 +       }
32892 +}
32893 +
32894 +static void set_cluster_pages_dirty(reiser4_cluster_t * clust)
32895 +{
32896 +       int i;
32897 +       struct page *pg;
32898 +       int nrpages = cluster_nrpages_to_capture(clust);
32899 +
32900 +       for (i = 0; i < nrpages; i++) {
32901 +
32902 +               pg = clust->pages[i];
32903 +               assert("edward-968", pg != NULL);
32904 +               lock_page(pg);
32905 +               assert("edward-1065", PageUptodate(pg));
32906 +               set_page_dirty_internal(pg);
32907 +               unlock_page(pg);
32908 +               mark_page_accessed(pg);
32909 +       }
32910 +}
32911 +
32912 +static void clear_cluster_pages_dirty(reiser4_cluster_t * clust)
32913 +{
32914 +       int i;
32915 +       assert("edward-1275", clust != NULL);
32916 +
32917 +       for (i = 0; i < clust->nr_pages; i++) {
32918 +               assert("edward-1276", clust->pages[i] != NULL);
32919 +
32920 +               lock_page(clust->pages[i]);
32921 +               if (PageDirty(clust->pages[i])) {
32922 +                       assert("edward-1277", PageUptodate(clust->pages[i]));
32923 +                       clear_page_dirty_for_io(clust->pages[i]);
32924 +               }
32925 +#if REISER4_DEBUG
32926 +               else
32927 +                       /* Race between flush and write:
32928 +                          some pages became clean when write() (or another
32929 +                          process which modifies data) capture the cluster. */
32930 +                       warning("edward-985", "Page of index %lu (inode %llu)"
32931 +                               " is not dirty\n", clust->pages[i]->index,
32932 +                               (unsigned long long)get_inode_oid(clust->
32933 +                                                                 pages[i]->
32934 +                                                                 mapping->
32935 +                                                                 host));
32936 +#endif
32937 +               unlock_page(clust->pages[i]);
32938 +       }
32939 +}
32940 +
32941 +/* update i_size by window */
32942 +static void inode_set_new_size(reiser4_cluster_t * clust, struct inode *inode)
32943 +{
32944 +       loff_t size;
32945 +       reiser4_slide_t *win;
32946 +
32947 +       assert("edward-1181", clust != NULL);
32948 +       assert("edward-1182", inode != NULL);
32949 +
32950 +       win = clust->win;
32951 +       assert("edward-1183", win != NULL);
32952 +
32953 +       size = clust_to_off(clust->index, inode) + win->off;
32954 +
32955 +       switch (clust->op) {
32956 +       case PCL_APPEND:
32957 +               if (size + win->count <= inode->i_size)
32958 +                       /* overwrite only */
32959 +                       return;
32960 +               size += win->count;
32961 +               break;
32962 +       case PCL_TRUNCATE:
32963 +               break;
32964 +       default:
32965 +               impossible("edward-1184", "bad page cluster option");
32966 +               break;
32967 +       }
32968 +       inode_check_scale_nolock(inode, inode->i_size, size);
32969 +       inode->i_size = size;
32970 +       return;
32971 +}
32972 +
32973 +/* Check in page cluster modifications.
32974 +   . Make jnode dirty, if it wasn't;
32975 +   . Reserve space for a disk cluster update by flush algorithm, if needed;
32976 +   . Clean up old references (if any).
32977 +   . Put pages (grabbed in this thread) which will be truncated
32978 +*/
32979 +static void
32980 +make_cluster_jnode_dirty_locked(reiser4_cluster_t * clust, jnode * node,
32981 +                               loff_t * old_isize, struct inode *inode)
32982 +{
32983 +       int i;
32984 +       int old_nrpages;
32985 +       int new_nrpages = cluster_nrpages_to_capture(clust);
32986 +
32987 +       assert("edward-973", new_nrpages > 0);
32988 +       assert("edward-221", node != NULL);
32989 +       assert("edward-971", clust->reserved == 1);
32990 +       assert_spin_locked(&(node->guard));
32991 +       assert("edward-972", node->page_count < cluster_nrpages(inode));
32992 +       assert("edward-1263",
32993 +              clust->reserved_prepped == estimate_update_cluster(inode));
32994 +       assert("edward-1264", clust->reserved_unprepped == 0);
32995 +
32996 +       if (JF_ISSET(node, JNODE_DIRTY)) {
32997 +               /* someone has modified this cluster, but
32998 +                  the modifications are not committed yet */
32999 +               old_nrpages =
33000 +                       count_to_nrpages(cnt_to_clcnt(*old_isize,
33001 +                                                     clust->index, inode));
33002 +               /* free space which is already reserved */
33003 +               free_reserved4cluster(inode, clust,
33004 +                                     estimate_update_cluster(inode));
33005 +               /* put old references */
33006 +               for (i = 0; i < old_nrpages; i++) {
33007 +                       assert("edward-975", clust->pages[i]);
33008 +                       assert("edward-1185", PageUptodate(clust->pages[i]));
33009 +
33010 +                       page_cache_release(clust->pages[i]);
33011 +#if REISER4_DEBUG
33012 +                       cryptcompress_inode_data(inode)->pgcount --;
33013 +#endif
33014 +               }
33015 +       } else {
33016 +               /* no captured pages */
33017 +               assert("edward-1043", node->page_count == 0);
33018 +               jnode_make_dirty_locked(node);
33019 +               clust->reserved = 0;
33020 +       }
33021 +       /* put pages that will be truncated (if any) */
33022 +       for (i = new_nrpages; i < clust->nr_pages; i++) {
33023 +               assert("edward-1433", clust->pages[i]);
33024 +               assert("edward-1434", PageUptodate(clust->pages[i]));
33025 +               page_cache_release(clust->pages[i]);
33026 +#if REISER4_DEBUG
33027 +               cryptcompress_inode_data(inode)->pgcount --;
33028 +#endif
33029 +       }
33030 +#if REISER4_DEBUG
33031 +       clust->reserved_prepped -= estimate_update_cluster(inode);
33032 +       node->page_count = new_nrpages - 1;
33033 +#endif
33034 +       return;
33035 +}
33036 +
33037 +/* This function spawns a transaction and
33038 +   is called by any thread as a final step in page cluster modification.
33039 +*/
33040 +static int try_capture_cluster(reiser4_cluster_t * clust, struct inode *inode)
33041 +{
33042 +       int result = 0;
33043 +       loff_t old_size;
33044 +       jnode *node;
33045 +
33046 +       assert("edward-1029", clust != NULL);
33047 +       assert("edward-1030", clust->reserved == 1);
33048 +       assert("edward-1031", clust->nr_pages != 0);
33049 +       assert("edward-1032", clust->pages != NULL);
33050 +       assert("edward-1033", clust->pages[0] != NULL);
33051 +
33052 +       node = jprivate(clust->pages[0]);
33053 +
33054 +       assert("edward-1035", node != NULL);
33055 +
33056 +       spin_lock_jnode(node);
33057 +       old_size = inode->i_size;
33058 +       if (clust->win)
33059 +               inode_set_new_size(clust, inode);
33060 +
33061 +       result = try_capture(node, ZNODE_WRITE_LOCK, 0);
33062 +       if (result)
33063 +               goto exit;
33064 +       make_cluster_jnode_dirty_locked(clust, node, &old_size, inode);
33065 +      exit:
33066 +       assert("edward-1034", !result);
33067 +       spin_unlock_jnode(node);
33068 +       jput(node);
33069 +       return result;
33070 +}
33071 +
33072 +/* Collect unlocked cluster pages for any modifications and attach a jnode.
33073 +   We allocate only one jnode per cluster, this jnode is binded to the first
33074 +   page of this cluster, so we have an extra-reference that will exist with
33075 +   this jnode, other references will be cleaned up in flush time.
33076 +*/
33077 +static int
33078 +grab_cluster_pages_jnode(struct inode *inode, reiser4_cluster_t * clust)
33079 +{
33080 +       int i;
33081 +       int result = 0;
33082 +       jnode *node = NULL;
33083 +
33084 +       assert("edward-182", clust != NULL);
33085 +       assert("edward-183", clust->pages != NULL);
33086 +       assert("edward-184", clust->nr_pages <= cluster_nrpages(inode));
33087 +
33088 +       if (clust->nr_pages == 0)
33089 +               return 0;
33090 +
33091 +       for (i = 0; i < clust->nr_pages; i++) {
33092 +
33093 +               assert("edward-1044", clust->pages[i] == NULL);
33094 +
33095 +               clust->pages[i] =
33096 +                   grab_cache_page(inode->i_mapping,
33097 +                                   clust_to_pg(clust->index, inode) + i);
33098 +               if (!clust->pages[i]) {
33099 +                       result = RETERR(-ENOMEM);
33100 +                       break;
33101 +               }
33102 +               if (i == 0) {
33103 +                       node = jnode_of_page(clust->pages[i]);
33104 +                       if (IS_ERR(node)) {
33105 +                               result = PTR_ERR(node);
33106 +                               unlock_page(clust->pages[i]);
33107 +                               break;
33108 +                       }
33109 +                       JF_SET(node, JNODE_CLUSTER_PAGE);
33110 +                       unlock_page(clust->pages[i]);
33111 +                       assert("edward-919", node);
33112 +                       continue;
33113 +               }
33114 +               unlock_page(clust->pages[i]);
33115 +       }
33116 +       if (result) {
33117 +               while (i)
33118 +                       page_cache_release(clust->pages[--i]);
33119 +               if (node && !IS_ERR(node))
33120 +                       jput(node);
33121 +               return result;
33122 +       }
33123 +       assert("edward-920", jprivate(clust->pages[0]));
33124 +#if REISER4_DEBUG
33125 +       cryptcompress_inode_data(inode)->pgcount += clust->nr_pages;
33126 +#endif
33127 +       return 0;
33128 +}
33129 +
33130 +/* Collect unlocked cluster pages only for read (not to modify) */
33131 +static int grab_cluster_pages(struct inode *inode, reiser4_cluster_t * clust)
33132 +{
33133 +       int i;
33134 +       int result = 0;
33135 +
33136 +       assert("edward-1428", inode != NULL);
33137 +       assert("edward-1429", inode->i_mapping != NULL);
33138 +       assert("edward-787", clust != NULL);
33139 +       assert("edward-788", clust->pages != NULL);
33140 +       assert("edward-789", clust->nr_pages != 0);
33141 +       assert("edward-790", clust->nr_pages <= cluster_nrpages(inode));
33142 +
33143 +       for (i = 0; i < clust->nr_pages; i++) {
33144 +               clust->pages[i] =
33145 +                   grab_cache_page(inode->i_mapping,
33146 +                                   clust_to_pg(clust->index, inode) + i);
33147 +               if (!clust->pages[i]) {
33148 +                       result = RETERR(-ENOMEM);
33149 +                       break;
33150 +               }
33151 +               unlock_page(clust->pages[i]);
33152 +       }
33153 +       if (result)
33154 +               while (i)
33155 +                       page_cache_release(clust->pages[--i]);
33156 +       return result;
33157 +}
33158 +
33159 +/* @node might be attached by reiser4_writepage(), not by
33160 +   cryptcompress plugin code, but emergency flush should
33161 +   understand that pages of cryptcompress files are not
33162 +   flushable.
33163 +*/
33164 +#if 0
33165 +int jnode_of_cluster(const jnode * node, struct page * page)
33166 +{
33167 +       assert("edward-1339", node != NULL);
33168 +       assert("edward-1340", page != NULL);
33169 +       assert("edward-1341", page->mapping != NULL);
33170 +       assert("edward-1342", page->mapping->host != NULL);
33171 +       assert("edward-1343",
33172 +              ergo(jnode_is_unformatted(node),
33173 +                   get_inode_oid(page->mapping->host) ==
33174 +                   node->key.j.objectid));
33175 +       if (inode_file_plugin(page->mapping->host) ==
33176 +           file_plugin_by_id(CRC_FILE_PLUGIN_ID)) {
33177 +#if REISER4_DEBUG
33178 +               if (!jnode_is_cluster_page(node))
33179 +                       warning("edward-1345",
33180 +                       "inode %llu: cluster page of index %lu became private",
33181 +                       (unsigned long long)get_inode_oid(page->mapping->host),
33182 +                       page->index);
33183 +#endif
33184 +               return 1;
33185 +       }
33186 +       return 0;
33187 +}
33188 +#endif  /*  0  */
33189 +
33190 +/* put cluster pages */
33191 +void release_cluster_pages(reiser4_cluster_t * clust)
33192 +{
33193 +       int i;
33194 +
33195 +       assert("edward-447", clust != NULL);
33196 +       for (i = 0; i < clust->nr_pages; i++) {
33197 +
33198 +               assert("edward-449", clust->pages[i] != NULL);
33199 +
33200 +               page_cache_release(clust->pages[i]);
33201 +       }
33202 +}
33203 +
33204 +/* this is called when something is failed */
33205 +static void release_cluster_pages_and_jnode(reiser4_cluster_t * clust)
33206 +{
33207 +       jnode *node;
33208 +
33209 +       assert("edward-445", clust != NULL);
33210 +       assert("edward-922", clust->pages != NULL);
33211 +       assert("edward-446", clust->pages[0] != NULL);
33212 +
33213 +       node = jprivate(clust->pages[0]);
33214 +
33215 +       assert("edward-447", node != NULL);
33216 +
33217 +       release_cluster_pages(clust);
33218 +       jput(node);
33219 +}
33220 +
33221 +#if REISER4_DEBUG
33222 +static int window_ok(reiser4_slide_t * win, struct inode *inode)
33223 +{
33224 +       assert("edward-1115", win != NULL);
33225 +       assert("edward-1116", ergo(win->delta, win->stat == HOLE_WINDOW));
33226 +
33227 +       return (win->off != inode_cluster_size(inode)) &&
33228 +           (win->off + win->count + win->delta <= inode_cluster_size(inode));
33229 +}
33230 +
33231 +static int cluster_ok(reiser4_cluster_t * clust, struct inode *inode)
33232 +{
33233 +       assert("edward-279", clust != NULL);
33234 +
33235 +       if (!clust->pages)
33236 +               return 0;
33237 +       return (clust->win ? window_ok(clust->win, inode) : 1);
33238 +}
33239 +#endif
33240 +
33241 +/* guess next window stat */
33242 +static inline window_stat next_window_stat(reiser4_slide_t * win)
33243 +{
33244 +       assert("edward-1130", win != NULL);
33245 +       return ((win->stat == HOLE_WINDOW && win->delta == 0) ?
33246 +               HOLE_WINDOW : DATA_WINDOW);
33247 +}
33248 +
33249 +/* guess next cluster index and window params */
33250 +static void
33251 +update_cluster(struct inode *inode, reiser4_cluster_t * clust, loff_t file_off,
33252 +              loff_t to_file)
33253 +{
33254 +       reiser4_slide_t *win;
33255 +
33256 +       assert("edward-185", clust != NULL);
33257 +       assert("edward-438", clust->pages != NULL);
33258 +       assert("edward-281", cluster_ok(clust, inode));
33259 +
33260 +       win = clust->win;
33261 +       if (!win)
33262 +               return;
33263 +
33264 +       switch (win->stat) {
33265 +       case DATA_WINDOW:
33266 +               /* increment window position */
33267 +               clust->index++;
33268 +               win->stat = DATA_WINDOW;
33269 +               win->off = 0;
33270 +               win->count = min_count(inode_cluster_size(inode), to_file);
33271 +               break;
33272 +       case HOLE_WINDOW:
33273 +               switch (next_window_stat(win)) {
33274 +               case HOLE_WINDOW:
33275 +                       /* set window to fit the offset we start write from */
33276 +                       clust->index = off_to_clust(file_off, inode);
33277 +                       win->stat = HOLE_WINDOW;
33278 +                       win->off = 0;
33279 +                       win->count = off_to_cloff(file_off, inode);
33280 +                       win->delta =
33281 +                           min_count(inode_cluster_size(inode) - win->count,
33282 +                                     to_file);
33283 +                       break;
33284 +               case DATA_WINDOW:
33285 +                       /* do not move the window, just change its state,
33286 +                          off+count+delta=inv */
33287 +                       win->stat = DATA_WINDOW;
33288 +                       win->off = win->off + win->count;
33289 +                       win->count = win->delta;
33290 +                       win->delta = 0;
33291 +                       break;
33292 +               default:
33293 +                       impossible("edward-282", "wrong next window state");
33294 +               }
33295 +               break;
33296 +       default:
33297 +               impossible("edward-283", "wrong current window state");
33298 +       }
33299 +       assert("edward-1068", cluster_ok(clust, inode));
33300 +}
33301 +
33302 +static int update_sd_cryptcompress(struct inode *inode)
33303 +{
33304 +       int result = 0;
33305 +
33306 +       assert("edward-978", schedulable());
33307 +       assert("edward-1265", get_current_context()->grabbed_blocks == 0);
33308 +
33309 +       result = reiser4_grab_space_force(      /* one for stat data update */
33310 +                                                estimate_update_common(inode),
33311 +                                                BA_CAN_COMMIT);
33312 +       assert("edward-979", !result);
33313 +       if (result)
33314 +               return result;
33315 +       inode->i_ctime = inode->i_mtime = CURRENT_TIME;
33316 +       result = reiser4_update_sd(inode);
33317 +
33318 +       all_grabbed2free();
33319 +       return result;
33320 +}
33321 +
33322 +
33323 +/* NOTE-Edward: this is too similar to reiser4/txnmgr.c:uncapture_jnode() */
33324 +static void uncapture_cluster_jnode(jnode * node)
33325 +{
33326 +       txn_atom *atom;
33327 +
33328 +       assert_spin_locked(&(node->guard));
33329 +
33330 +       /*jnode_make_clean(node); */
33331 +       atom = jnode_get_atom(node);
33332 +       if (atom == NULL) {
33333 +               assert("jmacd-7111", !JF_ISSET(node, JNODE_DIRTY));
33334 +               spin_unlock_jnode(node);
33335 +               return;
33336 +       }
33337 +
33338 +       uncapture_block(node);
33339 +       spin_unlock_atom(atom);
33340 +       jput(node);
33341 +}
33342 +
33343 +static void forget_cluster_pages(struct page **pages, int nr)
33344 +{
33345 +       int i;
33346 +       for (i = 0; i < nr; i++) {
33347 +
33348 +               assert("edward-1045", pages[i] != NULL);
33349 +               page_cache_release(pages[i]);
33350 +       }
33351 +}
33352 +
33353 +/* Check out last modifications we are about to commit,
33354 +   and prepare input stream for transform operations.
33355 +*/
33356 +int
33357 +flush_cluster_pages(reiser4_cluster_t * clust, jnode * node,
33358 +                   struct inode *inode)
33359 +{
33360 +       int result = 0;
33361 +       int i;
33362 +       int nr_pages = 0;
33363 +       tfm_cluster_t *tc = &clust->tc;
33364 +
33365 +       assert("edward-980", node != NULL);
33366 +       assert("edward-236", inode != NULL);
33367 +       assert("edward-237", clust != NULL);
33368 +       assert("edward-240", !clust->win);
33369 +       assert("edward-241", schedulable());
33370 +       assert("edward-718", crc_inode_ok(inode));
33371 +
33372 +       result = grab_tfm_stream(inode, tc, INPUT_STREAM);
33373 +       if (result) {
33374 +               warning("edward-1430",
33375 +                       "alloc stream failed with ret=%d", result);
33376 +               return result;
33377 +       }
33378 +       spin_lock_jnode(node);
33379 +       assert("edward-1435", JF_ISSET(node, JNODE_DIRTY));
33380 +
33381 +       /* Check out a size of logical cluster and
33382 +          set a number of cluster pages to commit. */
33383 +       tc->len = tc->lsize = fsize_to_count(clust, inode);
33384 +       clust->nr_pages = count_to_nrpages(tc->len);
33385 +
33386 +       assert("edward-983", clust->nr_pages == node->page_count + 1);
33387 +#if REISER4_DEBUG
33388 +       node->page_count = 0;
33389 +#endif
33390 +       cluster_reserved2grabbed(estimate_update_cluster(inode));
33391 +       uncapture_cluster_jnode(node);
33392 +
33393 +       assert("edward-1224", schedulable());
33394 +       /* Check out cluster pages to commit */
33395 +       nr_pages =
33396 +             find_get_pages(inode->i_mapping, clust_to_pg(clust->index, inode),
33397 +                            clust->nr_pages, clust->pages);
33398 +
33399 +       assert("edward-1280", nr_pages == clust->nr_pages);
33400 +       /* Construct input stream from the checked out pages */
33401 +       for (i = 0; i < clust->nr_pages; i++) {
33402 +               char *data;
33403 +
33404 +               assert("edward-242", clust->pages[i] != NULL);
33405 +               assert("edward-1436", clust->pages[i]->index ==
33406 +                      clust_to_pg(clust->index, inode) + i);
33407 +               assert("edward-1437", PageUptodate(clust->pages[i]));
33408 +               /* flush the page into the input stream */
33409 +               lock_page(clust->pages[i]);
33410 +               data = kmap(clust->pages[i]);
33411 +
33412 +               assert("edward-986", cnt_to_pgcnt(tc->len, i) != 0);
33413 +
33414 +               memcpy(tfm_stream_data(tc, INPUT_STREAM) + pg_to_off(i),
33415 +                      data, cnt_to_pgcnt(tc->len, i));
33416 +               kunmap(clust->pages[i]);
33417 +               unlock_page(clust->pages[i]);
33418 +       }
33419 +       clear_cluster_pages_dirty(clust);
33420 +       release_cluster_pages(clust);
33421 +#if REISER4_DEBUG
33422 +       cryptcompress_inode_data(inode)->pgcount -= clust->nr_pages;
33423 +#endif
33424 +       /* put pages that were found here */
33425 +       release_cluster_pages(clust);
33426 +       return result;
33427 +}
33428 +
33429 +/* set hint for the cluster of the index @index */
33430 +static void set_hint_cluster(struct inode *inode, hint_t * hint,
33431 +                            cloff_t index, znode_lock_mode mode)
33432 +{
33433 +       reiser4_key key;
33434 +       assert("edward-722", crc_inode_ok(inode));
33435 +       assert("edward-723",
33436 +              inode_file_plugin(inode) ==
33437 +              file_plugin_by_id(CRC_FILE_PLUGIN_ID));
33438 +
33439 +       inode_file_plugin(inode)->key_by_inode(inode,
33440 +                                              clust_to_off(index, inode),
33441 +                                              &key);
33442 +
33443 +       seal_init(&hint->seal, &hint->ext_coord.coord, &key);
33444 +       hint->offset = get_key_offset(&key);
33445 +       hint->mode = mode;
33446 +}
33447 +
33448 +void invalidate_hint_cluster(reiser4_cluster_t * clust)
33449 +{
33450 +       assert("edward-1291", clust != NULL);
33451 +       assert("edward-1292", clust->hint != NULL);
33452 +
33453 +       done_lh(&clust->hint->lh);
33454 +       clust->hint->ext_coord.valid = 0;
33455 +}
33456 +
33457 +void put_hint_cluster(reiser4_cluster_t * clust, struct inode *inode,
33458 +                znode_lock_mode mode)
33459 +{
33460 +       assert("edward-1286", clust != NULL);
33461 +       assert("edward-1287", clust->hint != NULL);
33462 +
33463 +       set_hint_cluster(inode, clust->hint, clust->index + 1, mode);
33464 +       invalidate_hint_cluster(clust);
33465 +}
33466 +
33467 +static int
33468 +balance_dirty_page_cluster(reiser4_cluster_t * clust, struct inode *inode,
33469 +                          loff_t off, loff_t to_file)
33470 +{
33471 +       int result;
33472 +
33473 +       assert("edward-724", inode != NULL);
33474 +       assert("edward-725", crc_inode_ok(inode));
33475 +       assert("edward-1272", get_current_context()->grabbed_blocks == 0);
33476 +
33477 +       /* set next window params */
33478 +       update_cluster(inode, clust, off, to_file);
33479 +
33480 +       result = update_sd_cryptcompress(inode);
33481 +       assert("edward-988", !result);
33482 +       if (result)
33483 +               return result;
33484 +       assert("edward-726", clust->hint->lh.owner == NULL);
33485 +
33486 +       reiser4_throttle_write(inode);
33487 +       all_grabbed2free();
33488 +       return 0;
33489 +}
33490 +
33491 +/* set zeroes to the cluster, update it, and maybe, try to capture its pages */
33492 +static int
33493 +write_hole(struct inode *inode, reiser4_cluster_t * clust, loff_t file_off,
33494 +          loff_t to_file)
33495 +{
33496 +       char *data;
33497 +       int result = 0;
33498 +       unsigned cl_off, cl_count = 0;
33499 +       unsigned to_pg, pg_off;
33500 +       reiser4_slide_t *win;
33501 +
33502 +       assert("edward-190", clust != NULL);
33503 +       assert("edward-1069", clust->win != NULL);
33504 +       assert("edward-191", inode != NULL);
33505 +       assert("edward-727", crc_inode_ok(inode));
33506 +       assert("edward-1171", clust->dstat != INVAL_DISK_CLUSTER);
33507 +       assert("edward-1154",
33508 +              ergo(clust->dstat != FAKE_DISK_CLUSTER, clust->reserved == 1));
33509 +
33510 +       win = clust->win;
33511 +
33512 +       assert("edward-1070", win != NULL);
33513 +       assert("edward-201", win->stat == HOLE_WINDOW);
33514 +       assert("edward-192", cluster_ok(clust, inode));
33515 +
33516 +       if (win->off == 0 && win->count == inode_cluster_size(inode)) {
33517 +               /* the hole will be represented by fake disk cluster */
33518 +               update_cluster(inode, clust, file_off, to_file);
33519 +               return 0;
33520 +       }
33521 +       cl_count = win->count;  /* number of zeroes to write */
33522 +       cl_off = win->off;
33523 +       pg_off = off_to_pgoff(win->off);
33524 +
33525 +       while (cl_count) {
33526 +               struct page *page;
33527 +               page = clust->pages[off_to_pg(cl_off)];
33528 +
33529 +               assert("edward-284", page != NULL);
33530 +
33531 +               to_pg = min_count(PAGE_CACHE_SIZE - pg_off, cl_count);
33532 +               lock_page(page);
33533 +               data = kmap_atomic(page, KM_USER0);
33534 +               memset(data + pg_off, 0, to_pg);
33535 +               flush_dcache_page(page);
33536 +               kunmap_atomic(data, KM_USER0);
33537 +               SetPageUptodate(page);
33538 +               unlock_page(page);
33539 +
33540 +               cl_off += to_pg;
33541 +               cl_count -= to_pg;
33542 +               pg_off = 0;
33543 +       }
33544 +       if (!win->delta) {
33545 +               /* only zeroes, try to capture */
33546 +
33547 +               set_cluster_pages_dirty(clust);
33548 +               result = try_capture_cluster(clust, inode);
33549 +               if (result)
33550 +                       return result;
33551 +               put_hint_cluster(clust, inode, ZNODE_WRITE_LOCK);
33552 +               result =
33553 +                   balance_dirty_page_cluster(clust, inode, file_off, to_file);
33554 +       } else
33555 +               update_cluster(inode, clust, file_off, to_file);
33556 +       return result;
33557 +}
33558 +
33559 +/*
33560 +  The main disk search procedure for cryptcompress plugins, which
33561 +  . scans all items of disk cluster
33562 +  . maybe reads each one (if @read != 0)
33563 +  . maybe makes its znode dirty  (if @write != 0)
33564 +
33565 +  NOTE-EDWARD: Callers should handle the case when disk cluster
33566 +  is incomplete (-EIO)
33567 +*/
33568 +int
33569 +find_cluster(reiser4_cluster_t * clust,
33570 +            struct inode *inode, int read, int write)
33571 +{
33572 +       flow_t f;
33573 +       hint_t *hint;
33574 +       int result = 0;
33575 +       unsigned long cl_idx;
33576 +       ra_info_t ra_info;
33577 +       file_plugin *fplug;
33578 +       item_plugin *iplug;
33579 +       tfm_cluster_t *tc;
33580 +       int was_grabbed;
33581 +
33582 +       assert("edward-138", clust != NULL);
33583 +       assert("edward-728", clust->hint != NULL);
33584 +       assert("edward-225", read || write);
33585 +       assert("edward-226", schedulable());
33586 +       assert("edward-137", inode != NULL);
33587 +       assert("edward-729", crc_inode_ok(inode));
33588 +
33589 +       hint = clust->hint;
33590 +       cl_idx = clust->index;
33591 +       fplug = inode_file_plugin(inode);
33592 +       was_grabbed = get_current_context()->grabbed_blocks;
33593 +       tc = &clust->tc;
33594 +
33595 +       assert("edward-462", !tfm_cluster_is_uptodate(tc));
33596 +       assert("edward-461", ergo(read, tfm_stream_is_set(tc, INPUT_STREAM)));
33597 +
33598 +       /* set key of the first disk cluster item */
33599 +       fplug->flow_by_inode(inode,
33600 +                            (read ? (char __user *)tfm_stream_data(tc, INPUT_STREAM) : NULL),
33601 +                            0 /* kernel space */ ,
33602 +                            inode_scaled_cluster_size(inode),
33603 +                            clust_to_off(cl_idx, inode), READ_OP, &f);
33604 +       if (write) {
33605 +               /* reserve for flush to make dirty all the leaf nodes
33606 +                  which contain disk cluster */
33607 +               result =
33608 +                   reiser4_grab_space_force(estimate_dirty_cluster(inode),
33609 +                                            BA_CAN_COMMIT);
33610 +               assert("edward-990", !result);
33611 +               if (result)
33612 +                       goto out;
33613 +       }
33614 +
33615 +       ra_info.key_to_stop = f.key;
33616 +       set_key_offset(&ra_info.key_to_stop, get_key_offset(max_key()));
33617 +
33618 +       while (f.length) {
33619 +               result = find_cluster_item(hint,
33620 +                                          &f.key,
33621 +                                          (write ? ZNODE_WRITE_LOCK :
33622 +                                           ZNODE_READ_LOCK), NULL, FIND_EXACT,
33623 +                                          (write ? CBK_FOR_INSERT : 0));
33624 +               switch (result) {
33625 +               case CBK_COORD_NOTFOUND:
33626 +                       result = 0;
33627 +                       if (inode_scaled_offset
33628 +                           (inode,
33629 +                            clust_to_off(cl_idx,
33630 +                                         inode)) == get_key_offset(&f.key)) {
33631 +                               /* first item not found, this is treated
33632 +                                  as disk cluster is absent */
33633 +                               clust->dstat = FAKE_DISK_CLUSTER;
33634 +                               goto out;
33635 +                       }
33636 +                       /* we are outside the cluster, stop search here */
33637 +                       assert("edward-146",
33638 +                              f.length != inode_scaled_cluster_size(inode));
33639 +                       goto ok;
33640 +               case CBK_COORD_FOUND:
33641 +                       assert("edward-148",
33642 +                              hint->ext_coord.coord.between == AT_UNIT);
33643 +                       assert("edward-460",
33644 +                              hint->ext_coord.coord.unit_pos == 0);
33645 +
33646 +                       coord_clear_iplug(&hint->ext_coord.coord);
33647 +                       result = zload_ra(hint->ext_coord.coord.node, &ra_info);
33648 +                       if (unlikely(result))
33649 +                               goto out;
33650 +                       iplug = item_plugin_by_coord(&hint->ext_coord.coord);
33651 +                       assert("edward-147",
33652 +                              item_id_by_coord(&hint->ext_coord.coord) ==
33653 +                              CTAIL_ID);
33654 +
33655 +                       result = iplug->s.file.read(NULL, &f, hint);
33656 +                       if (result) {
33657 +                               zrelse(hint->ext_coord.coord.node);
33658 +                               goto out;
33659 +                       }
33660 +                       if (write) {
33661 +                               znode_make_dirty(hint->ext_coord.coord.node);
33662 +                               znode_set_convertible(hint->ext_coord.coord.
33663 +                                                     node);
33664 +                       }
33665 +                       zrelse(hint->ext_coord.coord.node);
33666 +                       break;
33667 +               default:
33668 +                       goto out;
33669 +               }
33670 +       }
33671 + ok:
33672 +       /* at least one item was found  */
33673 +       /* NOTE-EDWARD: Callers should handle the case
33674 +          when disk cluster is incomplete (-EIO) */
33675 +       tc->len = inode_scaled_cluster_size(inode) - f.length;
33676 +       tc->lsize = fsize_to_count(clust, inode);
33677 +       assert("edward-1196", tc->len > 0);
33678 +       assert("edward-1406", tc->lsize > 0);
33679 +
33680 +       if (hint_is_unprepped_dclust(clust->hint))
33681 +               clust->dstat = UNPR_DISK_CLUSTER;
33682 +       else
33683 +               clust->dstat = PREP_DISK_CLUSTER;
33684 + out:
33685 +       assert("edward-1339",
33686 +              get_current_context()->grabbed_blocks >= was_grabbed);
33687 +       grabbed2free(get_current_context(),
33688 +                    get_current_super_private(),
33689 +                    get_current_context()->grabbed_blocks - was_grabbed);
33690 +       return result;
33691 +}
33692 +
33693 +int
33694 +get_disk_cluster_locked(reiser4_cluster_t * clust, struct inode *inode,
33695 +                       znode_lock_mode lock_mode)
33696 +{
33697 +       reiser4_key key;
33698 +       ra_info_t ra_info;
33699 +
33700 +       assert("edward-730", schedulable());
33701 +       assert("edward-731", clust != NULL);
33702 +       assert("edward-732", inode != NULL);
33703 +
33704 +       if (clust->hint->ext_coord.valid) {
33705 +               assert("edward-1293", clust->dstat != INVAL_DISK_CLUSTER);
33706 +               assert("edward-1294",
33707 +                      znode_is_write_locked(clust->hint->lh.node));
33708 +               /* already have a valid locked position */
33709 +               return (clust->dstat ==
33710 +                       FAKE_DISK_CLUSTER ? CBK_COORD_NOTFOUND :
33711 +                       CBK_COORD_FOUND);
33712 +       }
33713 +       key_by_inode_cryptcompress(inode, clust_to_off(clust->index, inode),
33714 +                                  &key);
33715 +       ra_info.key_to_stop = key;
33716 +       set_key_offset(&ra_info.key_to_stop, get_key_offset(max_key()));
33717 +
33718 +       return find_cluster_item(clust->hint, &key, lock_mode, NULL, FIND_EXACT,
33719 +                                CBK_FOR_INSERT);
33720 +}
33721 +
33722 +/* Read needed cluster pages before modifying.
33723 +   If success, @clust->hint contains locked position in the tree.
33724 +   Also:
33725 +   . find and set disk cluster state
33726 +   . make disk cluster dirty if its state is not FAKE_DISK_CLUSTER.
33727 +*/
33728 +static int
33729 +read_some_cluster_pages(struct inode *inode, reiser4_cluster_t * clust)
33730 +{
33731 +       int i;
33732 +       int result = 0;
33733 +       item_plugin *iplug;
33734 +       reiser4_slide_t *win = clust->win;
33735 +
33736 +       iplug = item_plugin_by_id(CTAIL_ID);
33737 +
33738 +       assert("edward-733", get_current_context()->grabbed_blocks == 0);
33739 +       assert("edward-924", !tfm_cluster_is_uptodate(&clust->tc));
33740 +
33741 +#if REISER4_DEBUG
33742 +       if (clust->nr_pages == 0) {
33743 +               /* start write hole from fake disk cluster */
33744 +               assert("edward-1117", win != NULL);
33745 +               assert("edward-1118", win->stat == HOLE_WINDOW);
33746 +               assert("edward-1119", new_cluster(clust, inode));
33747 +       }
33748 +#endif
33749 +       if (new_cluster(clust, inode)) {
33750 +               /*
33751 +                  new page cluster is about to be written, nothing to read,
33752 +                */
33753 +               assert("edward-734", schedulable());
33754 +               assert("edward-735", clust->hint->lh.owner == NULL);
33755 +
33756 +               if (clust->nr_pages) {
33757 +                       int off;
33758 +                       char *data;
33759 +                       struct page * pg;
33760 +                       assert("edward-1419", clust->pages != NULL);
33761 +                       pg = clust->pages[clust->nr_pages - 1];
33762 +                       assert("edward-1420", pg != NULL);
33763 +                       off = off_to_pgoff(win->off+win->count+win->delta);
33764 +                       if (off) {
33765 +                               lock_page(pg);
33766 +                               data = kmap_atomic(pg, KM_USER0);
33767 +                               memset(data + off, 0, PAGE_CACHE_SIZE - off);
33768 +                               flush_dcache_page(pg);
33769 +                               kunmap_atomic(data, KM_USER0);
33770 +                               unlock_page(pg);
33771 +                       }
33772 +               }
33773 +               clust->dstat = FAKE_DISK_CLUSTER;
33774 +               return 0;
33775 +       }
33776 +       /*
33777 +          Here we should search for disk cluster to figure out its real state.
33778 +          Also there is one more important reason to do disk search: we need
33779 +          to make disk cluster _dirty_ if it exists
33780 +        */
33781 +
33782 +       /* if windows is specified, read the only pages
33783 +          that will be modified partially */
33784 +
33785 +       for (i = 0; i < clust->nr_pages; i++) {
33786 +               struct page *pg = clust->pages[i];
33787 +
33788 +               lock_page(pg);
33789 +               if (PageUptodate(pg)) {
33790 +                       unlock_page(pg);
33791 +                       continue;
33792 +               }
33793 +               unlock_page(pg);
33794 +
33795 +               if (win &&
33796 +                   i >= count_to_nrpages(win->off) &&
33797 +                   i < off_to_pg(win->off + win->count + win->delta))
33798 +                       /* page will be completely overwritten */
33799 +                       continue;
33800 +
33801 +               if (win && (i == clust->nr_pages - 1) &&
33802 +                   /* the last page is
33803 +                      partially modified,
33804 +                      not uptodate .. */
33805 +                   (count_to_nrpages(inode->i_size) <= pg->index)) {
33806 +                       /* .. and appended,
33807 +                          so set zeroes to the rest */
33808 +                       char *data;
33809 +                       int offset;
33810 +                       lock_page(pg);
33811 +                       data = kmap_atomic(pg, KM_USER0);
33812 +
33813 +                       assert("edward-1260",
33814 +                              count_to_nrpages(win->off + win->count +
33815 +                                               win->delta) - 1 == i);
33816 +
33817 +                       offset =
33818 +                           off_to_pgoff(win->off + win->count + win->delta);
33819 +                       memset(data + offset, 0, PAGE_CACHE_SIZE - offset);
33820 +                       flush_dcache_page(pg);
33821 +                       kunmap_atomic(data, KM_USER0);
33822 +                       unlock_page(pg);
33823 +                       /* still not uptodate */
33824 +                       break;
33825 +               }
33826 +               if (!tfm_cluster_is_uptodate(&clust->tc)) {
33827 +                       result = ctail_read_disk_cluster(clust, inode, 1);
33828 +                       assert("edward-992", !result);
33829 +                       if (result)
33830 +                               goto out;
33831 +                       assert("edward-925",
33832 +                              tfm_cluster_is_uptodate(&clust->tc));
33833 +               }
33834 +               lock_page(pg);
33835 +               result = do_readpage_ctail(inode, clust, pg);
33836 +               unlock_page(pg);
33837 +               assert("edward-993", !result);
33838 +               if (result) {
33839 +                       impossible("edward-219",
33840 +                                  "do_readpage_ctail returned crap");
33841 +                       goto out;
33842 +               }
33843 +       }
33844 +       if (!tfm_cluster_is_uptodate(&clust->tc)) {
33845 +               /* disk cluster unclaimed, but we need to make its znodes dirty
33846 +                  to make flush update convert its content */
33847 +               result =
33848 +                   find_cluster(clust, inode, 0 /* do not read */ ,
33849 +                                1 /* write */ );
33850 +               assert("edward-994", !result);
33851 +       }
33852 + out:
33853 +       tfm_cluster_clr_uptodate(&clust->tc);
33854 +       return result;
33855 +}
33856 +
33857 +static int
33858 +should_create_unprepped_cluster(reiser4_cluster_t * clust, struct inode *inode)
33859 +{
33860 +       assert("edward-737", clust != NULL);
33861 +
33862 +       switch (clust->dstat) {
33863 +       case PREP_DISK_CLUSTER:
33864 +       case UNPR_DISK_CLUSTER:
33865 +               return 0;
33866 +       case FAKE_DISK_CLUSTER:
33867 +               if (clust->win &&
33868 +                   clust->win->stat == HOLE_WINDOW && clust->nr_pages == 0) {
33869 +                       assert("edward-1172", new_cluster(clust, inode));
33870 +                       return 0;
33871 +               }
33872 +               return 1;
33873 +       default:
33874 +               impossible("edward-1173", "bad disk cluster state");
33875 +               return 0;
33876 +       }
33877 +}
33878 +
33879 +static int
33880 +crc_make_unprepped_cluster(reiser4_cluster_t * clust, struct inode *inode)
33881 +{
33882 +       int result;
33883 +
33884 +       assert("edward-1123", schedulable());
33885 +       assert("edward-737", clust != NULL);
33886 +       assert("edward-738", inode != NULL);
33887 +       assert("edward-739", crc_inode_ok(inode));
33888 +       assert("edward-1053", clust->hint != NULL);
33889 +       assert("edward-1266", get_current_context()->grabbed_blocks == 0);
33890 +
33891 +       if (clust->reserved) {
33892 +               cluster_reserved2grabbed(estimate_insert_cluster(inode));
33893 +#if REISER4_DEBUG
33894 +               assert("edward-1267",
33895 +                      clust->reserved_unprepped ==
33896 +                      estimate_insert_cluster(inode));
33897 +               clust->reserved_unprepped -= estimate_insert_cluster(inode);
33898 +#endif
33899 +       }
33900 +       if (!should_create_unprepped_cluster(clust, inode)) {
33901 +               all_grabbed2free();
33902 +               return 0;
33903 +       } else {
33904 +               assert("edward-1268", clust->reserved == 1);
33905 +       }
33906 +       result = ctail_insert_unprepped_cluster(clust, inode);
33907 +       all_grabbed2free();
33908 +       if (result)
33909 +               return result;
33910 +
33911 +       assert("edward-743", crc_inode_ok(inode));
33912 +       assert("edward-1269", get_current_context()->grabbed_blocks == 0);
33913 +       assert("edward-744", znode_is_write_locked(clust->hint->lh.node));
33914 +
33915 +       clust->dstat = UNPR_DISK_CLUSTER;
33916 +       return 0;
33917 +}
33918 +
33919 +#if REISER4_DEBUG
33920 +static int jnode_truncate_ok(struct inode *inode, cloff_t index)
33921 +{
33922 +       jnode *node;
33923 +       node =
33924 +           jlookup(current_tree, get_inode_oid(inode),
33925 +                   clust_to_pg(index, inode));
33926 +       if (likely(!node))
33927 +               return 1;
33928 +       /* someone got this jnode */
33929 +       warning("edward-1315", "jnode %p is untruncated\n", node);
33930 +       jput(node);
33931 +       return (atomic_read(&node->x_count));
33932 +}
33933 +#endif
33934 +
33935 +/* Collect unlocked cluster pages and jnode (the last is in the
33936 +   case when the page cluster will be modified and captured) */
33937 +int
33938 +prepare_page_cluster(struct inode *inode, reiser4_cluster_t * clust,
33939 +                    int capture)
33940 +{
33941 +       assert("edward-177", inode != NULL);
33942 +       assert("edward-741", crc_inode_ok(inode));
33943 +       assert("edward-740", clust->pages != NULL);
33944 +
33945 +       set_cluster_nrpages(clust, inode);
33946 +       reset_cluster_pgset(clust, cluster_nrpages(inode));
33947 +       return (capture ?
33948 +               grab_cluster_pages_jnode(inode, clust) :
33949 +               grab_cluster_pages(inode, clust));
33950 +}
33951 +
33952 +/* Truncate all pages of the cluster of index @index.
33953 +   This is called by ->kill_hook() method of item plugin */
33954 +void truncate_page_cluster(struct inode *inode, cloff_t index)
33955 +{
33956 +       int i;
33957 +       int found = 0;
33958 +       int nr_pages;
33959 +       jnode *node;
33960 +       struct page *pages[MAX_CLUSTER_NRPAGES];
33961 +
33962 +       node =
33963 +           jlookup(current_tree, get_inode_oid(inode),
33964 +                   clust_to_pg(index, inode));
33965 +       /* jnode is absent, just drop pages which can not
33966 +          acquire jnode because of exclusive access */
33967 +       if (!node) {
33968 +               truncate_inode_pages_range(inode->i_mapping,
33969 +                                          clust_to_off(index, inode),
33970 +                                          clust_to_off(index,
33971 +                                                       inode) +
33972 +                                          inode_cluster_size(inode) - 1);
33973 +               return;
33974 +       }
33975 +       /* jnode is present and may be dirty */
33976 +       nr_pages = count_to_nrpages(cnt_to_clcnt(inode->i_size, index, inode));
33977 +
33978 +       found = find_get_pages(inode->i_mapping, clust_to_pg(index, inode),
33979 +                              nr_pages, pages);
33980 +       spin_lock_jnode(node);
33981 +       if (JF_ISSET(node, JNODE_DIRTY)) {
33982 +               /* someone has done modifications which are not
33983 +                  yet committed, so we need to release some resources */
33984 +
33985 +               /* free disk space grabbed for disk cluster converting */
33986 +               cluster_reserved2grabbed(estimate_update_cluster(inode));
33987 +               grabbed2free(get_current_context(),
33988 +                            get_current_super_private(),
33989 +                            estimate_update_cluster(inode));
33990 +
33991 +               assert("edward-1198", found == nr_pages);
33992 +               assert("edward-1199", node->page_count + 1 == nr_pages);
33993 +#if REISER4_DEBUG
33994 +               node->page_count = 0;
33995 +#endif
33996 +               /* This will clear dirty bit */
33997 +               uncapture_cluster_jnode(node);
33998 +
33999 +               /* put pages grabbed for last uncommitted modifications */
34000 +               for (i = 0; i < nr_pages; i++) {
34001 +                       assert("edward-1200", PageUptodate(pages[i]));
34002 +                       page_cache_release(pages[i]);
34003 +#if REISER4_DEBUG
34004 +                       cryptcompress_inode_data(inode)->pgcount --;
34005 +#endif
34006 +               }
34007 +       } else
34008 +               spin_unlock_jnode(node);
34009 +       /* FIXME-EDWARD: Use truncate_complete_page in the loop above instead */
34010 +
34011 +       jput(node);
34012 +       /* put pages found here */
34013 +       forget_cluster_pages(pages, found);
34014 +       truncate_inode_pages_range(inode->i_mapping,
34015 +                                  clust_to_off(index, inode),
34016 +                                  clust_to_off(index,
34017 +                                               inode) +
34018 +                                  inode_cluster_size(inode) - 1);
34019 +       assert("edward-1201", jnode_truncate_ok(inode, index));
34020 +       return;
34021 +}
34022 +
34023 +/* Prepare cluster handle before(after) modifications
34024 +   which are supposed to be committed.
34025 +
34026 +   . grab cluster pages;
34027 +   . reserve disk space;
34028 +   . maybe read pages from disk and set the disk cluster dirty;
34029 +   . maybe write hole;
34030 +   . maybe create 'unprepped' disk cluster if the last one is fake
34031 +     (i.e. is not represenred by any items)
34032 +*/
34033 +
34034 +static int
34035 +prepare_cluster(struct inode *inode,
34036 +               loff_t file_off /* write position in the file */ ,
34037 +               loff_t to_file, /* bytes of users data to write to the file */
34038 +               reiser4_cluster_t * clust, page_cluster_op op)
34039 +{
34040 +       int result = 0;
34041 +       reiser4_slide_t *win = clust->win;
34042 +
34043 +       assert("edward-1273", get_current_context()->grabbed_blocks == 0);
34044 +       reset_cluster_params(clust);
34045 +#if REISER4_DEBUG
34046 +       clust->ctx = get_current_context();
34047 +#endif
34048 +       assert("edward-1190", op != PCL_UNKNOWN);
34049 +
34050 +       clust->op = op;
34051 +
34052 +       result = prepare_page_cluster(inode, clust, 1);
34053 +       if (result)
34054 +               return result;
34055 +       result = reserve4cluster(inode, clust);
34056 +       if (result)
34057 +               goto err1;
34058 +       result = read_some_cluster_pages(inode, clust);
34059 +       if (result) {
34060 +               free_reserved4cluster(inode,
34061 +                                     clust,
34062 +                                     estimate_update_cluster(inode) +
34063 +                                     estimate_insert_cluster(inode));
34064 +               goto err1;
34065 +       }
34066 +       assert("edward-1124", clust->dstat != INVAL_DISK_CLUSTER);
34067 +
34068 +       result = crc_make_unprepped_cluster(clust, inode);
34069 +       if (result)
34070 +               goto err2;
34071 +       if (win && win->stat == HOLE_WINDOW) {
34072 +               result = write_hole(inode, clust, file_off, to_file);
34073 +               if (result)
34074 +                       goto err2;
34075 +       }
34076 +       return 0;
34077 +      err2:
34078 +       free_reserved4cluster(inode, clust,
34079 +                             estimate_update_cluster(inode));
34080 +      err1:
34081 +       release_cluster_pages_and_jnode(clust);
34082 +       assert("edward-1125", result == -ENOSPC);
34083 +       return result;
34084 +}
34085 +
34086 +/* set window by two offsets */
34087 +static void
34088 +set_window(reiser4_cluster_t * clust, reiser4_slide_t * win,
34089 +          struct inode *inode, loff_t o1, loff_t o2)
34090 +{
34091 +       assert("edward-295", clust != NULL);
34092 +       assert("edward-296", inode != NULL);
34093 +       assert("edward-1071", win != NULL);
34094 +       assert("edward-297", o1 <= o2);
34095 +
34096 +       clust->index = off_to_clust(o1, inode);
34097 +
34098 +       win->off = off_to_cloff(o1, inode);
34099 +       win->count = min_count(inode_cluster_size(inode) - win->off, o2 - o1);
34100 +       win->delta = 0;
34101 +
34102 +       clust->win = win;
34103 +}
34104 +
34105 +static int
34106 +set_cluster_by_window(struct inode *inode, reiser4_cluster_t * clust,
34107 +                     reiser4_slide_t * win, flow_t * f, loff_t file_off)
34108 +{
34109 +       int result;
34110 +
34111 +       assert("edward-197", clust != NULL);
34112 +       assert("edward-1072", win != NULL);
34113 +       assert("edward-198", inode != NULL);
34114 +
34115 +       result = alloc_cluster_pgset(clust, cluster_nrpages(inode));
34116 +       if (result)
34117 +               return result;
34118 +
34119 +       if (file_off > inode->i_size) {
34120 +               /* Uhmm, hole in cryptcompress file... */
34121 +               loff_t hole_size;
34122 +               hole_size = file_off - inode->i_size;
34123 +
34124 +               set_window(clust, win, inode, inode->i_size, file_off);
34125 +               win->stat = HOLE_WINDOW;
34126 +               if (win->off + hole_size < inode_cluster_size(inode))
34127 +                       /* there is also user's data to append to the hole */
34128 +                       win->delta =
34129 +                           min_count(inode_cluster_size(inode) -
34130 +                                     (win->off + win->count), f->length);
34131 +               return 0;
34132 +       }
34133 +       set_window(clust, win, inode, file_off, file_off + f->length);
34134 +       win->stat = DATA_WINDOW;
34135 +       return 0;
34136 +}
34137 +
34138 +int set_cluster_by_page(reiser4_cluster_t * clust, struct page * page,
34139 +                       int count)
34140 +{
34141 +       int result = 0;
34142 +       int (*setting_actor)(reiser4_cluster_t * clust, int count);
34143 +
34144 +       assert("edward-1358", clust != NULL);
34145 +       assert("edward-1359", page != NULL);
34146 +       assert("edward-1360", page->mapping != NULL);
34147 +       assert("edward-1361", page->mapping->host != NULL);
34148 +
34149 +       setting_actor  = (clust->pages ? reset_cluster_pgset : alloc_cluster_pgset);
34150 +       result = setting_actor(clust, count);
34151 +       clust->index = pg_to_clust(page->index, page->mapping->host);
34152 +       return result;
34153 +}
34154 +
34155 +/* reset all the params that not get updated */
34156 +void reset_cluster_params(reiser4_cluster_t * clust)
34157 +{
34158 +       assert("edward-197", clust != NULL);
34159 +
34160 +       clust->dstat = INVAL_DISK_CLUSTER;
34161 +       clust->tc.uptodate = 0;
34162 +       clust->tc.len = 0;
34163 +}
34164 +
34165 +/* Core write procedure of cryptcompress plugin, which slices user's
34166 +   flow into logical clusters, maps the last ones to the appropriate
34167 +   page clusters, and tries to capture them.
34168 +   If @buf != NULL, returns number of successfully written bytes,
34169 +   otherwise returns error
34170 +*/
34171 +static loff_t
34172 +write_cryptcompress_flow(struct file *file, struct inode *inode,
34173 +                        const char __user *buf, size_t count, loff_t pos)
34174 +{
34175 +       int i;
34176 +       flow_t f;
34177 +       hint_t *hint;
34178 +       int result = 0;
34179 +       size_t to_write = 0;
34180 +       loff_t file_off;
34181 +       reiser4_slide_t win;
34182 +       reiser4_cluster_t clust;
34183 +
34184 +       assert("edward-161", schedulable());
34185 +       assert("edward-748", crc_inode_ok(inode));
34186 +       assert("edward-159", current_blocksize == PAGE_CACHE_SIZE);
34187 +       assert("edward-1274", get_current_context()->grabbed_blocks == 0);
34188 +
34189 +       result = check_cryptcompress(inode);
34190 +       if (result)
34191 +               return result;
34192 +       hint = kmalloc(sizeof(*hint), GFP_KERNEL);
34193 +       if (hint == NULL)
34194 +               return RETERR(-ENOMEM);
34195 +
34196 +       result = load_file_hint(file, hint);
34197 +       if (result) {
34198 +               kfree(hint);
34199 +               return result;
34200 +       }
34201 +
34202 +       result =
34203 +           flow_by_inode_cryptcompress(inode, buf, 1 /* user space */ ,
34204 +                                       count, pos, WRITE_OP, &f);
34205 +       if (result)
34206 +               goto out;
34207 +       to_write = f.length;
34208 +
34209 +       /* current write position in file */
34210 +       file_off = pos;
34211 +       reiser4_slide_init(&win);
34212 +       cluster_init_read(&clust, &win);
34213 +       clust.hint = hint;
34214 +
34215 +       result = set_cluster_by_window(inode, &clust, &win, &f, file_off);
34216 +       if (result)
34217 +               goto out;
34218 +
34219 +       if (next_window_stat(&win) == HOLE_WINDOW) {
34220 +               result =
34221 +                   prepare_cluster(inode, file_off, f.length, &clust,
34222 +                                   PCL_APPEND);
34223 +               if (result)
34224 +                       goto out;
34225 +       }
34226 +       do {
34227 +               char *src;
34228 +               unsigned page_off, page_count;
34229 +
34230 +               assert("edward-750", schedulable());
34231 +
34232 +               result =
34233 +                   prepare_cluster(inode, file_off, f.length, &clust,
34234 +                                   PCL_APPEND);
34235 +               if (result)
34236 +                       goto out;
34237 +
34238 +               assert("edward-751", crc_inode_ok(inode));
34239 +               assert("edward-204", win.stat == DATA_WINDOW);
34240 +               assert("edward-1288", clust.hint->ext_coord.valid);
34241 +               assert("edward-752",
34242 +                      znode_is_write_locked(hint->ext_coord.coord.node));
34243 +
34244 +               put_hint_cluster(&clust, inode, ZNODE_WRITE_LOCK);
34245 +
34246 +               /* set write position in page */
34247 +               page_off = off_to_pgoff(win.off);
34248 +
34249 +               /* copy user's data to cluster pages */
34250 +               for (i = off_to_pg(win.off), src = f.data;
34251 +                    i < count_to_nrpages(win.off + win.count);
34252 +                    i++, src += page_count) {
34253 +                       page_count =
34254 +                           cnt_to_pgcnt(win.off + win.count, i) - page_off;
34255 +
34256 +                       assert("edward-1039",
34257 +                              page_off + page_count <= PAGE_CACHE_SIZE);
34258 +                       assert("edward-287", clust.pages[i] != NULL);
34259 +
34260 +                       lock_page(clust.pages[i]);
34261 +                       result =
34262 +                           __copy_from_user((char *)kmap(clust.pages[i]) +
34263 +                                            page_off, (char __user *)src, page_count);
34264 +                       kunmap(clust.pages[i]);
34265 +                       if (unlikely(result)) {
34266 +                               unlock_page(clust.pages[i]);
34267 +                               result = -EFAULT;
34268 +                               goto err2;
34269 +                       }
34270 +                       SetPageUptodate(clust.pages[i]);
34271 +                       unlock_page(clust.pages[i]);
34272 +                       page_off = 0;
34273 +               }
34274 +               assert("edward-753", crc_inode_ok(inode));
34275 +
34276 +               set_cluster_pages_dirty(&clust);
34277 +
34278 +               result = try_capture_cluster(&clust, inode);
34279 +               if (result)
34280 +                       goto err2;
34281 +
34282 +               assert("edward-998", f.user == 1);
34283 +
34284 +               move_flow_forward(&f, win.count);
34285 +
34286 +               /* disk cluster may be already clean at this point */
34287 +
34288 +               /* . update cluster
34289 +                  . set hint for new offset
34290 +                  . unlock znode
34291 +                  . update inode
34292 +                  . balance dirty pages
34293 +                */
34294 +               result = balance_dirty_page_cluster(&clust, inode, 0, f.length);
34295 +               if (result)
34296 +                       goto err1;
34297 +               assert("edward-755", hint->lh.owner == NULL);
34298 +               reset_cluster_params(&clust);
34299 +               continue;
34300 +             err2:
34301 +               release_cluster_pages_and_jnode(&clust);
34302 +             err1:
34303 +               if (clust.reserved)
34304 +                       free_reserved4cluster(inode,
34305 +                                             &clust,
34306 +                                             estimate_update_cluster(inode));
34307 +               break;
34308 +       } while (f.length);
34309 +      out:
34310 +       done_lh(&hint->lh);
34311 +       if (result == -EEXIST)
34312 +               warning("edward-1407", "write returns EEXIST!\n");
34313 +
34314 +       put_cluster_handle(&clust);
34315 +       save_file_hint(file, hint);
34316 +       kfree(hint);
34317 +       if (buf) {
34318 +               /* if nothing were written - there must be an error */
34319 +               assert("edward-195", ergo((to_write == f.length), result < 0));
34320 +               return (to_write - f.length) ? (to_write - f.length) : result;
34321 +       }
34322 +       return result;
34323 +}
34324 +
34325 +static ssize_t write_crc_file(struct file *file,       /* file to write to */
34326 +                             struct inode *inode,      /* inode */
34327 +                             const char __user *buf,   /* address of user-space buffer */
34328 +                             size_t count,     /* number of bytes to write */
34329 +                             loff_t * off /* position to write which */ )
34330 +{
34331 +
34332 +       int result;
34333 +       loff_t pos;
34334 +       ssize_t written;
34335 +       cryptcompress_info_t *info = cryptcompress_inode_data(inode);
34336 +
34337 +       assert("edward-196", crc_inode_ok(inode));
34338 +
34339 +       result = generic_write_checks(file, off, &count, 0);
34340 +       if (unlikely(result != 0))
34341 +               return result;
34342 +
34343 +       if (unlikely(count == 0))
34344 +               return 0;
34345 +
34346 +       down_write(&info->lock);
34347 +       LOCK_CNT_INC(inode_sem_w);
34348 +
34349 +       pos = *off;
34350 +       written =
34351 +           write_cryptcompress_flow(file, inode, buf, count, pos);
34352 +
34353 +       up_write(&info->lock);
34354 +       LOCK_CNT_DEC(inode_sem_w);
34355 +
34356 +       if (written < 0) {
34357 +               if (written == -EEXIST)
34358 +                       printk("write_crc_file returns EEXIST!\n");
34359 +               return written;
34360 +       }
34361 +       /* update position in a file */
34362 +       *off = pos + written;
34363 +       /* return number of written bytes */
34364 +       return written;
34365 +}
34366 +
34367 +/**
34368 + * write_cryptcompress - write of struct file_operations
34369 + * @file: file to write to
34370 + * @buf: address of user-space buffer
34371 + * @read_amount: number of bytes to write
34372 + * @off: position in file to write to
34373 + *
34374 + * This is implementation of vfs's write method of struct file_operations for
34375 + * cryptcompress plugin.
34376 + */
34377 +ssize_t write_cryptcompress(struct file *file, const char __user *buf,
34378 +                           size_t count, loff_t *off)
34379 +{
34380 +       ssize_t result;
34381 +       struct inode *inode;
34382 +       reiser4_context *ctx;
34383 +
34384 +       inode = file->f_dentry->d_inode;
34385 +
34386 +       ctx = init_context(inode->i_sb);
34387 +       if (IS_ERR(ctx))
34388 +               return PTR_ERR(ctx);
34389 +
34390 +       mutex_lock(&inode->i_mutex);
34391 +
34392 +       result = write_crc_file(file, inode, buf, count, off);
34393 +
34394 +       mutex_unlock(&inode->i_mutex);
34395 +
34396 +       context_set_commit_async(ctx);
34397 +       reiser4_exit_context(ctx);
34398 +       return result;
34399 +}
34400 +
34401 +static void
34402 +readpages_crc(struct address_space *mapping, struct list_head *pages,
34403 +             void *data)
34404 +{
34405 +       file_plugin *fplug;
34406 +       item_plugin *iplug;
34407 +
34408 +       assert("edward-1112", mapping != NULL);
34409 +       assert("edward-1113", mapping->host != NULL);
34410 +
34411 +       fplug = inode_file_plugin(mapping->host);
34412 +       assert("edward-1114", fplug == file_plugin_by_id(CRC_FILE_PLUGIN_ID));
34413 +       iplug = item_plugin_by_id(CTAIL_ID);
34414 +
34415 +       iplug->s.file.readpages(data, mapping, pages);
34416 +
34417 +       return;
34418 +}
34419 +
34420 +static reiser4_block_nr cryptcompress_estimate_read(struct inode *inode)
34421 +{
34422 +       /* reserve one block to update stat data item */
34423 +       assert("edward-1193",
34424 +              inode_file_plugin(inode)->estimate.update ==
34425 +              estimate_update_common);
34426 +       return estimate_update_common(inode);
34427 +}
34428 +
34429 +/**
34430 + * read_cryptcompress - read of struct file_operations
34431 + * @file: file to read from
34432 + * @buf: address of user-space buffer
34433 + * @read_amount: number of bytes to read
34434 + * @off: position in file to read from
34435 + *
34436 + * This is implementation of vfs's read method of struct file_operations for
34437 + * cryptcompress plugin.
34438 + */
34439 +ssize_t read_cryptcompress(struct file * file, char __user *buf, size_t size,
34440 +                          loff_t * off)
34441 +{
34442 +       ssize_t result;
34443 +       struct inode *inode;
34444 +       reiser4_context *ctx;
34445 +       reiser4_file_fsdata *fsdata;
34446 +       cryptcompress_info_t *info;
34447 +       reiser4_block_nr needed;
34448 +
34449 +       inode = file->f_dentry->d_inode;
34450 +       assert("edward-1194", !inode_get_flag(inode, REISER4_NO_SD));
34451 +
34452 +       ctx = init_context(inode->i_sb);
34453 +       if (IS_ERR(ctx))
34454 +               return PTR_ERR(ctx);
34455 +
34456 +       info = cryptcompress_inode_data(inode);
34457 +       needed = cryptcompress_estimate_read(inode);
34458 +
34459 +       /* FIXME-EDWARD:
34460 +          Grab space for sd_update so find_cluster will be happy */
34461 +       result = reiser4_grab_space(needed, BA_CAN_COMMIT);
34462 +       if (result != 0) {
34463 +               reiser4_exit_context(ctx);
34464 +               return result;
34465 +       }
34466 +       fsdata = reiser4_get_file_fsdata(file);
34467 +       fsdata->ra2.data = file;
34468 +       fsdata->ra2.readpages = readpages_crc;
34469 +
34470 +       down_read(&info->lock);
34471 +       LOCK_CNT_INC(inode_sem_r);
34472 +
34473 +       result = do_sync_read(file, buf, size, off);
34474 +
34475 +       up_read(&info->lock);
34476 +       LOCK_CNT_DEC(inode_sem_r);
34477 +
34478 +       context_set_commit_async(ctx);
34479 +       reiser4_exit_context(ctx);
34480 +
34481 +       return result;
34482 +}
34483 +
34484 +/* If @index > 0, find real disk cluster of the index (@index - 1),
34485 +   If @index == 0 find the real disk cluster of the object of maximal index.
34486 +   Keep incremented index of the result in @found.
34487 +   It succes was returned:
34488 +   (@index == 0 && @found == 0) means that the object doesn't have real disk
34489 +   clusters.
34490 +   (@index != 0 && @found == 0) means that disk cluster of (@index -1) doesn't
34491 +   exist.
34492 +*/
34493 +static int
34494 +find_real_disk_cluster(struct inode *inode, cloff_t * found, cloff_t index)
34495 +{
34496 +       int result;
34497 +       reiser4_key key;
34498 +       loff_t offset;
34499 +       hint_t *hint;
34500 +       lock_handle *lh;
34501 +       lookup_bias bias;
34502 +       coord_t *coord;
34503 +       item_plugin *iplug;
34504 +
34505 +       assert("edward-1131", inode != NULL);
34506 +       assert("edward-95", crc_inode_ok(inode));
34507 +
34508 +       hint = kmalloc(sizeof(*hint), GFP_KERNEL);
34509 +       if (hint == NULL)
34510 +               return RETERR(-ENOMEM);
34511 +       hint_init_zero(hint);
34512 +       lh = &hint->lh;
34513 +
34514 +       bias = (index ? FIND_EXACT : FIND_MAX_NOT_MORE_THAN);
34515 +       offset =
34516 +           (index ? clust_to_off(index, inode) -
34517 +            1 : get_key_offset(max_key()));
34518 +
34519 +       key_by_inode_cryptcompress(inode, offset, &key);
34520 +
34521 +       /* find the last item of this object */
34522 +       result =
34523 +           find_cluster_item(hint, &key, ZNODE_READ_LOCK, NULL /* ra_info */,
34524 +                             bias, 0);
34525 +       if (cbk_errored(result)) {
34526 +               done_lh(lh);
34527 +               kfree(hint);
34528 +               return result;
34529 +       }
34530 +       if (result == CBK_COORD_NOTFOUND) {
34531 +               /* no real disk clusters */
34532 +               done_lh(lh);
34533 +               kfree(hint);
34534 +               *found = 0;
34535 +               return 0;
34536 +       }
34537 +       /* disk cluster is found */
34538 +       coord = &hint->ext_coord.coord;
34539 +       coord_clear_iplug(coord);
34540 +       result = zload(coord->node);
34541 +       if (unlikely(result)) {
34542 +               done_lh(lh);
34543 +               kfree(hint);
34544 +               return result;
34545 +       }
34546 +       iplug = item_plugin_by_coord(coord);
34547 +       assert("edward-277", iplug == item_plugin_by_id(CTAIL_ID));
34548 +       assert("edward-1202", ctail_ok(coord));
34549 +
34550 +       item_key_by_coord(coord, &key);
34551 +       *found = off_to_clust(get_key_offset(&key), inode) + 1;
34552 +
34553 +       assert("edward-1132", ergo(index, index == *found));
34554 +
34555 +       zrelse(coord->node);
34556 +       done_lh(lh);
34557 +       kfree(hint);
34558 +       return 0;
34559 +}
34560 +
34561 +static int find_fake_appended(struct inode *inode, cloff_t * index)
34562 +{
34563 +       return find_real_disk_cluster(inode, index,
34564 +                                     0 /* find last real one */ );
34565 +}
34566 +
34567 +/* Set left coord when unit is not found after node_lookup()
34568 +   This takes into account that there can be holes in a sequence
34569 +   of disk clusters */
34570 +
34571 +static void adjust_left_coord(coord_t * left_coord)
34572 +{
34573 +       switch (left_coord->between) {
34574 +       case AFTER_UNIT:
34575 +               left_coord->between = AFTER_ITEM;
34576 +       case AFTER_ITEM:
34577 +       case BEFORE_UNIT:
34578 +               break;
34579 +       default:
34580 +               impossible("edward-1204", "bad left coord to cut");
34581 +       }
34582 +       return;
34583 +}
34584 +
34585 +#define CRC_CUT_TREE_MIN_ITERATIONS 64
34586 +int
34587 +cut_tree_worker_cryptcompress(tap_t * tap, const reiser4_key * from_key,
34588 +                             const reiser4_key * to_key,
34589 +                             reiser4_key * smallest_removed,
34590 +                             struct inode *object, int truncate, int *progress)
34591 +{
34592 +       lock_handle next_node_lock;
34593 +       coord_t left_coord;
34594 +       int result;
34595 +
34596 +       assert("edward-1158", tap->coord->node != NULL);
34597 +       assert("edward-1159", znode_is_write_locked(tap->coord->node));
34598 +       assert("edward-1160", znode_get_level(tap->coord->node) == LEAF_LEVEL);
34599 +
34600 +       *progress = 0;
34601 +       init_lh(&next_node_lock);
34602 +
34603 +       while (1) {
34604 +               znode *node;    /* node from which items are cut */
34605 +               node_plugin *nplug;     /* node plugin for @node */
34606 +
34607 +               node = tap->coord->node;
34608 +
34609 +               /* Move next_node_lock to the next node on the left. */
34610 +               result =
34611 +                   reiser4_get_left_neighbor(&next_node_lock, node,
34612 +                                             ZNODE_WRITE_LOCK,
34613 +                                             GN_CAN_USE_UPPER_LEVELS);
34614 +               if (result != 0 && result != -E_NO_NEIGHBOR)
34615 +                       break;
34616 +               /* FIXME-EDWARD: Check can we delete the node as a whole. */
34617 +               result = tap_load(tap);
34618 +               if (result)
34619 +                       return result;
34620 +
34621 +               /* Prepare the second (right) point for cut_node() */
34622 +               if (*progress)
34623 +                       coord_init_last_unit(tap->coord, node);
34624 +
34625 +               else if (item_plugin_by_coord(tap->coord)->b.lookup == NULL)
34626 +                       /* set rightmost unit for the items without lookup method */
34627 +                       tap->coord->unit_pos = coord_last_unit_pos(tap->coord);
34628 +
34629 +               nplug = node->nplug;
34630 +
34631 +               assert("edward-1161", nplug);
34632 +               assert("edward-1162", nplug->lookup);
34633 +
34634 +               /* left_coord is leftmost unit cut from @node */
34635 +               result = nplug->lookup(node, from_key, FIND_EXACT, &left_coord);
34636 +
34637 +               if (IS_CBKERR(result))
34638 +                       break;
34639 +
34640 +               if (result == CBK_COORD_NOTFOUND)
34641 +                       adjust_left_coord(&left_coord);
34642 +
34643 +               /* adjust coordinates so that they are set to existing units */
34644 +               if (coord_set_to_right(&left_coord)
34645 +                   || coord_set_to_left(tap->coord)) {
34646 +                       result = 0;
34647 +                       break;
34648 +               }
34649 +
34650 +               if (coord_compare(&left_coord, tap->coord) ==
34651 +                   COORD_CMP_ON_RIGHT) {
34652 +                       /* keys from @from_key to @to_key are not in the tree */
34653 +                       result = 0;
34654 +                       break;
34655 +               }
34656 +
34657 +               /* cut data from one node */
34658 +               *smallest_removed = *min_key();
34659 +               result = kill_node_content(&left_coord,
34660 +                                          tap->coord,
34661 +                                          from_key,
34662 +                                          to_key,
34663 +                                          smallest_removed,
34664 +                                          next_node_lock.node,
34665 +                                          object, truncate);
34666 +#if REISER4_DEBUG
34667 +               /*node_check(node, ~0U); */
34668 +#endif
34669 +               tap_relse(tap);
34670 +
34671 +               if (result)
34672 +                       break;
34673 +
34674 +               ++(*progress);
34675 +
34676 +               /* Check whether all items with keys >= from_key were removed
34677 +                * from the tree. */
34678 +               if (keyle(smallest_removed, from_key))
34679 +                       /* result = 0; */
34680 +                       break;
34681 +
34682 +               if (next_node_lock.node == NULL)
34683 +                       break;
34684 +
34685 +               result = tap_move(tap, &next_node_lock);
34686 +               done_lh(&next_node_lock);
34687 +               if (result)
34688 +                       break;
34689 +
34690 +               /* Break long cut_tree operation (deletion of a large file) if
34691 +                * atom requires commit. */
34692 +               if (*progress > CRC_CUT_TREE_MIN_ITERATIONS
34693 +                   && current_atom_should_commit()) {
34694 +                       result = -E_REPEAT;
34695 +                       break;
34696 +               }
34697 +       }
34698 +       done_lh(&next_node_lock);
34699 +       return result;
34700 +}
34701 +
34702 +/* Append or expand hole in two steps (exclusive access should be aquired!)
34703 +   1) write zeroes to the current real cluster,
34704 +   2) expand hole via fake clusters (just increase i_size) */
34705 +static int
34706 +cryptcompress_append_hole(struct inode *inode /*contains old i_size */ ,
34707 +                         loff_t new_size)
34708 +{
34709 +       int result = 0;
34710 +       hint_t *hint;
34711 +       lock_handle *lh;
34712 +       loff_t hole_size;
34713 +       int nr_zeroes;
34714 +       reiser4_slide_t win;
34715 +       reiser4_cluster_t clust;
34716 +
34717 +       assert("edward-1133", inode->i_size < new_size);
34718 +       assert("edward-1134", schedulable());
34719 +       assert("edward-1135", crc_inode_ok(inode));
34720 +       assert("edward-1136", current_blocksize == PAGE_CACHE_SIZE);
34721 +       assert("edward-1333", off_to_cloff(inode->i_size, inode) != 0);
34722 +
34723 +       hint = kmalloc(sizeof(*hint), GFP_KERNEL);
34724 +       if (hint == NULL)
34725 +               return RETERR(-ENOMEM);
34726 +       hint_init_zero(hint);
34727 +       lh = &hint->lh;
34728 +
34729 +       reiser4_slide_init(&win);
34730 +       cluster_init_read(&clust, &win);
34731 +       clust.hint = hint;
34732 +
34733 +       result = alloc_cluster_pgset(&clust, cluster_nrpages(inode));
34734 +       if (result)
34735 +               goto out;
34736 +       if (off_to_cloff(inode->i_size, inode) == 0)
34737 +               goto fake_append;
34738 +       hole_size = new_size - inode->i_size;
34739 +       nr_zeroes =
34740 +               inode_cluster_size(inode) - off_to_cloff(inode->i_size, inode);
34741 +       if (hole_size < nr_zeroes)
34742 +               nr_zeroes = hole_size;
34743 +       set_window(&clust, &win, inode, inode->i_size,
34744 +                  inode->i_size + nr_zeroes);
34745 +       win.stat = HOLE_WINDOW;
34746 +
34747 +       assert("edward-1137",
34748 +              clust.index == off_to_clust(inode->i_size, inode));
34749 +
34750 +       result = prepare_cluster(inode, 0, 0, &clust, PCL_APPEND);
34751 +
34752 +       assert("edward-1271", !result || result == -ENOSPC);
34753 +       if (result)
34754 +               goto out;
34755 +       assert("edward-1139",
34756 +              clust.dstat == PREP_DISK_CLUSTER ||
34757 +              clust.dstat == UNPR_DISK_CLUSTER);
34758 +
34759 +       assert("edward-1431", hole_size >= nr_zeroes);
34760 +       if (hole_size == nr_zeroes)
34761 +       /* nothing to append anymore */
34762 +               goto out;
34763 +      fake_append:
34764 +       INODE_SET_FIELD(inode, i_size, new_size);
34765 +      out:
34766 +       done_lh(lh);
34767 +       kfree(hint);
34768 +       put_cluster_handle(&clust);
34769 +       return result;
34770 +}
34771 +
34772 +#if REISER4_DEBUG
34773 +static int
34774 +pages_truncate_ok(struct inode *inode, loff_t old_size, pgoff_t start)
34775 +{
34776 +       struct pagevec pvec;
34777 +       int i;
34778 +       int count;
34779 +       int rest;
34780 +
34781 +       rest = count_to_nrpages(old_size) - start;
34782 +
34783 +       pagevec_init(&pvec, 0);
34784 +       count = min_count(pagevec_space(&pvec), rest);
34785 +
34786 +       while (rest) {
34787 +               count = min_count(pagevec_space(&pvec), rest);
34788 +               pvec.nr = find_get_pages(inode->i_mapping, start,
34789 +                                        count, pvec.pages);
34790 +               for (i = 0; i < pagevec_count(&pvec); i++) {
34791 +                       if (PageUptodate(pvec.pages[i])) {
34792 +                               warning("edward-1205",
34793 +                                       "truncated page of index %lu is uptodate",
34794 +                                       pvec.pages[i]->index);
34795 +                               return 0;
34796 +                       }
34797 +               }
34798 +               start += count;
34799 +               rest -= count;
34800 +               pagevec_release(&pvec);
34801 +       }
34802 +       return 1;
34803 +}
34804 +
34805 +static int body_truncate_ok(struct inode *inode, cloff_t aidx)
34806 +{
34807 +       int result;
34808 +       cloff_t raidx;
34809 +
34810 +       result = find_fake_appended(inode, &raidx);
34811 +       return !result && (aidx == raidx);
34812 +}
34813 +#endif
34814 +
34815 +static int
34816 +update_cryptcompress_size(struct inode *inode, reiser4_key * key, int update_sd)
34817 +{
34818 +       return (get_key_offset(key) & ((loff_t) (inode_cluster_size(inode)) - 1)
34819 +               ? 0 : update_file_size(inode, key, update_sd));
34820 +}
34821 +
34822 +/* prune cryptcompress file in two steps (exclusive access should be acquired!)
34823 +   1) cut all disk clusters but the last one partially truncated,
34824 +   2) set zeroes and capture last partially truncated page cluster if the last
34825 +      one exists, otherwise truncate via prune fake cluster (just decrease i_size)
34826 +*/
34827 +static int
34828 +prune_cryptcompress(struct inode *inode, loff_t new_size, int update_sd,
34829 +                   cloff_t aidx)
34830 +{
34831 +       int result = 0;
34832 +       unsigned nr_zeroes;
34833 +       loff_t to_prune;
34834 +       loff_t old_size;
34835 +       cloff_t ridx;
34836 +
34837 +       hint_t *hint;
34838 +       lock_handle *lh;
34839 +       reiser4_slide_t win;
34840 +       reiser4_cluster_t clust;
34841 +
34842 +       assert("edward-1140", inode->i_size >= new_size);
34843 +       assert("edward-1141", schedulable());
34844 +       assert("edward-1142", crc_inode_ok(inode));
34845 +       assert("edward-1143", current_blocksize == PAGE_CACHE_SIZE);
34846 +
34847 +       hint = kmalloc(sizeof(*hint), GFP_KERNEL);
34848 +       if (hint == NULL)
34849 +               return RETERR(-ENOMEM);
34850 +       hint_init_zero(hint);
34851 +       lh = &hint->lh;
34852 +
34853 +       reiser4_slide_init(&win);
34854 +       cluster_init_read(&clust, &win);
34855 +       clust.hint = hint;
34856 +
34857 +       /* rightmost completely truncated cluster */
34858 +       ridx = count_to_nrclust(new_size, inode);
34859 +
34860 +       assert("edward-1174", ridx <= aidx);
34861 +       old_size = inode->i_size;
34862 +       if (ridx != aidx) {
34863 +               result = cut_file_items(inode,
34864 +                                       clust_to_off(ridx, inode),
34865 +                                       update_sd,
34866 +                                       clust_to_off(aidx, inode),
34867 +                                       update_cryptcompress_size);
34868 +               if (result)
34869 +                       goto out;
34870 +       }
34871 +       if (!off_to_cloff(new_size, inode)) {
34872 +               /* no partially truncated clusters */
34873 +               assert("edward-1145", inode->i_size == new_size);
34874 +               goto finish;
34875 +       }
34876 +       assert("edward-1146", new_size < inode->i_size);
34877 +
34878 +       to_prune = inode->i_size - new_size;
34879 +
34880 +       /* partial truncate of leftmost cluster,
34881 +          first check if it is fake */
34882 +       result = find_real_disk_cluster(inode, &aidx, ridx);
34883 +       if (result)
34884 +               goto out;
34885 +       if (!aidx)
34886 +               /* yup, this is fake one */
34887 +               goto finish;
34888 +
34889 +       assert("edward-1148", aidx == ridx);
34890 +
34891 +       /* do partial truncate of the leftmost page cluster,
34892 +          then try to capture this one */
34893 +       result = alloc_cluster_pgset(&clust, cluster_nrpages(inode));
34894 +       if (result)
34895 +               goto out;
34896 +       nr_zeroes = (off_to_pgoff(new_size) ?
34897 +                    PAGE_CACHE_SIZE - off_to_pgoff(new_size) : 0);
34898 +       set_window(&clust, &win, inode, new_size, new_size + nr_zeroes);
34899 +       win.stat = HOLE_WINDOW;
34900 +
34901 +       assert("edward-1149", clust.index == ridx - 1);
34902 +
34903 +       result = prepare_cluster(inode, 0, 0, &clust, PCL_TRUNCATE);
34904 +       if (result)
34905 +               goto out;
34906 +       assert("edward-1151",
34907 +              clust.dstat == PREP_DISK_CLUSTER ||
34908 +              clust.dstat == UNPR_DISK_CLUSTER);
34909 +
34910 +       assert("edward-1191", inode->i_size == new_size);
34911 +       assert("edward-1206", body_truncate_ok(inode, ridx));
34912 +      finish:
34913 +       /* drop all the pages that don't have jnodes (i.e. pages
34914 +          which can not be truncated by cut_file_items() because
34915 +          of holes represented by fake disk clusters) including
34916 +          the pages of partially truncated cluster which was
34917 +          released by prepare_cluster() */
34918 +       truncate_inode_pages(inode->i_mapping, new_size);
34919 +       INODE_SET_FIELD(inode, i_size, new_size);
34920 +      out:
34921 +       assert("edward-1334", !result || result == -ENOSPC);
34922 +       assert("edward-1209",
34923 +              pages_truncate_ok(inode, old_size, count_to_nrpages(new_size)));
34924 +       done_lh(lh);
34925 +       kfree(hint);
34926 +       put_cluster_handle(&clust);
34927 +       return result;
34928 +}
34929 +
34930 +/* Prepare cryptcompress file for truncate:
34931 +   prune or append rightmost fake logical clusters (if any)
34932 +*/
34933 +static int
34934 +start_truncate_fake(struct inode *inode, cloff_t aidx, loff_t new_size,
34935 +                   int update_sd)
34936 +{
34937 +       int result = 0;
34938 +       int bytes;
34939 +
34940 +       if (new_size > inode->i_size) {
34941 +               /* append */
34942 +               if (inode->i_size < clust_to_off(aidx, inode))
34943 +                       /* no fake bytes */
34944 +                       return 0;
34945 +               bytes = new_size - inode->i_size;
34946 +               INODE_SET_FIELD(inode, i_size, inode->i_size + bytes);
34947 +       } else {
34948 +               /* prune */
34949 +               if (inode->i_size <= clust_to_off(aidx, inode))
34950 +                       /* no fake bytes */
34951 +                       return 0;
34952 +               bytes =
34953 +                   inode->i_size - max_count(new_size,
34954 +                                             clust_to_off(aidx, inode));
34955 +               if (!bytes)
34956 +                       return 0;
34957 +               INODE_SET_FIELD(inode, i_size, inode->i_size - bytes);
34958 +               /* In the case of fake prune we need to drop page cluster.
34959 +                  There are only 2 cases for partially truncated page:
34960 +                  1. If is is dirty, therefore it is anonymous
34961 +                  (was dirtied via mmap), and will be captured
34962 +                  later via ->capture().
34963 +                  2. If is clean, therefore it is filled by zeroes.
34964 +                  In both cases we don't need to make it dirty and
34965 +                  capture here.
34966 +                */
34967 +               truncate_inode_pages(inode->i_mapping, inode->i_size);
34968 +       }
34969 +       if (update_sd)
34970 +               result = update_sd_cryptcompress(inode);
34971 +       return result;
34972 +}
34973 +
34974 +/* This is called in setattr_cryptcompress when it is used to truncate,
34975 +   and in delete_cryptcompress */
34976 +static int cryptcompress_truncate(struct inode *inode, /* old size */
34977 +                                 loff_t new_size,      /* new size */
34978 +                                 int update_sd)
34979 +{
34980 +       int result;
34981 +       cloff_t aidx;
34982 +
34983 +       result = find_fake_appended(inode, &aidx);
34984 +       if (result)
34985 +               return result;
34986 +       assert("edward-1208",
34987 +              ergo(aidx > 0, inode->i_size > clust_to_off(aidx - 1, inode)));
34988 +
34989 +       result = start_truncate_fake(inode, aidx, new_size, update_sd);
34990 +       if (result)
34991 +               return result;
34992 +       if (inode->i_size == new_size)
34993 +               /* nothing to truncate anymore */
34994 +               return 0;
34995 +       return (inode->i_size < new_size ?
34996 +               cryptcompress_append_hole(inode, new_size) :
34997 +               prune_cryptcompress(inode, new_size, update_sd, aidx));
34998 +}
34999 +
35000 +static void clear_moved_tag_cluster(struct address_space * mapping,
35001 +                                   reiser4_cluster_t * clust)
35002 +{
35003 +       int i;
35004 +       void * ret;
35005 +       read_lock_irq(&mapping->tree_lock);
35006 +       for (i = 0; i < clust->nr_pages; i++) {
35007 +               assert("edward-1438", clust->pages[i] != NULL);
35008 +               ret = radix_tree_tag_clear(&mapping->page_tree,
35009 +                                          clust->pages[i]->index,
35010 +                                          PAGECACHE_TAG_REISER4_MOVED);
35011 +               assert("edward-1439", ret == clust->pages[i]);
35012 +       }
35013 +       read_unlock_irq(&mapping->tree_lock);
35014 +}
35015 +
35016 +/* Capture an anonymous pager cluster. (Page cluser is
35017 +   anonymous if it contains at least one anonymous page */
35018 +static int
35019 +capture_page_cluster(reiser4_cluster_t * clust, struct inode *inode)
35020 +{
35021 +       int result;
35022 +
35023 +       assert("edward-1073", clust != NULL);
35024 +       assert("edward-1074", inode != NULL);
35025 +       assert("edward-1075", clust->dstat == INVAL_DISK_CLUSTER);
35026 +
35027 +       result = prepare_cluster(inode, 0, 0, clust, PCL_APPEND);
35028 +       if (result)
35029 +               return result;
35030 +       set_cluster_pages_dirty(clust);
35031 +       clear_moved_tag_cluster(inode->i_mapping, clust);
35032 +
35033 +       result = try_capture_cluster(clust, inode);
35034 +       put_hint_cluster(clust, inode, ZNODE_WRITE_LOCK);
35035 +       if (unlikely(result)) {
35036 +               /* set cleared tag back, so it will be
35037 +                  possible to capture it again later */
35038 +               read_lock_irq(&inode->i_mapping->tree_lock);
35039 +               radix_tree_tag_set(&inode->i_mapping->page_tree,
35040 +                                  clust_to_pg(clust->index, inode),
35041 +                                  PAGECACHE_TAG_REISER4_MOVED);
35042 +               read_unlock_irq(&inode->i_mapping->tree_lock);
35043 +
35044 +               release_cluster_pages_and_jnode(clust);
35045 +       }
35046 +       return result;
35047 +}
35048 +
35049 +#define MAX_CLUSTERS_TO_CAPTURE(inode)    (1024 >> cluster_nrpages_shift(inode))
35050 +
35051 +/* read lock should be acquired */
35052 +static int
35053 +capture_anonymous_clusters(struct address_space *mapping, pgoff_t * index,
35054 +                          int to_capture)
35055 +{
35056 +       int result = 0;
35057 +       int found;
35058 +       int progress = 0;
35059 +       struct page *page = NULL;
35060 +       hint_t *hint;
35061 +       lock_handle *lh;
35062 +       reiser4_cluster_t clust;
35063 +
35064 +       assert("edward-1127", mapping != NULL);
35065 +       assert("edward-1128", mapping->host != NULL);
35066 +       assert("edward-1440",  mapping->host->i_mapping == mapping);
35067 +
35068 +       hint = kmalloc(sizeof(*hint), GFP_KERNEL);
35069 +       if (hint == NULL)
35070 +               return RETERR(-ENOMEM);
35071 +       hint_init_zero(hint);
35072 +       lh = &hint->lh;
35073 +
35074 +       cluster_init_read(&clust, NULL);
35075 +       clust.hint = hint;
35076 +
35077 +       result = alloc_cluster_pgset(&clust, cluster_nrpages(mapping->host));
35078 +       if (result)
35079 +               goto out;
35080 +
35081 +       while (to_capture > 0) {
35082 +               found =
35083 +                   find_get_pages_tag(mapping, index,
35084 +                                      PAGECACHE_TAG_REISER4_MOVED, 1, &page);
35085 +               if (!found) {
35086 +                       *index = (pgoff_t) - 1;
35087 +                       break;
35088 +               }
35089 +               assert("edward-1109", page != NULL);
35090 +
35091 +               move_cluster_forward(&clust, mapping->host, page->index,
35092 +                                    &progress);
35093 +               result = capture_page_cluster(&clust, mapping->host);
35094 +               page_cache_release(page);
35095 +               if (result)
35096 +                       break;
35097 +               to_capture--;
35098 +       }
35099 +       if (result) {
35100 +               warning("edward-1077",
35101 +                       "Cannot capture anon pages: result=%i (captured=%d)\n",
35102 +                       result,
35103 +                       ((__u32) MAX_CLUSTERS_TO_CAPTURE(mapping->host)) -
35104 +                       to_capture);
35105 +       } else {
35106 +               /* something had to be found */
35107 +               assert("edward-1078",
35108 +                      to_capture <= MAX_CLUSTERS_TO_CAPTURE(mapping->host));
35109 +               if (to_capture <= 0)
35110 +                       /* there may be left more pages */
35111 +                       __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
35112 +       }
35113 +      out:
35114 +       done_lh(lh);
35115 +       kfree(hint);
35116 +       put_cluster_handle(&clust);
35117 +       return result;
35118 +}
35119 +
35120 +/* Check mapping for existence of not captured dirty pages.
35121 +   This returns !0 if either page tree contains pages tagged
35122 +   PAGECACHE_TAG_REISER4_MOVED */
35123 +static int crc_inode_has_anon_pages(struct inode *inode)
35124 +{
35125 +       return mapping_tagged(inode->i_mapping, PAGECACHE_TAG_REISER4_MOVED);
35126 +}
35127 +
35128 +/* this is implementation of vfs's writepages method of struct
35129 +   address_space_operations */
35130 +int
35131 +writepages_cryptcompress(struct address_space *mapping,
35132 +                        struct writeback_control *wbc)
35133 +{
35134 +       int result;
35135 +       int to_capture;
35136 +       pgoff_t nrpages;
35137 +       pgoff_t index = 0;
35138 +       cryptcompress_info_t *info;
35139 +       struct inode *inode;
35140 +
35141 +       inode = mapping->host;
35142 +       if (!crc_inode_has_anon_pages(inode)) {
35143 +               result = 0;
35144 +               goto end;
35145 +       }
35146 +
35147 +       info = cryptcompress_inode_data(inode);
35148 +       nrpages = count_to_nrpages(i_size_read(inode));
35149 +
35150 +       if (wbc->sync_mode != WB_SYNC_ALL)
35151 +               to_capture =
35152 +                   min_count(wbc->nr_to_write, MAX_CLUSTERS_TO_CAPTURE(inode));
35153 +       else
35154 +               to_capture = MAX_CLUSTERS_TO_CAPTURE(inode);
35155 +       do {
35156 +               reiser4_context *ctx;
35157 +
35158 +               if (is_in_reiser4_context()) {
35159 +                       /* FIXME-EDWARD: REMOVEME */
35160 +                       all_grabbed2free();
35161 +
35162 +                       /* It can be in the context of write system call from
35163 +                          balance_dirty_pages() */
35164 +                       if (down_read_trylock(&info->lock) == 0) {
35165 +                               result = RETERR(-EBUSY);
35166 +                               break;
35167 +                       }
35168 +               } else
35169 +                       down_read(&info->lock);
35170 +
35171 +               ctx = init_context(inode->i_sb);
35172 +               if (IS_ERR(ctx)) {
35173 +                       result = PTR_ERR(ctx);
35174 +                       break;
35175 +               }
35176 +               ctx->nobalance = 1;
35177 +
35178 +               assert("edward-1079",
35179 +                      lock_stack_isclean(get_current_lock_stack()));
35180 +
35181 +               LOCK_CNT_INC(inode_sem_r);
35182 +
35183 +               result =
35184 +                   capture_anonymous_clusters(inode->i_mapping, &index,
35185 +                                              to_capture);
35186 +
35187 +               up_read(&info->lock);
35188 +
35189 +               LOCK_CNT_DEC(inode_sem_r);
35190 +
35191 +               if (result != 0 || wbc->sync_mode != WB_SYNC_ALL) {
35192 +                       reiser4_exit_context(ctx);
35193 +                       break;
35194 +               }
35195 +               result = txnmgr_force_commit_all(inode->i_sb, 0);
35196 +               reiser4_exit_context(ctx);
35197 +       } while (result == 0 && index < nrpages);
35198 +
35199 +      end:
35200 +       if (is_in_reiser4_context()) {
35201 +               if (get_current_context()->nr_captured >= CAPTURE_APAGE_BURST) {
35202 +                       /* there are already pages to flush, flush them out, do
35203 +                          not delay until end of reiser4_sync_inodes */
35204 +                       writeout(inode->i_sb, wbc);
35205 +                       get_current_context()->nr_captured = 0;
35206 +               }
35207 +       }
35208 +       return result;
35209 +}
35210 +
35211 +/* plugin->u.file.mmap */
35212 +int mmap_cryptcompress(struct file *file, struct vm_area_struct *vma)
35213 +{
35214 +       //return -ENOSYS;
35215 +       return generic_file_mmap(file, vma);
35216 +}
35217 +
35218 +/* plugin->u.file.release */
35219 +/* plugin->u.file.get_block */
35220 +
35221 +/* this is implementation of delete method of file plugin for
35222 +   cryptcompress objects */
35223 +int delete_cryptcompress(struct inode *inode)
35224 +{
35225 +       int result;
35226 +
35227 +       assert("edward-429", inode->i_nlink == 0);
35228 +
35229 +       if (inode->i_size) {
35230 +               result = cryptcompress_truncate(inode, 0, 0);
35231 +               if (result) {
35232 +                       warning("edward-430",
35233 +                               "cannot truncate cryptcompress file  %lli: %i",
35234 +                               (unsigned long long)get_inode_oid(inode),
35235 +                               result);
35236 +                       return result;
35237 +               }
35238 +       }
35239 +       /* and remove stat data */
35240 +       return delete_object_common(inode);
35241 +}
35242 +
35243 +/* plugin->u.file.setattr method
35244 +   see plugin.h for description */
35245 +int setattr_cryptcompress(struct dentry *dentry,       /* Object to change attributes */
35246 +                         struct iattr *attr /* change description */ )
35247 +{
35248 +       int result;
35249 +       struct inode *inode;
35250 +
35251 +       inode = dentry->d_inode;
35252 +       result = check_cryptcompress(inode);
35253 +       if (result)
35254 +               return result;
35255 +       if (attr->ia_valid & ATTR_SIZE) {
35256 +               /* EDWARD-FIXME-HANS: VS-FIXME-HANS:
35257 +                  Q: this case occurs when? truncate?
35258 +                  A: yes
35259 +
35260 +                  Q: If so, why isn't this code in truncate itself instead of here?
35261 +
35262 +                  A: because vfs calls fs's truncate after it has called truncate_inode_pages to get rid of pages
35263 +                  corresponding to part of file being truncated. In reiser4 it may cause existence of unallocated
35264 +                  extents which do not have jnodes. Flush code does not expect that. Solution of this problem is
35265 +                  straightforward. As vfs's truncate is implemented using setattr operation (common implementaion of
35266 +                  which calls truncate_inode_pages and fs's truncate in case when size of file changes) - it seems
35267 +                  reasonable to have reiser4_setattr which will take care of removing pages, jnodes and extents
35268 +                  simultaneously in case of truncate.
35269 +                  Q: do you think implementing truncate using setattr is ugly,
35270 +                  and vfs needs improving, or is there some sense in which this is a good design?
35271 +
35272 +                  A: VS-FIXME-HANS:
35273 +                */
35274 +
35275 +               /* truncate does reservation itself and requires exclusive access obtained */
35276 +               if (inode->i_size != attr->ia_size) {
35277 +                       reiser4_context *ctx;
35278 +                       loff_t old_size;
35279 +                       cryptcompress_info_t *info =
35280 +                           cryptcompress_inode_data(inode);
35281 +
35282 +                       ctx = init_context(dentry->d_inode->i_sb);
35283 +                       if (IS_ERR(ctx))
35284 +                               return PTR_ERR(ctx);
35285 +
35286 +                       down_write(&info->lock);
35287 +                       LOCK_CNT_INC(inode_sem_w);
35288 +
35289 +                       inode_check_scale(inode, inode->i_size, attr->ia_size);
35290 +
35291 +                       old_size = inode->i_size;
35292 +
35293 +                       result =
35294 +                           cryptcompress_truncate(inode, attr->ia_size,
35295 +                                                  1 /* update stat data */ );
35296 +                       if (result) {
35297 +                               warning("edward-1192",
35298 +                                       "truncate_cryptcompress failed: oid %lli, "
35299 +                                       "old size %lld, new size %lld, retval %d",
35300 +                                       (unsigned long long)
35301 +                                       get_inode_oid(inode), old_size,
35302 +                                       attr->ia_size, result);
35303 +                       }
35304 +                       up_write(&info->lock);
35305 +                       LOCK_CNT_DEC(inode_sem_w);
35306 +                       context_set_commit_async(ctx);
35307 +                       reiser4_exit_context(ctx);
35308 +               } else
35309 +                       result = 0;
35310 +       } else
35311 +               result = setattr_common(dentry, attr);
35312 +       return result;
35313 +}
35314 +
35315 +/* sendfile_cryptcompress - sendfile of struct file_operations */
35316 +ssize_t
35317 +sendfile_cryptcompress(struct file *file, loff_t *ppos, size_t count,
35318 +                      read_actor_t actor, void *target)
35319 +{
35320 +       reiser4_context *ctx;
35321 +       ssize_t result;
35322 +       struct inode *inode;
35323 +       cryptcompress_info_t *info;
35324 +
35325 +       inode = file->f_dentry->d_inode;
35326 +       ctx = init_context(inode->i_sb);
35327 +       if (IS_ERR(ctx))
35328 +               return PTR_ERR(ctx);
35329 +       /*
35330 +        * generic_file_sndfile may want to call update_atime. Grab space for
35331 +        * stat data update
35332 +        */
35333 +       result = reiser4_grab_space(estimate_update_common(inode),
35334 +                                   BA_CAN_COMMIT);
35335 +       if (result)
35336 +               goto exit;
35337 +       info = cryptcompress_inode_data(inode);
35338 +       down_read(&info->lock);
35339 +       result = generic_file_sendfile(file, ppos, count, actor, target);
35340 +       up_read(&info->lock);
35341 + exit:
35342 +       reiser4_exit_context(ctx);
35343 +       return result;
35344 +}
35345 +
35346 +/*
35347 + * release_cryptcompress - release of struct file_operations
35348 + * @inode: inode of released file
35349 + * @file: file to release
35350 + */
35351 +int release_cryptcompress(struct inode *inode, struct file *file)
35352 +{
35353 +       reiser4_context *ctx = init_context(inode->i_sb);
35354 +
35355 +       if (IS_ERR(ctx))
35356 +               return PTR_ERR(ctx);
35357 +       reiser4_free_file_fsdata(file);
35358 +       reiser4_exit_context(ctx);
35359 +       return 0;
35360 +}
35361 +
35362 +static int
35363 +save_len_cryptcompress_plugin(struct inode *inode, reiser4_plugin * plugin)
35364 +{
35365 +       assert("edward-457", inode != NULL);
35366 +       assert("edward-458", plugin != NULL);
35367 +       assert("edward-459", plugin->h.id == CRC_FILE_PLUGIN_ID);
35368 +       return 0;
35369 +}
35370 +
35371 +static int
35372 +load_cryptcompress_plugin(struct inode *inode, reiser4_plugin * plugin,
35373 +                         char **area, int *len)
35374 +{
35375 +       assert("edward-455", inode != NULL);
35376 +       assert("edward-456", (reiser4_inode_data(inode)->pset != NULL));
35377 +
35378 +       plugin_set_file(&reiser4_inode_data(inode)->pset,
35379 +                       file_plugin_by_id(CRC_FILE_PLUGIN_ID));
35380 +       return 0;
35381 +}
35382 +
35383 +static int change_cryptcompress(struct inode *inode, reiser4_plugin * plugin)
35384 +{
35385 +       /* cannot change object plugin of already existing object */
35386 +       return RETERR(-EINVAL);
35387 +}
35388 +
35389 +struct reiser4_plugin_ops cryptcompress_plugin_ops = {
35390 +       .load = load_cryptcompress_plugin,
35391 +       .save_len = save_len_cryptcompress_plugin,
35392 +       .save = NULL,
35393 +       .alignment = 8,
35394 +       .change = change_cryptcompress
35395 +};
35396 +
35397 +/*
35398 +  Local variables:
35399 +  c-indentation-style: "K&R"
35400 +  mode-name: "LC"
35401 +  c-basic-offset: 8
35402 +  tab-width: 8
35403 +  fill-column: 80
35404 +  scroll-step: 1
35405 +  End:
35406 +*/
35407 diff --git a/fs/reiser4/plugin/file/cryptcompress.h b/fs/reiser4/plugin/file/cryptcompress.h
35408 new file mode 100644
35409 index 0000000..b573631
35410 --- /dev/null
35411 +++ b/fs/reiser4/plugin/file/cryptcompress.h
35412 @@ -0,0 +1,549 @@
35413 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
35414 +/* See http://www.namesys.com/cryptcompress_design.html */
35415 +
35416 +#if !defined( __FS_REISER4_CRYPTCOMPRESS_H__ )
35417 +#define __FS_REISER4_CRYPTCOMPRESS_H__
35418 +
35419 +#include "../compress/compress.h"
35420 +#include "../crypto/cipher.h"
35421 +
35422 +#include <linux/pagemap.h>
35423 +#include <linux/vmalloc.h>
35424 +
35425 +#define MIN_CLUSTER_SIZE PAGE_CACHE_SIZE
35426 +#define MIN_CLUSTER_SHIFT PAGE_CACHE_SHIFT
35427 +#define MAX_CLUSTER_SHIFT 16
35428 +#define MAX_CLUSTER_NRPAGES (1U << MAX_CLUSTER_SHIFT >> PAGE_CACHE_SHIFT)
35429 +#define DC_CHECKSUM_SIZE 4
35430 +
35431 +static inline loff_t min_count(loff_t a, loff_t b)
35432 +{
35433 +       return (a < b ? a : b);
35434 +}
35435 +
35436 +static inline loff_t max_count(loff_t a, loff_t b)
35437 +{
35438 +       return (a > b ? a : b);
35439 +}
35440 +
35441 +#if REISER4_DEBUG
35442 +static inline int cluster_shift_ok(int shift)
35443 +{
35444 +       return (shift >= MIN_CLUSTER_SHIFT) && (shift <= MAX_CLUSTER_SHIFT);
35445 +}
35446 +#endif
35447 +
35448 +typedef struct tfm_stream {
35449 +       __u8 *data;
35450 +       size_t size;
35451 +} tfm_stream_t;
35452 +
35453 +typedef enum {
35454 +       INPUT_STREAM,
35455 +       OUTPUT_STREAM,
35456 +       LAST_STREAM
35457 +} tfm_stream_id;
35458 +
35459 +typedef tfm_stream_t *tfm_unit[LAST_STREAM];
35460 +
35461 +static inline __u8 *ts_data(tfm_stream_t * stm)
35462 +{
35463 +       assert("edward-928", stm != NULL);
35464 +       return stm->data;
35465 +}
35466 +
35467 +static inline size_t ts_size(tfm_stream_t * stm)
35468 +{
35469 +       assert("edward-929", stm != NULL);
35470 +       return stm->size;
35471 +}
35472 +
35473 +static inline void set_ts_size(tfm_stream_t * stm, size_t size)
35474 +{
35475 +       assert("edward-930", stm != NULL);
35476 +
35477 +       stm->size = size;
35478 +}
35479 +
35480 +static inline int alloc_ts(tfm_stream_t ** stm)
35481 +{
35482 +       assert("edward-931", stm);
35483 +       assert("edward-932", *stm == NULL);
35484 +
35485 +       *stm = kmalloc(sizeof **stm, GFP_KERNEL);
35486 +       if (*stm == NULL)
35487 +               return -ENOMEM;
35488 +       memset(*stm, 0, sizeof **stm);
35489 +       return 0;
35490 +}
35491 +
35492 +static inline void free_ts(tfm_stream_t * stm)
35493 +{
35494 +       assert("edward-933", !ts_data(stm));
35495 +       assert("edward-934", !ts_size(stm));
35496 +
35497 +       kfree(stm);
35498 +}
35499 +
35500 +static inline int alloc_ts_data(tfm_stream_t * stm, size_t size)
35501 +{
35502 +       assert("edward-935", !ts_data(stm));
35503 +       assert("edward-936", !ts_size(stm));
35504 +       assert("edward-937", size != 0);
35505 +
35506 +       stm->data = vmalloc(size);
35507 +       if (!stm->data)
35508 +               return -ENOMEM;
35509 +       set_ts_size(stm, size);
35510 +       return 0;
35511 +}
35512 +
35513 +static inline void free_ts_data(tfm_stream_t * stm)
35514 +{
35515 +       assert("edward-938", equi(ts_data(stm), ts_size(stm)));
35516 +
35517 +       if (ts_data(stm))
35518 +               vfree(ts_data(stm));
35519 +       memset(stm, 0, sizeof *stm);
35520 +}
35521 +
35522 +/* Write modes for item conversion in flush convert phase */
35523 +typedef enum {
35524 +       CRC_APPEND_ITEM = 1,
35525 +       CRC_OVERWRITE_ITEM = 2,
35526 +       CRC_CUT_ITEM = 3
35527 +} crc_write_mode_t;
35528 +
35529 +typedef enum {
35530 +       PCL_UNKNOWN = 0,        /* invalid option */
35531 +       PCL_APPEND = 1,         /* append and/or overwrite */
35532 +       PCL_TRUNCATE = 2        /* truncate */
35533 +} page_cluster_op;
35534 +
35535 +/* Reiser4 file write/read transforms page cluster into disk cluster (and back)
35536 +   using crypto/compression transforms implemented by reiser4 transform plugins.
35537 +   Before each transform we allocate a pair of streams (tfm_unit) and assemble
35538 +   page cluster into the input one. After transform we split output stream into
35539 +   a set of items (disk cluster).
35540 +*/
35541 +typedef struct tfm_cluster {
35542 +       coa_set coa;
35543 +       tfm_unit tun;
35544 +       tfm_action act;
35545 +       int uptodate;
35546 +       int lsize;        /* size of the logical cluster */
35547 +       int len;          /* length of the transform stream */
35548 +} tfm_cluster_t;
35549 +
35550 +static inline coa_t get_coa(tfm_cluster_t * tc, reiser4_compression_id id, tfm_action act)
35551 +{
35552 +       return tc->coa[id][act];
35553 +}
35554 +
35555 +static inline void
35556 +set_coa(tfm_cluster_t * tc, reiser4_compression_id id, tfm_action act, coa_t coa)
35557 +{
35558 +       tc->coa[id][act] = coa;
35559 +}
35560 +
35561 +static inline int
35562 +alloc_coa(tfm_cluster_t * tc, compression_plugin * cplug)
35563 +{
35564 +       coa_t coa;
35565 +
35566 +       coa = cplug->alloc(tc->act);
35567 +       if (IS_ERR(coa))
35568 +               return PTR_ERR(coa);
35569 +       set_coa(tc, cplug->h.id, tc->act, coa);
35570 +       return 0;
35571 +}
35572 +
35573 +static inline int
35574 +grab_coa(tfm_cluster_t * tc, compression_plugin * cplug)
35575 +{
35576 +       return (cplug->alloc && !get_coa(tc, cplug->h.id, tc->act) ?
35577 +               alloc_coa(tc, cplug) : 0);
35578 +}
35579 +
35580 +static inline void free_coa_set(tfm_cluster_t * tc)
35581 +{
35582 +       tfm_action j;
35583 +       reiser4_compression_id i;
35584 +       compression_plugin *cplug;
35585 +
35586 +       assert("edward-810", tc != NULL);
35587 +
35588 +       for (j = 0; j < LAST_TFM; j++)
35589 +               for (i = 0; i < LAST_COMPRESSION_ID; i++) {
35590 +                       if (!get_coa(tc, i, j))
35591 +                               continue;
35592 +                       cplug = compression_plugin_by_id(i);
35593 +                       assert("edward-812", cplug->free != NULL);
35594 +                       cplug->free(get_coa(tc, i, j), j);
35595 +                       set_coa(tc, i, j, 0);
35596 +               }
35597 +       return;
35598 +}
35599 +
35600 +static inline tfm_stream_t *tfm_stream(tfm_cluster_t * tc, tfm_stream_id id)
35601 +{
35602 +       return tc->tun[id];
35603 +}
35604 +
35605 +static inline void
35606 +set_tfm_stream(tfm_cluster_t * tc, tfm_stream_id id, tfm_stream_t * ts)
35607 +{
35608 +       tc->tun[id] = ts;
35609 +}
35610 +
35611 +static inline __u8 *tfm_stream_data(tfm_cluster_t * tc, tfm_stream_id id)
35612 +{
35613 +       return ts_data(tfm_stream(tc, id));
35614 +}
35615 +
35616 +static inline void
35617 +set_tfm_stream_data(tfm_cluster_t * tc, tfm_stream_id id, __u8 * data)
35618 +{
35619 +       tfm_stream(tc, id)->data = data;
35620 +}
35621 +
35622 +static inline size_t tfm_stream_size(tfm_cluster_t * tc, tfm_stream_id id)
35623 +{
35624 +       return ts_size(tfm_stream(tc, id));
35625 +}
35626 +
35627 +static inline void
35628 +set_tfm_stream_size(tfm_cluster_t * tc, tfm_stream_id id, size_t size)
35629 +{
35630 +       tfm_stream(tc, id)->size = size;
35631 +}
35632 +
35633 +static inline int
35634 +alloc_tfm_stream(tfm_cluster_t * tc, size_t size, tfm_stream_id id)
35635 +{
35636 +       assert("edward-939", tc != NULL);
35637 +       assert("edward-940", !tfm_stream(tc, id));
35638 +
35639 +       tc->tun[id] = kmalloc(sizeof(tfm_stream_t), GFP_KERNEL);
35640 +       if (!tc->tun[id])
35641 +               return -ENOMEM;
35642 +       memset(tfm_stream(tc, id), 0, sizeof(tfm_stream_t));
35643 +       return alloc_ts_data(tfm_stream(tc, id), size);
35644 +}
35645 +
35646 +static inline int
35647 +realloc_tfm_stream(tfm_cluster_t * tc, size_t size, tfm_stream_id id)
35648 +{
35649 +       assert("edward-941", tfm_stream_size(tc, id) < size);
35650 +       free_ts_data(tfm_stream(tc, id));
35651 +       return alloc_ts_data(tfm_stream(tc, id), size);
35652 +}
35653 +
35654 +static inline void free_tfm_stream(tfm_cluster_t * tc, tfm_stream_id id)
35655 +{
35656 +       free_ts_data(tfm_stream(tc, id));
35657 +       free_ts(tfm_stream(tc, id));
35658 +       set_tfm_stream(tc, id, 0);
35659 +}
35660 +
35661 +static inline unsigned coa_overrun(compression_plugin * cplug, int ilen)
35662 +{
35663 +       return (cplug->overrun != NULL ? cplug->overrun(ilen) : 0);
35664 +}
35665 +
35666 +static inline void free_tfm_unit(tfm_cluster_t * tc)
35667 +{
35668 +       tfm_stream_id id;
35669 +       for (id = 0; id < LAST_STREAM; id++) {
35670 +               if (!tfm_stream(tc, id))
35671 +                       continue;
35672 +               free_tfm_stream(tc, id);
35673 +       }
35674 +}
35675 +
35676 +static inline void put_tfm_cluster(tfm_cluster_t * tc)
35677 +{
35678 +       assert("edward-942", tc != NULL);
35679 +       free_coa_set(tc);
35680 +       free_tfm_unit(tc);
35681 +}
35682 +
35683 +static inline int tfm_cluster_is_uptodate(tfm_cluster_t * tc)
35684 +{
35685 +       assert("edward-943", tc != NULL);
35686 +       assert("edward-944", tc->uptodate == 0 || tc->uptodate == 1);
35687 +       return (tc->uptodate == 1);
35688 +}
35689 +
35690 +static inline void tfm_cluster_set_uptodate(tfm_cluster_t * tc)
35691 +{
35692 +       assert("edward-945", tc != NULL);
35693 +       assert("edward-946", tc->uptodate == 0 || tc->uptodate == 1);
35694 +       tc->uptodate = 1;
35695 +       return;
35696 +}
35697 +
35698 +static inline void tfm_cluster_clr_uptodate(tfm_cluster_t * tc)
35699 +{
35700 +       assert("edward-947", tc != NULL);
35701 +       assert("edward-948", tc->uptodate == 0 || tc->uptodate == 1);
35702 +       tc->uptodate = 0;
35703 +       return;
35704 +}
35705 +
35706 +static inline int tfm_stream_is_set(tfm_cluster_t * tc, tfm_stream_id id)
35707 +{
35708 +       return (tfm_stream(tc, id) &&
35709 +               tfm_stream_data(tc, id) && tfm_stream_size(tc, id));
35710 +}
35711 +
35712 +static inline int tfm_cluster_is_set(tfm_cluster_t * tc)
35713 +{
35714 +       int i;
35715 +       for (i = 0; i < LAST_STREAM; i++)
35716 +               if (!tfm_stream_is_set(tc, i))
35717 +                       return 0;
35718 +       return 1;
35719 +}
35720 +
35721 +static inline void alternate_streams(tfm_cluster_t * tc)
35722 +{
35723 +       tfm_stream_t *tmp = tfm_stream(tc, INPUT_STREAM);
35724 +
35725 +       set_tfm_stream(tc, INPUT_STREAM, tfm_stream(tc, OUTPUT_STREAM));
35726 +       set_tfm_stream(tc, OUTPUT_STREAM, tmp);
35727 +}
35728 +
35729 +/* a kind of data that we can write to the window */
35730 +typedef enum {
35731 +       DATA_WINDOW,            /* the data we copy form user space */
35732 +       HOLE_WINDOW             /* zeroes if we write hole */
35733 +} window_stat;
35734 +
35735 +/* Sliding window of cluster size which should be set to the approprite position
35736 +   (defined by cluster index) in a file before page cluster modification by
35737 +   file_write. Then we translate file size, offset to write from, number of
35738 +   bytes to write, etc.. to the following configuration needed to estimate
35739 +   number of pages to read before write, etc...
35740 +*/
35741 +typedef struct reiser4_slide {
35742 +       unsigned off;           /* offset we start to write/truncate from */
35743 +       unsigned count;         /* number of bytes (zeroes) to write/truncate */
35744 +       unsigned delta;         /* number of bytes to append to the hole */
35745 +       window_stat stat;       /* a kind of data to write to the window */
35746 +} reiser4_slide_t;
35747 +
35748 +/* The following is a set of possible disk cluster states */
35749 +typedef enum {
35750 +       INVAL_DISK_CLUSTER,     /* unknown state */
35751 +       PREP_DISK_CLUSTER,      /* disk cluster got converted by flush
35752 +                                  at least 1 time */
35753 +       UNPR_DISK_CLUSTER,      /* disk cluster just created and should be
35754 +                                  converted by flush */
35755 +       FAKE_DISK_CLUSTER       /* disk cluster doesn't exist neither in memory
35756 +                                  nor on disk */
35757 +} disk_cluster_stat;
35758 +
35759 +/*
35760 +   While implementing all transforms (from page to disk cluster, and back)
35761 +   reiser4 cluster manager fills the following structure incapsulating pointers
35762 +   to all the clusters for the same index including the sliding window above
35763 +*/
35764 +typedef struct reiser4_cluster {
35765 +       tfm_cluster_t tc;       /* transform cluster */
35766 +       int nr_pages;           /* number of pages */
35767 +       struct page **pages;    /* page cluster */
35768 +       page_cluster_op op;     /* page cluster operation */
35769 +       struct file *file;
35770 +       hint_t *hint;           /* disk cluster item for traversal */
35771 +       disk_cluster_stat dstat;        /* state of the current disk cluster */
35772 +       cloff_t index;          /* offset in the units of cluster size */
35773 +       reiser4_slide_t *win;   /* sliding window of cluster size */
35774 +       int reserved;           /* this indicates that space for disk
35775 +                                  cluster modification is reserved */
35776 +#if REISER4_DEBUG
35777 +       reiser4_context *ctx;
35778 +       int reserved_prepped;
35779 +       int reserved_unprepped;
35780 +#endif
35781 +
35782 +} reiser4_cluster_t;
35783 +
35784 +static inline __u8 * tfm_input_data (reiser4_cluster_t * clust)
35785 +{
35786 +       return tfm_stream_data(&clust->tc, INPUT_STREAM);
35787 +}
35788 +
35789 +static inline __u8 * tfm_output_data (reiser4_cluster_t * clust)
35790 +{
35791 +       return tfm_stream_data(&clust->tc, OUTPUT_STREAM);
35792 +}
35793 +
35794 +static inline int reset_cluster_pgset(reiser4_cluster_t * clust, int nrpages)
35795 +{
35796 +       assert("edward-1057", clust->pages != NULL);
35797 +       memset(clust->pages, 0, sizeof(*clust->pages) * nrpages);
35798 +       return 0;
35799 +}
35800 +
35801 +static inline int alloc_cluster_pgset(reiser4_cluster_t * clust, int nrpages)
35802 +{
35803 +       assert("edward-949", clust != NULL);
35804 +       assert("edward-1362", clust->pages == NULL);
35805 +       assert("edward-950", nrpages != 0 && nrpages <= MAX_CLUSTER_NRPAGES);
35806 +
35807 +       clust->pages =
35808 +               kmalloc(sizeof(*clust->pages) * nrpages, GFP_KERNEL);
35809 +       if (!clust->pages)
35810 +               return RETERR(-ENOMEM);
35811 +       reset_cluster_pgset(clust, nrpages);
35812 +       return 0;
35813 +}
35814 +
35815 +static inline void free_cluster_pgset(reiser4_cluster_t * clust)
35816 +{
35817 +       assert("edward-951", clust->pages != NULL);
35818 +       kfree(clust->pages);
35819 +       clust->pages = NULL;
35820 +}
35821 +
35822 +static inline void put_cluster_handle(reiser4_cluster_t * clust)
35823 +{
35824 +       assert("edward-435", clust != NULL);
35825 +
35826 +       put_tfm_cluster(&clust->tc);
35827 +       if (clust->pages)
35828 +               free_cluster_pgset(clust);
35829 +       memset(clust, 0, sizeof *clust);
35830 +}
35831 +
35832 +static inline void inc_keyload_count(crypto_stat_t * data)
35833 +{
35834 +       assert("edward-1410", data != NULL);
35835 +       data->keyload_count++;
35836 +}
35837 +
35838 +static inline void dec_keyload_count(crypto_stat_t * data)
35839 +{
35840 +       assert("edward-1411", data != NULL);
35841 +       assert("edward-1412", data->keyload_count > 0);
35842 +       data->keyload_count--;
35843 +}
35844 +
35845 +/* cryptcompress specific part of reiser4_inode */
35846 +typedef struct cryptcompress_info {
35847 +       struct rw_semaphore lock;
35848 +       crypto_stat_t *crypt;
35849 +       int compress_toggle;      /* current status of compressibility
35850 +                                    is set by compression mode plugin */
35851 +#if REISER4_DEBUG
35852 +       int pgcount;              /* number of captured pages */
35853 +#endif
35854 +} cryptcompress_info_t;
35855 +
35856 +
35857 +static inline void toggle_compression (cryptcompress_info_t * info, int val)
35858 +{
35859 +       info->compress_toggle = val;
35860 +}
35861 +
35862 +static inline int compression_is_on (cryptcompress_info_t * info)
35863 +{
35864 +       return info->compress_toggle;
35865 +}
35866 +
35867 +cryptcompress_info_t *cryptcompress_inode_data(const struct inode *);
35868 +int equal_to_rdk(znode *, const reiser4_key *);
35869 +int goto_right_neighbor(coord_t *, lock_handle *);
35870 +int load_file_hint(struct file *, hint_t *);
35871 +void save_file_hint(struct file *, const hint_t *);
35872 +void hint_init_zero(hint_t *);
35873 +int crc_inode_ok(struct inode *inode);
35874 +extern int ctail_read_disk_cluster (reiser4_cluster_t *, struct inode *, int);
35875 +extern int do_readpage_ctail(struct inode *, reiser4_cluster_t *,
35876 +                            struct page * page);
35877 +extern int ctail_insert_unprepped_cluster(reiser4_cluster_t * clust,
35878 +                                         struct inode * inode);
35879 +int bind_cryptcompress(struct inode *child, struct inode *parent);
35880 +void destroy_inode_cryptcompress(struct inode * inode);
35881 +crypto_stat_t * inode_crypto_stat (struct inode * inode);
35882 +void inherit_crypto_stat_common(struct inode * parent, struct inode * object,
35883 +                               int (*can_inherit)(struct inode * child,
35884 +                                                  struct inode * parent));
35885 +void attach_crypto_stat(struct inode * inode, crypto_stat_t * info);
35886 +void change_crypto_stat(struct inode * inode, crypto_stat_t * new);
35887 +crypto_stat_t * alloc_crypto_stat (struct inode * inode);
35888 +
35889 +
35890 +static inline reiser4_tfma_t *
35891 +info_get_tfma (crypto_stat_t * info, reiser4_tfm id)
35892 +{
35893 +       return &info->tfma[id];
35894 +}
35895 +
35896 +static inline struct crypto_tfm *
35897 +info_get_tfm (crypto_stat_t * info, reiser4_tfm id)
35898 +{
35899 +       return info_get_tfma(info, id)->tfm;
35900 +}
35901 +
35902 +static inline void
35903 +info_set_tfm (crypto_stat_t * info, reiser4_tfm id, struct crypto_tfm * tfm)
35904 +{
35905 +       info_get_tfma(info, id)->tfm = tfm;
35906 +}
35907 +
35908 +static inline struct crypto_tfm *
35909 +info_cipher_tfm (crypto_stat_t * info)
35910 +{
35911 +       return info_get_tfm(info, CIPHER_TFM);
35912 +}
35913 +
35914 +static inline struct crypto_tfm *
35915 +info_digest_tfm (crypto_stat_t * info)
35916 +{
35917 +       return info_get_tfm(info, DIGEST_TFM);
35918 +}
35919 +
35920 +static inline cipher_plugin *
35921 +info_cipher_plugin (crypto_stat_t * info)
35922 +{
35923 +       return &info_get_tfma(info, CIPHER_TFM)->plug->cipher;
35924 +}
35925 +
35926 +static inline digest_plugin *
35927 +info_digest_plugin (crypto_stat_t * info)
35928 +{
35929 +       return &info_get_tfma(info, DIGEST_TFM)->plug->digest;
35930 +}
35931 +
35932 +static inline void
35933 +info_set_plugin(crypto_stat_t * info, reiser4_tfm id, reiser4_plugin * plugin)
35934 +{
35935 +       info_get_tfma(info, id)->plug = plugin;
35936 +}
35937 +
35938 +static inline void
35939 +info_set_cipher_plugin(crypto_stat_t * info, cipher_plugin * cplug)
35940 +{
35941 +       info_set_plugin(info, CIPHER_TFM, cipher_plugin_to_plugin(cplug));
35942 +}
35943 +
35944 +static inline void
35945 +info_set_digest_plugin(crypto_stat_t * info, digest_plugin * plug)
35946 +{
35947 +       info_set_plugin(info, DIGEST_TFM, digest_plugin_to_plugin(plug));
35948 +}
35949 +
35950 +#endif                         /* __FS_REISER4_CRYPTCOMPRESS_H__ */
35951 +
35952 +/* Make Linus happy.
35953 +   Local variables:
35954 +   c-indentation-style: "K&R"
35955 +   mode-name: "LC"
35956 +   c-basic-offset: 8
35957 +   tab-width: 8
35958 +   fill-column: 120
35959 +   scroll-step: 1
35960 +   End:
35961 +*/
35962 diff --git a/fs/reiser4/plugin/file/file.c b/fs/reiser4/plugin/file/file.c
35963 new file mode 100644
35964 index 0000000..0efa698
35965 --- /dev/null
35966 +++ b/fs/reiser4/plugin/file/file.c
35967 @@ -0,0 +1,2713 @@
35968 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
35969 + * reiser4/README */
35970 +
35971 +/*
35972 + * this file contains implementations of inode/file/address_space/file plugin
35973 + * operations specific for "unix file plugin" (plugin id is
35974 + * UNIX_FILE_PLUGIN_ID). "Unix file" is either built of tail items only
35975 + * (FORMATTING_ID) or of extent items only (EXTENT_POINTER_ID) or empty (have
35976 + * no items but stat data)
35977 + */
35978 +
35979 +#include "../../inode.h"
35980 +#include "../../super.h"
35981 +#include "../../tree_walk.h"
35982 +#include "../../carry.h"
35983 +#include "../../page_cache.h"
35984 +#include "../../ioctl.h"
35985 +#include "../object.h"
35986 +#include "../../safe_link.h"
35987 +
35988 +#include <linux/writeback.h>
35989 +#include <linux/pagevec.h>
35990 +#include <linux/syscalls.h>
35991 +
35992 +
35993 +static int unpack(struct file *file, struct inode *inode, int forever);
35994 +
35995 +/* get unix file plugin specific portion of inode */
35996 +unix_file_info_t *unix_file_inode_data(const struct inode *inode)
35997 +{
35998 +       return &reiser4_inode_data(inode)->file_plugin_data.unix_file_info;
35999 +}
36000 +
36001 +/**
36002 + * equal_to_rdk - compare key and znode's right delimiting key
36003 + * @node: node whose right delimiting key to compare with @key
36004 + * @key: key to compare with @node's right delimiting key
36005 + *
36006 + * Returns true if @key is equal to right delimiting key of @node.
36007 + */
36008 +int equal_to_rdk(znode *node, const reiser4_key *key)
36009 +{
36010 +       int result;
36011 +
36012 +       read_lock_dk(znode_get_tree(node));
36013 +       result = keyeq(key, znode_get_rd_key(node));
36014 +       read_unlock_dk(znode_get_tree(node));
36015 +       return result;
36016 +}
36017 +
36018 +#if REISER4_DEBUG
36019 +
36020 +/**
36021 + * equal_to_ldk - compare key and znode's left delimiting key
36022 + * @node: node whose left delimiting key to compare with @key
36023 + * @key: key to compare with @node's left delimiting key
36024 + *
36025 + * Returns true if @key is equal to left delimiting key of @node.
36026 + */
36027 +int equal_to_ldk(znode *node, const reiser4_key *key)
36028 +{
36029 +       int result;
36030 +
36031 +       read_lock_dk(znode_get_tree(node));
36032 +       result = keyeq(key, znode_get_ld_key(node));
36033 +       read_unlock_dk(znode_get_tree(node));
36034 +       return result;
36035 +}
36036 +
36037 +/**
36038 + * check_coord - check whether coord corresponds to key
36039 + * @coord: coord to check
36040 + * @key: key @coord has to correspond to
36041 + *
36042 + * Returns true if @coord is set as if it was set as result of lookup with @key
36043 + * in coord->node.
36044 + */
36045 +static int check_coord(const coord_t *coord, const reiser4_key *key)
36046 +{
36047 +       coord_t twin;
36048 +
36049 +       node_plugin_by_node(coord->node)->lookup(coord->node, key,
36050 +                                                FIND_MAX_NOT_MORE_THAN, &twin);
36051 +       return coords_equal(coord, &twin);
36052 +}
36053 +
36054 +#endif /* REISER4_DEBUG */
36055 +
36056 +/**
36057 + * init_uf_coord - initialize extended coord
36058 + * @uf_coord:
36059 + * @lh:
36060 + *
36061 + *
36062 + */
36063 +void init_uf_coord(uf_coord_t *uf_coord, lock_handle *lh)
36064 +{
36065 +       coord_init_zero(&uf_coord->coord);
36066 +       coord_clear_iplug(&uf_coord->coord);
36067 +       uf_coord->lh = lh;
36068 +       init_lh(lh);
36069 +       memset(&uf_coord->extension, 0, sizeof(uf_coord->extension));
36070 +       uf_coord->valid = 0;
36071 +}
36072 +
36073 +static void validate_extended_coord(uf_coord_t *uf_coord, loff_t offset)
36074 +{
36075 +       assert("vs-1333", uf_coord->valid == 0);
36076 +
36077 +       if (coord_is_between_items(&uf_coord->coord))
36078 +               return;
36079 +
36080 +       assert("vs-1348",
36081 +              item_plugin_by_coord(&uf_coord->coord)->s.file.
36082 +              init_coord_extension);
36083 +
36084 +       item_body_by_coord(&uf_coord->coord);
36085 +       item_plugin_by_coord(&uf_coord->coord)->s.file.
36086 +           init_coord_extension(uf_coord, offset);
36087 +}
36088 +
36089 +/**
36090 + * goto_right_neighbor - lock right neighbor, drop current node lock
36091 + * @coord:
36092 + * @lh:
36093 + *
36094 + * Obtain lock on right neighbor and drop lock on current node.
36095 + */
36096 +int goto_right_neighbor(coord_t *coord, lock_handle *lh)
36097 +{
36098 +       int result;
36099 +       lock_handle lh_right;
36100 +
36101 +       assert("vs-1100", znode_is_locked(coord->node));
36102 +
36103 +       init_lh(&lh_right);
36104 +       result = reiser4_get_right_neighbor(&lh_right, coord->node,
36105 +                                           znode_is_wlocked(coord->node) ?
36106 +                                           ZNODE_WRITE_LOCK : ZNODE_READ_LOCK,
36107 +                                           GN_CAN_USE_UPPER_LEVELS);
36108 +       if (result) {
36109 +               done_lh(&lh_right);
36110 +               return result;
36111 +       }
36112 +
36113 +       /*
36114 +        * we hold two longterm locks on neighboring nodes. Unlock left of
36115 +        * them
36116 +        */
36117 +       done_lh(lh);
36118 +
36119 +       coord_init_first_unit_nocheck(coord, lh_right.node);
36120 +       move_lh(lh, &lh_right);
36121 +
36122 +       return 0;
36123 +
36124 +}
36125 +
36126 +/**
36127 + * set_file_state
36128 + * @uf_info:
36129 + * @cbk_result:
36130 + * @level:
36131 + *
36132 + * This is to be used by find_file_item and in find_file_state to
36133 + * determine real state of file
36134 + */
36135 +static void set_file_state(unix_file_info_t *uf_info, int cbk_result,
36136 +                          tree_level level)
36137 +{
36138 +       if (cbk_errored(cbk_result))
36139 +               /* error happened in find_file_item */
36140 +               return;
36141 +
36142 +       assert("vs-1164", level == LEAF_LEVEL || level == TWIG_LEVEL);
36143 +
36144 +       if (uf_info->container == UF_CONTAINER_UNKNOWN) {
36145 +               /*
36146 +                * container is unknown, therefore conversion can not be in
36147 +                * progress
36148 +                */
36149 +               assert("", !inode_get_flag(unix_file_info_to_inode(uf_info),
36150 +                                          REISER4_PART_IN_CONV));
36151 +               if (cbk_result == CBK_COORD_NOTFOUND)
36152 +                       uf_info->container = UF_CONTAINER_EMPTY;
36153 +               else if (level == LEAF_LEVEL)
36154 +                       uf_info->container = UF_CONTAINER_TAILS;
36155 +               else
36156 +                       uf_info->container = UF_CONTAINER_EXTENTS;
36157 +       } else {
36158 +               /*
36159 +                * file state is known, check whether it is set correctly if
36160 +                * file is not being tail converted
36161 +                */
36162 +               if (!inode_get_flag(unix_file_info_to_inode(uf_info),
36163 +                                   REISER4_PART_IN_CONV)) {
36164 +                       assert("vs-1162",
36165 +                              ergo(level == LEAF_LEVEL &&
36166 +                                   cbk_result == CBK_COORD_FOUND,
36167 +                                   uf_info->container == UF_CONTAINER_TAILS));
36168 +                       assert("vs-1165",
36169 +                              ergo(level == TWIG_LEVEL &&
36170 +                                   cbk_result == CBK_COORD_FOUND,
36171 +                                   uf_info->container == UF_CONTAINER_EXTENTS));
36172 +               }
36173 +       }
36174 +}
36175 +
36176 +int find_file_item_nohint(coord_t *coord, lock_handle *lh,
36177 +                         const reiser4_key *key, znode_lock_mode lock_mode,
36178 +                         struct inode *inode)
36179 +{
36180 +       return object_lookup(inode, key, coord, lh, lock_mode,
36181 +                            FIND_MAX_NOT_MORE_THAN,
36182 +                            TWIG_LEVEL, LEAF_LEVEL,
36183 +                            (lock_mode == ZNODE_READ_LOCK) ? CBK_UNIQUE :
36184 +                            (CBK_UNIQUE | CBK_FOR_INSERT),
36185 +                            NULL /* ra_info */ );
36186 +}
36187 +
36188 +/**
36189 + * find_file_item - look for file item in the tree
36190 + * @hint: provides coordinate, lock handle, seal
36191 + * @key: key for search
36192 + * @mode: mode of lock to put on returned node
36193 + * @ra_info:
36194 + * @inode:
36195 + *
36196 + * This finds position in the tree corresponding to @key. It first tries to use
36197 + * @hint's seal if it is set.
36198 + */
36199 +int find_file_item(hint_t *hint, const reiser4_key *key,
36200 +                  znode_lock_mode lock_mode,
36201 +                  struct inode *inode)
36202 +{
36203 +       int result;
36204 +       coord_t *coord;
36205 +       lock_handle *lh;
36206 +
36207 +       assert("nikita-3030", schedulable());
36208 +       assert("vs-1707", hint != NULL);
36209 +       assert("vs-47", inode != NULL);
36210 +
36211 +       coord = &hint->ext_coord.coord;
36212 +       lh = hint->ext_coord.lh;
36213 +       init_lh(lh);
36214 +
36215 +       result = hint_validate(hint, key, 1 /* check key */, lock_mode);
36216 +       if (!result) {
36217 +               if (coord->between == AFTER_UNIT &&
36218 +                   equal_to_rdk(coord->node, key)) {
36219 +                       result = goto_right_neighbor(coord, lh);
36220 +                       if (result == -E_NO_NEIGHBOR)
36221 +                               return RETERR(-EIO);
36222 +                       if (result)
36223 +                               return result;
36224 +                       assert("vs-1152", equal_to_ldk(coord->node, key));
36225 +                       /*
36226 +                        * we moved to different node. Invalidate coord
36227 +                        * extension, zload is necessary to init it again
36228 +                        */
36229 +                       hint->ext_coord.valid = 0;
36230 +               }
36231 +
36232 +               set_file_state(unix_file_inode_data(inode), CBK_COORD_FOUND,
36233 +                              znode_get_level(coord->node));
36234 +
36235 +               return CBK_COORD_FOUND;
36236 +       }
36237 +
36238 +       coord_init_zero(coord);
36239 +       result = find_file_item_nohint(coord, lh, key, lock_mode, inode);
36240 +       set_file_state(unix_file_inode_data(inode), result,
36241 +                      znode_get_level(coord->node));
36242 +
36243 +       /* FIXME: we might already have coord extension initialized */
36244 +       hint->ext_coord.valid = 0;
36245 +       return result;
36246 +}
36247 +
36248 +/* plugin->u.file.write_flowom = NULL
36249 +   plugin->u.file.read_flow = NULL */
36250 +
36251 +void hint_init_zero(hint_t * hint)
36252 +{
36253 +       memset(hint, 0, sizeof(*hint));
36254 +       init_lh(&hint->lh);
36255 +       hint->ext_coord.lh = &hint->lh;
36256 +}
36257 +
36258 +static int find_file_state(struct inode *inode, unix_file_info_t *uf_info)
36259 +{
36260 +       int result;
36261 +       reiser4_key key;
36262 +       coord_t coord;
36263 +       lock_handle lh;
36264 +
36265 +       assert("vs-1628", ea_obtained(uf_info));
36266 +
36267 +       if (uf_info->container == UF_CONTAINER_UNKNOWN) {
36268 +               key_by_inode_and_offset_common(inode, 0, &key);
36269 +               init_lh(&lh);
36270 +               result = find_file_item_nohint(&coord, &lh, &key,
36271 +                                              ZNODE_READ_LOCK, inode);
36272 +               set_file_state(uf_info, result, znode_get_level(coord.node));
36273 +               done_lh(&lh);
36274 +               if (!cbk_errored(result))
36275 +                       result = 0;
36276 +       } else
36277 +               result = 0;
36278 +       assert("vs-1074",
36279 +              ergo(result == 0, uf_info->container != UF_CONTAINER_UNKNOWN));
36280 +       txn_restart_current();
36281 +       return result;
36282 +}
36283 +
36284 +/* estimate and reserve space needed to truncate page which gets partially truncated: one block for page itself, stat
36285 +   data update (estimate_one_insert_into_item) and one item insertion (estimate_one_insert_into_item) which may happen
36286 +   if page corresponds to hole extent and unallocated one will have to be created */
36287 +static int reserve_partial_page(reiser4_tree * tree)
36288 +{
36289 +       grab_space_enable();
36290 +       return reiser4_grab_reserved(reiser4_get_current_sb(),
36291 +                                    1 +
36292 +                                    2 * estimate_one_insert_into_item(tree),
36293 +                                    BA_CAN_COMMIT);
36294 +}
36295 +
36296 +/* estimate and reserve space needed to cut one item and update one stat data */
36297 +static int reserve_cut_iteration(reiser4_tree * tree)
36298 +{
36299 +       __u64 estimate = estimate_one_item_removal(tree)
36300 +           + estimate_one_insert_into_item(tree);
36301 +
36302 +       assert("nikita-3172", lock_stack_isclean(get_current_lock_stack()));
36303 +
36304 +       grab_space_enable();
36305 +       /* We need to double our estimate now that we can delete more than one
36306 +          node. */
36307 +       return reiser4_grab_reserved(reiser4_get_current_sb(), estimate * 2,
36308 +                                    BA_CAN_COMMIT);
36309 +}
36310 +
36311 +int update_file_size(struct inode *inode, reiser4_key * key, int update_sd)
36312 +{
36313 +       int result = 0;
36314 +
36315 +       INODE_SET_FIELD(inode, i_size, get_key_offset(key));
36316 +       if (update_sd) {
36317 +               inode->i_ctime = inode->i_mtime = CURRENT_TIME;
36318 +               result = reiser4_update_sd(inode);
36319 +       }
36320 +       return result;
36321 +}
36322 +
36323 +/* cut file items one by one starting from the last one until new file size (inode->i_size) is reached. Reserve space
36324 +   and update file stat data on every single cut from the tree */
36325 +int
36326 +cut_file_items(struct inode *inode, loff_t new_size, int update_sd,
36327 +              loff_t cur_size, int (*update_actor) (struct inode *,
36328 +                                                    reiser4_key *, int))
36329 +{
36330 +       reiser4_key from_key, to_key;
36331 +       reiser4_key smallest_removed;
36332 +       file_plugin *fplug = inode_file_plugin(inode);
36333 +       int result;
36334 +       int progress = 0;
36335 +
36336 +       assert("vs-1248",
36337 +              fplug == file_plugin_by_id(UNIX_FILE_PLUGIN_ID) ||
36338 +              fplug == file_plugin_by_id(CRC_FILE_PLUGIN_ID));
36339 +
36340 +       fplug->key_by_inode(inode, new_size, &from_key);
36341 +       to_key = from_key;
36342 +       set_key_offset(&to_key, cur_size - 1 /*get_key_offset(max_key()) */ );
36343 +       /* this loop normally runs just once */
36344 +       while (1) {
36345 +               result = reserve_cut_iteration(tree_by_inode(inode));
36346 +               if (result)
36347 +                       break;
36348 +
36349 +               result = cut_tree_object(current_tree, &from_key, &to_key,
36350 +                                        &smallest_removed, inode, 1,
36351 +                                        &progress);
36352 +               if (result == -E_REPEAT) {
36353 +                       /* -E_REPEAT is a signal to interrupt a long file truncation process */
36354 +                       if (progress) {
36355 +                               result =
36356 +                                   update_actor(inode, &smallest_removed,
36357 +                                                update_sd);
36358 +                               if (result)
36359 +                                       break;
36360 +                       }
36361 +
36362 +                       /* the below does up(sbinfo->delete_sema). Do not get folled */
36363 +                       reiser4_release_reserved(inode->i_sb);
36364 +
36365 +                       /* cut_tree_object() was interrupted probably because
36366 +                        * current atom requires commit, we have to release
36367 +                        * transaction handle to allow atom commit. */
36368 +                       txn_restart_current();
36369 +                       continue;
36370 +               }
36371 +               if (result
36372 +                   && !(result == CBK_COORD_NOTFOUND && new_size == 0
36373 +                        && inode->i_size == 0))
36374 +                       break;
36375 +
36376 +               set_key_offset(&smallest_removed, new_size);
36377 +               /* Final sd update after the file gets its correct size */
36378 +               result = update_actor(inode, &smallest_removed, update_sd);
36379 +               break;
36380 +       }
36381 +
36382 +       /* the below does up(sbinfo->delete_sema). Do not get folled */
36383 +       reiser4_release_reserved(inode->i_sb);
36384 +
36385 +       return result;
36386 +}
36387 +
36388 +int find_or_create_extent(struct page *page);
36389 +
36390 +static int filler(void *vp, struct page *page)
36391 +{
36392 +       return readpage_unix_file_nolock(vp, page);
36393 +}
36394 +
36395 +/* part of truncate_file_body: it is called when truncate is used to make file
36396 +   shorter */
36397 +static int shorten_file(struct inode *inode, loff_t new_size)
36398 +{
36399 +       int result;
36400 +       struct page *page;
36401 +       int padd_from;
36402 +       unsigned long index;
36403 +       char *kaddr;
36404 +       unix_file_info_t *uf_info;
36405 +
36406 +       /*
36407 +        * all items of ordinary reiser4 file are grouped together. That is why
36408 +        * we can use cut_tree. Plan B files (for instance) can not be
36409 +        * truncated that simply
36410 +        */
36411 +       result = cut_file_items(inode, new_size, 1 /*update_sd */ ,
36412 +                               get_key_offset(max_key()), update_file_size);
36413 +       if (result)
36414 +               return result;
36415 +
36416 +       uf_info = unix_file_inode_data(inode);
36417 +       assert("vs-1105", new_size == inode->i_size);
36418 +       if (new_size == 0) {
36419 +               uf_info->container = UF_CONTAINER_EMPTY;
36420 +               return 0;
36421 +       }
36422 +
36423 +       result = find_file_state(inode, uf_info);
36424 +       if (result)
36425 +               return result;
36426 +       if (uf_info->container == UF_CONTAINER_TAILS)
36427 +               /*
36428 +                * No need to worry about zeroing last page after new file
36429 +                * end
36430 +                */
36431 +               return 0;
36432 +
36433 +       padd_from = inode->i_size & (PAGE_CACHE_SIZE - 1);
36434 +       if (!padd_from)
36435 +               /* file is truncated to page boundary */
36436 +               return 0;
36437 +
36438 +       result = reserve_partial_page(tree_by_inode(inode));
36439 +       if (result) {
36440 +               reiser4_release_reserved(inode->i_sb);
36441 +               return result;
36442 +       }
36443 +
36444 +       /* last page is partially truncated - zero its content */
36445 +       index = (inode->i_size >> PAGE_CACHE_SHIFT);
36446 +       page = read_cache_page(inode->i_mapping, index, filler, NULL);
36447 +       if (IS_ERR(page)) {
36448 +               /*
36449 +                * the below does up(sbinfo->delete_sema). Do not get
36450 +                * confused
36451 +                */
36452 +               reiser4_release_reserved(inode->i_sb);
36453 +               if (likely(PTR_ERR(page) == -EINVAL)) {
36454 +                       /* looks like file is built of tail items */
36455 +                       return 0;
36456 +               }
36457 +               return PTR_ERR(page);
36458 +       }
36459 +       wait_on_page_locked(page);
36460 +       if (!PageUptodate(page)) {
36461 +               page_cache_release(page);
36462 +               /*
36463 +                * the below does up(sbinfo->delete_sema). Do not get
36464 +                * confused
36465 +                */
36466 +               reiser4_release_reserved(inode->i_sb);
36467 +               return RETERR(-EIO);
36468 +       }
36469 +
36470 +       /*
36471 +        * if page correspons to hole extent unit - unallocated one will be
36472 +        * created here. This is not necessary
36473 +        */
36474 +       result = find_or_create_extent(page);
36475 +
36476 +       /*
36477 +        * FIXME: cut_file_items has already updated inode. Probably it would
36478 +        * be better to update it here when file is really truncated
36479 +        */
36480 +       if (result) {
36481 +               page_cache_release(page);
36482 +               /*
36483 +                * the below does up(sbinfo->delete_sema). Do not get
36484 +                * confused
36485 +                */
36486 +               reiser4_release_reserved(inode->i_sb);
36487 +               return result;
36488 +       }
36489 +
36490 +       lock_page(page);
36491 +       assert("vs-1066", PageLocked(page));
36492 +       kaddr = kmap_atomic(page, KM_USER0);
36493 +       memset(kaddr + padd_from, 0, PAGE_CACHE_SIZE - padd_from);
36494 +       flush_dcache_page(page);
36495 +       kunmap_atomic(kaddr, KM_USER0);
36496 +       unlock_page(page);
36497 +       page_cache_release(page);
36498 +       /* the below does up(sbinfo->delete_sema). Do not get confused */
36499 +       reiser4_release_reserved(inode->i_sb);
36500 +       return 0;
36501 +}
36502 +
36503 +/**
36504 + * should_have_notail
36505 + * @uf_info:
36506 + * @new_size:
36507 + *
36508 + * Calls formatting plugin to see whether file of size @new_size has to be
36509 + * stored in unformatted nodes or in tail items. 0 is returned for later case.
36510 + */
36511 +static int should_have_notail(const unix_file_info_t *uf_info, loff_t new_size)
36512 +{
36513 +       if (!uf_info->tplug)
36514 +               return 1;
36515 +       return !uf_info->tplug->have_tail(unix_file_info_to_inode(uf_info),
36516 +                                         new_size);
36517 +
36518 +}
36519 +
36520 +/**
36521 + * truncate_file_body - change length of file
36522 + * @inode: inode of file
36523 + * @new_size: new file length
36524 + *
36525 + * Adjusts items file @inode is built of to match @new_size. It may either cut
36526 + * items or add them to represent a hole at the end of file. The caller has to
36527 + * obtain exclusive access to the file.
36528 + */
36529 +static int truncate_file_body(struct inode *inode, loff_t new_size)
36530 +{
36531 +       int result;
36532 +
36533 +       if (inode->i_size < new_size) {
36534 +               /* expanding truncate */
36535 +               struct dentry dentry;
36536 +               struct file file;
36537 +               unix_file_info_t *uf_info;
36538 +
36539 +               dentry.d_inode = inode;
36540 +               file.f_dentry = &dentry;
36541 +               file.private_data = NULL;
36542 +               file.f_pos = new_size;
36543 +               file.private_data = NULL;
36544 +               uf_info = unix_file_inode_data(inode);
36545 +               result = find_file_state(inode, uf_info);
36546 +               if (result)
36547 +                       return result;
36548 +
36549 +               if (should_have_notail(uf_info, new_size)) {
36550 +                       /*
36551 +                        * file of size @new_size has to be built of
36552 +                        * extents. If it is built of tails - convert to
36553 +                        * extents
36554 +                        */
36555 +                       if (uf_info->container ==  UF_CONTAINER_TAILS) {
36556 +                               /*
36557 +                                * if file is being convered by another process
36558 +                                * - wait until it completes
36559 +                                */
36560 +                               while (1) {
36561 +                                       if (inode_get_flag(inode, REISER4_PART_IN_CONV)) {
36562 +                                               drop_exclusive_access(uf_info);
36563 +                                               schedule();
36564 +                                               get_exclusive_access(uf_info);
36565 +                                               continue;
36566 +                                       }
36567 +                                       break;
36568 +                               }
36569 +
36570 +                               if (uf_info->container ==  UF_CONTAINER_TAILS) {
36571 +                                       result = tail2extent(uf_info);
36572 +                                       if (result)
36573 +                                               return result;
36574 +                               }
36575 +                       }
36576 +                       result = write_extent(&file, NULL, 0, &new_size);
36577 +                       if (result)
36578 +                               return result;
36579 +                       uf_info->container = UF_CONTAINER_EXTENTS;
36580 +               } else {
36581 +                       if (uf_info->container ==  UF_CONTAINER_EXTENTS) {
36582 +                               result = write_extent(&file, NULL, 0, &new_size);
36583 +                               if (result)
36584 +                                       return result;
36585 +                       } else {
36586 +                               result = write_tail(&file, NULL, 0, &new_size);
36587 +                               if (result)
36588 +                                       return result;
36589 +                               uf_info->container = UF_CONTAINER_TAILS;
36590 +                       }
36591 +               }
36592 +               BUG_ON(result > 0);
36593 +               INODE_SET_FIELD(inode, i_size, new_size);
36594 +               file_update_time(&file);
36595 +               result = reiser4_update_sd(inode);
36596 +               BUG_ON(result != 0);
36597 +               reiser4_free_file_fsdata(&file);
36598 +       } else
36599 +               result = shorten_file(inode, new_size);
36600 +       return result;
36601 +}
36602 +
36603 +/* plugin->u.write_sd_by_inode = write_sd_by_inode_common */
36604 +
36605 +/**
36606 + * load_file_hint - copy hint from struct file to local variable
36607 + * @file: file to get hint from
36608 + * @hint: structure to fill
36609 + *
36610 + * Reiser4 specific portion of struct file may contain information (hint)
36611 + * stored on exiting from previous read or write. That information includes
36612 + * seal of znode and coord within that znode where previous read or write
36613 + * stopped. This function copies that information to @hint if it was stored or
36614 + * initializes @hint by 0s otherwise.
36615 + */
36616 +int load_file_hint(struct file *file, hint_t *hint)
36617 +{
36618 +       reiser4_file_fsdata *fsdata;
36619 +
36620 +       if (file) {
36621 +               fsdata = reiser4_get_file_fsdata(file);
36622 +               if (IS_ERR(fsdata))
36623 +                       return PTR_ERR(fsdata);
36624 +
36625 +               spin_lock_inode(file->f_dentry->d_inode);
36626 +               if (seal_is_set(&fsdata->reg.hint.seal)) {
36627 +                       *hint = fsdata->reg.hint;
36628 +                       init_lh(&hint->lh);
36629 +                       hint->ext_coord.lh = &hint->lh;
36630 +                       spin_unlock_inode(file->f_dentry->d_inode);
36631 +                       /*
36632 +                        * force re-validation of the coord on the first
36633 +                        * iteration of the read/write loop.
36634 +                        */
36635 +                       hint->ext_coord.valid = 0;
36636 +                       assert("nikita-19892", coords_equal(&hint->seal.coord1,
36637 +                                                           &hint->ext_coord.
36638 +                                                           coord));
36639 +                       return 0;
36640 +               }
36641 +               memset(&fsdata->reg.hint, 0, sizeof(hint_t));
36642 +               spin_unlock_inode(file->f_dentry->d_inode);
36643 +       }
36644 +       hint_init_zero(hint);
36645 +       return 0;
36646 +}
36647 +
36648 +/**
36649 + * save_file_hint - copy hint to reiser4 private struct file's part
36650 + * @file: file to save hint in
36651 + * @hint: hint to save
36652 + *
36653 + * This copies @hint to reiser4 private part of struct file. It can help
36654 + * speedup future accesses to the file.
36655 + */
36656 +void save_file_hint(struct file *file, const hint_t *hint)
36657 +{
36658 +       reiser4_file_fsdata *fsdata;
36659 +
36660 +       assert("edward-1337", hint != NULL);
36661 +
36662 +       if (!file || !seal_is_set(&hint->seal))
36663 +               return;
36664 +       fsdata = reiser4_get_file_fsdata(file);
36665 +       assert("vs-965", !IS_ERR(fsdata));
36666 +       assert("nikita-19891",
36667 +              coords_equal(&hint->seal.coord1, &hint->ext_coord.coord));
36668 +       assert("vs-30", hint->lh.owner == NULL);
36669 +       spin_lock_inode(file->f_dentry->d_inode);
36670 +       fsdata->reg.hint = *hint;
36671 +       spin_unlock_inode(file->f_dentry->d_inode);
36672 +       return;
36673 +}
36674 +
36675 +void unset_hint(hint_t * hint)
36676 +{
36677 +       assert("vs-1315", hint);
36678 +       hint->ext_coord.valid = 0;
36679 +       seal_done(&hint->seal);
36680 +       done_lh(&hint->lh);
36681 +}
36682 +
36683 +/* coord must be set properly. So, that set_hint has nothing to do */
36684 +void set_hint(hint_t * hint, const reiser4_key * key, znode_lock_mode mode)
36685 +{
36686 +       ON_DEBUG(coord_t * coord = &hint->ext_coord.coord);
36687 +       assert("vs-1207", WITH_DATA(coord->node, check_coord(coord, key)));
36688 +
36689 +       seal_init(&hint->seal, &hint->ext_coord.coord, key);
36690 +       hint->offset = get_key_offset(key);
36691 +       hint->mode = mode;
36692 +       done_lh(&hint->lh);
36693 +}
36694 +
36695 +int hint_is_set(const hint_t * hint)
36696 +{
36697 +       return seal_is_set(&hint->seal);
36698 +}
36699 +
36700 +#if REISER4_DEBUG
36701 +static int all_but_offset_key_eq(const reiser4_key * k1, const reiser4_key * k2)
36702 +{
36703 +       return (get_key_locality(k1) == get_key_locality(k2) &&
36704 +               get_key_type(k1) == get_key_type(k2) &&
36705 +               get_key_band(k1) == get_key_band(k2) &&
36706 +               get_key_ordering(k1) == get_key_ordering(k2) &&
36707 +               get_key_objectid(k1) == get_key_objectid(k2));
36708 +}
36709 +#endif
36710 +
36711 +int
36712 +hint_validate(hint_t * hint, const reiser4_key * key, int check_key,
36713 +             znode_lock_mode lock_mode)
36714 +{
36715 +       if (!hint || !hint_is_set(hint) || hint->mode != lock_mode)
36716 +               /* hint either not set or set by different operation */
36717 +               return RETERR(-E_REPEAT);
36718 +
36719 +       assert("vs-1277", all_but_offset_key_eq(key, &hint->seal.key));
36720 +
36721 +       if (check_key && get_key_offset(key) != hint->offset)
36722 +               /* hint is set for different key */
36723 +               return RETERR(-E_REPEAT);
36724 +
36725 +       assert("vs-31", hint->ext_coord.lh == &hint->lh);
36726 +       return seal_validate(&hint->seal, &hint->ext_coord.coord, key,
36727 +                            hint->ext_coord.lh, lock_mode, ZNODE_LOCK_LOPRI);
36728 +}
36729 +
36730 +/**
36731 + * find_or_create_extent -
36732 + * @page:
36733 + *
36734 + *
36735 + */
36736 +/* look for place at twig level for extent corresponding to page, call extent's writepage method to create
36737 +   unallocated extent if it does not exist yet, initialize jnode, capture page */
36738 +int find_or_create_extent(struct page *page)
36739 +{
36740 +       int result;
36741 +       struct inode *inode;
36742 +       int plugged_hole;
36743 +
36744 +       jnode *node;
36745 +
36746 +       assert("vs-1065", page->mapping && page->mapping->host);
36747 +       inode = page->mapping->host;
36748 +
36749 +       lock_page(page);
36750 +       node = jnode_of_page(page);
36751 +       if (IS_ERR(node)) {
36752 +               unlock_page(page);
36753 +               return PTR_ERR(node);
36754 +       }
36755 +       JF_SET(node, JNODE_WRITE_PREPARED);
36756 +       unlock_page(page);
36757 +
36758 +       if (node->blocknr == 0) {
36759 +               plugged_hole = 0;
36760 +               result = update_extent(inode, node,
36761 +                                      (loff_t)page->index << PAGE_CACHE_SHIFT,
36762 +                                      &plugged_hole);
36763 +               if (result) {
36764 +                       JF_CLR(node, JNODE_WRITE_PREPARED);
36765 +                       jput(node);
36766 +                       warning("", "update_extent failed: %d", result);
36767 +                       return result;
36768 +               }
36769 +               if (plugged_hole)
36770 +                       reiser4_update_sd(inode);
36771 +       } else {
36772 +               spin_lock_jnode(node);
36773 +               result = try_capture(node, ZNODE_WRITE_LOCK, 0);
36774 +               BUG_ON(result != 0);
36775 +               jnode_make_dirty_locked(node);
36776 +               spin_unlock_jnode(node);
36777 +       }
36778 +
36779 +       BUG_ON(node->atom == NULL);
36780 +       JF_CLR(node, JNODE_WRITE_PREPARED);
36781 +       jput(node);
36782 +
36783 +       if (get_current_context()->entd) {
36784 +               entd_context *ent = get_entd_context(node->tree->super);
36785 +
36786 +               if (ent->cur_request->page == page)
36787 +                       ent->cur_request->node = node;
36788 +       }
36789 +       return 0;
36790 +}
36791 +
36792 +/**
36793 + * has_anonymous_pages - check whether inode has pages dirtied via mmap
36794 + * @inode: inode to check
36795 + *
36796 + * Returns true if inode's mapping has dirty pages which do not belong to any
36797 + * atom. Those are either tagged PAGECACHE_TAG_REISER4_MOVED in mapping's page
36798 + * tree or were eflushed and can be found via jnodes tagged
36799 + * EFLUSH_TAG_ANONYMOUS in radix tree of jnodes.
36800 + */
36801 +static int has_anonymous_pages(struct inode *inode)
36802 +{
36803 +       int result;
36804 +
36805 +       read_lock_irq(&inode->i_mapping->tree_lock);
36806 +       result = radix_tree_tagged(&inode->i_mapping->page_tree, PAGECACHE_TAG_REISER4_MOVED);
36807 +       read_unlock_irq(&inode->i_mapping->tree_lock);
36808 +       return result;
36809 +}
36810 +
36811 +/**
36812 + * capture_page_and_create_extent -
36813 + * @page: page to be captured
36814 + *
36815 + * Grabs space for extent creation and stat data update and calls function to
36816 + * do actual work.
36817 + */
36818 +static int capture_page_and_create_extent(struct page *page)
36819 +{
36820 +       int result;
36821 +       struct inode *inode;
36822 +
36823 +       assert("vs-1084", page->mapping && page->mapping->host);
36824 +       inode = page->mapping->host;
36825 +       assert("vs-1139",
36826 +              unix_file_inode_data(inode)->container == UF_CONTAINER_EXTENTS);
36827 +       /* page belongs to file */
36828 +       assert("vs-1393",
36829 +              inode->i_size > ((loff_t) page->index << PAGE_CACHE_SHIFT));
36830 +
36831 +       /* page capture may require extent creation (if it does not exist yet)
36832 +          and stat data's update (number of blocks changes on extent
36833 +          creation) */
36834 +       grab_space_enable();
36835 +       result =
36836 +           reiser4_grab_space(2 *
36837 +                              estimate_one_insert_into_item(tree_by_inode
36838 +                                                            (inode)),
36839 +                              BA_CAN_COMMIT);
36840 +       if (likely(!result))
36841 +               result = find_or_create_extent(page);
36842 +
36843 +       if (result != 0)
36844 +               SetPageError(page);
36845 +       return result;
36846 +}
36847 +
36848 +/* this is implementation of method commit_write of struct
36849 +   address_space_operations for unix file plugin */
36850 +int
36851 +commit_write_unix_file(struct file *file, struct page *page,
36852 +                      unsigned from, unsigned to)
36853 +{
36854 +       reiser4_context *ctx;
36855 +       struct inode *inode;
36856 +       int result;
36857 +
36858 +       assert("umka-3101", file != NULL);
36859 +       assert("umka-3102", page != NULL);
36860 +       assert("umka-3093", PageLocked(page));
36861 +
36862 +       SetPageUptodate(page);
36863 +
36864 +       inode = page->mapping->host;
36865 +       ctx = init_context(page->mapping->host->i_sb);
36866 +       if (IS_ERR(ctx))
36867 +               return PTR_ERR(ctx);
36868 +       page_cache_get(page);
36869 +       unlock_page(page);
36870 +       result = capture_page_and_create_extent(page);
36871 +       lock_page(page);
36872 +       page_cache_release(page);
36873 +
36874 +       /* don't commit transaction under inode semaphore */
36875 +       context_set_commit_async(ctx);
36876 +       reiser4_exit_context(ctx);
36877 +       return result;
36878 +}
36879 +
36880 +/*
36881 + * Support for "anonymous" pages and jnodes.
36882 + *
36883 + * When file is write-accessed through mmap pages can be dirtied from the user
36884 + * level. In this case kernel is not notified until one of following happens:
36885 + *
36886 + *     (1) msync()
36887 + *
36888 + *     (2) truncate() (either explicit or through unlink)
36889 + *
36890 + *     (3) VM scanner starts reclaiming mapped pages, dirtying them before
36891 + *     starting write-back.
36892 + *
36893 + * As a result of (3) ->writepage may be called on a dirty page without
36894 + * jnode. Such page is called "anonymous" in reiser4. Certain work-loads
36895 + * (iozone) generate huge number of anonymous pages. Emergency flush handles
36896 + * this situation by creating jnode for anonymous page, starting IO on the
36897 + * page, and marking jnode with JNODE_KEEPME bit so that it's not thrown out of
36898 + * memory. Such jnode is also called anonymous.
36899 + *
36900 + * reiser4_sync_sb() method tries to insert anonymous pages and jnodes into
36901 + * tree. This is done by capture_anonymous_*() functions below.
36902 + */
36903 +
36904 +/**
36905 + * capture_anonymous_page - involve page into transaction
36906 + * @pg: page to deal with
36907 + *
36908 + * Takes care that @page has corresponding metadata in the tree, creates jnode
36909 + * for @page and captures it. On success 1 is returned.
36910 + */
36911 +static int capture_anonymous_page(struct page *page)
36912 +{
36913 +       int result;
36914 +
36915 +       if (PageWriteback(page))
36916 +               /* FIXME: do nothing? */
36917 +               return 0;
36918 +
36919 +       result = capture_page_and_create_extent(page);
36920 +       if (result == 0) {
36921 +               result = 1;
36922 +       } else
36923 +               warning("nikita-3329",
36924 +                               "Cannot capture anon page: %i", result);
36925 +
36926 +       return result;
36927 +}
36928 +
36929 +/**
36930 + * capture_anonymous_pages - find and capture pages dirtied via mmap
36931 + * @mapping: address space where to look for pages
36932 + * @index: start index
36933 + * @to_capture: maximum number of pages to capture
36934 + *
36935 + * Looks for pages tagged REISER4_MOVED starting from the *@index-th page,
36936 + * captures (involves into atom) them, returns number of captured pages,
36937 + * updates @index to next page after the last captured one.
36938 + */
36939 +static int
36940 +capture_anonymous_pages(struct address_space *mapping, pgoff_t *index,
36941 +                       unsigned int to_capture)
36942 +{
36943 +       int result;
36944 +       struct pagevec pvec;
36945 +       unsigned int i, count;
36946 +       int nr;
36947 +
36948 +       pagevec_init(&pvec, 0);
36949 +       count = min(pagevec_space(&pvec), to_capture);
36950 +       nr = 0;
36951 +
36952 +       /* find pages tagged MOVED */
36953 +       write_lock_irq(&mapping->tree_lock);
36954 +       pvec.nr = radix_tree_gang_lookup_tag(&mapping->page_tree,
36955 +                                            (void **)pvec.pages, *index, count,
36956 +                                            PAGECACHE_TAG_REISER4_MOVED);
36957 +       if (pagevec_count(&pvec) == 0) {
36958 +               /*
36959 +                * there are no pages tagged MOVED in mapping->page_tree
36960 +                * starting from *index
36961 +                */
36962 +               write_unlock_irq(&mapping->tree_lock);
36963 +               *index = (pgoff_t)-1;
36964 +               return 0;
36965 +       }
36966 +
36967 +       /* clear MOVED tag for all found pages */
36968 +       for (i = 0; i < pagevec_count(&pvec); i++) {
36969 +               void *p;
36970 +
36971 +               page_cache_get(pvec.pages[i]);
36972 +               p = radix_tree_tag_clear(&mapping->page_tree, pvec.pages[i]->index,
36973 +                                        PAGECACHE_TAG_REISER4_MOVED);
36974 +               assert("vs-49", p == pvec.pages[i]);
36975 +       }
36976 +       write_unlock_irq(&mapping->tree_lock);
36977 +
36978 +
36979 +       *index = pvec.pages[i - 1]->index + 1;
36980 +
36981 +       for (i = 0; i < pagevec_count(&pvec); i++) {
36982 +               /*
36983 +                * tag PAGECACHE_TAG_REISER4_MOVED will be cleared by
36984 +                * set_page_dirty_internal which is called when jnode is
36985 +                * captured
36986 +                */
36987 +               result = capture_anonymous_page(pvec.pages[i]);
36988 +               if (result == 1)
36989 +                       nr++;
36990 +               else {
36991 +                       if (result < 0) {
36992 +                               warning("vs-1454",
36993 +                                       "failed to capture page: "
36994 +                                       "result=%d, captured=%d)\n",
36995 +                                       result, i);
36996 +
36997 +                               /*
36998 +                                * set MOVED tag to all pages which left not
36999 +                                * captured
37000 +                                */
37001 +                               write_lock_irq(&mapping->tree_lock);
37002 +                               for (; i < pagevec_count(&pvec); i ++) {
37003 +                                       radix_tree_tag_set(&mapping->page_tree,
37004 +                                                          pvec.pages[i]->index,
37005 +                                                          PAGECACHE_TAG_REISER4_MOVED);
37006 +                               }
37007 +                               write_unlock_irq(&mapping->tree_lock);
37008 +
37009 +                               pagevec_release(&pvec);
37010 +                               return result;
37011 +                       } else {
37012 +                               /*
37013 +                                * result == 0. capture_anonymous_page returns
37014 +                                * 0 for Writeback-ed page. Set MOVED tag on
37015 +                                * that page
37016 +                                */
37017 +                               write_lock_irq(&mapping->tree_lock);
37018 +                               radix_tree_tag_set(&mapping->page_tree,
37019 +                                                  pvec.pages[i]->index,
37020 +                                                  PAGECACHE_TAG_REISER4_MOVED);
37021 +                               write_unlock_irq(&mapping->tree_lock);
37022 +                               if (i == 0)
37023 +                                       *index = pvec.pages[0]->index;
37024 +                               else
37025 +                                       *index = pvec.pages[i - 1]->index + 1;
37026 +                       }
37027 +               }
37028 +       }
37029 +       pagevec_release(&pvec);
37030 +       return nr;
37031 +}
37032 +
37033 +/**
37034 + * capture_anonymous_jnodes - find and capture anonymous jnodes
37035 + * @mapping: address space where to look for jnodes
37036 + * @from: start index
37037 + * @to: end index
37038 + * @to_capture: maximum number of jnodes to capture
37039 + *
37040 + * Looks for jnodes tagged EFLUSH_TAG_ANONYMOUS in inode's tree of jnodes in
37041 + * the range of indexes @from-@to and captures them, returns number of captured
37042 + * jnodes, updates @from to next jnode after the last captured one.
37043 + */
37044 +static int
37045 +capture_anonymous_jnodes(struct address_space *mapping,
37046 +                        pgoff_t *from, pgoff_t to, int to_capture)
37047 +{
37048 +       *from = to;
37049 +       return 0;
37050 +}
37051 +
37052 +/*
37053 + * Commit atom of the jnode of a page.
37054 + */
37055 +static int sync_page(struct page *page)
37056 +{
37057 +       int result;
37058 +       do {
37059 +               jnode *node;
37060 +               txn_atom *atom;
37061 +
37062 +               lock_page(page);
37063 +               node = jprivate(page);
37064 +               if (node != NULL) {
37065 +                       spin_lock_jnode(node);
37066 +                       atom = jnode_get_atom(node);
37067 +                       spin_unlock_jnode(node);
37068 +               } else
37069 +                       atom = NULL;
37070 +               unlock_page(page);
37071 +               result = sync_atom(atom);
37072 +       } while (result == -E_REPEAT);
37073 +       /*
37074 +        * ZAM-FIXME-HANS: document the logic of this loop, is it just to
37075 +        * handle the case where more pages get added to the atom while we are
37076 +        * syncing it?
37077 +        */
37078 +       assert("nikita-3485", ergo(result == 0,
37079 +                                  get_current_context()->trans->atom == NULL));
37080 +       return result;
37081 +}
37082 +
37083 +/*
37084 + * Commit atoms of pages on @pages list.
37085 + * call sync_page for each page from mapping's page tree
37086 + */
37087 +static int sync_page_list(struct inode *inode)
37088 +{
37089 +       int result;
37090 +       struct address_space *mapping;
37091 +       unsigned long from;     /* start index for radix_tree_gang_lookup */
37092 +       unsigned int found;     /* return value for radix_tree_gang_lookup */
37093 +
37094 +       mapping = inode->i_mapping;
37095 +       from = 0;
37096 +       result = 0;
37097 +       read_lock_irq(&mapping->tree_lock);
37098 +       while (result == 0) {
37099 +               struct page *page;
37100 +
37101 +               found =
37102 +                   radix_tree_gang_lookup(&mapping->page_tree, (void **)&page,
37103 +                                          from, 1);
37104 +               assert("", found < 2);
37105 +               if (found == 0)
37106 +                       break;
37107 +
37108 +               /* page may not leave radix tree because it is protected from truncating by inode->i_mutex locked by
37109 +                  sys_fsync */
37110 +               page_cache_get(page);
37111 +               read_unlock_irq(&mapping->tree_lock);
37112 +
37113 +               from = page->index + 1;
37114 +
37115 +               result = sync_page(page);
37116 +
37117 +               page_cache_release(page);
37118 +               read_lock_irq(&mapping->tree_lock);
37119 +       }
37120 +
37121 +       read_unlock_irq(&mapping->tree_lock);
37122 +       return result;
37123 +}
37124 +
37125 +static int commit_file_atoms(struct inode *inode)
37126 +{
37127 +       int result;
37128 +       unix_file_info_t *uf_info;
37129 +
37130 +       uf_info = unix_file_inode_data(inode);
37131 +
37132 +       get_exclusive_access(uf_info);
37133 +       /*
37134 +        * find what items file is made from
37135 +        */
37136 +       result = find_file_state(inode, uf_info);
37137 +       drop_exclusive_access(uf_info);
37138 +       if (result != 0)
37139 +               return result;
37140 +
37141 +       /*
37142 +        * file state cannot change because we are under ->i_mutex
37143 +        */
37144 +       switch (uf_info->container) {
37145 +       case UF_CONTAINER_EXTENTS:
37146 +               /* find_file_state might open join an atom */
37147 +               txn_restart_current();
37148 +               result =
37149 +                   /*
37150 +                    * when we are called by
37151 +                    * filemap_fdatawrite->
37152 +                    *    do_writepages()->
37153 +                    *       reiser4_writepages()
37154 +                    *
37155 +                    * inode->i_mapping->dirty_pages are spices into
37156 +                    * ->io_pages, leaving ->dirty_pages dirty.
37157 +                    *
37158 +                    * When we are called from
37159 +                    * reiser4_fsync()->sync_unix_file(), we have to
37160 +                    * commit atoms of all pages on the ->dirty_list.
37161 +                    *
37162 +                    * So for simplicity we just commit ->io_pages and
37163 +                    * ->dirty_pages.
37164 +                    */
37165 +                   sync_page_list(inode);
37166 +               break;
37167 +       case UF_CONTAINER_TAILS:
37168 +               /*
37169 +                * NOTE-NIKITA probably we can be smarter for tails. For now
37170 +                * just commit all existing atoms.
37171 +                */
37172 +               result = txnmgr_force_commit_all(inode->i_sb, 0);
37173 +               break;
37174 +       case UF_CONTAINER_EMPTY:
37175 +               result = 0;
37176 +               break;
37177 +       case UF_CONTAINER_UNKNOWN:
37178 +       default:
37179 +               result = -EIO;
37180 +               break;
37181 +       }
37182 +
37183 +       /*
37184 +        * commit current transaction: there can be captured nodes from
37185 +        * find_file_state() and finish_conversion().
37186 +        */
37187 +       txn_restart_current();
37188 +       return result;
37189 +}
37190 +
37191 +/**
37192 + * writepages_unix_file - writepages of struct address_space_operations
37193 + * @mapping:
37194 + * @wbc:
37195 + *
37196 + * This captures anonymous pages and anonymous jnodes. Anonymous pages are
37197 + * pages which are dirtied via mmapping. Anonymous jnodes are ones which were
37198 + * created by reiser4_writepage.
37199 + */
37200 +int writepages_unix_file(struct address_space *mapping,
37201 +                    struct writeback_control *wbc)
37202 +{
37203 +       int result;
37204 +       unix_file_info_t *uf_info;
37205 +       pgoff_t pindex, jindex, nr_pages;
37206 +       long to_capture;
37207 +       struct inode *inode;
37208 +
37209 +       inode = mapping->host;
37210 +       if (!has_anonymous_pages(inode)) {
37211 +               result = 0;
37212 +               goto end;
37213 +       }
37214 +       jindex = pindex = wbc->range_start >> PAGE_CACHE_SHIFT;
37215 +       result = 0;
37216 +       nr_pages =
37217 +           (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
37218 +       uf_info = unix_file_inode_data(inode);
37219 +
37220 +       do {
37221 +               reiser4_context *ctx;
37222 +
37223 +               if (wbc->sync_mode != WB_SYNC_ALL)
37224 +                       to_capture = min(wbc->nr_to_write, CAPTURE_APAGE_BURST);
37225 +               else
37226 +                       to_capture = CAPTURE_APAGE_BURST;
37227 +
37228 +               ctx = init_context(inode->i_sb);
37229 +               if (IS_ERR(ctx)) {
37230 +                       result = PTR_ERR(ctx);
37231 +                       break;
37232 +               }
37233 +               /* avoid recursive calls to ->sync_inodes */
37234 +               ctx->nobalance = 1;
37235 +               assert("zam-760", lock_stack_isclean(get_current_lock_stack()));
37236 +               assert("", LOCK_CNT_NIL(inode_sem_w));
37237 +               assert("", LOCK_CNT_NIL(inode_sem_r));
37238 +
37239 +               txn_restart_current();
37240 +
37241 +               /* we have to get nonexclusive access to the file */
37242 +               if (get_current_context()->entd) {
37243 +                       /*
37244 +                        * use nonblocking version of nonexclusive_access to
37245 +                        * avoid deadlock which might look like the following:
37246 +                        * process P1 holds NEA on file F1 and called entd to
37247 +                        * reclaim some memory. Entd works for P1 and is going
37248 +                        * to capture pages of file F2. To do that entd has to
37249 +                        * get NEA to F2. F2 is held by process P2 which also
37250 +                        * called entd. But entd is serving P1 at the moment
37251 +                        * and P2 has to wait. Process P3 trying to get EA to
37252 +                        * file F2. Existence of pending EA request to file F2
37253 +                        * makes impossible for entd to get NEA to file
37254 +                        * F2. Neither of these process can continue. Using
37255 +                        * nonblocking version of gettign NEA is supposed to
37256 +                        * avoid this deadlock.
37257 +                        */
37258 +                       if (try_to_get_nonexclusive_access(uf_info) == 0) {
37259 +                               result = RETERR(-EBUSY);
37260 +                               reiser4_exit_context(ctx);
37261 +                               break;
37262 +                       }
37263 +               } else
37264 +                       get_nonexclusive_access(uf_info);
37265 +
37266 +               while (to_capture > 0) {
37267 +                       pgoff_t start;
37268 +
37269 +                       assert("vs-1727", jindex <= pindex);
37270 +                       if (pindex == jindex) {
37271 +                               start = pindex;
37272 +                               result =
37273 +                                   capture_anonymous_pages(inode->i_mapping,
37274 +                                                           &pindex,
37275 +                                                           to_capture);
37276 +                               if (result <= 0)
37277 +                                       break;
37278 +                               to_capture -= result;
37279 +                               wbc->nr_to_write -= result;
37280 +                               if (start + result == pindex) {
37281 +                                       jindex = pindex;
37282 +                                       continue;
37283 +                               }
37284 +                               if (to_capture <= 0)
37285 +                                       break;
37286 +                       }
37287 +                       /* deal with anonymous jnodes between jindex and pindex */
37288 +                       result =
37289 +                           capture_anonymous_jnodes(inode->i_mapping, &jindex,
37290 +                                                    pindex, to_capture);
37291 +                       if (result < 0)
37292 +                               break;
37293 +                       to_capture -= result;
37294 +                       get_current_context()->nr_captured += result;
37295 +
37296 +                       if (jindex == (pgoff_t) - 1) {
37297 +                               assert("vs-1728", pindex == (pgoff_t) - 1);
37298 +                               break;
37299 +                       }
37300 +               }
37301 +               if (to_capture <= 0)
37302 +                       /* there may be left more pages */
37303 +                       __mark_inode_dirty(inode, I_DIRTY_PAGES);
37304 +
37305 +               drop_nonexclusive_access(uf_info);
37306 +               if (result < 0) {
37307 +                       /* error happened */
37308 +                       reiser4_exit_context(ctx);
37309 +                       return result;
37310 +               }
37311 +               if (wbc->sync_mode != WB_SYNC_ALL) {
37312 +                       reiser4_exit_context(ctx);
37313 +                       return 0;
37314 +               }
37315 +               result = commit_file_atoms(inode);
37316 +               reiser4_exit_context(ctx);
37317 +               if (pindex >= nr_pages && jindex == pindex)
37318 +                       break;
37319 +       } while (1);
37320 +
37321 +      end:
37322 +       if (is_in_reiser4_context()) {
37323 +               if (get_current_context()->nr_captured >= CAPTURE_APAGE_BURST) {
37324 +                       /*
37325 +                        * there are already pages to flush, flush them out, do
37326 +                        * not delay until end of reiser4_sync_inodes
37327 +                        */
37328 +                       writeout(inode->i_sb, wbc);
37329 +                       get_current_context()->nr_captured = 0;
37330 +               }
37331 +       }
37332 +       return result;
37333 +}
37334 +
37335 +/*
37336 + * ->sync() method for unix file.
37337 + *
37338 + * We are trying to be smart here. Instead of committing all atoms (original
37339 + * solution), we scan dirty pages of this file and commit all atoms they are
37340 + * part of.
37341 + *
37342 + * Situation is complicated by anonymous pages: i.e., extent-less pages
37343 + * dirtied through mmap. Fortunately sys_fsync() first calls
37344 + * filemap_fdatawrite() that will ultimately call reiser4_writepages(), insert
37345 + * all missing extents and capture anonymous pages.
37346 + */
37347 +int sync_unix_file(struct file *file, struct dentry *dentry, int datasync)
37348 +{
37349 +       reiser4_context *ctx;
37350 +       txn_atom *atom;
37351 +       reiser4_block_nr reserve;
37352 +
37353 +       ctx = init_context(dentry->d_inode->i_sb);
37354 +       if (IS_ERR(ctx))
37355 +               return PTR_ERR(ctx);
37356 +
37357 +       reserve = estimate_update_common(dentry->d_inode);
37358 +       if (reiser4_grab_space(reserve, BA_CAN_COMMIT)) {
37359 +               reiser4_exit_context(ctx);
37360 +               return RETERR(-ENOSPC);
37361 +       }
37362 +       write_sd_by_inode_common(dentry->d_inode);
37363 +
37364 +       atom = get_current_atom_locked();
37365 +       spin_lock_txnh(ctx->trans);
37366 +       force_commit_atom(ctx->trans);
37367 +       reiser4_exit_context(ctx);
37368 +       return 0;
37369 +}
37370 +
37371 +/**
37372 + * readpage_unix_file_nolock - readpage of struct address_space_operations
37373 + * @file:
37374 + * @page:
37375 + *
37376 + * Compose a key and search for item containing information about @page
37377 + * data. If item is found - its readpage method is called.
37378 + */
37379 +int readpage_unix_file_nolock(struct file *file, struct page *page)
37380 +{
37381 +       reiser4_context *ctx;
37382 +       int result;
37383 +       struct inode *inode;
37384 +       reiser4_key key;
37385 +       item_plugin *iplug;
37386 +       hint_t *hint;
37387 +       lock_handle *lh;
37388 +       coord_t *coord;
37389 +
37390 +       assert("vs-1062", PageLocked(page));
37391 +       assert("vs-976", !PageUptodate(page));
37392 +       assert("vs-1061", page->mapping && page->mapping->host);
37393 +
37394 +       if ((page->mapping->host->i_size <=
37395 +            ((loff_t) page->index << PAGE_CACHE_SHIFT))) {
37396 +               /* page is out of file already */
37397 +               unlock_page(page);
37398 +               return -EINVAL;
37399 +       }
37400 +
37401 +       inode = page->mapping->host;
37402 +       ctx = init_context(inode->i_sb);
37403 +       if (IS_ERR(ctx)) {
37404 +               unlock_page(page);
37405 +               return PTR_ERR(ctx);
37406 +       }
37407 +
37408 +       hint = kmalloc(sizeof(*hint), get_gfp_mask());
37409 +       if (hint == NULL) {
37410 +               unlock_page(page);
37411 +               reiser4_exit_context(ctx);
37412 +               return RETERR(-ENOMEM);
37413 +       }
37414 +
37415 +       result = load_file_hint(file, hint);
37416 +       if (result) {
37417 +               kfree(hint);
37418 +               unlock_page(page);
37419 +               reiser4_exit_context(ctx);
37420 +               return result;
37421 +       }
37422 +       lh = &hint->lh;
37423 +
37424 +       /* get key of first byte of the page */
37425 +       key_by_inode_and_offset_common(inode,
37426 +                                      (loff_t) page->index << PAGE_CACHE_SHIFT,
37427 +                                      &key);
37428 +
37429 +       /* look for file metadata corresponding to first byte of page */
37430 +       page_cache_get(page);
37431 +       unlock_page(page);
37432 +       result = find_file_item(hint, &key, ZNODE_READ_LOCK, inode);
37433 +       lock_page(page);
37434 +       page_cache_release(page);
37435 +
37436 +       if (page->mapping == NULL) {
37437 +               /*
37438 +                * readpage allows truncate to run concurrently. Page was
37439 +                * truncated while it was not locked
37440 +                */
37441 +               done_lh(lh);
37442 +               kfree(hint);
37443 +               unlock_page(page);
37444 +               txn_restart(ctx);
37445 +               reiser4_exit_context(ctx);
37446 +               return -EINVAL;
37447 +       }
37448 +
37449 +       if (result != CBK_COORD_FOUND || hint->ext_coord.coord.between != AT_UNIT) {
37450 +               if (result == CBK_COORD_FOUND &&
37451 +                   hint->ext_coord.coord.between != AT_UNIT)
37452 +                       /* file is truncated */
37453 +                       result = -EINVAL;
37454 +               done_lh(lh);
37455 +               kfree(hint);
37456 +               unlock_page(page);
37457 +               txn_restart(ctx);
37458 +               reiser4_exit_context(ctx);
37459 +               return result;
37460 +       }
37461 +
37462 +       /*
37463 +        * item corresponding to page is found. It can not be removed because
37464 +        * znode lock is held
37465 +        */
37466 +       if (PageUptodate(page)) {
37467 +               done_lh(lh);
37468 +               kfree(hint);
37469 +               unlock_page(page);
37470 +               txn_restart(ctx);
37471 +               reiser4_exit_context(ctx);
37472 +               return 0;
37473 +       }
37474 +
37475 +       coord = &hint->ext_coord.coord;
37476 +       result = zload(coord->node);
37477 +       if (result) {
37478 +               done_lh(lh);
37479 +               kfree(hint);
37480 +               unlock_page(page);
37481 +               txn_restart(ctx);
37482 +               reiser4_exit_context(ctx);
37483 +               return result;
37484 +       }
37485 +
37486 +       validate_extended_coord(&hint->ext_coord,
37487 +                               (loff_t) page->index << PAGE_CACHE_SHIFT);
37488 +
37489 +       if (!coord_is_existing_unit(coord)) {
37490 +               /* this indicates corruption */
37491 +               warning("vs-280",
37492 +                       "Looking for page %lu of file %llu (size %lli). "
37493 +                       "No file items found (%d). File is corrupted?\n",
37494 +                       page->index, (unsigned long long)get_inode_oid(inode),
37495 +                       inode->i_size, result);
37496 +               zrelse(coord->node);
37497 +               done_lh(lh);
37498 +               kfree(hint);
37499 +               unlock_page(page);
37500 +               txn_restart(ctx);
37501 +               reiser4_exit_context(ctx);
37502 +               return RETERR(-EIO);
37503 +       }
37504 +
37505 +       /*
37506 +        * get plugin of found item or use plugin if extent if there are no
37507 +        * one
37508 +        */
37509 +       iplug = item_plugin_by_coord(coord);
37510 +       if (iplug->s.file.readpage)
37511 +               result = iplug->s.file.readpage(coord, page);
37512 +       else
37513 +               result = RETERR(-EINVAL);
37514 +
37515 +       if (!result) {
37516 +               set_key_offset(&key,
37517 +                              (loff_t) (page->index + 1) << PAGE_CACHE_SHIFT);
37518 +               /* FIXME should call set_hint() */
37519 +               unset_hint(hint);
37520 +       } else {
37521 +               unlock_page(page);
37522 +               unset_hint(hint);
37523 +       }
37524 +       assert("vs-979",
37525 +              ergo(result == 0, (PageLocked(page) || PageUptodate(page))));
37526 +       assert("vs-9791", ergo(result != 0, !PageLocked(page)));
37527 +
37528 +       zrelse(coord->node);
37529 +       done_lh(lh);
37530 +
37531 +       save_file_hint(file, hint);
37532 +       kfree(hint);
37533 +
37534 +       /*
37535 +        * FIXME: explain why it is needed. HINT: page allocation in write can
37536 +        * not be done when atom is not NULL because reiser4_writepage can not
37537 +        * kick entd and have to eflush
37538 +        */
37539 +       txn_restart(ctx);
37540 +       reiser4_exit_context(ctx);
37541 +       return result;
37542 +}
37543 +
37544 +/**
37545 + * readpage_unix_file - readpage of struct address_space_operations
37546 + * @file: file @page belongs to
37547 + * @page: page to read
37548 + *
37549 + * Get non exclusive access to a file to avoid races with truncate. If page is
37550 + * out of file - return error. Call readpage_unix_file_nolock to do the rest.
37551 + */
37552 +int readpage_unix_file(struct file *file, struct page *page)
37553 +{
37554 +       return readpage_unix_file_nolock(file, page);
37555 +}
37556 +
37557 +static reiser4_block_nr unix_file_estimate_read(struct inode *inode,
37558 +                                               loff_t count UNUSED_ARG)
37559 +{
37560 +       /* We should reserve one block, because of updating of the stat data
37561 +          item */
37562 +       assert("vs-1249",
37563 +              inode_file_plugin(inode)->estimate.update ==
37564 +              estimate_update_common);
37565 +       return estimate_update_common(inode);
37566 +}
37567 +
37568 +/* this is called with nonexclusive access obtained, file's container can not change */
37569 +static size_t read_file(hint_t * hint, struct file *file,      /* file to read from to */
37570 +                       char __user *buf,       /* address of user-space buffer */
37571 +                       size_t count,   /* number of bytes to read */
37572 +                       loff_t * off)
37573 +{
37574 +       int result;
37575 +       struct inode *inode;
37576 +       flow_t flow;
37577 +       int (*read_f) (struct file *, flow_t *, hint_t *);
37578 +       coord_t *coord;
37579 +       znode *loaded;
37580 +
37581 +       inode = file->f_dentry->d_inode;
37582 +
37583 +       /* build flow */
37584 +       assert("vs-1250",
37585 +              inode_file_plugin(inode)->flow_by_inode ==
37586 +              flow_by_inode_unix_file);
37587 +       result =
37588 +           flow_by_inode_unix_file(inode, buf, 1 /* user space */ , count,
37589 +                                   *off, READ_OP, &flow);
37590 +       if (unlikely(result))
37591 +               return result;
37592 +
37593 +       /* get seal and coord sealed with it from reiser4 private data
37594 +          of struct file.  The coord will tell us where our last read
37595 +          of this file finished, and the seal will help to determine
37596 +          if that location is still valid.
37597 +        */
37598 +       coord = &hint->ext_coord.coord;
37599 +       while (flow.length && result == 0) {
37600 +               result =
37601 +                       find_file_item(hint, &flow.key, ZNODE_READ_LOCK, inode);
37602 +               if (cbk_errored(result))
37603 +                       /* error happened */
37604 +                       break;
37605 +
37606 +               if (coord->between != AT_UNIT) {
37607 +                       /* there were no items corresponding to given offset */
37608 +                       done_lh(hint->ext_coord.lh);
37609 +                       break;
37610 +               }
37611 +
37612 +               loaded = coord->node;
37613 +               result = zload(loaded);
37614 +               if (unlikely(result)) {
37615 +                       done_lh(hint->ext_coord.lh);
37616 +                       break;
37617 +               }
37618 +
37619 +               if (hint->ext_coord.valid == 0)
37620 +                       validate_extended_coord(&hint->ext_coord,
37621 +                                               get_key_offset(&flow.key));
37622 +
37623 +               assert("vs-4", hint->ext_coord.valid == 1);
37624 +               assert("vs-33", hint->ext_coord.lh == &hint->lh);
37625 +               /* call item's read method */
37626 +               read_f = item_plugin_by_coord(coord)->s.file.read;
37627 +               result = read_f(file, &flow, hint);
37628 +               zrelse(loaded);
37629 +               done_lh(hint->ext_coord.lh);
37630 +       }
37631 +
37632 +       return (count - flow.length) ? (count - flow.length) : result;
37633 +}
37634 +
37635 +/**
37636 + * read_unix_file - read of struct file_operations
37637 + * @file: file to read from
37638 + * @buf: address of user-space buffer
37639 + * @read_amount: number of bytes to read
37640 + * @off: position in file to read from
37641 + *
37642 + * This is implementation of vfs's read method of struct file_operations for
37643 + * unix file plugin.
37644 + */
37645 +ssize_t read_unix_file(struct file *file, char __user *buf, size_t read_amount,
37646 +                      loff_t *off)
37647 +{
37648 +       reiser4_context *ctx;
37649 +       int result;
37650 +       struct inode *inode;
37651 +       hint_t *hint;
37652 +       unix_file_info_t *uf_info;
37653 +       size_t count, read, left;
37654 +       reiser4_block_nr needed;
37655 +       loff_t size;
37656 +
37657 +       if (unlikely(read_amount == 0))
37658 +               return 0;
37659 +
37660 +       assert("umka-072", file != NULL);
37661 +       assert("umka-074", off != NULL);
37662 +       inode = file->f_dentry->d_inode;
37663 +       assert("vs-972", !inode_get_flag(inode, REISER4_NO_SD));
37664 +
37665 +       ctx = init_context(inode->i_sb);
37666 +       if (IS_ERR(ctx))
37667 +               return PTR_ERR(ctx);
37668 +
37669 +       hint = kmalloc(sizeof(*hint), get_gfp_mask());
37670 +       if (hint == NULL) {
37671 +               context_set_commit_async(ctx);
37672 +               reiser4_exit_context(ctx);
37673 +               return RETERR(-ENOMEM);
37674 +       }
37675 +
37676 +       result = load_file_hint(file, hint);
37677 +       if (result) {
37678 +               kfree(hint);
37679 +               context_set_commit_async(ctx);
37680 +               reiser4_exit_context(ctx);
37681 +               return result;
37682 +       }
37683 +
37684 +       left = read_amount;
37685 +       count = 0;
37686 +       uf_info = unix_file_inode_data(inode);
37687 +       while (left > 0) {
37688 +               txn_restart_current();
37689 +
37690 +               get_nonexclusive_access(uf_info);
37691 +
37692 +               size = i_size_read(inode);
37693 +               if (*off >= size) {
37694 +                       /* position to read from is past the end of file */
37695 +                       drop_nonexclusive_access(uf_info);
37696 +                       break;
37697 +               }
37698 +               if (*off + left > size)
37699 +                       left = size - *off;
37700 +
37701 +               /* faultin user page */
37702 +               result = fault_in_pages_writeable(buf, left > PAGE_CACHE_SIZE ? PAGE_CACHE_SIZE : left);
37703 +               if (result) {
37704 +                       drop_nonexclusive_access(uf_info);
37705 +                       break;
37706 +               }
37707 +
37708 +               read = read_file(hint, file, buf,
37709 +                                left > PAGE_CACHE_SIZE ? PAGE_CACHE_SIZE : left,
37710 +                                off);
37711 +
37712 +               drop_nonexclusive_access(uf_info);
37713 +
37714 +               if (read < 0) {
37715 +                       result = read;
37716 +                       break;
37717 +               }
37718 +               left -= read;
37719 +               buf += read;
37720 +
37721 +               /* update position in a file */
37722 +               *off += read;
37723 +               /* total number of read bytes */
37724 +               count += read;
37725 +       }
37726 +       save_file_hint(file, hint);
37727 +       kfree(hint);
37728 +
37729 +       if (count) {
37730 +               /*
37731 +                * something was read. Grab space for stat data update and
37732 +                * update atime
37733 +                */
37734 +               needed = unix_file_estimate_read(inode, read_amount);
37735 +               result = reiser4_grab_space_force(needed, BA_CAN_COMMIT);
37736 +               if (result == 0)
37737 +                       file_accessed(file);
37738 +               else
37739 +                       warning("", "failed to grab space for atime update");
37740 +       }
37741 +
37742 +       context_set_commit_async(ctx);
37743 +       reiser4_exit_context(ctx);
37744 +
37745 +       /* return number of read bytes or error code if nothing is read */
37746 +       return count ? count : result;
37747 +}
37748 +
37749 +/* This function takes care about @file's pages. First of all it checks if
37750 +   filesystems readonly and if so gets out. Otherwise, it throws out all
37751 +   pages of file if it was mapped for read and going to be mapped for write
37752 +   and consists of tails. This is done in order to not manage few copies
37753 +   of the data (first in page cache and second one in tails them selves)
37754 +   for the case of mapping files consisting tails.
37755 +
37756 +   Here also tail2extent conversion is performed if it is allowed and file
37757 +   is going to be written or mapped for write. This functions may be called
37758 +   from write_unix_file() or mmap_unix_file(). */
37759 +static int check_pages_unix_file(struct file *file, struct inode *inode)
37760 +{
37761 +       reiser4_invalidate_pages(inode->i_mapping, 0,
37762 +                                (inode->i_size + PAGE_CACHE_SIZE -
37763 +                                 1) >> PAGE_CACHE_SHIFT, 0);
37764 +       return unpack(file, inode, 0 /* not forever */ );
37765 +}
37766 +
37767 +/**
37768 + * mmap_unix_file - mmap of struct file_operations
37769 + * @file: file to mmap
37770 + * @vma:
37771 + *
37772 + * This is implementation of vfs's mmap method of struct file_operations for
37773 + * unix file plugin. It converts file to extent if necessary. Sets
37774 + * reiser4_inode's flag - REISER4_HAS_MMAP.
37775 + */
37776 +int mmap_unix_file(struct file *file, struct vm_area_struct *vma)
37777 +{
37778 +       reiser4_context *ctx;
37779 +       int result;
37780 +       struct inode *inode;
37781 +       unix_file_info_t *uf_info;
37782 +       reiser4_block_nr needed;
37783 +
37784 +       inode = file->f_dentry->d_inode;
37785 +       ctx = init_context(inode->i_sb);
37786 +       if (IS_ERR(ctx))
37787 +               return PTR_ERR(ctx);
37788 +
37789 +       uf_info = unix_file_inode_data(inode);
37790 +
37791 +       down(&uf_info->write);
37792 +       get_exclusive_access(uf_info);
37793 +
37794 +       if (!IS_RDONLY(inode) && (vma->vm_flags & (VM_MAYWRITE | VM_SHARED))) {
37795 +               /*
37796 +                * we need file built of extent items. If it is still built of
37797 +                * tail items we have to convert it. Find what items the file
37798 +                * is built of
37799 +                */
37800 +               result = find_file_state(inode, uf_info);
37801 +               if (result != 0) {
37802 +                       drop_exclusive_access(uf_info);
37803 +                       up(&uf_info->write);
37804 +                       reiser4_exit_context(ctx);
37805 +                       return result;
37806 +               }
37807 +
37808 +               assert("vs-1648", (uf_info->container == UF_CONTAINER_TAILS ||
37809 +                                  uf_info->container == UF_CONTAINER_EXTENTS ||
37810 +                                  uf_info->container == UF_CONTAINER_EMPTY));
37811 +               if (uf_info->container == UF_CONTAINER_TAILS) {
37812 +                       /*
37813 +                        * invalidate all pages and convert file from tails to
37814 +                        * extents
37815 +                        */
37816 +                       result = check_pages_unix_file(file, inode);
37817 +                       if (result) {
37818 +                               drop_exclusive_access(uf_info);
37819 +                               up(&uf_info->write);
37820 +                               reiser4_exit_context(ctx);
37821 +                               return result;
37822 +                       }
37823 +               }
37824 +       }
37825 +
37826 +       /*
37827 +        * generic_file_mmap will do update_atime. Grab space for stat data
37828 +        * update.
37829 +        */
37830 +       needed = inode_file_plugin(inode)->estimate.update(inode);
37831 +       result = reiser4_grab_space_force(needed, BA_CAN_COMMIT);
37832 +       if (result) {
37833 +               drop_exclusive_access(uf_info);
37834 +               up(&uf_info->write);
37835 +               reiser4_exit_context(ctx);
37836 +               return result;
37837 +       }
37838 +
37839 +       result = generic_file_mmap(file, vma);
37840 +       if (result == 0) {
37841 +               /* mark file as having mapping. */
37842 +               inode_set_flag(inode, REISER4_HAS_MMAP);
37843 +       }
37844 +
37845 +       drop_exclusive_access(uf_info);
37846 +       up(&uf_info->write);
37847 +       reiser4_exit_context(ctx);
37848 +       return result;
37849 +}
37850 +
37851 +/**
37852 + * find_first_item
37853 + * @inode:
37854 + *
37855 + * Finds file item which is responsible for first byte in the file.
37856 + */
37857 +static int find_first_item(struct inode *inode)
37858 +{
37859 +       coord_t coord;
37860 +       lock_handle lh;
37861 +       reiser4_key key;
37862 +       int result;
37863 +
37864 +       coord_init_zero(&coord);
37865 +       init_lh(&lh);
37866 +       inode_file_plugin(inode)->key_by_inode(inode, 0, &key);
37867 +       result = find_file_item_nohint(&coord, &lh, &key, ZNODE_READ_LOCK,
37868 +                                      inode);
37869 +       if (result == CBK_COORD_FOUND) {
37870 +               if (coord.between == AT_UNIT) {
37871 +                       result = zload(coord.node);
37872 +                       if (result == 0) {
37873 +                               result = item_id_by_coord(&coord);
37874 +                               zrelse(coord.node);
37875 +                               if (result != EXTENT_POINTER_ID &&
37876 +                                   result != FORMATTING_ID)
37877 +                                       result = RETERR(-EIO);
37878 +                       }
37879 +               } else
37880 +                       result = RETERR(-EIO);
37881 +       }
37882 +       done_lh(&lh);
37883 +       return result;
37884 +}
37885 +
37886 +/**
37887 + * open_unix_file
37888 + * @inode:
37889 + * @file:
37890 + *
37891 + * If filesystem is not readonly - complete uncompleted tail conversion if
37892 + * there was one
37893 + */
37894 +int open_unix_file(struct inode *inode, struct file *file)
37895 +{
37896 +       int result;
37897 +       reiser4_context *ctx;
37898 +       unix_file_info_t *uf_info;
37899 +
37900 +       if (IS_RDONLY(inode))
37901 +               return 0;
37902 +
37903 +       if (!inode_get_flag(inode, REISER4_PART_MIXED))
37904 +               return 0;
37905 +
37906 +       ctx = init_context(inode->i_sb);
37907 +       if (IS_ERR(ctx))
37908 +               return PTR_ERR(ctx);
37909 +
37910 +       uf_info = unix_file_inode_data(inode);
37911 +       get_exclusive_access(uf_info);
37912 +
37913 +       /*
37914 +        * it may happen that another process is doing tail conversion. Wait
37915 +        * until it completes
37916 +        */
37917 +       while (1) {
37918 +               if (inode_get_flag(inode, REISER4_PART_IN_CONV)) {
37919 +                       drop_exclusive_access(uf_info);
37920 +                       schedule();
37921 +                       get_exclusive_access(uf_info);
37922 +                       continue;
37923 +               }
37924 +               break;
37925 +       }
37926 +
37927 +       if (!inode_get_flag(inode, REISER4_PART_MIXED)) {
37928 +               /*
37929 +                * other process completed the conversion
37930 +                */
37931 +               drop_exclusive_access(uf_info);
37932 +               reiser4_exit_context(ctx);
37933 +               return 0;
37934 +       }
37935 +
37936 +       /*
37937 +        * file left in semi converted state after unclean shutdown or another
37938 +        * thread is doing conversion and dropped exclusive access which doing
37939 +        * balance dirty pages. Complete the conversion
37940 +        */
37941 +       result = find_first_item(inode);
37942 +       if (result == EXTENT_POINTER_ID)
37943 +               /*
37944 +                * first item is extent, therefore there was incomplete
37945 +                * tail2extent conversion. Complete it
37946 +                */
37947 +               result = tail2extent(unix_file_inode_data(inode));
37948 +       else if (result == FORMATTING_ID)
37949 +               /*
37950 +                * first item is formatting item, therefore there was
37951 +                * incomplete extent2tail conversion. Complete it
37952 +                */
37953 +               result = extent2tail(unix_file_inode_data(inode));
37954 +       else
37955 +               result = -EIO;
37956 +
37957 +       assert("vs-1712",
37958 +              ergo(result == 0, (!inode_get_flag(inode, REISER4_PART_MIXED) &&
37959 +                                 !inode_get_flag(inode, REISER4_PART_IN_CONV))));
37960 +       drop_exclusive_access(uf_info);
37961 +       reiser4_exit_context(ctx);
37962 +       return result;
37963 +}
37964 +
37965 +#define NEITHER_OBTAINED 0
37966 +#define EA_OBTAINED 1
37967 +#define NEA_OBTAINED 2
37968 +
37969 +static void drop_access(unix_file_info_t *uf_info)
37970 +{
37971 +       if (uf_info->exclusive_use)
37972 +               drop_exclusive_access(uf_info);
37973 +       else
37974 +               drop_nonexclusive_access(uf_info);
37975 +}
37976 +
37977 +#define debug_wuf(format, ...) printk("%s: %d: %s: " format "\n", \
37978 +                             __FILE__, __LINE__, __FUNCTION__, ## __VA_ARGS__)
37979 +
37980 +/**
37981 + * write_unix_file - write of struct file_operations
37982 + * @file: file to write to
37983 + * @buf: address of user-space buffer
37984 + * @write_amount: number of bytes to write
37985 + * @off: position in file to write to
37986 + *
37987 + * This is implementation of vfs's write method of struct file_operations for
37988 + * unix file plugin.
37989 + */
37990 +ssize_t write_unix_file(struct file *file, const char __user *buf,
37991 +                       size_t count, loff_t *pos)
37992 +{
37993 +       int result;
37994 +       reiser4_context *ctx;
37995 +       struct inode *inode;
37996 +       unix_file_info_t *uf_info;
37997 +       ssize_t written;
37998 +       int try_free_space;
37999 +       int to_write = PAGE_CACHE_SIZE * WRITE_GRANULARITY;
38000 +       size_t left;
38001 +       ssize_t (*write_op)(struct file *, const char __user *, size_t,
38002 +                           loff_t *pos);
38003 +       int ea;
38004 +       loff_t new_size;
38005 +
38006 +       inode = file->f_dentry->d_inode;
38007 +       ctx = init_context(inode->i_sb);
38008 +       if (IS_ERR(ctx))
38009 +               return PTR_ERR(ctx);
38010 +
38011 +       mutex_lock(&inode->i_mutex);
38012 +
38013 +       assert("vs-947", !inode_get_flag(inode, REISER4_NO_SD));
38014 +       assert("vs-9471", (!inode_get_flag(inode, REISER4_PART_MIXED)));
38015 +
38016 +       /* check amount of bytes to write and writing position */
38017 +       result = generic_write_checks(file, pos, &count, 0);
38018 +       if (result) {
38019 +               mutex_unlock(&inode->i_mutex);
38020 +               context_set_commit_async(ctx);
38021 +               reiser4_exit_context(ctx);
38022 +               return result;
38023 +       }
38024 +
38025 +       result = remove_suid(file->f_dentry);
38026 +       if (result) {
38027 +               mutex_unlock(&inode->i_mutex);
38028 +               context_set_commit_async(ctx);
38029 +               reiser4_exit_context(ctx);
38030 +               return result;
38031 +       }
38032 +
38033 +       uf_info = unix_file_inode_data(inode);
38034 +
38035 +       current->backing_dev_info = inode->i_mapping->backing_dev_info;
38036 +       written = 0;
38037 +       try_free_space = 0;
38038 +       left = count;
38039 +       ea = NEITHER_OBTAINED;
38040 +
38041 +       new_size = i_size_read(inode);
38042 +       if (*pos + count > new_size)
38043 +               new_size = *pos + count;
38044 +
38045 +       while (left) {
38046 +               if (left < to_write)
38047 +                       to_write = left;
38048 +
38049 +               if (uf_info->container == UF_CONTAINER_EMPTY) {
38050 +                       get_exclusive_access(uf_info);
38051 +                       ea = EA_OBTAINED;
38052 +                       if (uf_info->container != UF_CONTAINER_EMPTY) {
38053 +                               /* file is made not empty by another process */
38054 +                               drop_exclusive_access(uf_info);
38055 +                               ea = NEITHER_OBTAINED;
38056 +                               continue;
38057 +                       }
38058 +               } else if (uf_info->container == UF_CONTAINER_UNKNOWN) {
38059 +                       /*
38060 +                        * get exclusive access directly just to not have to
38061 +                        * re-obtain it if file will appear empty
38062 +                        */
38063 +                       get_exclusive_access(uf_info);
38064 +                       ea = EA_OBTAINED;
38065 +                       result = find_file_state(inode, uf_info);
38066 +                       if (result) {
38067 +                               drop_exclusive_access(uf_info);
38068 +                               ea = NEITHER_OBTAINED;
38069 +                               break;
38070 +                       }
38071 +               } else {
38072 +                       get_nonexclusive_access(uf_info);
38073 +                       ea = NEA_OBTAINED;
38074 +               }
38075 +
38076 +               /* either EA or NEA is obtained. Choose item write method */
38077 +               if (uf_info->container == UF_CONTAINER_EXTENTS) {
38078 +                       /* file is built of extent items */
38079 +                       write_op = write_extent;
38080 +               } else if (uf_info->container == UF_CONTAINER_EMPTY) {
38081 +                       /* file is empty */
38082 +                       if (should_have_notail(uf_info, new_size))
38083 +                               write_op = write_extent;
38084 +                       else
38085 +                               write_op = write_tail;
38086 +               } else {
38087 +                       /* file is built of tail items */
38088 +                       if (should_have_notail(uf_info, new_size)) {
38089 +                               if (ea == NEA_OBTAINED) {
38090 +                                       drop_nonexclusive_access(uf_info);
38091 +                                       get_exclusive_access(uf_info);
38092 +                                       ea = EA_OBTAINED;
38093 +                               }
38094 +                               if (uf_info->container == UF_CONTAINER_TAILS) {
38095 +                                       /*
38096 +                                        * if file is being convered by another
38097 +                                        * process - wait until it completes
38098 +                                        */
38099 +                                       while (1) {
38100 +                                               if (inode_get_flag(inode, REISER4_PART_IN_CONV)) {
38101 +                                                       drop_exclusive_access(uf_info);
38102 +                                                       schedule();
38103 +                                                       get_exclusive_access(uf_info);
38104 +                                                       continue;
38105 +                                               }
38106 +                                               break;
38107 +                                       }
38108 +                                       if (uf_info->container ==  UF_CONTAINER_TAILS) {
38109 +                                               result = tail2extent(uf_info);
38110 +                                               if (result)
38111 +                                                       break;
38112 +                                       }
38113 +                               }
38114 +                               drop_exclusive_access(uf_info);
38115 +                               ea = NEITHER_OBTAINED;
38116 +                               continue;
38117 +                       }
38118 +                       write_op = write_tail;
38119 +               }
38120 +
38121 +               written = write_op(file, buf, to_write, pos);
38122 +               if (written == -ENOSPC && try_free_space) {
38123 +                       drop_access(uf_info);
38124 +                       txnmgr_force_commit_all(inode->i_sb, 0);
38125 +                       try_free_space = 0;
38126 +                       continue;
38127 +               }
38128 +               if (written < 0) {
38129 +                       drop_access(uf_info);
38130 +                       result = written;
38131 +                       break;
38132 +               }
38133 +               /* something is written. */
38134 +               if (uf_info->container == UF_CONTAINER_EMPTY) {
38135 +                       assert("", ea == EA_OBTAINED);
38136 +                       uf_info->container = (write_op == write_extent) ?
38137 +                               UF_CONTAINER_EXTENTS : UF_CONTAINER_TAILS;
38138 +               } else {
38139 +                       assert("", ergo(uf_info->container == UF_CONTAINER_EXTENTS,
38140 +                                       write_op == write_extent));
38141 +                       assert("", ergo(uf_info->container == UF_CONTAINER_TAILS,
38142 +                                       write_op == write_tail));
38143 +               }
38144 +               if (*pos + written > inode->i_size)
38145 +                       INODE_SET_FIELD(inode, i_size, *pos + written);
38146 +               file_update_time(file);
38147 +               result = reiser4_update_sd(inode);
38148 +               if (result) {
38149 +                       mutex_unlock(&inode->i_mutex);
38150 +                       current->backing_dev_info = NULL;
38151 +                       drop_access(uf_info);
38152 +                       context_set_commit_async(ctx);
38153 +                       reiser4_exit_context(ctx);
38154 +                       return result;
38155 +               }
38156 +               drop_access(uf_info);
38157 +               ea = NEITHER_OBTAINED;
38158 +               txn_restart(ctx);
38159 +               current->journal_info = NULL;
38160 +               /*
38161 +                * tell VM how many pages were dirtied. Maybe number of pages
38162 +                * which were dirty already should not be counted
38163 +                */
38164 +               balance_dirty_pages_ratelimited_nr(inode->i_mapping,
38165 +                                                  (written + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE);
38166 +               current->journal_info = ctx;
38167 +
38168 +               left -= written;
38169 +               buf += written;
38170 +               *pos += written;
38171 +       }
38172 +
38173 +       mutex_unlock(&inode->i_mutex);
38174 +
38175 +       if (result == 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
38176 +               txn_restart_current();
38177 +               grab_space_enable();
38178 +               result = sync_unix_file(file, file->f_dentry,
38179 +                                       0 /* data and stat data */ );
38180 +               if (result)
38181 +                       warning("reiser4-7", "failed to sync file %llu",
38182 +                               (unsigned long long)get_inode_oid(inode));
38183 +       }
38184 +
38185 +       current->backing_dev_info = NULL;
38186 +
38187 +       reiser4_exit_context(ctx);
38188 +
38189 +       /*
38190 +        * return number of written bytes or error code if nothing is
38191 +        * written. Note, that it does not work correctly in case when
38192 +        * sync_unix_file returns error
38193 +        */
38194 +       return (count - left) ? (count - left) : result;
38195 +}
38196 +
38197 +/**
38198 + * release_unix_file - release of struct file_operations
38199 + * @inode: inode of released file
38200 + * @file: file to release
38201 + *
38202 + * Implementation of release method of struct file_operations for unix file
38203 + * plugin. If last reference to indode is released - convert all extent items
38204 + * into tail items if necessary. Frees reiser4 specific file data.
38205 + */
38206 +int release_unix_file(struct inode *inode, struct file *file)
38207 +{
38208 +       reiser4_context *ctx;
38209 +       unix_file_info_t *uf_info;
38210 +       int result;
38211 +       int in_reiser4;
38212 +
38213 +       in_reiser4 = is_in_reiser4_context();
38214 +
38215 +       ctx = init_context(inode->i_sb);
38216 +       if (IS_ERR(ctx))
38217 +               return PTR_ERR(ctx);
38218 +
38219 +       result = 0;
38220 +       if (in_reiser4 == 0) {
38221 +               uf_info = unix_file_inode_data(inode);
38222 +
38223 +               down(&uf_info->write);
38224 +               get_exclusive_access(uf_info);
38225 +               if (atomic_read(&file->f_dentry->d_count) == 1 &&
38226 +                   uf_info->container == UF_CONTAINER_EXTENTS &&
38227 +                   !should_have_notail(uf_info, inode->i_size) &&
38228 +                   !rofs_inode(inode)) {
38229 +                       result = extent2tail(uf_info);
38230 +                       if (result != 0) {
38231 +                               warning("nikita-3233",
38232 +                                       "Failed (%d) to convert in %s (%llu)",
38233 +                                       result, __FUNCTION__,
38234 +                                       (unsigned long long)
38235 +                                       get_inode_oid(inode));
38236 +                       }
38237 +               }
38238 +               drop_exclusive_access(uf_info);
38239 +               up(&uf_info->write);
38240 +       } else {
38241 +               /*
38242 +                  we are within reiser4 context already. How latter is
38243 +                  possible? Simple:
38244 +
38245 +                  (gdb) bt
38246 +                  #0  get_exclusive_access ()
38247 +                  #2  0xc01e56d3 in release_unix_file ()
38248 +                  #3  0xc01c3643 in reiser4_release ()
38249 +                  #4  0xc014cae0 in __fput ()
38250 +                  #5  0xc013ffc3 in remove_vm_struct ()
38251 +                  #6  0xc0141786 in exit_mmap ()
38252 +                  #7  0xc0118480 in mmput ()
38253 +                  #8  0xc0133205 in oom_kill ()
38254 +                  #9  0xc01332d1 in out_of_memory ()
38255 +                  #10 0xc013bc1d in try_to_free_pages ()
38256 +                  #11 0xc013427b in __alloc_pages ()
38257 +                  #12 0xc013f058 in do_anonymous_page ()
38258 +                  #13 0xc013f19d in do_no_page ()
38259 +                  #14 0xc013f60e in handle_mm_fault ()
38260 +                  #15 0xc01131e5 in do_page_fault ()
38261 +                  #16 0xc0104935 in error_code ()
38262 +                  #17 0xc025c0c6 in __copy_to_user_ll ()
38263 +                  #18 0xc01d496f in read_tail ()
38264 +                  #19 0xc01e4def in read_unix_file ()
38265 +                  #20 0xc01c3504 in reiser4_read ()
38266 +                  #21 0xc014bd4f in vfs_read ()
38267 +                  #22 0xc014bf66 in sys_read ()
38268 +                */
38269 +               warning("vs-44", "out of memory?");
38270 +       }
38271 +
38272 +       reiser4_free_file_fsdata(file);
38273 +
38274 +       reiser4_exit_context(ctx);
38275 +       return result;
38276 +}
38277 +
38278 +static void set_file_notail(struct inode *inode)
38279 +{
38280 +       reiser4_inode *state;
38281 +       formatting_plugin *tplug;
38282 +
38283 +       state = reiser4_inode_data(inode);
38284 +       tplug = formatting_plugin_by_id(NEVER_TAILS_FORMATTING_ID);
38285 +       plugin_set_formatting(&state->pset, tplug);
38286 +       inode_set_plugin(inode,
38287 +                        formatting_plugin_to_plugin(tplug), PSET_FORMATTING);
38288 +}
38289 +
38290 +/* if file is built of tails - convert it to extents */
38291 +static int unpack(struct file *filp, struct inode *inode, int forever)
38292 +{
38293 +       int result = 0;
38294 +       unix_file_info_t *uf_info;
38295 +
38296 +       uf_info = unix_file_inode_data(inode);
38297 +       assert("vs-1628", ea_obtained(uf_info));
38298 +
38299 +       result = find_file_state(inode, uf_info);
38300 +       if (result)
38301 +               return result;
38302 +       assert("vs-1074", uf_info->container != UF_CONTAINER_UNKNOWN);
38303 +
38304 +       if (uf_info->container == UF_CONTAINER_TAILS) {
38305 +               /*
38306 +                * if file is being convered by another process - wait until it
38307 +                * completes
38308 +                */
38309 +               while (1) {
38310 +                       if (inode_get_flag(inode, REISER4_PART_IN_CONV)) {
38311 +                               drop_exclusive_access(uf_info);
38312 +                               schedule();
38313 +                               get_exclusive_access(uf_info);
38314 +                               continue;
38315 +                       }
38316 +                       break;
38317 +               }
38318 +               if (uf_info->container == UF_CONTAINER_TAILS) {
38319 +                       result = tail2extent(uf_info);
38320 +                       if (result)
38321 +                               return result;
38322 +               }
38323 +       }
38324 +       if (forever) {
38325 +               /* safe new formatting plugin in stat data */
38326 +               __u64 tograb;
38327 +
38328 +               set_file_notail(inode);
38329 +
38330 +               grab_space_enable();
38331 +               tograb = inode_file_plugin(inode)->estimate.update(inode);
38332 +               result = reiser4_grab_space(tograb, BA_CAN_COMMIT);
38333 +               result = reiser4_update_sd(inode);
38334 +       }
38335 +
38336 +       return result;
38337 +}
38338 +
38339 +/* implentation of vfs' ioctl method of struct file_operations for unix file
38340 +   plugin
38341 +*/
38342 +int
38343 +ioctl_unix_file(struct inode *inode, struct file *filp,
38344 +               unsigned int cmd, unsigned long arg UNUSED_ARG)
38345 +{
38346 +       reiser4_context *ctx;
38347 +       int result;
38348 +
38349 +       ctx = init_context(inode->i_sb);
38350 +       if (IS_ERR(ctx))
38351 +               return PTR_ERR(ctx);
38352 +
38353 +       switch (cmd) {
38354 +       case REISER4_IOC_UNPACK:
38355 +               get_exclusive_access(unix_file_inode_data(inode));
38356 +               result = unpack(filp, inode, 1 /* forever */ );
38357 +               drop_exclusive_access(unix_file_inode_data(inode));
38358 +               break;
38359 +
38360 +       default:
38361 +               result = RETERR(-ENOSYS);
38362 +               break;
38363 +       }
38364 +       reiser4_exit_context(ctx);
38365 +       return result;
38366 +}
38367 +
38368 +/* implentation of vfs' bmap method of struct address_space_operations for unix
38369 +   file plugin
38370 +*/
38371 +sector_t bmap_unix_file(struct address_space * mapping, sector_t lblock)
38372 +{
38373 +       reiser4_context *ctx;
38374 +       sector_t result;
38375 +       reiser4_key key;
38376 +       coord_t coord;
38377 +       lock_handle lh;
38378 +       struct inode *inode;
38379 +       item_plugin *iplug;
38380 +       sector_t block;
38381 +
38382 +       inode = mapping->host;
38383 +
38384 +       ctx = init_context(inode->i_sb);
38385 +       if (IS_ERR(ctx))
38386 +               return PTR_ERR(ctx);
38387 +       key_by_inode_and_offset_common(inode,
38388 +                                      (loff_t) lblock * current_blocksize,
38389 +                                      &key);
38390 +
38391 +       init_lh(&lh);
38392 +       result =
38393 +           find_file_item_nohint(&coord, &lh, &key, ZNODE_READ_LOCK, inode);
38394 +       if (cbk_errored(result)) {
38395 +               done_lh(&lh);
38396 +               reiser4_exit_context(ctx);
38397 +               return result;
38398 +       }
38399 +
38400 +       result = zload(coord.node);
38401 +       if (result) {
38402 +               done_lh(&lh);
38403 +               reiser4_exit_context(ctx);
38404 +               return result;
38405 +       }
38406 +
38407 +       iplug = item_plugin_by_coord(&coord);
38408 +       if (iplug->s.file.get_block) {
38409 +               result = iplug->s.file.get_block(&coord, lblock, &block);
38410 +               if (result == 0)
38411 +                       result = block;
38412 +       } else
38413 +               result = RETERR(-EINVAL);
38414 +
38415 +       zrelse(coord.node);
38416 +       done_lh(&lh);
38417 +       reiser4_exit_context(ctx);
38418 +       return result;
38419 +}
38420 +
38421 +/**
38422 + * flow_by_inode_unix_file - initizlize structure flow
38423 + * @inode: inode of file for which read or write is abou
38424 + * @buf: buffer to perform read to or write from
38425 + * @user: flag showing whether @buf is user space or kernel space
38426 + * @size: size of buffer @buf
38427 + * @off: start offset fro read or write
38428 + * @op: READ or WRITE
38429 + * @flow:
38430 + *
38431 + * Initializes fields of @flow: key, size of data, i/o mode (read or write).
38432 + */
38433 +int flow_by_inode_unix_file(struct inode *inode,
38434 +                           const char __user *buf, int user,
38435 +                           loff_t size, loff_t off,
38436 +                           rw_op op, flow_t *flow)
38437 +{
38438 +       assert("nikita-1100", inode != NULL);
38439 +
38440 +       flow->length = size;
38441 +       memcpy(&flow->data, &buf, sizeof(buf));
38442 +       flow->user = user;
38443 +       flow->op = op;
38444 +       assert("nikita-1931", inode_file_plugin(inode) != NULL);
38445 +       assert("nikita-1932",
38446 +              inode_file_plugin(inode)->key_by_inode ==
38447 +              key_by_inode_and_offset_common);
38448 +       /* calculate key of write position and insert it into flow->key */
38449 +       return key_by_inode_and_offset_common(inode, off, &flow->key);
38450 +}
38451 +
38452 +/* plugin->u.file.set_plug_in_sd = NULL
38453 +   plugin->u.file.set_plug_in_inode = NULL
38454 +   plugin->u.file.create_blank_sd = NULL */
38455 +/* plugin->u.file.delete */
38456 +/*
38457 +   plugin->u.file.add_link = add_link_common
38458 +   plugin->u.file.rem_link = NULL */
38459 +
38460 +/* plugin->u.file.owns_item
38461 +   this is common_file_owns_item with assertion */
38462 +/* Audited by: green(2002.06.15) */
38463 +int
38464 +owns_item_unix_file(const struct inode *inode /* object to check against */ ,
38465 +                   const coord_t * coord /* coord to check */ )
38466 +{
38467 +       int result;
38468 +
38469 +       result = owns_item_common(inode, coord);
38470 +       if (!result)
38471 +               return 0;
38472 +       if (item_type_by_coord(coord) != UNIX_FILE_METADATA_ITEM_TYPE)
38473 +               return 0;
38474 +       assert("vs-547",
38475 +              item_id_by_coord(coord) == EXTENT_POINTER_ID ||
38476 +              item_id_by_coord(coord) == FORMATTING_ID);
38477 +       return 1;
38478 +}
38479 +
38480 +static int setattr_truncate(struct inode *inode, struct iattr *attr)
38481 +{
38482 +       int result;
38483 +       int s_result;
38484 +       loff_t old_size;
38485 +       reiser4_tree *tree;
38486 +
38487 +       inode_check_scale(inode, inode->i_size, attr->ia_size);
38488 +
38489 +       old_size = inode->i_size;
38490 +       tree = tree_by_inode(inode);
38491 +
38492 +       result = safe_link_grab(tree, BA_CAN_COMMIT);
38493 +       if (result == 0)
38494 +               result = safe_link_add(inode, SAFE_TRUNCATE);
38495 +       if (result == 0)
38496 +               result = truncate_file_body(inode, attr->ia_size);
38497 +       if (result)
38498 +               warning("vs-1588", "truncate_file failed: oid %lli, "
38499 +                       "old size %lld, new size %lld, retval %d",
38500 +                       (unsigned long long)get_inode_oid(inode),
38501 +                       old_size, attr->ia_size, result);
38502 +
38503 +       s_result = safe_link_grab(tree, BA_CAN_COMMIT);
38504 +       if (s_result == 0)
38505 +               s_result =
38506 +                   safe_link_del(tree, get_inode_oid(inode), SAFE_TRUNCATE);
38507 +       if (s_result != 0) {
38508 +               warning("nikita-3417", "Cannot kill safelink %lli: %i",
38509 +                       (unsigned long long)get_inode_oid(inode), s_result);
38510 +       }
38511 +       safe_link_release(tree);
38512 +       return result;
38513 +}
38514 +
38515 +/* plugin->u.file.setattr method */
38516 +/* This calls inode_setattr and if truncate is in effect it also takes
38517 +   exclusive inode access to avoid races */
38518 +int setattr_unix_file(struct dentry *dentry,   /* Object to change attributes */
38519 +                     struct iattr *attr /* change description */ )
38520 +{
38521 +       int result;
38522 +
38523 +       if (attr->ia_valid & ATTR_SIZE) {
38524 +               reiser4_context *ctx;
38525 +               unix_file_info_t *uf_info;
38526 +
38527 +               /* truncate does reservation itself and requires exclusive
38528 +                  access obtained */
38529 +               ctx = init_context(dentry->d_inode->i_sb);
38530 +               if (IS_ERR(ctx))
38531 +                       return PTR_ERR(ctx);
38532 +
38533 +               uf_info = unix_file_inode_data(dentry->d_inode);
38534 +               down(&uf_info->write);
38535 +               get_exclusive_access(uf_info);
38536 +               result = setattr_truncate(dentry->d_inode, attr);
38537 +               drop_exclusive_access(uf_info);
38538 +               up(&uf_info->write);
38539 +               context_set_commit_async(ctx);
38540 +               reiser4_exit_context(ctx);
38541 +       } else
38542 +               result = setattr_common(dentry, attr);
38543 +
38544 +       return result;
38545 +}
38546 +
38547 +/* plugin->u.file.init_inode_data */
38548 +void
38549 +init_inode_data_unix_file(struct inode *inode,
38550 +                         reiser4_object_create_data * crd, int create)
38551 +{
38552 +       unix_file_info_t *data;
38553 +
38554 +       data = unix_file_inode_data(inode);
38555 +       data->container = create ? UF_CONTAINER_EMPTY : UF_CONTAINER_UNKNOWN;
38556 +       init_rwsem(&data->latch);
38557 +       sema_init(&data->write, 1);
38558 +       data->tplug = inode_formatting_plugin(inode);
38559 +       data->exclusive_use = 0;
38560 +
38561 +#if REISER4_DEBUG
38562 +       data->ea_owner = NULL;
38563 +       atomic_set(&data->nr_neas, 0);
38564 +#endif
38565 +       init_inode_ordering(inode, crd, create);
38566 +}
38567 +
38568 +/**
38569 + * delete_object_unix_file - delete_object of file_plugin
38570 + * @inode: inode to be deleted
38571 + *
38572 + * Truncates file to length 0, removes stat data and safe link.
38573 + */
38574 +int delete_object_unix_file(struct inode *inode)
38575 +{
38576 +       unix_file_info_t *uf_info;
38577 +       int result;
38578 +
38579 +       if (inode_get_flag(inode, REISER4_NO_SD))
38580 +               return 0;
38581 +
38582 +       /* truncate file bogy first */
38583 +       uf_info = unix_file_inode_data(inode);
38584 +       get_exclusive_access(uf_info);
38585 +       result = truncate_file_body(inode, 0 /* size */ );
38586 +       drop_exclusive_access(uf_info);
38587 +
38588 +       if (result)
38589 +               warning("", "failed to truncate file (%llu) on removal: %d",
38590 +                       get_inode_oid(inode), result);
38591 +
38592 +       /* remove stat data and safe link */
38593 +       return delete_object_common(inode);
38594 +}
38595 +
38596 +/**
38597 + * sendfile_unix_file - sendfile of struct file_operations
38598 + * @file: file to be sent
38599 + * @ppos: position to start from
38600 + * @count: number of bytes to send
38601 + * @actor: function to copy data
38602 + * @target: where to copy read data
38603 + *
38604 + * Reads @count bytes from @file and calls @actor for every page read. This is
38605 + * needed for loop back devices support.
38606 + */
38607 +ssize_t
38608 +sendfile_unix_file(struct file *file, loff_t *ppos, size_t count,
38609 +                  read_actor_t actor, void *target)
38610 +{
38611 +       reiser4_context *ctx;
38612 +       ssize_t result;
38613 +       struct inode *inode;
38614 +       unix_file_info_t *uf_info;
38615 +
38616 +       inode = file->f_dentry->d_inode;
38617 +       ctx = init_context(inode->i_sb);
38618 +       if (IS_ERR(ctx))
38619 +               return PTR_ERR(ctx);
38620 +
38621 +       /*
38622 +        * generic_file_sndfile may want to call update_atime. Grab space for
38623 +        * stat data update
38624 +        */
38625 +       result = reiser4_grab_space(estimate_update_common(inode),
38626 +                                   BA_CAN_COMMIT);
38627 +       if (result)
38628 +               goto error;
38629 +       mutex_lock(&inode->i_mutex);
38630 +       inode_set_flag(inode, REISER4_HAS_MMAP);
38631 +       mutex_unlock(&inode->i_mutex);
38632 +
38633 +       uf_info = unix_file_inode_data(inode);
38634 +       get_nonexclusive_access(uf_info);
38635 +       result = generic_file_sendfile(file, ppos, count, actor, target);
38636 +       drop_nonexclusive_access(uf_info);
38637 + error:
38638 +       reiser4_exit_context(ctx);
38639 +       return result;
38640 +}
38641 +
38642 +int
38643 +prepare_write_unix_file(struct file *file, struct page *page,
38644 +                       unsigned from, unsigned to)
38645 +{
38646 +       reiser4_context *ctx;
38647 +       unix_file_info_t *uf_info;
38648 +       int ret;
38649 +
38650 +       ctx = init_context(file->f_dentry->d_inode->i_sb);
38651 +       if (IS_ERR(ctx))
38652 +               return PTR_ERR(ctx);
38653 +
38654 +       uf_info = unix_file_inode_data(file->f_dentry->d_inode);
38655 +       get_exclusive_access(uf_info);
38656 +       ret = find_file_state(file->f_dentry->d_inode, uf_info);
38657 +       if (ret == 0) {
38658 +               if (uf_info->container == UF_CONTAINER_TAILS)
38659 +                       ret = -EINVAL;
38660 +               else
38661 +                       ret = do_prepare_write(file, page, from, to);
38662 +       }
38663 +       drop_exclusive_access(uf_info);
38664 +
38665 +       /* don't commit transaction under inode semaphore */
38666 +       context_set_commit_async(ctx);
38667 +       reiser4_exit_context(ctx);
38668 +       return ret;
38669 +}
38670 +
38671 +/*
38672 + * Local variables:
38673 + * c-indentation-style: "K&R"
38674 + * mode-name: "LC"
38675 + * c-basic-offset: 8
38676 + * tab-width: 8
38677 + * fill-column: 79
38678 + * scroll-step: 1
38679 + * End:
38680 + */
38681 diff --git a/fs/reiser4/plugin/file/file.h b/fs/reiser4/plugin/file/file.h
38682 new file mode 100644
38683 index 0000000..c1f83c3
38684 --- /dev/null
38685 +++ b/fs/reiser4/plugin/file/file.h
38686 @@ -0,0 +1,257 @@
38687 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
38688 + * reiser4/README */
38689 +
38690 +/* this file contains declarations of methods implementing file plugins
38691 +   (UNIX_FILE_PLUGIN_ID, SYMLINK_FILE_PLUGIN_ID and CRC_FILE_PLUGIN_ID) */
38692 +
38693 +#if !defined( __REISER4_FILE_H__ )
38694 +#define __REISER4_FILE_H__
38695 +
38696 +/* declarations of functions implementing UNIX_FILE_PLUGIN_ID file plugin */
38697 +
38698 +/* inode operations */
38699 +int setattr_unix_file(struct dentry *, struct iattr *);
38700 +
38701 +/* file operations */
38702 +ssize_t read_unix_file(struct file *, char __user *buf, size_t read_amount,
38703 +                      loff_t *off);
38704 +ssize_t write_unix_file(struct file *, const char __user *buf, size_t write_amount,
38705 +                       loff_t * off);
38706 +int ioctl_unix_file(struct inode *, struct file *, unsigned int cmd,
38707 +                   unsigned long arg);
38708 +int mmap_unix_file(struct file *, struct vm_area_struct *);
38709 +int open_unix_file(struct inode *, struct file *);
38710 +int release_unix_file(struct inode *, struct file *);
38711 +int sync_unix_file(struct file *, struct dentry *, int datasync);
38712 +ssize_t sendfile_unix_file(struct file *, loff_t *ppos, size_t count,
38713 +                          read_actor_t, void *target);
38714 +
38715 +/* address space operations */
38716 +int readpage_unix_file(struct file *, struct page *);
38717 +int readpage_unix_file_nolock(struct file *, struct page *);
38718 +int writepages_unix_file(struct address_space *, struct writeback_control *);
38719 +int prepare_write_unix_file(struct file *, struct page *, unsigned from,
38720 +                           unsigned to);
38721 +int commit_write_unix_file(struct file *, struct page *, unsigned from,
38722 +                          unsigned to);
38723 +sector_t bmap_unix_file(struct address_space *, sector_t lblock);
38724 +
38725 +/* file plugin operations */
38726 +int flow_by_inode_unix_file(struct inode *, const char __user *buf,
38727 +                           int user, loff_t, loff_t, rw_op, flow_t *);
38728 +int owns_item_unix_file(const struct inode *, const coord_t *);
38729 +void init_inode_data_unix_file(struct inode *, reiser4_object_create_data *,
38730 +                              int create);
38731 +int delete_object_unix_file(struct inode *);
38732 +
38733 +/*
38734 + * all the write into unix file is performed by item write method. Write method
38735 + * of unix file plugin only decides which item plugin (extent or tail) and in
38736 + * which mode (one from the enum below) to call
38737 + */
38738 +typedef enum {
38739 +       FIRST_ITEM = 1,
38740 +       APPEND_ITEM = 2,
38741 +       OVERWRITE_ITEM = 3
38742 +} write_mode_t;
38743 +
38744 +/* unix file may be in one the following states */
38745 +typedef enum {
38746 +       UF_CONTAINER_UNKNOWN = 0,
38747 +       UF_CONTAINER_TAILS = 1,
38748 +       UF_CONTAINER_EXTENTS = 2,
38749 +       UF_CONTAINER_EMPTY = 3
38750 +} file_container_t;
38751 +
38752 +struct formatting_plugin;
38753 +struct inode;
38754 +
38755 +/* unix file plugin specific part of reiser4 inode */
38756 +typedef struct unix_file_info {
38757 +       /*
38758 +        * this read-write lock protects file containerization change. Accesses
38759 +        * which do not change file containerization (see file_container_t)
38760 +        * (read, readpage, writepage, write (until tail conversion is
38761 +        * involved)) take read-lock. Accesses which modify file
38762 +        * containerization (truncate, conversion from tail to extent and back)
38763 +        * take write-lock.
38764 +        */
38765 +       struct rw_semaphore latch;
38766 +       /*
38767 +        * this semaphore is used to serialize writes instead of inode->i_mutex,
38768 +        * because write_unix_file uses get_user_pages which is to be used
38769 +        * under mm->mmap_sem and because it is required to take mm->mmap_sem
38770 +        * before inode->i_mutex, so inode->i_mutex would have to be unlocked
38771 +        * before calling to get_user_pages which is unacceptable
38772 +        */
38773 +       struct semaphore write;
38774 +       /* this enum specifies which items are used to build the file */
38775 +       file_container_t container;
38776 +       /*
38777 +        * plugin which controls when file is to be converted to extents and
38778 +        * back to tail
38779 +        */
38780 +       struct formatting_plugin *tplug;
38781 +       /* if this is set, file is in exclusive use */
38782 +       int exclusive_use;
38783 +#if REISER4_DEBUG
38784 +       /* pointer to task struct of thread owning exclusive access to file */
38785 +       void *ea_owner;
38786 +       atomic_t nr_neas;
38787 +       void *last_reader;
38788 +#endif
38789 +} unix_file_info_t;
38790 +
38791 +struct unix_file_info *unix_file_inode_data(const struct inode *inode);
38792 +void get_exclusive_access(unix_file_info_t *);
38793 +void drop_exclusive_access(unix_file_info_t *);
38794 +void get_nonexclusive_access(unix_file_info_t *);
38795 +void drop_nonexclusive_access(unix_file_info_t *);
38796 +int try_to_get_nonexclusive_access(unix_file_info_t *);
38797 +int find_file_item(hint_t *, const reiser4_key *, znode_lock_mode,
38798 +                  struct inode *);
38799 +int find_file_item_nohint(coord_t *, lock_handle *,
38800 +                         const reiser4_key *, znode_lock_mode,
38801 +                         struct inode *);
38802 +
38803 +int load_file_hint(struct file *, hint_t *);
38804 +void save_file_hint(struct file *, const hint_t *);
38805 +
38806 +
38807 +#include "../item/extent.h"
38808 +#include "../item/tail.h"
38809 +#include "../item/ctail.h"
38810 +
38811 +struct uf_coord {
38812 +       coord_t coord;
38813 +       lock_handle *lh;
38814 +       int valid;
38815 +       union {
38816 +               extent_coord_extension_t extent;
38817 +               tail_coord_extension_t tail;
38818 +               ctail_coord_extension_t ctail;
38819 +       } extension;
38820 +};
38821 +
38822 +#include "../../forward.h"
38823 +#include "../../seal.h"
38824 +#include "../../lock.h"
38825 +
38826 +/*
38827 + * This structure is used to speed up file operations (reads and writes).  A
38828 + * hint is a suggestion about where a key resolved to last time.  A seal
38829 + * indicates whether a node has been modified since a hint was last recorded.
38830 + * You check the seal, and if the seal is still valid, you can use the hint
38831 + * without traversing the tree again.
38832 + */
38833 +struct hint {
38834 +       seal_t seal; /* a seal over last file item accessed */
38835 +       uf_coord_t ext_coord;
38836 +       loff_t offset;
38837 +       znode_lock_mode mode;
38838 +       lock_handle lh;
38839 +};
38840 +
38841 +void set_hint(hint_t *, const reiser4_key *, znode_lock_mode);
38842 +int hint_is_set(const hint_t *);
38843 +void unset_hint(hint_t *);
38844 +int hint_validate(hint_t *, const reiser4_key *, int check_key,
38845 +                 znode_lock_mode);
38846 +void hint_init_zero(hint_t *);
38847 +
38848 +int update_file_size(struct inode *, reiser4_key *, int update_sd);
38849 +int cut_file_items(struct inode *, loff_t new_size, int update_sd,
38850 +                  loff_t cur_size, int (*update_actor) (struct inode *,
38851 +                                                        reiser4_key *, int));
38852 +
38853 +
38854 +#if REISER4_DEBUG
38855 +
38856 +/* return 1 is exclusive access is obtained, 0 - otherwise */
38857 +static inline int ea_obtained(unix_file_info_t * uf_info)
38858 +{
38859 +       int ret;
38860 +
38861 +       ret = down_read_trylock(&uf_info->latch);
38862 +       if (ret)
38863 +               up_read(&uf_info->latch);
38864 +       return !ret;
38865 +}
38866 +
38867 +#endif
38868 +
38869 +/* declarations of functions implementing SYMLINK_FILE_PLUGIN_ID file plugin */
38870 +int create_symlink(struct inode *symlink, struct inode *dir,
38871 +                  reiser4_object_create_data *);
38872 +void destroy_inode_symlink(struct inode *);
38873 +
38874 +/* declarations of functions implementing CRC_FILE_PLUGIN_ID file plugin */
38875 +
38876 +/* inode operations */
38877 +int setattr_cryptcompress(struct dentry *, struct iattr *);
38878 +
38879 +/* file operations */
38880 +ssize_t read_cryptcompress(struct file *, char __user *buf, size_t read_amount,
38881 +                          loff_t * off);
38882 +ssize_t write_cryptcompress(struct file *, const char __user *buf, size_t write_amount,
38883 +                           loff_t * off);
38884 +int mmap_cryptcompress(struct file *, struct vm_area_struct *);
38885 +ssize_t sendfile_cryptcompress(struct file *file, loff_t *ppos, size_t count,
38886 +                              read_actor_t actor, void *target);
38887 +int release_cryptcompress(struct inode *, struct file *);
38888 +
38889 +/* address space operations */
38890 +extern int readpage_cryptcompress(struct file *, struct page *);
38891 +extern int writepages_cryptcompress(struct address_space *,
38892 +                                    struct writeback_control *);
38893 +
38894 +
38895 +/* file plugin operations */
38896 +int flow_by_inode_cryptcompress(struct inode *, const char __user *buf,
38897 +                               int user, loff_t, loff_t, rw_op, flow_t *);
38898 +int key_by_inode_cryptcompress(struct inode *, loff_t off, reiser4_key *);
38899 +int create_cryptcompress(struct inode *, struct inode *,
38900 +                        reiser4_object_create_data *);
38901 +int delete_cryptcompress(struct inode *);
38902 +void init_inode_data_cryptcompress(struct inode *, reiser4_object_create_data *,
38903 +                                  int create);
38904 +int cut_tree_worker_cryptcompress(tap_t *, const reiser4_key * from_key,
38905 +                                 const reiser4_key * to_key,
38906 +                                 reiser4_key * smallest_removed,
38907 +                                 struct inode *object, int truncate,
38908 +                                 int *progress);
38909 +void destroy_inode_cryptcompress(struct inode *);
38910 +
38911 +extern reiser4_plugin_ops cryptcompress_plugin_ops;
38912 +
38913 +#define WRITE_GRANULARITY 32
38914 +
38915 +
38916 +int tail2extent(unix_file_info_t *);
38917 +int extent2tail(unix_file_info_t *);
38918 +
38919 +int goto_right_neighbor(coord_t *, lock_handle *);
38920 +int find_or_create_extent(struct page *);
38921 +int equal_to_ldk(znode *, const reiser4_key *);
38922 +
38923 +void init_uf_coord(uf_coord_t *uf_coord, lock_handle *lh);
38924 +
38925 +static inline int cbk_errored(int cbk_result)
38926 +{
38927 +       return (cbk_result != CBK_COORD_NOTFOUND
38928 +               && cbk_result != CBK_COORD_FOUND);
38929 +}
38930 +
38931 +/* __REISER4_FILE_H__ */
38932 +#endif
38933 +
38934 +/*
38935 + * Local variables:
38936 + * c-indentation-style: "K&R"
38937 + * mode-name: "LC"
38938 + * c-basic-offset: 8
38939 + * tab-width: 8
38940 + * fill-column: 79
38941 + * scroll-step: 1
38942 + * End:
38943 +*/
38944 diff --git a/fs/reiser4/plugin/file/invert.c b/fs/reiser4/plugin/file/invert.c
38945 new file mode 100644
38946 index 0000000..a0a2576
38947 --- /dev/null
38948 +++ b/fs/reiser4/plugin/file/invert.c
38949 @@ -0,0 +1,493 @@
38950 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
38951 +
38952 +/* Suppose you want to conveniently read and write a large variety of small files conveniently within a single emacs
38953 +   buffer, without having a separate buffer for each 8 byte or so file.  Inverts are the way to do that.  An invert
38954 +   provides you with the contents of a set of subfiles plus its own contents.  It is a file which inherits other files
38955 +   when you read it, and allows you to write to it and through it to the files that it inherits from.  In order for it
38956 +   to know which subfiles each part of your write should go into, there must be delimiters indicating that.  It tries to
38957 +   make that easy for you by providing those delimiters in what you read from it.
38958 +
38959 +  When you read it, an invert performs an inverted assignment.  Instead of taking an assignment command and writing a
38960 +  bunch of files, it takes a bunch of files and composes an assignment command for you to read from it that if executed
38961 +  would create those files.  But which files?  Well, that must be specified in the body of the invert using a special
38962 +  syntax, and that specification is called the invert of the assignment.
38963 +
38964 +  When written to, an invert performs the assignment command that is written
38965 +  to it, and modifies its own body to contain the invert of that
38966 +  assignment.
38967 +
38968 +  In other words, writing to an invert file what you have read from it
38969 +  is the identity operation.
38970 +
38971 +  Malformed assignments cause write errors.  Partial writes are not
38972 +  supported in v4.0, but will be.
38973 +
38974 +  Example:
38975 +
38976 +    If an invert contains:
38977 +
38978 +    /filenameA/<>+"(some text stored in the invert)+/filenameB/<>
38979 +
38980 +======================
38981 +Each element in this definition should be an invert, and all files
38982 +should be called recursively - too.  This is bad. If one of the
38983 +included files in not a regular or invert file, then we can't read
38984 +main file.
38985 +
38986 +I think to make it is possible easier:
38987 +
38988 +internal structure of invert file should be like symlink file. But
38989 +read and write method should be explitely indicated in i/o operation..
38990 +
38991 +By default we read and write (if probably) as symlink and if we
38992 +specify ..invert at reading time that too we can specify it at write time.
38993 +
38994 +example:
38995 +/my_invert_file/..invert<- ( (/filenameA<-"(The contents of filenameA))+"(some text stored in the invert)+(/filenameB<-"(The contents of filenameB) ) )
38996 +will create  /my_invert_file as invert, and will creat /filenameA and /filenameB with specified body.
38997 +
38998 +read of /my_invert_file/..invert will be
38999 +/filenameA<-"(The contents of filenameA)+"(some text stored in the invert)+/filenameB<-"(The contents of filenameB)
39000 +
39001 +but read of /my_invert_file/ will be
39002 +The contents of filenameAsome text stored in the invertThe contents of filenameB
39003 +
39004 +we also can creat this file as
39005 +/my_invert_file/<-/filenameA+"(some text stored in the invert)+/filenameB
39006 +will create  /my_invert_file , and use existing files /filenameA and /filenameB.
39007 +
39008 +and when we will read it will be as previously invert file.
39009 +
39010 +This is correct?
39011 +
39012 + vv
39013 +DEMIDOV-FIXME-HANS:
39014 +
39015 +Maybe you are right, but then you must disable writes to /my_invert_file/ and only allow writes to /my_invert_file/..invert
39016 +
39017 +Do you agree?  Discuss it on reiserfs-list....
39018 +
39019 +-Hans
39020 +=======================
39021 +
39022 +  Then a read will return:
39023 +
39024 +    /filenameA<-"(The contents of filenameA)+"(some text stored in the invert)+/filenameB<-"(The contents of filenameB)
39025 +
39026 +    and a write of the line above to the invert will set the contents of
39027 +    the invert and filenameA and filenameB to their original values.
39028 +
39029 +  Note that the contents of an invert have no influence on the effect
39030 +  of a write unless the write is a partial write (and a write of a
39031 +  shorter file without using truncate first is a partial write).
39032 +
39033 +  truncate() has no effect on filenameA and filenameB, it merely
39034 +  resets the value of the invert.
39035 +
39036 +  Writes to subfiles via the invert are implemented by preceding them
39037 +  with truncates.
39038 +
39039 +  Parse failures cause write failures.
39040 +
39041 +  Questions to ponder: should the invert be acted on prior to file
39042 +  close when writing to an open filedescriptor?
39043 +
39044 + Example:
39045 +
39046 + If an invert contains:
39047 +
39048 +   "(This text and a pair of quotes are all that is here.)
39049 +
39050 +Then a read will return:
39051 +
39052 +   "(This text and a pair of quotes are all that is here.)
39053 +
39054 +*/
39055 +
39056 +/* OPEN method places a struct file in memory associated with invert body
39057 +  and returns something like file descriptor to the user for the future access
39058 +  to the invert file.
39059 +  During opening we parse the body of invert and get a list of the 'entryes'
39060 +  (that describes all its subfiles) and place pointer on the first struct in
39061 +  reiserfs-specific part of invert inode (arbitrary decision).
39062 +
39063 +  Each subfile is described by the struct inv_entry that has a pointer @sd on
39064 +  in-core based stat-data and  a pointer on struct file @f (if we find that the
39065 +  subfile uses more then one unformated node (arbitrary decision), we load
39066 +  struct file in memory, otherwise we load base stat-data (and maybe 1-2 bytes
39067 +  of some other information we need)
39068 +
39069 +  Since READ and WRITE methods for inverts were formulated in assignment
39070 +  language, they don't contain arguments 'size' and 'offset' that make sense
39071 +  only in ordinary read/write methods.
39072 +
39073 +  READ method is a combination of two methods:
39074 +  1) ordinary read method (with offset=0, lenght = @f->...->i_size) for entries
39075 +  with @f != 0, this method uses pointer on struct file as an argument
39076 +  2) read method for inode-less files with @sd != 0, this method uses
39077 +  in-core based stat-data instead struct file as an argument.
39078 +  in the first case we don't use pagecache, just copy data that we got after
39079 +  cbk() into userspace.
39080 +
39081 +  WRITE method for invert files is more complex.
39082 +  Besides declared WRITE-interface in assignment languageb above we need
39083 +  to have an opportunity to edit unwrapped body of invert file with some
39084 +  text editor, it means we need GENERIC WRITE METHOD for invert file:
39085 +
39086 +  my_invert_file/..invert <- "string"
39087 +
39088 +  this method parses "string" and looks for correct subfile signatures, also
39089 +  the parsing process splits this "string" on the set of flows in  accordance
39090 +  with the set of subfiles specified by this signarure.
39091 +  The found list of signatures #S is compared with the opened one #I of invert
39092 +  file. If it doesn't have this one (#I==0, it will be so for instance if we
39093 +  have just create this invert file) the write method assignes found signature
39094 +  (#I=#S;) to the invert file. Then if #I==#S, generic write method splits
39095 +  itself to the some write methods for ordinary or light-weight, or call itself
39096 +  recursively for invert files with corresponding flows.
39097 +  I am not sure, but the list of signatures looks like what mr.Demidov means
39098 +  by 'delimiters'.
39099 +
39100 +  The cases when #S<#I (#I<#S) (in the sense of set-theory) are also available
39101 +  and cause delete (create new) subfiles (arbitrary decision - it may looks
39102 +  too complex, but this interface will be the completest). The order of entries
39103 +  of list #S (#I) and inherited order on #I (#S) must coincide.
39104 +  The other parsing results give malformed signature that aborts READ method
39105 +  and releases all resources.
39106 +
39107 +  Format of subfile (entry) signature:
39108 +
39109 +  "START_MAGIC"<>(TYPE="...",LOOKUP_ARG="...")SUBFILE_BODY"END_MAGIC"
39110 +
39111 +  Legend:
39112 +
39113 +    START_MAGIC - keyword indicates the start of subfile signature;
39114 +
39115 +    <> indicates the start of 'subfile metadata', that is the pair
39116 +  (TYPE="...",LOOKUP_ARG="...") in parenthesis separated by comma.
39117 +
39118 +    TYPE - the string "type" indicates the start of one of the three words:
39119 +  - ORDINARY_FILE,
39120 +  - LIGHT_WEIGHT_FILE,
39121 +  - INVERT_FILE;
39122 +
39123 +    LOOKUP_ARG - lookup argument depends on previous type:
39124 +  */
39125 +
39126 + /************************************************************/
39127 + /*       TYPE        *          LOOKUP ARGUMENT             */
39128 + /************************************************************/
39129 + /* LIGH_WEIGHT_FILE  *           stat-data key              */
39130 + /************************************************************/
39131 + /*   ORDINARY_FILE   *             filename                 */
39132 + /************************************************************/
39133 + /*   INVERT_FILE     *             filename                 */
39134 + /************************************************************/
39135 +
39136 + /* where:
39137 +  *stat-data key - the string contains stat data key of this subfile, it will be
39138 +  passed to fast-access lookup method for light-weight files;
39139 +  *filename - pathname of this subfile, iyt well be passed to VFS lookup methods
39140 +  for ordinary and invert files;
39141 +
39142 +  SUBFILE_BODY - data of this subfile (it will go to the flow)
39143 +  END_MAGIC - the keyword indicates the end of subfile signature.
39144 +
39145 +  The other simbols inside the signature interpreted as 'unformatted content',
39146 +  which is available with VFS's read_link() (arbitraruy decision).
39147 +
39148 +  NOTE: Parse method for a body of invert file uses mentioned signatures _without_
39149 +  subfile bodies.
39150 +
39151 +  Now the only unclear thing is WRITE in regular light-weight subfile A that we
39152 +  can describe only in  assignment language:
39153 +
39154 +  A <- "some_string"
39155 +
39156 +  I guess we don't want to change stat-data and body items of file A
39157 +  if this file exist, and size(A) != size("some_string") because this operation is
39158 +  expencive, so we only do the partial write if size(A) > size("some_string")
39159 +  and do truncate of the "some_string", and then do A <- "truncated string", if
39160 +  size(A) < size("some_string"). This decision is also arbitrary..
39161 +  */
39162 +
39163 +/* here is infrastructure for formated flows */
39164 +
39165 +#define SUBFILE_HEADER_MAGIC 0x19196605
39166 +#define FLOW_HEADER_MAGIC 0x01194304
39167 +
39168 +#include "../plugin.h"
39169 +#include "../../debug.h"
39170 +#include "../../forward.h"
39171 +#include "../object.h"
39172 +#include "../item/item.h"
39173 +#include "../item/static_stat.h"
39174 +#include "../../dformat.h"
39175 +#include "../znode.h"
39176 +#include "../inode.h"
39177 +
39178 +#include <linux/types.h>
39179 +#include <linux/fs.h>          /* for struct file  */
39180 +#include <linux/list.h>                /* for struct list_head */
39181 +
39182 +typedef enum {
39183 +       LIGHT_WEIGHT_FILE,
39184 +       ORDINARY_FILE,
39185 +       INVERT_FILE
39186 +} inv_entry_type;
39187 +
39188 +typedef struct flow_header {
39189 +       d32 fl_magic;
39190 +       d16 fl_nr;              /* number of subfiles in the flow */
39191 +};
39192 +
39193 +typedef struct subfile_header {
39194 +       d32 sh_magic;           /* subfile magic */
39195 +       d16 sh_type;            /* type of subfile: light-weight, ordinary, invert */
39196 +       d16 sh_arg_len;         /* lenght of lookup argument (filename, key) */
39197 +       d32 sh_body_len;        /* lenght of subfile body */
39198 +};
39199 +
39200 +/* functions to get/set fields of flow header */
39201 +
39202 +static void fl_set_magic(flow_header * fh, __u32 value)
39203 +{
39204 +       cputod32(value, &fh->fh_magic);
39205 +}
39206 +
39207 +static __u32 fl_get_magic(flow_header * fh)
39208 +{
39209 +       return d32tocpu(&fh->fh_magic);
39210 +}
39211 +static void fl_set_number(flow_header * fh, __u16 value)
39212 +{
39213 +       cputod16(value, &fh->fh_nr);
39214 +}
39215 +static unsigned fl_get_number(flow_header * fh)
39216 +{
39217 +       return d16tocpu(&fh->fh_nr);
39218 +}
39219 +
39220 +/* functions to get/set fields of subfile header */
39221 +
39222 +static void sh_set_magic(subfile_header * sh, __u32 value)
39223 +{
39224 +       cputod32(value, &sh->sh_magic);
39225 +}
39226 +
39227 +static __u32 sh_get_magic(subfile_header * sh)
39228 +{
39229 +       return d32tocpu(&sh->sh_magic);
39230 +}
39231 +static void sh_set_type(subfile_header * sh, __u16 value)
39232 +{
39233 +       cputod16(value, &sh->sh_magic);
39234 +}
39235 +static unsigned sh_get_type(subfile_header * sh)
39236 +{
39237 +       return d16tocpu(&sh->sh_magic);
39238 +}
39239 +static void sh_set_arg_len(subfile_header * sh, __u16 value)
39240 +{
39241 +       cputod16(value, &sh->sh_arg_len);
39242 +}
39243 +static unsigned sh_get_arg_len(subfile_header * sh)
39244 +{
39245 +       return d16tocpu(&sh->sh_arg_len);
39246 +}
39247 +static void sh_set_body_len(subfile_header * sh, __u32 value)
39248 +{
39249 +       cputod32(value, &sh->sh_body_len);
39250 +}
39251 +
39252 +static __u32 sh_get_body_len(subfile_header * sh)
39253 +{
39254 +       return d32tocpu(&sh->sh_body_len);
39255 +}
39256 +
39257 +/* in-core minimal stat-data, light-weight analog of inode */
39258 +
39259 +struct incore_sd_base {
39260 +       umode_t isd_mode;
39261 +       nlink_t isd_nlink;
39262 +       loff_t isd_size;
39263 +       char *isd_data;         /* 'subflow' to write */
39264 +};
39265 +
39266 +/* open invert create a list of invert entries,
39267 +   every entry is represented by structure inv_entry */
39268 +
39269 +struct inv_entry {
39270 +       struct list_head *ie_list;
39271 +       struct file *ie_file;   /* this is NULL if the file doesn't
39272 +                                  have unformated nodes */
39273 +       struct incore_sd_base *ie_sd;   /* inode-less analog of struct file */
39274 +};
39275 +
39276 +/* allocate and init invert entry */
39277 +
39278 +static struct inv_entry *allocate_inv_entry(void)
39279 +{
39280 +       struct inv_entry *inv_entry;
39281 +
39282 +       inv_entry = reiser4_kmalloc(sizeof(struct inv_entry), GFP_KERNEL);
39283 +       if (!inv_entry)
39284 +               return ERR_PTR(RETERR(-ENOMEM));
39285 +       inv_entry->ie_file = NULL;
39286 +       inv_entry->ie_sd = NULL;
39287 +       INIT_LIST_HEAD(&inv_entry->ie_list);
39288 +       return inv_entry;
39289 +}
39290 +
39291 +static int put_inv_entry(struct inv_entry *ientry)
39292 +{
39293 +       int result = 0;
39294 +
39295 +       assert("edward-96", ientry != NULL);
39296 +       assert("edward-97", ientry->ie_list != NULL);
39297 +
39298 +       list_del(ientry->ie_list);
39299 +       if (ientry->ie_sd != NULL) {
39300 +               kfree(ientry->ie_sd);
39301 +               kfree(ientry);
39302 +       }
39303 +       if (ientry->ie_file != NULL)
39304 +               result = filp_close(ientry->file, NULL);
39305 +       return result;
39306 +}
39307 +
39308 +static int allocate_incore_sd_base(struct inv_entry *inv_entry)
39309 +{
39310 +       struct incore_sd_base *isd_base assert("edward-98", inv_entry != NULL);
39311 +       assert("edward-99", inv_entry->ie_inode = NULL);
39312 +       assert("edward-100", inv_entry->ie_sd = NULL);
39313 +
39314 +       isd_base = reiser4_kmalloc(sizeof(struct incore_sd_base), GFP_KERNEL);
39315 +       if (!isd_base)
39316 +               return RETERR(-ENOMEM);
39317 +       inv_entry->ie_sd = isd_base;
39318 +       return 0;
39319 +}
39320 +
39321 +/* this can be installed as ->init_inv_entry () method of
39322 +   item_plugins[ STATIC_STAT_DATA_IT ] (fs/reiser4/plugin/item/item.c).
39323 +   Copies data from on-disk stat-data format into light-weight analog of inode .
39324 +   Doesn't hanlde stat-data extensions. */
39325 +
39326 +static void sd_base_load(struct inv_entry *inv_entry, char *sd)
39327 +{
39328 +       reiser4_stat_data_base *sd_base;
39329 +
39330 +       assert("edward-101", inv_entry != NULL);
39331 +       assert("edward-101", inv_entry->ie_sd != NULL);
39332 +       assert("edward-102", sd != NULL);
39333 +
39334 +       sd_base = (reiser4_stat_data_base *) sd;
39335 +       inv_entry->incore_sd_base->isd_mode = d16tocpu(&sd_base->mode);
39336 +       inv_entry->incore_sd_base->isd_nlink = d32tocpu(&sd_base->nlink);
39337 +       inv_entry->incore_sd_base->isd_size = d64tocpu(&sd_base->size);
39338 +       inv_entry->incore_sd_base->isd_data = NULL;
39339 +}
39340 +
39341 +/* initialise incore stat-data */
39342 +
39343 +static void init_incore_sd_base(struct inv_entry *inv_entry, coord_t * coord)
39344 +{
39345 +       reiser4_plugin *plugin = item_plugin_by_coord(coord);
39346 +       void *body = item_body_by_coord(coord);
39347 +
39348 +       assert("edward-103", inv_entry != NULL);
39349 +       assert("edward-104", plugin != NULL);
39350 +       assert("edward-105", body != NULL);
39351 +
39352 +       sd_base_load(inv_entry, body);
39353 +}
39354 +
39355 +/* takes a key or filename and allocates new invert_entry,
39356 +   init and adds it into the list,
39357 +   we use lookup_sd_by_key() for light-weight files and VFS lookup by filename */
39358 +
39359 +int get_inv_entry(struct inode *invert_inode,  /* inode of invert's body */
39360 +                 inv_entry_type type,  /* LIGHT-WEIGHT or ORDINARY */
39361 +                 const reiser4_key * key,      /* key of invert entry stat-data */
39362 +                 char *filename,       /* filename of the file to be opened */
39363 +                 int flags, int mode)
39364 +{
39365 +       int result;
39366 +       struct inv_entry *ientry;
39367 +
39368 +       assert("edward-107", invert_inode != NULL);
39369 +
39370 +       ientry = allocate_inv_entry();
39371 +       if (IS_ERR(ientry))
39372 +               return (PTR_ERR(ientry));
39373 +
39374 +       if (type == LIGHT_WEIGHT_FILE) {
39375 +               coord_t coord;
39376 +               lock_handle lh;
39377 +
39378 +               assert("edward-108", key != NULL);
39379 +
39380 +               init_coord(&coord);
39381 +               init_lh(&lh);
39382 +               result =
39383 +                   lookup_sd_by_key(tree_by_inode(invert_inode),
39384 +                                    ZNODE_READ_LOCK, &coord, &lh, key);
39385 +               if (result == 0)
39386 +                       init_incore_sd_base(ientry, coord);
39387 +
39388 +               done_lh(&lh);
39389 +               done_coord(&coord);
39390 +               return (result);
39391 +       } else {
39392 +               struct file *file = filp_open(filename, flags, mode);
39393 +               /* FIXME_EDWARD here we need to check if we
39394 +                  did't follow to any mount point */
39395 +
39396 +               assert("edward-108", filename != NULL);
39397 +
39398 +               if (IS_ERR(file))
39399 +                       return (PTR_ERR(file));
39400 +               ientry->ie_file = file;
39401 +               return 0;
39402 +       }
39403 +}
39404 +
39405 +/* takes inode of invert, reads the body of this invert, parses it,
39406 +   opens all invert entries and return pointer on the first inv_entry */
39407 +
39408 +struct inv_entry *open_invert(struct file *invert_file)
39409 +{
39410 +
39411 +}
39412 +
39413 +ssize_t subfile_read(struct *invert_entry, flow * f)
39414 +{
39415 +
39416 +}
39417 +
39418 +ssize_t subfile_write(struct *invert_entry, flow * f)
39419 +{
39420 +
39421 +}
39422 +
39423 +ssize_t invert_read(struct *file, flow * f)
39424 +{
39425 +
39426 +}
39427 +
39428 +ssize_t invert_write(struct *file, flow * f)
39429 +{
39430 +
39431 +}
39432 +
39433 +/* Make Linus happy.
39434 +   Local variables:
39435 +   c-indentation-style: "K&R"
39436 +   mode-name: "LC"
39437 +   c-basic-offset: 8
39438 +   tab-width: 8
39439 +   fill-column: 120
39440 +   scroll-step: 1
39441 +   End:
39442 +*/
39443 diff --git a/fs/reiser4/plugin/file/symfile.c b/fs/reiser4/plugin/file/symfile.c
39444 new file mode 100644
39445 index 0000000..814dfb8
39446 --- /dev/null
39447 +++ b/fs/reiser4/plugin/file/symfile.c
39448 @@ -0,0 +1,87 @@
39449 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
39450 +
39451 +/* Symfiles are a generalization of Unix symlinks.
39452 +
39453 +   A symfile when read behaves as though you took its contents and
39454 +   substituted them into the reiser4 naming system as the right hand side
39455 +   of an assignment, and then read that which you had assigned to it.
39456 +
39457 +   A key issue for symfiles is how to implement writes through to
39458 +   subfiles.  In general, one must have some method of determining what
39459 +   of that which is written to the symfile is written to what subfile.
39460 +   This can be done by use of custom plugin methods written by users, or
39461 +   by using a few general methods we provide for those willing to endure
39462 +   the insertion of delimiters into what is read.
39463 +
39464 +   Writing to symfiles without delimiters to denote what is written to
39465 +   what subfile is not supported by any plugins we provide in this
39466 +   release.  Our most sophisticated support for writes is that embodied
39467 +   by the invert plugin (see invert.c).
39468 +
39469 +   A read only version of the /etc/passwd file might be
39470 +   constructed as a symfile whose contents are as follows:
39471 +
39472 +   /etc/passwd/userlines/*
39473 +
39474 +   or
39475 +
39476 +   /etc/passwd/userlines/demidov+/etc/passwd/userlines/edward+/etc/passwd/userlines/reiser+/etc/passwd/userlines/root
39477 +
39478 +   or
39479 +
39480 +   /etc/passwd/userlines/(demidov+edward+reiser+root)
39481 +
39482 +   A symfile with contents
39483 +
39484 +   /filenameA+"(some text stored in the uninvertable symfile)+/filenameB
39485 +
39486 +   will return when read
39487 +
39488 +   The contents of filenameAsome text stored in the uninvertable symfileThe contents of filenameB
39489 +
39490 +   and write of what has been read will not be possible to implement as
39491 +   an identity operation because there are no delimiters denoting the
39492 +   boundaries of what is to be written to what subfile.
39493 +
39494 +   Note that one could make this a read/write symfile if one specified
39495 +   delimiters, and the write method understood those delimiters delimited
39496 +   what was written to subfiles.
39497 +
39498 +   So, specifying the symfile in a manner that allows writes:
39499 +
39500 +   /etc/passwd/userlines/demidov+"(
39501 +   )+/etc/passwd/userlines/edward+"(
39502 +   )+/etc/passwd/userlines/reiser+"(
39503 +   )+/etc/passwd/userlines/root+"(
39504 +   )
39505 +
39506 +   or
39507 +
39508 +   /etc/passwd/userlines/(demidov+"(
39509 +   )+edward+"(
39510 +   )+reiser+"(
39511 +   )+root+"(
39512 +   ))
39513 +
39514 +   and the file demidov might be specified as:
39515 +
39516 +   /etc/passwd/userlines/demidov/username+"(:)+/etc/passwd/userlines/demidov/password+"(:)+/etc/passwd/userlines/demidov/userid+"(:)+/etc/passwd/userlines/demidov/groupid+"(:)+/etc/passwd/userlines/demidov/gecos+"(:)+/etc/passwd/userlines/demidov/home+"(:)+/etc/passwd/userlines/demidov/shell
39517 +
39518 +   or
39519 +
39520 +   /etc/passwd/userlines/demidov/(username+"(:)+password+"(:)+userid+"(:)+groupid+"(:)+gecos+"(:)+home+"(:)+shell)
39521 +
39522 +   Notice that if the file demidov has a carriage return in it, the
39523 +   parsing fails, but then if you put carriage returns in the wrong place
39524 +   in a normal /etc/passwd file it breaks things also.
39525 +
39526 +   Note that it is forbidden to have no text between two interpolations
39527 +   if one wants to be able to define what parts of a write go to what
39528 +   subfiles referenced in an interpolation.
39529 +
39530 +   If one wants to be able to add new lines by writing to the file, one
39531 +   must either write a custom plugin for /etc/passwd that knows how to
39532 +   name an added line, or one must use an invert, or one must use a more
39533 +   sophisticated symfile syntax that we are not planning to write for
39534 +   version 4.0.
39535 +*/
39536 diff --git a/fs/reiser4/plugin/file/symlink.c b/fs/reiser4/plugin/file/symlink.c
39537 new file mode 100644
39538 index 0000000..4f3f05a
39539 --- /dev/null
39540 +++ b/fs/reiser4/plugin/file/symlink.c
39541 @@ -0,0 +1,92 @@
39542 +/* Copyright 2002, 2003, 2005 by Hans Reiser, licensing governed by reiser4/README */
39543 +
39544 +#include "../../inode.h"
39545 +
39546 +#include <linux/types.h>
39547 +#include <linux/fs.h>
39548 +
39549 +/* file plugin methods specific for symlink files
39550 +   (SYMLINK_FILE_PLUGIN_ID) */
39551 +
39552 +/* this is implementation of create_object method of file plugin for
39553 +   SYMLINK_FILE_PLUGIN_ID
39554 + */
39555 +
39556 +/**
39557 + * create_symlink - create_object of file plugin for SYMLINK_FILE_PLUGIN_ID
39558 + * @symlink: inode of symlink object
39559 + * @dir: inode of parent directory
39560 + * @info:  parameters of new object
39561 + *
39562 + * Inserts stat data with symlink extension where into the tree.
39563 + */
39564 +int create_symlink(struct inode *symlink,
39565 +                  struct inode *dir UNUSED_ARG,
39566 +                  reiser4_object_create_data *data     /* info passed to us,
39567 +                                                        * this is filled by
39568 +                                                        * reiser4() syscall
39569 +                                                        * in particular */ )
39570 +{
39571 +       int result;
39572 +
39573 +       assert("nikita-680", symlink != NULL);
39574 +       assert("nikita-681", S_ISLNK(symlink->i_mode));
39575 +       assert("nikita-685", inode_get_flag(symlink, REISER4_NO_SD));
39576 +       assert("nikita-682", dir != NULL);
39577 +       assert("nikita-684", data != NULL);
39578 +       assert("nikita-686", data->id == SYMLINK_FILE_PLUGIN_ID);
39579 +
39580 +       /*
39581 +        * stat data of symlink has symlink extension in which we store
39582 +        * symlink content, that is, path symlink is pointing to.
39583 +        */
39584 +       reiser4_inode_data(symlink)->extmask |= (1 << SYMLINK_STAT);
39585 +
39586 +       assert("vs-838", symlink->i_private == NULL);
39587 +       symlink->i_private = (void *)data->name;
39588 +
39589 +       assert("vs-843", symlink->i_size == 0);
39590 +       INODE_SET_FIELD(symlink, i_size, strlen(data->name));
39591 +
39592 +       /* insert stat data appended with data->name */
39593 +       result = inode_file_plugin(symlink)->write_sd_by_inode(symlink);
39594 +       if (result) {
39595 +               /* FIXME-VS: Make sure that symlink->i_private is not attached
39596 +                  to kmalloced data */
39597 +               INODE_SET_FIELD(symlink, i_size, 0);
39598 +       } else {
39599 +               assert("vs-849", symlink->i_private
39600 +                      && inode_get_flag(symlink, REISER4_GENERIC_PTR_USED));
39601 +               assert("vs-850",
39602 +                      !memcmp((char *)symlink->i_private, data->name,
39603 +                              (size_t) symlink->i_size + 1));
39604 +       }
39605 +       return result;
39606 +}
39607 +
39608 +/* this is implementation of destroy_inode method of file plugin for
39609 +   SYMLINK_FILE_PLUGIN_ID
39610 + */
39611 +void destroy_inode_symlink(struct inode *inode)
39612 +{
39613 +       assert("edward-799",
39614 +              inode_file_plugin(inode) ==
39615 +              file_plugin_by_id(SYMLINK_FILE_PLUGIN_ID));
39616 +       assert("edward-800", !is_bad_inode(inode) && is_inode_loaded(inode));
39617 +       assert("edward-801", inode_get_flag(inode, REISER4_GENERIC_PTR_USED));
39618 +       assert("vs-839", S_ISLNK(inode->i_mode));
39619 +
39620 +       kfree(inode->i_private);
39621 +       inode->i_private = NULL;
39622 +       inode_clr_flag(inode, REISER4_GENERIC_PTR_USED);
39623 +}
39624 +
39625 +/* Local variables:
39626 +   c-indentation-style: "K&R"
39627 +   mode-name: "LC"
39628 +   c-basic-offset: 8
39629 +   tab-width: 8
39630 +   fill-column: 120
39631 +   scroll-step: 1
39632 +   End:
39633 +*/
39634 diff --git a/fs/reiser4/plugin/file/tail_conversion.c b/fs/reiser4/plugin/file/tail_conversion.c
39635 new file mode 100644
39636 index 0000000..c53dc9b
39637 --- /dev/null
39638 +++ b/fs/reiser4/plugin/file/tail_conversion.c
39639 @@ -0,0 +1,728 @@
39640 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
39641 +
39642 +#include "../../inode.h"
39643 +#include "../../super.h"
39644 +#include "../../page_cache.h"
39645 +#include "../../carry.h"
39646 +#include "../../safe_link.h"
39647 +#include "../../vfs_ops.h"
39648 +
39649 +#include <linux/writeback.h>
39650 +
39651 +/* this file contains:
39652 +   tail2extent and extent2tail */
39653 +
39654 +/* exclusive access to a file is acquired when file state changes: tail2extent, empty2tail, extent2tail, etc */
39655 +void get_exclusive_access(unix_file_info_t * uf_info)
39656 +{
39657 +       assert("nikita-3028", schedulable());
39658 +       assert("nikita-3047", LOCK_CNT_NIL(inode_sem_w));
39659 +       assert("nikita-3048", LOCK_CNT_NIL(inode_sem_r));
39660 +       /*
39661 +        * "deadlock avoidance": sometimes we commit a transaction under
39662 +        * rw-semaphore on a file. Such commit can deadlock with another
39663 +        * thread that captured some block (hence preventing atom from being
39664 +        * committed) and waits on rw-semaphore.
39665 +        */
39666 +       txn_restart_current();
39667 +       LOCK_CNT_INC(inode_sem_w);
39668 +       down_write(&uf_info->latch);
39669 +       uf_info->exclusive_use = 1;
39670 +       assert("vs-1713", uf_info->ea_owner == NULL);
39671 +       assert("vs-1713", atomic_read(&uf_info->nr_neas) == 0);
39672 +       ON_DEBUG(uf_info->ea_owner = current);
39673 +}
39674 +
39675 +void drop_exclusive_access(unix_file_info_t * uf_info)
39676 +{
39677 +       assert("vs-1714", uf_info->ea_owner == current);
39678 +       assert("vs-1715", atomic_read(&uf_info->nr_neas) == 0);
39679 +       ON_DEBUG(uf_info->ea_owner = NULL);
39680 +       uf_info->exclusive_use = 0;
39681 +       up_write(&uf_info->latch);
39682 +       assert("nikita-3049", LOCK_CNT_NIL(inode_sem_r));
39683 +       assert("nikita-3049", LOCK_CNT_GTZ(inode_sem_w));
39684 +       LOCK_CNT_DEC(inode_sem_w);
39685 +       txn_restart_current();
39686 +}
39687 +
39688 +/**
39689 + * nea_grabbed - do something when file semaphore is down_read-ed
39690 + * @uf_info:
39691 + *
39692 + * This is called when nonexclisive access is obtained on file. All it does is
39693 + * for debugging purposes.
39694 + */
39695 +static void nea_grabbed(unix_file_info_t *uf_info)
39696 +{
39697 +#if REISER4_DEBUG
39698 +       LOCK_CNT_INC(inode_sem_r);
39699 +       assert("vs-1716", uf_info->ea_owner == NULL);
39700 +       atomic_inc(&uf_info->nr_neas);
39701 +       uf_info->last_reader = current;
39702 +#endif
39703 +}
39704 +
39705 +/**
39706 + * get_nonexclusive_access - get nonexclusive access to a file
39707 + * @uf_info: unix file specific part of inode to obtain access to
39708 + *
39709 + * Nonexclusive access is obtained on a file before read, write, readpage.
39710 + */
39711 +void get_nonexclusive_access(unix_file_info_t *uf_info)
39712 +{
39713 +       assert("nikita-3029", schedulable());
39714 +       assert("nikita-3361", get_current_context()->trans->atom == NULL);
39715 +
39716 +       down_read(&uf_info->latch);
39717 +       nea_grabbed(uf_info);
39718 +}
39719 +
39720 +/**
39721 + * try_to_get_nonexclusive_access - try to get nonexclusive access to a file
39722 + * @uf_info: unix file specific part of inode to obtain access to
39723 + *
39724 + * Non-blocking version of nonexclusive access obtaining.
39725 + */
39726 +int try_to_get_nonexclusive_access(unix_file_info_t *uf_info)
39727 +{
39728 +       int result;
39729 +
39730 +       result = down_read_trylock(&uf_info->latch);
39731 +       if (result)
39732 +               nea_grabbed(uf_info);
39733 +       return result;
39734 +}
39735 +
39736 +void drop_nonexclusive_access(unix_file_info_t * uf_info)
39737 +{
39738 +       assert("vs-1718", uf_info->ea_owner == NULL);
39739 +       assert("vs-1719", atomic_read(&uf_info->nr_neas) > 0);
39740 +       ON_DEBUG(atomic_dec(&uf_info->nr_neas));
39741 +
39742 +       up_read(&uf_info->latch);
39743 +
39744 +       LOCK_CNT_DEC(inode_sem_r);
39745 +       txn_restart_current();
39746 +}
39747 +
39748 +/* part of tail2extent. Cut all items covering @count bytes starting from
39749 +   @offset */
39750 +/* Audited by: green(2002.06.15) */
39751 +static int cut_formatting_items(struct inode *inode, loff_t offset, int count)
39752 +{
39753 +       reiser4_key from, to;
39754 +
39755 +       /* AUDIT: How about putting an assertion here, what would check
39756 +          all provided range is covered by tail items only? */
39757 +       /* key of first byte in the range to be cut  */
39758 +       inode_file_plugin(inode)->key_by_inode(inode, offset, &from);
39759 +
39760 +       /* key of last byte in that range */
39761 +       to = from;
39762 +       set_key_offset(&to, (__u64) (offset + count - 1));
39763 +
39764 +       /* cut everything between those keys */
39765 +       return cut_tree(tree_by_inode(inode), &from, &to, inode, 0);
39766 +}
39767 +
39768 +static void release_all_pages(struct page **pages, unsigned nr_pages)
39769 +{
39770 +       unsigned i;
39771 +
39772 +       for (i = 0; i < nr_pages; i++) {
39773 +               if (pages[i] == NULL) {
39774 +                       unsigned j;
39775 +                       for (j = i + 1; j < nr_pages; j++)
39776 +                               assert("vs-1620", pages[j] == NULL);
39777 +                       break;
39778 +               }
39779 +               page_cache_release(pages[i]);
39780 +               pages[i] = NULL;
39781 +       }
39782 +}
39783 +
39784 +/* part of tail2extent. replace tail items with extent one. Content of tail
39785 +   items (@count bytes) being cut are copied already into
39786 +   pages. extent_writepage method is called to create extents corresponding to
39787 +   those pages */
39788 +static int replace(struct inode *inode, struct page **pages, unsigned nr_pages, int count)
39789 +{
39790 +       int result;
39791 +       unsigned i;
39792 +       STORE_COUNTERS;
39793 +
39794 +       if (nr_pages == 0)
39795 +               return 0;
39796 +
39797 +       assert("vs-596", pages[0]);
39798 +
39799 +       /* cut copied items */
39800 +       result =
39801 +           cut_formatting_items(inode,
39802 +                                (loff_t) pages[0]->index << PAGE_CACHE_SHIFT,
39803 +                                count);
39804 +       if (result)
39805 +               return result;
39806 +
39807 +       CHECK_COUNTERS;
39808 +
39809 +       /* put into tree replacement for just removed items: extent item, namely */
39810 +       for (i = 0; i < nr_pages; i++) {
39811 +               result = add_to_page_cache_lru(pages[i], inode->i_mapping,
39812 +                                              pages[i]->index,
39813 +                                              mapping_gfp_mask(inode->
39814 +                                                               i_mapping));
39815 +               if (result)
39816 +                       break;
39817 +               unlock_page(pages[i]);
39818 +               result = find_or_create_extent(pages[i]);
39819 +               if (result)
39820 +                       break;
39821 +               SetPageUptodate(pages[i]);
39822 +       }
39823 +       return result;
39824 +}
39825 +
39826 +#define TAIL2EXTENT_PAGE_NUM 3 /* number of pages to fill before cutting tail
39827 +                                * items */
39828 +
39829 +static int reserve_tail2extent_iteration(struct inode *inode)
39830 +{
39831 +       reiser4_block_nr unformatted_nodes;
39832 +       reiser4_tree *tree;
39833 +
39834 +       tree = tree_by_inode(inode);
39835 +
39836 +       /* number of unformatted nodes which will be created */
39837 +       unformatted_nodes = TAIL2EXTENT_PAGE_NUM;
39838 +
39839 +       /*
39840 +        * space required for one iteration of extent->tail conversion:
39841 +        *
39842 +        *     1. kill N tail items
39843 +        *
39844 +        *     2. insert TAIL2EXTENT_PAGE_NUM unformatted nodes
39845 +        *
39846 +        *     3. insert TAIL2EXTENT_PAGE_NUM (worst-case single-block
39847 +        *     extents) extent units.
39848 +        *
39849 +        *     4. drilling to the leaf level by coord_by_key()
39850 +        *
39851 +        *     5. possible update of stat-data
39852 +        *
39853 +        */
39854 +       grab_space_enable();
39855 +       return reiser4_grab_space
39856 +           (2 * tree->height +
39857 +            TAIL2EXTENT_PAGE_NUM +
39858 +            TAIL2EXTENT_PAGE_NUM * estimate_one_insert_into_item(tree) +
39859 +            1 + estimate_one_insert_item(tree) +
39860 +            inode_file_plugin(inode)->estimate.update(inode), BA_CAN_COMMIT);
39861 +}
39862 +
39863 +/* clear stat data's flag indicating that conversion is being converted */
39864 +static int complete_conversion(struct inode *inode)
39865 +{
39866 +       int result;
39867 +
39868 +       grab_space_enable();
39869 +       result =
39870 +           reiser4_grab_space(inode_file_plugin(inode)->estimate.update(inode),
39871 +                              BA_CAN_COMMIT);
39872 +       if (result == 0) {
39873 +               inode_clr_flag(inode, REISER4_PART_MIXED);
39874 +               result = reiser4_update_sd(inode);
39875 +       }
39876 +       if (result)
39877 +               warning("vs-1696", "Failed to clear converting bit of %llu: %i",
39878 +                       (unsigned long long)get_inode_oid(inode), result);
39879 +       return 0;
39880 +}
39881 +
39882 +/**
39883 + * find_start
39884 + * @inode:
39885 + * @id:
39886 + * @offset:
39887 + *
39888 + * this is used by tail2extent and extent2tail to detect where previous
39889 + * uncompleted conversion stopped
39890 + */
39891 +static int find_start(struct inode *inode, reiser4_plugin_id id, __u64 *offset)
39892 +{
39893 +       int result;
39894 +       lock_handle lh;
39895 +       coord_t coord;
39896 +       unix_file_info_t *ufo;
39897 +       int found;
39898 +       reiser4_key key;
39899 +
39900 +       ufo = unix_file_inode_data(inode);
39901 +       init_lh(&lh);
39902 +       result = 0;
39903 +       found = 0;
39904 +       inode_file_plugin(inode)->key_by_inode(inode, *offset, &key);
39905 +       do {
39906 +               init_lh(&lh);
39907 +               result = find_file_item_nohint(&coord, &lh, &key,
39908 +                                              ZNODE_READ_LOCK, inode);
39909 +
39910 +               if (result == CBK_COORD_FOUND) {
39911 +                       if (coord.between == AT_UNIT) {
39912 +                               /*coord_clear_iplug(&coord); */
39913 +                               result = zload(coord.node);
39914 +                               if (result == 0) {
39915 +                                       if (item_id_by_coord(&coord) == id)
39916 +                                               found = 1;
39917 +                                       else
39918 +                                               item_plugin_by_coord(&coord)->s.
39919 +                                                   file.append_key(&coord,
39920 +                                                                   &key);
39921 +                                       zrelse(coord.node);
39922 +                               }
39923 +                       } else
39924 +                               result = RETERR(-ENOENT);
39925 +               }
39926 +               done_lh(&lh);
39927 +       } while (result == 0 && !found);
39928 +       *offset = get_key_offset(&key);
39929 +       return result;
39930 +}
39931 +
39932 +/**
39933 + * tail2extent
39934 + * @uf_info:
39935 + *
39936 + *
39937 + */
39938 +int tail2extent(unix_file_info_t *uf_info)
39939 +{
39940 +       int result;
39941 +       reiser4_key key;        /* key of next byte to be moved to page */
39942 +       char *p_data;           /* data of page */
39943 +       unsigned page_off = 0,  /* offset within the page where to copy data */
39944 +           count;              /* number of bytes of item which can be
39945 +                                * copied to page */
39946 +       struct page *pages[TAIL2EXTENT_PAGE_NUM];
39947 +       struct page *page;
39948 +       int done;               /* set to 1 when all file is read */
39949 +       char *item;
39950 +       int i;
39951 +       struct inode *inode;
39952 +       int first_iteration;
39953 +       int bytes;
39954 +       __u64 offset;
39955 +
39956 +       assert("nikita-3362", ea_obtained(uf_info));
39957 +       inode = unix_file_info_to_inode(uf_info);
39958 +       assert("nikita-3412", !IS_RDONLY(inode));
39959 +       assert("vs-1649", uf_info->container != UF_CONTAINER_EXTENTS);
39960 +       assert("", !inode_get_flag(inode, REISER4_PART_IN_CONV));
39961 +
39962 +       offset = 0;
39963 +       first_iteration = 1;
39964 +       result = 0;
39965 +       if (inode_get_flag(inode, REISER4_PART_MIXED)) {
39966 +               /*
39967 +                * file is marked on disk as there was a conversion which did
39968 +                * not complete due to either crash or some error. Find which
39969 +                * offset tail conversion stopped at
39970 +                */
39971 +               result = find_start(inode, FORMATTING_ID, &offset);
39972 +               if (result == -ENOENT) {
39973 +                       /* no tail items found, everything is converted */
39974 +                       uf_info->container = UF_CONTAINER_EXTENTS;
39975 +                       complete_conversion(inode);
39976 +                       return 0;
39977 +               } else if (result != 0)
39978 +                       /* some other error */
39979 +                       return result;
39980 +               first_iteration = 0;
39981 +       }
39982 +
39983 +       inode_set_flag(inode, REISER4_PART_IN_CONV);
39984 +
39985 +       /* get key of first byte of a file */
39986 +       inode_file_plugin(inode)->key_by_inode(inode, offset, &key);
39987 +
39988 +       done = 0;
39989 +       while (done == 0) {
39990 +               memset(pages, 0, sizeof(pages));
39991 +               result = reserve_tail2extent_iteration(inode);
39992 +               if (result != 0)
39993 +                       goto out;
39994 +               if (first_iteration) {
39995 +                       inode_set_flag(inode, REISER4_PART_MIXED);
39996 +                       reiser4_update_sd(inode);
39997 +                       first_iteration = 0;
39998 +               }
39999 +               bytes = 0;
40000 +               for (i = 0; i < sizeof_array(pages) && done == 0; i++) {
40001 +                       assert("vs-598",
40002 +                              (get_key_offset(&key) & ~PAGE_CACHE_MASK) == 0);
40003 +                       page = alloc_page(get_gfp_mask());
40004 +                       if (!page) {
40005 +                               result = RETERR(-ENOMEM);
40006 +                               goto error;
40007 +                       }
40008 +
40009 +                       page->index =
40010 +                           (unsigned long)(get_key_offset(&key) >>
40011 +                                           PAGE_CACHE_SHIFT);
40012 +                       /*
40013 +                        * usually when one is going to longterm lock znode (as
40014 +                        * find_file_item does, for instance) he must not hold
40015 +                        * locked pages. However, there is an exception for
40016 +                        * case tail2extent. Pages appearing here are not
40017 +                        * reachable to everyone else, they are clean, they do
40018 +                        * not have jnodes attached so keeping them locked do
40019 +                        * not risk deadlock appearance
40020 +                        */
40021 +                       assert("vs-983", !PagePrivate(page));
40022 +                       reiser4_invalidate_pages(inode->i_mapping, page->index,
40023 +                                                1, 0);
40024 +
40025 +                       for (page_off = 0; page_off < PAGE_CACHE_SIZE;) {
40026 +                               coord_t coord;
40027 +                               lock_handle lh;
40028 +
40029 +                               /* get next item */
40030 +                               /* FIXME: we might want to readahead here */
40031 +                               init_lh(&lh);
40032 +                               result =
40033 +                                   find_file_item_nohint(&coord, &lh, &key,
40034 +                                                         ZNODE_READ_LOCK,
40035 +                                                         inode);
40036 +                               if (result != CBK_COORD_FOUND) {
40037 +                                       /*
40038 +                                        * error happened of not items of file
40039 +                                        * were found
40040 +                                        */
40041 +                                       done_lh(&lh);
40042 +                                       page_cache_release(page);
40043 +                                       goto error;
40044 +                               }
40045 +
40046 +                               if (coord.between == AFTER_UNIT) {
40047 +                                       /*
40048 +                                        * end of file is reached. Padd page
40049 +                                        * with zeros
40050 +                                        */
40051 +                                       done_lh(&lh);
40052 +                                       done = 1;
40053 +                                       p_data = kmap_atomic(page, KM_USER0);
40054 +                                       memset(p_data + page_off, 0,
40055 +                                              PAGE_CACHE_SIZE - page_off);
40056 +                                       kunmap_atomic(p_data, KM_USER0);
40057 +                                       break;
40058 +                               }
40059 +
40060 +                               result = zload(coord.node);
40061 +                               if (result) {
40062 +                                       page_cache_release(page);
40063 +                                       done_lh(&lh);
40064 +                                       goto error;
40065 +                               }
40066 +                               assert("vs-856", coord.between == AT_UNIT);
40067 +                               item = ((char *)item_body_by_coord(&coord)) +
40068 +                                       coord.unit_pos;
40069 +
40070 +                               /* how many bytes to copy */
40071 +                               count =
40072 +                                   item_length_by_coord(&coord) -
40073 +                                   coord.unit_pos;
40074 +                               /* limit length of copy to end of page */
40075 +                               if (count > PAGE_CACHE_SIZE - page_off)
40076 +                                       count = PAGE_CACHE_SIZE - page_off;
40077 +
40078 +                               /*
40079 +                                * copy item (as much as will fit starting from
40080 +                                * the beginning of the item) into the page
40081 +                                */
40082 +                               p_data = kmap_atomic(page, KM_USER0);
40083 +                               memcpy(p_data + page_off, item, count);
40084 +                               kunmap_atomic(p_data, KM_USER0);
40085 +
40086 +                               page_off += count;
40087 +                               bytes += count;
40088 +                               set_key_offset(&key,
40089 +                                              get_key_offset(&key) + count);
40090 +
40091 +                               zrelse(coord.node);
40092 +                               done_lh(&lh);
40093 +                       } /* end of loop which fills one page by content of
40094 +                          * formatting items */
40095 +
40096 +                       if (page_off) {
40097 +                               /* something was copied into page */
40098 +                               pages[i] = page;
40099 +                       } else {
40100 +                               page_cache_release(page);
40101 +                               assert("vs-1648", done == 1);
40102 +                               break;
40103 +                       }
40104 +               } /* end of loop through pages of one conversion iteration */
40105 +
40106 +               if (i > 0) {
40107 +                       result = replace(inode, pages, i, bytes);
40108 +                       release_all_pages(pages, sizeof_array(pages));
40109 +                       if (result)
40110 +                               goto error;
40111 +                       /*
40112 +                        * we have to drop exclusive access to avoid deadlock
40113 +                        * which may happen because called by
40114 +                        * reiser4_writepages capture_unix_file requires to get
40115 +                        * non-exclusive access to a file. It is safe to drop
40116 +                        * EA in the middle of tail2extent conversion because
40117 +                        * write_unix_file/unix_setattr(truncate)/release_unix_file(extent2tail)
40118 +                        * are serialized by uf_info->write semaphore and
40119 +                        * because read_unix_file works (should at least) on
40120 +                        * partially converted files
40121 +                        */
40122 +                       drop_exclusive_access(uf_info);
40123 +                       /* throttle the conversion */
40124 +                       reiser4_throttle_write(inode);
40125 +                       get_exclusive_access(uf_info);
40126 +
40127 +                       /*
40128 +                        * nobody is allowed to complete conversion but a
40129 +                        * process which started it
40130 +                        */
40131 +                       assert("", inode_get_flag(inode, REISER4_PART_MIXED));
40132 +               }
40133 +       }
40134 +
40135 +       inode_clr_flag(inode, REISER4_PART_IN_CONV);
40136 +
40137 +       if (result == 0) {
40138 +               /* file is converted to extent items */
40139 +               assert("vs-1697", inode_get_flag(inode, REISER4_PART_MIXED));
40140 +
40141 +               uf_info->container = UF_CONTAINER_EXTENTS;
40142 +               complete_conversion(inode);
40143 +       } else {
40144 +               /*
40145 +                * conversion is not complete. Inode was already marked as
40146 +                * REISER4_PART_CONV and stat-data were updated at the first
40147 +                * iteration of the loop above.
40148 +                */
40149 +             error:
40150 +               release_all_pages(pages, sizeof_array(pages));
40151 +               warning("nikita-2282", "Partial conversion of %llu: %i",
40152 +                       (unsigned long long)get_inode_oid(inode), result);
40153 +       }
40154 +
40155 +      out:
40156 +       return result;
40157 +}
40158 +
40159 +static int reserve_extent2tail_iteration(struct inode *inode)
40160 +{
40161 +       reiser4_tree *tree;
40162 +
40163 +       tree = tree_by_inode(inode);
40164 +       /*
40165 +        * reserve blocks for (in this order):
40166 +        *
40167 +        *     1. removal of extent item
40168 +        *
40169 +        *     2. insertion of tail by insert_flow()
40170 +        *
40171 +        *     3. drilling to the leaf level by coord_by_key()
40172 +        *
40173 +        *     4. possible update of stat-data
40174 +        */
40175 +       grab_space_enable();
40176 +       return reiser4_grab_space
40177 +           (estimate_one_item_removal(tree) +
40178 +            estimate_insert_flow(tree->height) +
40179 +            1 + estimate_one_insert_item(tree) +
40180 +            inode_file_plugin(inode)->estimate.update(inode), BA_CAN_COMMIT);
40181 +}
40182 +
40183 +static int filler(void *vp, struct page *page)
40184 +{
40185 +       return readpage_unix_file_nolock(vp, page);
40186 +}
40187 +
40188 +/* for every page of file: read page, cut part of extent pointing to this page,
40189 +   put data of page tree by tail item */
40190 +int extent2tail(unix_file_info_t *uf_info)
40191 +{
40192 +       int result;
40193 +       struct inode *inode;
40194 +       struct page *page;
40195 +       unsigned long num_pages, i;
40196 +       unsigned long start_page;
40197 +       reiser4_key from;
40198 +       reiser4_key to;
40199 +       unsigned count;
40200 +       __u64 offset;
40201 +
40202 +       assert("nikita-3362", ea_obtained(uf_info));
40203 +       inode = unix_file_info_to_inode(uf_info);
40204 +       assert("nikita-3412", !IS_RDONLY(inode));
40205 +       assert("vs-1649", uf_info->container != UF_CONTAINER_TAILS);
40206 +       assert("", !inode_get_flag(inode, REISER4_PART_IN_CONV));
40207 +
40208 +       offset = 0;
40209 +       if (inode_get_flag(inode, REISER4_PART_MIXED)) {
40210 +               /*
40211 +                * file is marked on disk as there was a conversion which did
40212 +                * not complete due to either crash or some error. Find which
40213 +                * offset tail conversion stopped at
40214 +                */
40215 +               result = find_start(inode, EXTENT_POINTER_ID, &offset);
40216 +               if (result == -ENOENT) {
40217 +                       /* no extent found, everything is converted */
40218 +                       uf_info->container = UF_CONTAINER_TAILS;
40219 +                       complete_conversion(inode);
40220 +                       return 0;
40221 +               } else if (result != 0)
40222 +                       /* some other error */
40223 +                       return result;
40224 +       }
40225 +
40226 +       inode_set_flag(inode, REISER4_PART_IN_CONV);
40227 +
40228 +       /* number of pages in the file */
40229 +       num_pages =
40230 +           (inode->i_size + - offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
40231 +       start_page = offset >> PAGE_CACHE_SHIFT;
40232 +
40233 +       inode_file_plugin(inode)->key_by_inode(inode, offset, &from);
40234 +       to = from;
40235 +
40236 +       result = 0;
40237 +       for (i = 0; i < num_pages; i++) {
40238 +               __u64 start_byte;
40239 +
40240 +               result = reserve_extent2tail_iteration(inode);
40241 +               if (result != 0)
40242 +                       break;
40243 +               if (i == 0 && offset == 0) {
40244 +                       inode_set_flag(inode, REISER4_PART_MIXED);
40245 +                       reiser4_update_sd(inode);
40246 +               }
40247 +
40248 +               page = read_cache_page(inode->i_mapping,
40249 +                                      (unsigned)(i + start_page), filler, NULL);
40250 +               if (IS_ERR(page)) {
40251 +                       result = PTR_ERR(page);
40252 +                       break;
40253 +               }
40254 +
40255 +               wait_on_page_locked(page);
40256 +
40257 +               if (!PageUptodate(page)) {
40258 +                       page_cache_release(page);
40259 +                       result = RETERR(-EIO);
40260 +                       break;
40261 +               }
40262 +
40263 +               /* cut part of file we have read */
40264 +               start_byte = (__u64) (i << PAGE_CACHE_SHIFT);
40265 +               set_key_offset(&from, start_byte);
40266 +               set_key_offset(&to, start_byte + PAGE_CACHE_SIZE - 1);
40267 +               /*
40268 +                * cut_tree_object() returns -E_REPEAT to allow atom
40269 +                * commits during over-long truncates. But
40270 +                * extent->tail conversion should be performed in one
40271 +                * transaction.
40272 +                */
40273 +               result = cut_tree(tree_by_inode(inode), &from, &to, inode, 0);
40274 +
40275 +               if (result) {
40276 +                       page_cache_release(page);
40277 +                       break;
40278 +               }
40279 +
40280 +               /* put page data into tree via tail_write */
40281 +               count = PAGE_CACHE_SIZE;
40282 +               if ((i == (num_pages - 1)) &&
40283 +                   (inode->i_size & ~PAGE_CACHE_MASK))
40284 +                       /* last page can be incompleted */
40285 +                       count = (inode->i_size & ~PAGE_CACHE_MASK);
40286 +               while (count) {
40287 +                       struct dentry dentry;
40288 +                       struct file file;
40289 +                       loff_t pos;
40290 +
40291 +                       dentry.d_inode = inode;
40292 +                       file.f_dentry = &dentry;
40293 +                       file.private_data = NULL;
40294 +                       file.f_pos = start_byte;
40295 +                       file.private_data = NULL;
40296 +                       pos = start_byte;
40297 +                       result = write_tail(&file, (char __user *)kmap(page),
40298 +                                           count, &pos);
40299 +                       reiser4_free_file_fsdata(&file);
40300 +                       if (result <= 0) {
40301 +                               warning("", "write_tail failed");
40302 +                               page_cache_release(page);
40303 +                               inode_clr_flag(inode, REISER4_PART_IN_CONV);
40304 +                               return result;
40305 +                       }
40306 +                       count -= result;
40307 +               }
40308 +
40309 +               /* release page */
40310 +               lock_page(page);
40311 +               /* page is already detached from jnode and mapping. */
40312 +               assert("vs-1086", page->mapping == NULL);
40313 +               assert("nikita-2690",
40314 +                      (!PagePrivate(page) && jprivate(page) == 0));
40315 +               /* waiting for writeback completion with page lock held is
40316 +                * perfectly valid. */
40317 +               wait_on_page_writeback(page);
40318 +               drop_page(page);
40319 +               /* release reference taken by read_cache_page() above */
40320 +               page_cache_release(page);
40321 +
40322 +               drop_exclusive_access(uf_info);
40323 +               /* throttle the conversion */
40324 +               reiser4_throttle_write(inode);
40325 +               get_exclusive_access(uf_info);
40326 +               /*
40327 +                * nobody is allowed to complete conversion but a process which
40328 +                * started it
40329 +                */
40330 +               assert("", inode_get_flag(inode, REISER4_PART_MIXED));
40331 +       }
40332 +
40333 +       inode_clr_flag(inode, REISER4_PART_IN_CONV);
40334 +
40335 +       if (i == num_pages) {
40336 +               /* file is converted to formatted items */
40337 +               assert("vs-1698", inode_get_flag(inode, REISER4_PART_MIXED));
40338 +               assert("vs-1260",
40339 +                      inode_has_no_jnodes(reiser4_inode_data(inode)));
40340 +
40341 +               uf_info->container = UF_CONTAINER_TAILS;
40342 +               complete_conversion(inode);
40343 +               return 0;
40344 +       }
40345 +       /*
40346 +        * conversion is not complete. Inode was already marked as
40347 +        * REISER4_PART_MIXED and stat-data were updated at the first *
40348 +        * iteration of the loop above.
40349 +        */
40350 +       warning("nikita-2282",
40351 +               "Partial conversion of %llu: %lu of %lu: %i",
40352 +               (unsigned long long)get_inode_oid(inode), i,
40353 +               num_pages, result);
40354 +
40355 +       return result;
40356 +}
40357 +
40358 +/*
40359 + * Local variables:
40360 + * c-indentation-style: "K&R"
40361 + * mode-name: "LC"
40362 + * c-basic-offset: 8
40363 + * tab-width: 8
40364 + * fill-column: 79
40365 + * scroll-step: 1
40366 + * End:
40367 + */
40368 diff --git a/fs/reiser4/plugin/file_ops.c b/fs/reiser4/plugin/file_ops.c
40369 new file mode 100644
40370 index 0000000..315d1cc
40371 --- /dev/null
40372 +++ b/fs/reiser4/plugin/file_ops.c
40373 @@ -0,0 +1,167 @@
40374 +/* Copyright 2005 by Hans Reiser, licensing governed by
40375 +   reiser4/README */
40376 +
40377 +/* this file contains typical implementations for some of methods of
40378 +   struct file_operations and of struct address_space_operations
40379 +*/
40380 +
40381 +#include "../inode.h"
40382 +#include "object.h"
40383 +
40384 +/* file operations */
40385 +
40386 +/* implementation of vfs's llseek method of struct file_operations for
40387 +   typical directory can be found in readdir_common.c
40388 +*/
40389 +loff_t llseek_common_dir(struct file *, loff_t, int origin);
40390 +
40391 +/* implementation of vfs's readdir method of struct file_operations for
40392 +   typical directory can be found in readdir_common.c
40393 +*/
40394 +int readdir_common(struct file *, void *dirent, filldir_t);
40395 +
40396 +/**
40397 + * release_dir_common - release of struct file_operations
40398 + * @inode: inode of released file
40399 + * @file: file to release
40400 + *
40401 + * Implementation of release method of struct file_operations for typical
40402 + * directory. All it does is freeing of reiser4 specific file data.
40403 +*/
40404 +int release_dir_common(struct inode *inode, struct file *file)
40405 +{
40406 +       reiser4_context *ctx;
40407 +
40408 +       ctx = init_context(inode->i_sb);
40409 +       if (IS_ERR(ctx))
40410 +               return PTR_ERR(ctx);
40411 +       reiser4_free_file_fsdata(file);
40412 +       reiser4_exit_context(ctx);
40413 +       return 0;
40414 +}
40415 +
40416 +/* this is common implementation of vfs's fsync method of struct
40417 +   file_operations
40418 +*/
40419 +int sync_common(struct file *file, struct dentry *dentry, int datasync)
40420 +{
40421 +       reiser4_context *ctx;
40422 +       int result;
40423 +
40424 +       ctx = init_context(dentry->d_inode->i_sb);
40425 +       if (IS_ERR(ctx))
40426 +               return PTR_ERR(ctx);
40427 +       result = txnmgr_force_commit_all(dentry->d_inode->i_sb, 0);
40428 +
40429 +       context_set_commit_async(ctx);
40430 +       reiser4_exit_context(ctx);
40431 +       return result;
40432 +}
40433 +
40434 +/* this is common implementation of vfs's sendfile method of struct
40435 +   file_operations
40436 +
40437 +   Reads @count bytes from @file and calls @actor for every page read. This is
40438 +   needed for loop back devices support.
40439 +*/
40440 +#if 0
40441 +ssize_t
40442 +sendfile_common(struct file *file, loff_t *ppos, size_t count,
40443 +               read_actor_t actor, void *target)
40444 +{
40445 +       reiser4_context *ctx;
40446 +       ssize_t result;
40447 +
40448 +       ctx = init_context(file->f_dentry->d_inode->i_sb);
40449 +       if (IS_ERR(ctx))
40450 +               return PTR_ERR(ctx);
40451 +       result = generic_file_sendfile(file, ppos, count, actor, target);
40452 +       reiser4_exit_context(ctx);
40453 +       return result;
40454 +}
40455 +#endif  /*  0  */
40456 +
40457 +/* address space operations */
40458 +
40459 +/* this is common implementation of vfs's prepare_write method of struct
40460 +   address_space_operations
40461 +*/
40462 +int
40463 +prepare_write_common(struct file *file, struct page *page, unsigned from,
40464 +                    unsigned to)
40465 +{
40466 +       reiser4_context *ctx;
40467 +       int result;
40468 +
40469 +       ctx = init_context(page->mapping->host->i_sb);
40470 +       result = do_prepare_write(file, page, from, to);
40471 +
40472 +       /* don't commit transaction under inode semaphore */
40473 +       context_set_commit_async(ctx);
40474 +       reiser4_exit_context(ctx);
40475 +
40476 +       return result;
40477 +}
40478 +
40479 +/* this is helper for prepare_write_common and prepare_write_unix_file
40480 + */
40481 +int
40482 +do_prepare_write(struct file *file, struct page *page, unsigned from,
40483 +                unsigned to)
40484 +{
40485 +       int result;
40486 +       file_plugin *fplug;
40487 +       struct inode *inode;
40488 +
40489 +       assert("umka-3099", file != NULL);
40490 +       assert("umka-3100", page != NULL);
40491 +       assert("umka-3095", PageLocked(page));
40492 +
40493 +       if (to - from == PAGE_CACHE_SIZE || PageUptodate(page))
40494 +               return 0;
40495 +
40496 +       inode = page->mapping->host;
40497 +       fplug = inode_file_plugin(inode);
40498 +
40499 +       if (page->mapping->a_ops->readpage == NULL)
40500 +               return RETERR(-EINVAL);
40501 +
40502 +       result = page->mapping->a_ops->readpage(file, page);
40503 +       if (result != 0) {
40504 +               SetPageError(page);
40505 +               ClearPageUptodate(page);
40506 +               /* All reiser4 readpage() implementations should return the
40507 +                * page locked in case of error. */
40508 +               assert("nikita-3472", PageLocked(page));
40509 +       } else {
40510 +               /*
40511 +                * ->readpage() either:
40512 +                *
40513 +                *     1. starts IO against @page. @page is locked for IO in
40514 +                *     this case.
40515 +                *
40516 +                *     2. doesn't start IO. @page is unlocked.
40517 +                *
40518 +                * In either case, page should be locked.
40519 +                */
40520 +               lock_page(page);
40521 +               /*
40522 +                * IO (if any) is completed at this point. Check for IO
40523 +                * errors.
40524 +                */
40525 +               if (!PageUptodate(page))
40526 +                       result = RETERR(-EIO);
40527 +       }
40528 +       assert("umka-3098", PageLocked(page));
40529 +       return result;
40530 +}
40531 +
40532 +/*
40533 + * Local variables:
40534 + * c-indentation-style: "K&R"
40535 + * mode-name: "LC"
40536 + * c-basic-offset: 8
40537 + * tab-width: 8
40538 + * fill-column: 79
40539 + * End:
40540 + */
40541 diff --git a/fs/reiser4/plugin/file_ops_readdir.c b/fs/reiser4/plugin/file_ops_readdir.c
40542 new file mode 100644
40543 index 0000000..8438907
40544 --- /dev/null
40545 +++ b/fs/reiser4/plugin/file_ops_readdir.c
40546 @@ -0,0 +1,655 @@
40547 +/* Copyright 2005 by Hans Reiser, licensing governed by
40548 + * reiser4/README */
40549 +
40550 +#include "../inode.h"
40551 +
40552 +/* return true, iff @coord points to the valid directory item that is part of
40553 + * @inode directory. */
40554 +static int is_valid_dir_coord(struct inode *inode, coord_t * coord)
40555 +{
40556 +       return
40557 +           item_type_by_coord(coord) == DIR_ENTRY_ITEM_TYPE &&
40558 +           inode_file_plugin(inode)->owns_item(inode, coord);
40559 +}
40560 +
40561 +/* compare two logical positions within the same directory */
40562 +static cmp_t dir_pos_cmp(const dir_pos * p1, const dir_pos * p2)
40563 +{
40564 +       cmp_t result;
40565 +
40566 +       assert("nikita-2534", p1 != NULL);
40567 +       assert("nikita-2535", p2 != NULL);
40568 +
40569 +       result = de_id_cmp(&p1->dir_entry_key, &p2->dir_entry_key);
40570 +       if (result == EQUAL_TO) {
40571 +               int diff;
40572 +
40573 +               diff = p1->pos - p2->pos;
40574 +               result =
40575 +                   (diff < 0) ? LESS_THAN : (diff ? GREATER_THAN : EQUAL_TO);
40576 +       }
40577 +       return result;
40578 +}
40579 +
40580 +
40581 +/* see comment before readdir_common() for overview of why "adjustment" is
40582 + * necessary. */
40583 +static void
40584 +adjust_dir_pos(struct file *dir,
40585 +              readdir_pos * readdir_spot, const dir_pos * mod_point, int adj)
40586 +{
40587 +       dir_pos *pos;
40588 +
40589 +       /*
40590 +        * new directory entry was added (adj == +1) or removed (adj == -1) at
40591 +        * the @mod_point. Directory file descriptor @dir is doing readdir and
40592 +        * is currently positioned at @readdir_spot. Latter has to be updated
40593 +        * to maintain stable readdir.
40594 +        */
40595 +       /* directory is positioned to the beginning. */
40596 +       if (readdir_spot->entry_no == 0)
40597 +               return;
40598 +
40599 +       pos = &readdir_spot->position;
40600 +       switch (dir_pos_cmp(mod_point, pos)) {
40601 +       case LESS_THAN:
40602 +               /* @mod_pos is _before_ @readdir_spot, that is, entry was
40603 +                * added/removed on the left (in key order) of current
40604 +                * position. */
40605 +               /* logical number of directory entry readdir is "looking" at
40606 +                * changes */
40607 +               readdir_spot->entry_no += adj;
40608 +               assert("nikita-2577",
40609 +                      ergo(dir != NULL, get_dir_fpos(dir) + adj >= 0));
40610 +               if (de_id_cmp(&pos->dir_entry_key,
40611 +                             &mod_point->dir_entry_key) == EQUAL_TO) {
40612 +                       assert("nikita-2575", mod_point->pos < pos->pos);
40613 +                       /*
40614 +                        * if entry added/removed has the same key as current
40615 +                        * for readdir, update counter of duplicate keys in
40616 +                        * @readdir_spot.
40617 +                        */
40618 +                       pos->pos += adj;
40619 +               }
40620 +               break;
40621 +       case GREATER_THAN:
40622 +               /* directory is modified after @pos: nothing to do. */
40623 +               break;
40624 +       case EQUAL_TO:
40625 +               /* cannot insert an entry readdir is looking at, because it
40626 +                  already exists. */
40627 +               assert("nikita-2576", adj < 0);
40628 +               /* directory entry to which @pos points to is being
40629 +                  removed.
40630 +
40631 +                  NOTE-NIKITA: Right thing to do is to update @pos to point
40632 +                  to the next entry. This is complex (we are under spin-lock
40633 +                  for one thing). Just rewind it to the beginning. Next
40634 +                  readdir will have to scan the beginning of
40635 +                  directory. Proper solution is to use semaphore in
40636 +                  spin lock's stead and use rewind_right() here.
40637 +
40638 +                  NOTE-NIKITA: now, semaphore is used, so...
40639 +                */
40640 +               memset(readdir_spot, 0, sizeof *readdir_spot);
40641 +       }
40642 +}
40643 +
40644 +/* scan all file-descriptors for this directory and adjust their
40645 +   positions respectively. Should be used by implementations of
40646 +   add_entry and rem_entry of dir plugin */
40647 +void
40648 +adjust_dir_file(struct inode *dir, const struct dentry *de, int offset, int adj)
40649 +{
40650 +       reiser4_file_fsdata *scan;
40651 +       dir_pos mod_point;
40652 +
40653 +       assert("nikita-2536", dir != NULL);
40654 +       assert("nikita-2538", de != NULL);
40655 +       assert("nikita-2539", adj != 0);
40656 +
40657 +       build_de_id(dir, &de->d_name, &mod_point.dir_entry_key);
40658 +       mod_point.pos = offset;
40659 +
40660 +       spin_lock_inode(dir);
40661 +
40662 +       /*
40663 +        * new entry was added/removed in directory @dir. Scan all file
40664 +        * descriptors for @dir that are currently involved into @readdir and
40665 +        * update them.
40666 +        */
40667 +
40668 +       list_for_each_entry(scan, get_readdir_list(dir), dir.linkage)
40669 +               adjust_dir_pos(scan->back, &scan->dir.readdir, &mod_point, adj);
40670 +
40671 +       spin_unlock_inode(dir);
40672 +}
40673 +
40674 +/*
40675 + * traverse tree to start/continue readdir from the readdir position @pos.
40676 + */
40677 +static int dir_go_to(struct file *dir, readdir_pos * pos, tap_t * tap)
40678 +{
40679 +       reiser4_key key;
40680 +       int result;
40681 +       struct inode *inode;
40682 +
40683 +       assert("nikita-2554", pos != NULL);
40684 +
40685 +       inode = dir->f_dentry->d_inode;
40686 +       result = inode_dir_plugin(inode)->build_readdir_key(dir, &key);
40687 +       if (result != 0)
40688 +               return result;
40689 +       result = object_lookup(inode,
40690 +                              &key,
40691 +                              tap->coord,
40692 +                              tap->lh,
40693 +                              tap->mode,
40694 +                              FIND_EXACT,
40695 +                              LEAF_LEVEL, LEAF_LEVEL, 0, &tap->ra_info);
40696 +       if (result == CBK_COORD_FOUND)
40697 +               result = rewind_right(tap, (int)pos->position.pos);
40698 +       else {
40699 +               tap->coord->node = NULL;
40700 +               done_lh(tap->lh);
40701 +               result = RETERR(-EIO);
40702 +       }
40703 +       return result;
40704 +}
40705 +
40706 +/*
40707 + * handling of non-unique keys: calculate at what ordinal position within
40708 + * sequence of directory items with identical keys @pos is.
40709 + */
40710 +static int set_pos(struct inode *inode, readdir_pos * pos, tap_t * tap)
40711 +{
40712 +       int result;
40713 +       coord_t coord;
40714 +       lock_handle lh;
40715 +       tap_t scan;
40716 +       de_id *did;
40717 +       reiser4_key de_key;
40718 +
40719 +       coord_init_zero(&coord);
40720 +       init_lh(&lh);
40721 +       tap_init(&scan, &coord, &lh, ZNODE_READ_LOCK);
40722 +       tap_copy(&scan, tap);
40723 +       tap_load(&scan);
40724 +       pos->position.pos = 0;
40725 +
40726 +       did = &pos->position.dir_entry_key;
40727 +
40728 +       if (is_valid_dir_coord(inode, scan.coord)) {
40729 +
40730 +               build_de_id_by_key(unit_key_by_coord(scan.coord, &de_key), did);
40731 +
40732 +               while (1) {
40733 +
40734 +                       result = go_prev_unit(&scan);
40735 +                       if (result != 0)
40736 +                               break;
40737 +
40738 +                       if (!is_valid_dir_coord(inode, scan.coord)) {
40739 +                               result = -EINVAL;
40740 +                               break;
40741 +                       }
40742 +
40743 +                       /* get key of directory entry */
40744 +                       unit_key_by_coord(scan.coord, &de_key);
40745 +                       if (de_id_key_cmp(did, &de_key) != EQUAL_TO) {
40746 +                               /* duplicate-sequence is over */
40747 +                               break;
40748 +                       }
40749 +                       pos->position.pos++;
40750 +               }
40751 +       } else
40752 +               result = RETERR(-ENOENT);
40753 +       tap_relse(&scan);
40754 +       tap_done(&scan);
40755 +       return result;
40756 +}
40757 +
40758 +/*
40759 + * "rewind" directory to @offset, i.e., set @pos and @tap correspondingly.
40760 + */
40761 +static int dir_rewind(struct file *dir, readdir_pos * pos, tap_t * tap)
40762 +{
40763 +       __u64 destination;
40764 +       __s64 shift;
40765 +       int result;
40766 +       struct inode *inode;
40767 +       loff_t dirpos;
40768 +
40769 +       assert("nikita-2553", dir != NULL);
40770 +       assert("nikita-2548", pos != NULL);
40771 +       assert("nikita-2551", tap->coord != NULL);
40772 +       assert("nikita-2552", tap->lh != NULL);
40773 +
40774 +       dirpos = get_dir_fpos(dir);
40775 +       shift = dirpos - pos->fpos;
40776 +       /* this is logical directory entry within @dir which we are rewinding
40777 +        * to */
40778 +       destination = pos->entry_no + shift;
40779 +
40780 +       inode = dir->f_dentry->d_inode;
40781 +       if (dirpos < 0)
40782 +               return RETERR(-EINVAL);
40783 +       else if (destination == 0ll || dirpos == 0) {
40784 +               /* rewind to the beginning of directory */
40785 +               memset(pos, 0, sizeof *pos);
40786 +               return dir_go_to(dir, pos, tap);
40787 +       } else if (destination >= inode->i_size)
40788 +               return RETERR(-ENOENT);
40789 +
40790 +       if (shift < 0) {
40791 +               /* I am afraid of negative numbers */
40792 +               shift = -shift;
40793 +               /* rewinding to the left */
40794 +               if (shift <= (int)pos->position.pos) {
40795 +                       /* destination is within sequence of entries with
40796 +                          duplicate keys. */
40797 +                       result = dir_go_to(dir, pos, tap);
40798 +               } else {
40799 +                       shift -= pos->position.pos;
40800 +                       while (1) {
40801 +                               /* repetitions: deadlock is possible when
40802 +                                  going to the left. */
40803 +                               result = dir_go_to(dir, pos, tap);
40804 +                               if (result == 0) {
40805 +                                       result = rewind_left(tap, shift);
40806 +                                       if (result == -E_DEADLOCK) {
40807 +                                               tap_done(tap);
40808 +                                               continue;
40809 +                                       }
40810 +                               }
40811 +                               break;
40812 +                       }
40813 +               }
40814 +       } else {
40815 +               /* rewinding to the right */
40816 +               result = dir_go_to(dir, pos, tap);
40817 +               if (result == 0)
40818 +                       result = rewind_right(tap, shift);
40819 +       }
40820 +       if (result == 0) {
40821 +               result = set_pos(inode, pos, tap);
40822 +               if (result == 0) {
40823 +                       /* update pos->position.pos */
40824 +                       pos->entry_no = destination;
40825 +                       pos->fpos = dirpos;
40826 +               }
40827 +       }
40828 +       return result;
40829 +}
40830 +
40831 +/*
40832 + * Function that is called by common_readdir() on each directory entry while
40833 + * doing readdir. ->filldir callback may block, so we had to release long term
40834 + * lock while calling it. To avoid repeating tree traversal, seal is used. If
40835 + * seal is broken, we return -E_REPEAT. Node is unlocked in this case.
40836 + *
40837 + * Whether node is unlocked in case of any other error is undefined. It is
40838 + * guaranteed to be still locked if success (0) is returned.
40839 + *
40840 + * When ->filldir() wants no more, feed_entry() returns 1, and node is
40841 + * unlocked.
40842 + */
40843 +static int
40844 +feed_entry(struct file *f,
40845 +          readdir_pos * pos, tap_t * tap, filldir_t filldir, void *dirent)
40846 +{
40847 +       item_plugin *iplug;
40848 +       char *name;
40849 +       reiser4_key sd_key;
40850 +       int result;
40851 +       char buf[DE_NAME_BUF_LEN];
40852 +       char name_buf[32];
40853 +       char *local_name;
40854 +       unsigned file_type;
40855 +       seal_t seal;
40856 +       coord_t *coord;
40857 +       reiser4_key entry_key;
40858 +
40859 +       coord = tap->coord;
40860 +       iplug = item_plugin_by_coord(coord);
40861 +
40862 +       /* pointer to name within the node */
40863 +       name = iplug->s.dir.extract_name(coord, buf);
40864 +       assert("nikita-1371", name != NULL);
40865 +
40866 +       /* key of object the entry points to */
40867 +       if (iplug->s.dir.extract_key(coord, &sd_key) != 0)
40868 +               return RETERR(-EIO);
40869 +
40870 +       /* we must release longterm znode lock before calling filldir to avoid
40871 +          deadlock which may happen if filldir causes page fault. So, copy
40872 +          name to intermediate buffer */
40873 +       if (strlen(name) + 1 > sizeof(name_buf)) {
40874 +               local_name = kmalloc(strlen(name) + 1, get_gfp_mask());
40875 +               if (local_name == NULL)
40876 +                       return RETERR(-ENOMEM);
40877 +       } else
40878 +               local_name = name_buf;
40879 +
40880 +       strcpy(local_name, name);
40881 +       file_type = iplug->s.dir.extract_file_type(coord);
40882 +
40883 +       unit_key_by_coord(coord, &entry_key);
40884 +       seal_init(&seal, coord, &entry_key);
40885 +
40886 +       longterm_unlock_znode(tap->lh);
40887 +
40888 +       /*
40889 +        * send information about directory entry to the ->filldir() filler
40890 +        * supplied to us by caller (VFS).
40891 +        *
40892 +        * ->filldir is entitled to do weird things. For example, ->filldir
40893 +        * supplied by knfsd re-enters file system. Make sure no locks are
40894 +        * held.
40895 +        */
40896 +       assert("nikita-3436", lock_stack_isclean(get_current_lock_stack()));
40897 +
40898 +       txn_restart_current();
40899 +       result = filldir(dirent, name, (int)strlen(name),
40900 +                        /* offset of this entry */
40901 +                        f->f_pos,
40902 +                        /* inode number of object bounden by this entry */
40903 +                        oid_to_uino(get_key_objectid(&sd_key)), file_type);
40904 +       if (local_name != name_buf)
40905 +               kfree(local_name);
40906 +       if (result < 0)
40907 +               /* ->filldir() is satisfied. (no space in buffer, IOW) */
40908 +               result = 1;
40909 +       else
40910 +               result = seal_validate(&seal, coord, &entry_key,
40911 +                                      tap->lh, tap->mode, ZNODE_LOCK_HIPRI);
40912 +       return result;
40913 +}
40914 +
40915 +static void move_entry(readdir_pos * pos, coord_t * coord)
40916 +{
40917 +       reiser4_key de_key;
40918 +       de_id *did;
40919 +
40920 +       /* update @pos */
40921 +       ++pos->entry_no;
40922 +       did = &pos->position.dir_entry_key;
40923 +
40924 +       /* get key of directory entry */
40925 +       unit_key_by_coord(coord, &de_key);
40926 +
40927 +       if (de_id_key_cmp(did, &de_key) == EQUAL_TO)
40928 +               /* we are within sequence of directory entries
40929 +                  with duplicate keys. */
40930 +               ++pos->position.pos;
40931 +       else {
40932 +               pos->position.pos = 0;
40933 +               build_de_id_by_key(&de_key, did);
40934 +       }
40935 +       ++pos->fpos;
40936 +}
40937 +
40938 +/*
40939 + *     STATELESS READDIR
40940 + *
40941 + * readdir support in reiser4 relies on ability to update readdir_pos embedded
40942 + * into reiser4_file_fsdata on each directory modification (name insertion and
40943 + * removal), see readdir_common() function below. This obviously doesn't work
40944 + * when reiser4 is accessed over NFS, because NFS doesn't keep any state
40945 + * across client READDIR requests for the same directory.
40946 + *
40947 + * To address this we maintain a "pool" of detached reiser4_file_fsdata
40948 + * (d_cursor). Whenever NFS readdir request comes, we detect this, and try to
40949 + * find detached reiser4_file_fsdata corresponding to previous readdir
40950 + * request. In other words, additional state is maintained on the
40951 + * server. (This is somewhat contrary to the design goals of NFS protocol.)
40952 + *
40953 + * To efficiently detect when our ->readdir() method is called by NFS server,
40954 + * dentry is marked as "stateless" in reiser4_decode_fh() (this is checked by
40955 + * file_is_stateless() function).
40956 + *
40957 + * To find out d_cursor in the pool, we encode client id (cid) in the highest
40958 + * bits of NFS readdir cookie: when first readdir request comes to the given
40959 + * directory from the given client, cookie is set to 0. This situation is
40960 + * detected, global cid_counter is incremented, and stored in highest bits of
40961 + * all direntry offsets returned to the client, including last one. As the
40962 + * only valid readdir cookie is one obtained as direntry->offset, we are
40963 + * guaranteed that next readdir request (continuing current one) will have
40964 + * current cid in the highest bits of starting readdir cookie. All d_cursors
40965 + * are hashed into per-super-block hash table by (oid, cid) key.
40966 + *
40967 + * In addition d_cursors are placed into per-super-block radix tree where they
40968 + * are keyed by oid alone. This is necessary to efficiently remove them during
40969 + * rmdir.
40970 + *
40971 + * At last, currently unused d_cursors are linked into special list. This list
40972 + * is used d_cursor_shrink to reclaim d_cursors on memory pressure.
40973 + *
40974 + */
40975 +
40976 +
40977 +/*
40978 + * prepare for readdir.
40979 + */
40980 +static int dir_readdir_init(struct file *f, tap_t * tap, readdir_pos ** pos)
40981 +{
40982 +       struct inode *inode;
40983 +       reiser4_file_fsdata *fsdata;
40984 +       int result;
40985 +
40986 +       assert("nikita-1359", f != NULL);
40987 +       inode = f->f_dentry->d_inode;
40988 +       assert("nikita-1360", inode != NULL);
40989 +
40990 +       if (!S_ISDIR(inode->i_mode))
40991 +               return RETERR(-ENOTDIR);
40992 +
40993 +       /* try to find detached readdir state */
40994 +       result = try_to_attach_fsdata(f, inode);
40995 +       if (result != 0)
40996 +               return result;
40997 +
40998 +       fsdata = reiser4_get_file_fsdata(f);
40999 +       assert("nikita-2571", fsdata != NULL);
41000 +       if (IS_ERR(fsdata))
41001 +               return PTR_ERR(fsdata);
41002 +
41003 +       /* add file descriptor to the readdir list hanging of directory
41004 +        * inode. This list is used to scan "readdirs-in-progress" while
41005 +        * inserting or removing names in the directory. */
41006 +       spin_lock_inode(inode);
41007 +       if (list_empty_careful(&fsdata->dir.linkage))
41008 +               list_add(&fsdata->dir.linkage, get_readdir_list(inode));
41009 +       *pos = &fsdata->dir.readdir;
41010 +       spin_unlock_inode(inode);
41011 +
41012 +       /* move @tap to the current position */
41013 +       return dir_rewind(f, *pos, tap);
41014 +}
41015 +
41016 +/* this is implementation of vfs's llseek method of struct file_operations for
41017 +   typical directory
41018 +   See comment before readdir_common() for explanation.
41019 +*/
41020 +loff_t llseek_common_dir(struct file * file, loff_t off, int origin)
41021 +{
41022 +       reiser4_context *ctx;
41023 +       loff_t result;
41024 +       struct inode *inode;
41025 +
41026 +       inode = file->f_dentry->d_inode;
41027 +
41028 +       ctx = init_context(inode->i_sb);
41029 +       if (IS_ERR(ctx))
41030 +               return PTR_ERR(ctx);
41031 +
41032 +       mutex_lock(&inode->i_mutex);
41033 +
41034 +       /* update ->f_pos */
41035 +       result = default_llseek(file, off, origin);
41036 +       if (result >= 0) {
41037 +               int ff;
41038 +               coord_t coord;
41039 +               lock_handle lh;
41040 +               tap_t tap;
41041 +               readdir_pos *pos;
41042 +
41043 +               coord_init_zero(&coord);
41044 +               init_lh(&lh);
41045 +               tap_init(&tap, &coord, &lh, ZNODE_READ_LOCK);
41046 +
41047 +               ff = dir_readdir_init(file, &tap, &pos);
41048 +               detach_fsdata(file);
41049 +               if (ff != 0)
41050 +                       result = (loff_t) ff;
41051 +               tap_done(&tap);
41052 +       }
41053 +       detach_fsdata(file);
41054 +       mutex_unlock(&inode->i_mutex);
41055 +
41056 +       reiser4_exit_context(ctx);
41057 +       return result;
41058 +}
41059 +
41060 +/* this is common implementation of vfs's readdir method of struct
41061 +   file_operations
41062 +
41063 +   readdir problems:
41064 +
41065 +   readdir(2)/getdents(2) interface is based on implicit assumption that
41066 +   readdir can be restarted from any particular point by supplying file system
41067 +   with off_t-full of data. That is, file system fills ->d_off field in struct
41068 +   dirent and later user passes ->d_off to the seekdir(3), which is, actually,
41069 +   implemented by glibc as lseek(2) on directory.
41070 +
41071 +   Reiser4 cannot restart readdir from 64 bits of data, because two last
41072 +   components of the key of directory entry are unknown, which given 128 bits:
41073 +   locality and type fields in the key of directory entry are always known, to
41074 +   start readdir() from given point objectid and offset fields have to be
41075 +   filled.
41076 +
41077 +   Traditional UNIX API for scanning through directory
41078 +   (readdir/seekdir/telldir/opendir/closedir/rewindir/getdents) is based on the
41079 +   assumption that directory is structured very much like regular file, in
41080 +   particular, it is implied that each name within given directory (directory
41081 +   entry) can be uniquely identified by scalar offset and that such offset is
41082 +   stable across the life-time of the name is identifies.
41083 +
41084 +   This is manifestly not so for reiser4. In reiser4 the only stable unique
41085 +   identifies for the directory entry is its key that doesn't fit into
41086 +   seekdir/telldir API.
41087 +
41088 +   solution:
41089 +
41090 +   Within each file descriptor participating in readdir-ing of directory
41091 +   plugin/dir/dir.h:readdir_pos is maintained. This structure keeps track of
41092 +   the "current" directory entry that file descriptor looks at. It contains a
41093 +   key of directory entry (plus some additional info to deal with non-unique
41094 +   keys that we wouldn't dwell onto here) and a logical position of this
41095 +   directory entry starting from the beginning of the directory, that is
41096 +   ordinal number of this entry in the readdir order.
41097 +
41098 +   Obviously this logical position is not stable in the face of directory
41099 +   modifications. To work around this, on each addition or removal of directory
41100 +   entry all file descriptors for directory inode are scanned and their
41101 +   readdir_pos are updated accordingly (adjust_dir_pos()).
41102 +*/
41103 +int readdir_common(struct file *f /* directory file being read */ ,
41104 +                  void *dirent /* opaque data passed to us by VFS */ ,
41105 +                  filldir_t filld /* filler function passed to us by VFS */ )
41106 +{
41107 +       reiser4_context *ctx;
41108 +       int result;
41109 +       struct inode *inode;
41110 +       coord_t coord;
41111 +       lock_handle lh;
41112 +       tap_t tap;
41113 +       readdir_pos *pos;
41114 +
41115 +       assert("nikita-1359", f != NULL);
41116 +       inode = f->f_dentry->d_inode;
41117 +       assert("nikita-1360", inode != NULL);
41118 +
41119 +       if (!S_ISDIR(inode->i_mode))
41120 +               return RETERR(-ENOTDIR);
41121 +
41122 +       ctx = init_context(inode->i_sb);
41123 +       if (IS_ERR(ctx))
41124 +               return PTR_ERR(ctx);
41125 +
41126 +       coord_init_zero(&coord);
41127 +       init_lh(&lh);
41128 +       tap_init(&tap, &coord, &lh, ZNODE_READ_LOCK);
41129 +
41130 +       reiser4_readdir_readahead_init(inode, &tap);
41131 +
41132 +      repeat:
41133 +       result = dir_readdir_init(f, &tap, &pos);
41134 +       if (result == 0) {
41135 +               result = tap_load(&tap);
41136 +               /* scan entries one by one feeding them to @filld */
41137 +               while (result == 0) {
41138 +                       coord_t *coord;
41139 +
41140 +                       coord = tap.coord;
41141 +                       assert("nikita-2572", coord_is_existing_unit(coord));
41142 +                       assert("nikita-3227", is_valid_dir_coord(inode, coord));
41143 +
41144 +                       result = feed_entry(f, pos, &tap, filld, dirent);
41145 +                       if (result > 0) {
41146 +                               break;
41147 +                       } else if (result == 0) {
41148 +                               ++f->f_pos;
41149 +                               result = go_next_unit(&tap);
41150 +                               if (result == -E_NO_NEIGHBOR ||
41151 +                                   result == -ENOENT) {
41152 +                                       result = 0;
41153 +                                       break;
41154 +                               } else if (result == 0) {
41155 +                                       if (is_valid_dir_coord(inode, coord))
41156 +                                               move_entry(pos, coord);
41157 +                                       else
41158 +                                               break;
41159 +                               }
41160 +                       } else if (result == -E_REPEAT) {
41161 +                               /* feed_entry() had to restart. */
41162 +                               ++f->f_pos;
41163 +                               tap_relse(&tap);
41164 +                               goto repeat;
41165 +                       } else
41166 +                               warning("vs-1617",
41167 +                                       "readdir_common: unexpected error %d",
41168 +                                       result);
41169 +               }
41170 +               tap_relse(&tap);
41171 +
41172 +               if (result >= 0)
41173 +                       f->f_version = inode->i_version;
41174 +       } else if (result == -E_NO_NEIGHBOR || result == -ENOENT)
41175 +               result = 0;
41176 +       tap_done(&tap);
41177 +       detach_fsdata(f);
41178 +
41179 +       /* try to update directory's atime */
41180 +       if (reiser4_grab_space_force(inode_file_plugin(inode)->estimate.update(inode),
41181 +                              BA_CAN_COMMIT) != 0)
41182 +               warning("", "failed to update atime on readdir: %llu",
41183 +                       get_inode_oid(inode));
41184 +       else
41185 +               file_accessed(f);
41186 +
41187 +       context_set_commit_async(ctx);
41188 +       reiser4_exit_context(ctx);
41189 +
41190 +       return (result <= 0) ? result : 0;
41191 +}
41192 +
41193 +/*
41194 + * Local variables:
41195 + * c-indentation-style: "K&R"
41196 + * mode-name: "LC"
41197 + * c-basic-offset: 8
41198 + * tab-width: 8
41199 + * fill-column: 79
41200 + * End:
41201 + */
41202 diff --git a/fs/reiser4/plugin/file_plugin_common.c b/fs/reiser4/plugin/file_plugin_common.c
41203 new file mode 100644
41204 index 0000000..33f649c
41205 --- /dev/null
41206 +++ b/fs/reiser4/plugin/file_plugin_common.c
41207 @@ -0,0 +1,929 @@
41208 +/* Copyright 2005 by Hans Reiser, licensing governed by
41209 +   reiser4/README */
41210 +
41211 +/* this file contains typical implementations for most of methods of
41212 +   file plugin
41213 +*/
41214 +
41215 +#include "../inode.h"
41216 +#include "object.h"
41217 +#include "../safe_link.h"
41218 +
41219 +#include <linux/quotaops.h>
41220 +
41221 +static int insert_new_sd(struct inode *inode);
41222 +static int update_sd(struct inode *inode);
41223 +
41224 +/* this is common implementation of write_sd_by_inode method of file plugin
41225 +   either insert stat data or update it
41226 + */
41227 +int write_sd_by_inode_common(struct inode *inode /* object to save */ )
41228 +{
41229 +       int result;
41230 +
41231 +       assert("nikita-730", inode != NULL);
41232 +
41233 +       if (inode_get_flag(inode, REISER4_NO_SD))
41234 +               /* object doesn't have stat-data yet */
41235 +               result = insert_new_sd(inode);
41236 +       else
41237 +               result = update_sd(inode);
41238 +       if (result != 0 && result != -ENAMETOOLONG && result != -ENOMEM)
41239 +               /* Don't issue warnings about "name is too long" */
41240 +               warning("nikita-2221", "Failed to save sd for %llu: %i",
41241 +                       (unsigned long long)get_inode_oid(inode), result);
41242 +       return result;
41243 +}
41244 +
41245 +/* this is common implementation of key_by_inode method of file plugin
41246 + */
41247 +int
41248 +key_by_inode_and_offset_common(struct inode *inode, loff_t off,
41249 +                              reiser4_key * key)
41250 +{
41251 +       reiser4_key_init(key);
41252 +       set_key_locality(key, reiser4_inode_data(inode)->locality_id);
41253 +       set_key_ordering(key, get_inode_ordering(inode));
41254 +       set_key_objectid(key, get_inode_oid(inode));    /*FIXME: inode->i_ino */
41255 +       set_key_type(key, KEY_BODY_MINOR);
41256 +       set_key_offset(key, (__u64) off);
41257 +       return 0;
41258 +}
41259 +
41260 +/* this is common implementation of set_plug_in_inode method of file plugin
41261 + */
41262 +int set_plug_in_inode_common(struct inode *object /* inode to set plugin on */ ,
41263 +                            struct inode *parent /* parent object */ ,
41264 +                            reiser4_object_create_data * data  /* creational
41265 +                                                                * data */ )
41266 +{
41267 +       __u64 mask;
41268 +
41269 +       object->i_mode = data->mode;
41270 +       /* this should be plugin decision */
41271 +       object->i_uid = current->fsuid;
41272 +       object->i_mtime = object->i_atime = object->i_ctime = CURRENT_TIME;
41273 +
41274 +       /* support for BSD style group-id assignment. See mount's manual page
41275 +          description of bsdgroups ext2 mount options for more details */
41276 +       if (reiser4_is_set(object->i_sb, REISER4_BSD_GID))
41277 +               object->i_gid = parent->i_gid;
41278 +       else if (parent->i_mode & S_ISGID) {
41279 +               /* parent directory has sguid bit */
41280 +               object->i_gid = parent->i_gid;
41281 +               if (S_ISDIR(object->i_mode))
41282 +                       /* sguid is inherited by sub-directories */
41283 +                       object->i_mode |= S_ISGID;
41284 +       } else
41285 +               object->i_gid = current->fsgid;
41286 +
41287 +       /* this object doesn't have stat-data yet */
41288 +       inode_set_flag(object, REISER4_NO_SD);
41289 +#if 0
41290 +       /* this is now called after all inode plugins are initialized:
41291 +          do_create_vfs_child after adjust_to_parent */
41292 +       /* setup inode and file-operations for this inode */
41293 +       setup_inode_ops(object, data);
41294 +#endif
41295 +       object->i_nlink = 0;
41296 +       seal_init(&reiser4_inode_data(object)->sd_seal, NULL, NULL);
41297 +       mask = (1 << UNIX_STAT) | (1 << LIGHT_WEIGHT_STAT);
41298 +       if (!reiser4_is_set(object->i_sb, REISER4_32_BIT_TIMES))
41299 +               mask |= (1 << LARGE_TIMES_STAT);
41300 +
41301 +       reiser4_inode_data(object)->extmask = mask;
41302 +       return 0;
41303 +}
41304 +
41305 +/* this is common implementation of adjust_to_parent method of file plugin for
41306 +   regular files
41307 + */
41308 +int adjust_to_parent_common(struct inode *object /* new object */ ,
41309 +                           struct inode *parent /* parent directory */ ,
41310 +                           struct inode *root /* root directory */ )
41311 +{
41312 +       assert("nikita-2165", object != NULL);
41313 +       if (parent == NULL)
41314 +               parent = root;
41315 +       assert("nikita-2069", parent != NULL);
41316 +
41317 +       /*
41318 +        * inherit missing plugins from parent
41319 +        */
41320 +
41321 +       grab_plugin(object, parent, PSET_FILE);
41322 +       grab_plugin(object, parent, PSET_SD);
41323 +       grab_plugin(object, parent, PSET_FORMATTING);
41324 +       grab_plugin(object, parent, PSET_PERM);
41325 +       return 0;
41326 +}
41327 +
41328 +/* this is common implementation of adjust_to_parent method of file plugin for
41329 +   typical directories
41330 + */
41331 +int adjust_to_parent_common_dir(struct inode *object /* new object */ ,
41332 +                               struct inode *parent /* parent directory */ ,
41333 +                               struct inode *root /* root directory */ )
41334 +{
41335 +       int result = 0;
41336 +       pset_member memb;
41337 +
41338 +       assert("nikita-2166", object != NULL);
41339 +       if (parent == NULL)
41340 +               parent = root;
41341 +       assert("nikita-2167", parent != NULL);
41342 +
41343 +       /*
41344 +        * inherit missing plugins from parent
41345 +        */
41346 +       for (memb = 0; memb < PSET_LAST; ++memb) {
41347 +               result = grab_plugin(object, parent, memb);
41348 +               if (result != 0)
41349 +                       break;
41350 +       }
41351 +       return result;
41352 +}
41353 +
41354 +int adjust_to_parent_cryptcompress(struct inode *object /* new object */ ,
41355 +                                  struct inode *parent /* parent directory */,
41356 +                                  struct inode *root /* root directory */)
41357 +{
41358 +       int result;
41359 +       result = adjust_to_parent_common(object, parent, root);
41360 +       if (result)
41361 +               return result;
41362 +       assert("edward-1416", parent != NULL);
41363 +
41364 +       grab_plugin(object, parent, PSET_CLUSTER);
41365 +       grab_plugin(object, parent, PSET_CIPHER);
41366 +       grab_plugin(object, parent, PSET_DIGEST);
41367 +       grab_plugin(object, parent, PSET_COMPRESSION);
41368 +       grab_plugin(object, parent, PSET_COMPRESSION_MODE);
41369 +
41370 +       return 0;
41371 +}
41372 +
41373 +/* this is common implementation of create_object method of file plugin
41374 + */
41375 +int
41376 +create_object_common(struct inode *object, struct inode *parent UNUSED_ARG,
41377 +                    reiser4_object_create_data * data UNUSED_ARG)
41378 +{
41379 +       reiser4_block_nr reserve;
41380 +       assert("nikita-744", object != NULL);
41381 +       assert("nikita-745", parent != NULL);
41382 +       assert("nikita-747", data != NULL);
41383 +       assert("nikita-748", inode_get_flag(object, REISER4_NO_SD));
41384 +
41385 +       reserve = estimate_create_common(object);
41386 +       if (reiser4_grab_space(reserve, BA_CAN_COMMIT))
41387 +               return RETERR(-ENOSPC);
41388 +       return write_sd_by_inode_common(object);
41389 +}
41390 +
41391 +static int common_object_delete_no_reserve(struct inode *inode);
41392 +
41393 +/**
41394 + * delete_object_common - delete_object of file_plugin
41395 + * @inode: inode to be deleted
41396 + *
41397 + * This is common implementation of delete_object method of file_plugin. It
41398 + * applies to object its deletion consists of removing two items - stat data
41399 + * and safe-link.
41400 + */
41401 +int delete_object_common(struct inode *inode)
41402 +{
41403 +       int result;
41404 +
41405 +       assert("nikita-1477", inode != NULL);
41406 +       /* FIXME: if file body deletion failed (i/o error, for instance),
41407 +          inode->i_size can be != 0 here */
41408 +       assert("nikita-3420", inode->i_size == 0 || S_ISLNK(inode->i_mode));
41409 +       assert("nikita-3421", inode->i_nlink == 0);
41410 +
41411 +
41412 +       if (!inode_get_flag(inode, REISER4_NO_SD)) {
41413 +               reiser4_block_nr reserve;
41414 +
41415 +               /* grab space which is needed to remove 2 items from the tree:
41416 +                  stat data and safe-link */
41417 +               reserve = 2 * estimate_one_item_removal(tree_by_inode(inode));
41418 +               if (reiser4_grab_space_force(reserve,
41419 +                                            BA_RESERVED | BA_CAN_COMMIT))
41420 +                       return RETERR(-ENOSPC);
41421 +               result = common_object_delete_no_reserve(inode);
41422 +       } else
41423 +               result = 0;
41424 +       return result;
41425 +}
41426 +
41427 +/**
41428 + * delete_directory_common - delete_object of file_plugin
41429 + * @inode: inode to be deleted
41430 + *
41431 + * This is common implementation of delete_object method of file_plugin for
41432 + * typical directory. It calls done method of dir_plugin to remove "." and
41433 + * removes stat data and safe-link.
41434 + */
41435 +int delete_directory_common(struct inode *inode)
41436 +{
41437 +       int result;
41438 +       dir_plugin *dplug;
41439 +
41440 +       assert("", (get_current_context() &&
41441 +                   get_current_context()->trans->atom == NULL));
41442 +
41443 +       dplug = inode_dir_plugin(inode);
41444 +       assert("vs-1101", dplug && dplug->done);
41445 +
41446 +       /* kill cursors which might be attached to inode */
41447 +       kill_cursors(inode);
41448 +
41449 +       /* grab space enough for removing two items */
41450 +       if (reiser4_grab_space
41451 +           (2 * estimate_one_item_removal(tree_by_inode(inode)),
41452 +            BA_RESERVED | BA_CAN_COMMIT))
41453 +               return RETERR(-ENOSPC);
41454 +
41455 +       result = dplug->done(inode);
41456 +       if (!result)
41457 +               result = common_object_delete_no_reserve(inode);
41458 +       return result;
41459 +}
41460 +
41461 +/* this is common implementation of add_link method of file plugin
41462 + */
41463 +int add_link_common(struct inode *object, struct inode *parent UNUSED_ARG)
41464 +{
41465 +       /*
41466 +        * increment ->i_nlink and update ->i_ctime
41467 +        */
41468 +
41469 +       INODE_INC_FIELD(object, i_nlink);
41470 +       object->i_ctime = CURRENT_TIME;
41471 +       return 0;
41472 +}
41473 +
41474 +/* this is common implementation of rem_link method of file plugin
41475 + */
41476 +int rem_link_common(struct inode *object, struct inode *parent UNUSED_ARG)
41477 +{
41478 +       assert("nikita-2021", object != NULL);
41479 +       assert("nikita-2163", object->i_nlink > 0);
41480 +
41481 +       /*
41482 +        * decrement ->i_nlink and update ->i_ctime
41483 +        */
41484 +
41485 +       INODE_DEC_FIELD(object, i_nlink);
41486 +       object->i_ctime = CURRENT_TIME;
41487 +       return 0;
41488 +}
41489 +
41490 +/* this is common implementation of rem_link method of file plugin for typical
41491 +   directory
41492 +*/
41493 +int rem_link_common_dir(struct inode *object, struct inode *parent UNUSED_ARG)
41494 +{
41495 +       assert("nikita-20211", object != NULL);
41496 +       assert("nikita-21631", object->i_nlink > 0);
41497 +
41498 +       /*
41499 +        * decrement ->i_nlink and update ->i_ctime
41500 +        */
41501 +       INODE_DEC_FIELD(object, i_nlink);
41502 +       if (object->i_nlink == 1)
41503 +               INODE_DEC_FIELD(object, i_nlink);
41504 +       object->i_ctime = CURRENT_TIME;
41505 +       return 0;
41506 +}
41507 +
41508 +/* this is common implementation of owns_item method of file plugin
41509 +   compare objectids of keys in inode and coord */
41510 +int owns_item_common(const struct inode *inode,        /* object to check
41511 +                                                * against */
41512 +                    const coord_t * coord /* coord to check */ )
41513 +{
41514 +       reiser4_key item_key;
41515 +       reiser4_key file_key;
41516 +
41517 +       assert("nikita-760", inode != NULL);
41518 +       assert("nikita-761", coord != NULL);
41519 +
41520 +       return coord_is_existing_item(coord) &&
41521 +           (get_key_objectid(build_sd_key(inode, &file_key)) ==
41522 +            get_key_objectid(item_key_by_coord(coord, &item_key)));
41523 +}
41524 +
41525 +/* this is common implementation of owns_item method of file plugin
41526 +   for typical directory
41527 +*/
41528 +int owns_item_common_dir(const struct inode *inode,    /* object to check against */
41529 +                        const coord_t * coord /* coord of item to check */ )
41530 +{
41531 +       reiser4_key item_key;
41532 +
41533 +       assert("nikita-1335", inode != NULL);
41534 +       assert("nikita-1334", coord != NULL);
41535 +
41536 +       if (item_type_by_coord(coord) == DIR_ENTRY_ITEM_TYPE)
41537 +               return get_key_locality(item_key_by_coord(coord, &item_key)) ==
41538 +                   get_inode_oid(inode);
41539 +       else
41540 +               return owns_item_common(inode, coord);
41541 +}
41542 +
41543 +/* this is common implementation of can_add_link method of file plugin
41544 +   checks whether yet another hard links to this object can be added
41545 +*/
41546 +int can_add_link_common(const struct inode *object /* object to check */ )
41547 +{
41548 +       assert("nikita-732", object != NULL);
41549 +
41550 +       /* inode->i_nlink is unsigned int, so just check for integer
41551 +          overflow */
41552 +       return object->i_nlink + 1 != 0;
41553 +}
41554 +
41555 +/* this is common implementation of can_rem_link method of file plugin for
41556 +   typical directory
41557 +*/
41558 +int can_rem_link_common_dir(const struct inode *inode)
41559 +{
41560 +       /* is_dir_empty() returns 0 is dir is empty */
41561 +       return !is_dir_empty(inode);
41562 +}
41563 +
41564 +/* this is common implementation of detach method of file plugin for typical
41565 +   directory
41566 +*/
41567 +int detach_common_dir(struct inode *child, struct inode *parent)
41568 +{
41569 +       dir_plugin *dplug;
41570 +
41571 +       dplug = inode_dir_plugin(child);
41572 +       assert("nikita-2883", dplug != NULL);
41573 +       assert("nikita-2884", dplug->detach != NULL);
41574 +       return dplug->detach(child, parent);
41575 +}
41576 +
41577 +/* this is common implementation of bind method of file plugin for typical
41578 +   directory
41579 +*/
41580 +int bind_common_dir(struct inode *child, struct inode *parent)
41581 +{
41582 +       dir_plugin *dplug;
41583 +
41584 +       dplug = inode_dir_plugin(child);
41585 +       assert("nikita-2646", dplug != NULL);
41586 +       return dplug->attach(child, parent);
41587 +}
41588 +
41589 +static int process_truncate(struct inode *, __u64 size);
41590 +
41591 +/* this is common implementation of safelink method of file plugin
41592 + */
41593 +int safelink_common(struct inode *object, reiser4_safe_link_t link, __u64 value)
41594 +{
41595 +       int result;
41596 +
41597 +       assert("vs-1705", get_current_context()->trans->atom == NULL);
41598 +       if (link == SAFE_UNLINK)
41599 +               /* nothing to do. iput() in the caller (process_safelink) will
41600 +                * finish with file */
41601 +               result = 0;
41602 +       else if (link == SAFE_TRUNCATE)
41603 +               result = process_truncate(object, value);
41604 +       else {
41605 +               warning("nikita-3438", "Unrecognized safe-link type: %i", link);
41606 +               result = RETERR(-EIO);
41607 +       }
41608 +       return result;
41609 +}
41610 +
41611 +/* this is common implementation of estimate.create method of file plugin
41612 +   can be used when object creation involves insertion of one item (usually stat
41613 +   data) into tree
41614 +*/
41615 +reiser4_block_nr estimate_create_common(const struct inode * object)
41616 +{
41617 +       return estimate_one_insert_item(tree_by_inode(object));
41618 +}
41619 +
41620 +/* this is common implementation of estimate.create method of file plugin for
41621 +   typical directory
41622 +   can be used when directory creation involves insertion of two items (usually
41623 +   stat data and item containing "." and "..") into tree
41624 +*/
41625 +reiser4_block_nr estimate_create_common_dir(const struct inode * object)
41626 +{
41627 +       return 2 * estimate_one_insert_item(tree_by_inode(object));
41628 +}
41629 +
41630 +/* this is common implementation of estimate.update method of file plugin
41631 +   can be used when stat data update does not do more than inserting a unit
41632 +   into a stat data item which is probably true for most cases
41633 +*/
41634 +reiser4_block_nr estimate_update_common(const struct inode * inode)
41635 +{
41636 +       return estimate_one_insert_into_item(tree_by_inode(inode));
41637 +}
41638 +
41639 +/* this is common implementation of estimate.unlink method of file plugin
41640 + */
41641 +reiser4_block_nr
41642 +estimate_unlink_common(const struct inode * object UNUSED_ARG,
41643 +                      const struct inode * parent UNUSED_ARG)
41644 +{
41645 +       return 0;
41646 +}
41647 +
41648 +/* this is common implementation of estimate.unlink method of file plugin for
41649 +   typical directory
41650 +*/
41651 +reiser4_block_nr
41652 +estimate_unlink_common_dir(const struct inode * object,
41653 +                          const struct inode * parent)
41654 +{
41655 +       dir_plugin *dplug;
41656 +
41657 +       dplug = inode_dir_plugin(object);
41658 +       assert("nikita-2888", dplug != NULL);
41659 +       assert("nikita-2887", dplug->estimate.unlink != NULL);
41660 +       return dplug->estimate.unlink(object, parent);
41661 +}
41662 +
41663 +char *wire_write_common(struct inode *inode, char *start)
41664 +{
41665 +       return build_inode_onwire(inode, start);
41666 +}
41667 +
41668 +char *wire_read_common(char *addr, reiser4_object_on_wire * obj)
41669 +{
41670 +       return extract_obj_key_id_from_onwire(addr, &obj->u.std.key_id);
41671 +}
41672 +
41673 +struct dentry *wire_get_common(struct super_block *sb,
41674 +                              reiser4_object_on_wire * obj)
41675 +{
41676 +       struct inode *inode;
41677 +       struct dentry *dentry;
41678 +       reiser4_key key;
41679 +
41680 +       extract_key_from_id(&obj->u.std.key_id, &key);
41681 +       inode = reiser4_iget(sb, &key, 1);
41682 +       if (!IS_ERR(inode)) {
41683 +               reiser4_iget_complete(inode);
41684 +               dentry = d_alloc_anon(inode);
41685 +               if (dentry == NULL) {
41686 +                       iput(inode);
41687 +                       dentry = ERR_PTR(-ENOMEM);
41688 +               } else
41689 +                       dentry->d_op = &get_super_private(sb)->ops.dentry;
41690 +       } else if (PTR_ERR(inode) == -ENOENT)
41691 +               /*
41692 +                * inode wasn't found at the key encoded in the file
41693 +                * handle. Hence, file handle is stale.
41694 +                */
41695 +               dentry = ERR_PTR(RETERR(-ESTALE));
41696 +       else
41697 +               dentry = (void *)inode;
41698 +       return dentry;
41699 +}
41700 +
41701 +int wire_size_common(struct inode *inode)
41702 +{
41703 +       return inode_onwire_size(inode);
41704 +}
41705 +
41706 +void wire_done_common(reiser4_object_on_wire * obj)
41707 +{
41708 +       /* nothing to do */
41709 +}
41710 +
41711 +/* helper function to print errors */
41712 +static void key_warning(const reiser4_key * key /* key to print */ ,
41713 +                       const struct inode *inode,
41714 +                       int code /* error code to print */ )
41715 +{
41716 +       assert("nikita-716", key != NULL);
41717 +
41718 +       if (code != -ENOMEM) {
41719 +               warning("nikita-717", "Error for inode %llu (%i)",
41720 +                       (unsigned long long)get_key_objectid(key), code);
41721 +               print_key("for key", key);
41722 +       }
41723 +}
41724 +
41725 +/* NIKITA-FIXME-HANS: perhaps this function belongs in another file? */
41726 +#if REISER4_DEBUG
41727 +static void
41728 +check_inode_seal(const struct inode *inode,
41729 +                const coord_t * coord, const reiser4_key * key)
41730 +{
41731 +       reiser4_key unit_key;
41732 +
41733 +       unit_key_by_coord(coord, &unit_key);
41734 +       assert("nikita-2752",
41735 +              WITH_DATA_RET(coord->node, 1, keyeq(key, &unit_key)));
41736 +       assert("nikita-2753", get_inode_oid(inode) == get_key_objectid(key));
41737 +}
41738 +
41739 +static void check_sd_coord(coord_t * coord, const reiser4_key * key)
41740 +{
41741 +       reiser4_key ukey;
41742 +
41743 +       coord_clear_iplug(coord);
41744 +       if (zload(coord->node))
41745 +               return;
41746 +
41747 +       if (!coord_is_existing_unit(coord) ||
41748 +           !item_plugin_by_coord(coord) ||
41749 +           !keyeq(unit_key_by_coord(coord, &ukey), key) ||
41750 +           (znode_get_level(coord->node) != LEAF_LEVEL) ||
41751 +           !item_is_statdata(coord)) {
41752 +               warning("nikita-1901", "Conspicuous seal");
41753 +               print_key("key", key);
41754 +               print_coord("coord", coord, 1);
41755 +               impossible("nikita-2877", "no way");
41756 +       }
41757 +       zrelse(coord->node);
41758 +}
41759 +
41760 +#else
41761 +#define check_inode_seal(inode, coord, key) noop
41762 +#define check_sd_coord(coord, key) noop
41763 +#endif
41764 +
41765 +/* insert new stat-data into tree. Called with inode state
41766 +    locked. Return inode state locked. */
41767 +static int insert_new_sd(struct inode *inode /* inode to create sd for */ )
41768 +{
41769 +       int result;
41770 +       reiser4_key key;
41771 +       coord_t coord;
41772 +       reiser4_item_data data;
41773 +       char *area;
41774 +       reiser4_inode *ref;
41775 +       lock_handle lh;
41776 +       oid_t oid;
41777 +
41778 +       assert("nikita-723", inode != NULL);
41779 +       assert("nikita-3406", inode_get_flag(inode, REISER4_NO_SD));
41780 +
41781 +       ref = reiser4_inode_data(inode);
41782 +       spin_lock_inode(inode);
41783 +
41784 +       if (ref->plugin_mask != 0)
41785 +               /* inode has non-standard plugins */
41786 +               inode_set_extension(inode, PLUGIN_STAT);
41787 +       /*
41788 +        * prepare specification of new item to be inserted
41789 +        */
41790 +
41791 +       data.iplug = inode_sd_plugin(inode);
41792 +       data.length = data.iplug->s.sd.save_len(inode);
41793 +       spin_unlock_inode(inode);
41794 +
41795 +       data.data = NULL;
41796 +       data.user = 0;
41797 +/* could be optimized for case where there is only one node format in
41798 + * use in the filesystem, probably there are lots of such
41799 + * places we could optimize for only one node layout.... -Hans */
41800 +       if (data.length > tree_by_inode(inode)->nplug->max_item_size()) {
41801 +               /* This is silly check, but we don't know actual node where
41802 +                  insertion will go into. */
41803 +               return RETERR(-ENAMETOOLONG);
41804 +       }
41805 +       oid = oid_allocate(inode->i_sb);
41806 +/* NIKITA-FIXME-HANS: what is your opinion on whether this error check should be encapsulated into oid_allocate? */
41807 +       if (oid == ABSOLUTE_MAX_OID)
41808 +               return RETERR(-EOVERFLOW);
41809 +
41810 +       set_inode_oid(inode, oid);
41811 +
41812 +       coord_init_zero(&coord);
41813 +       init_lh(&lh);
41814 +
41815 +       result = insert_by_key(tree_by_inode(inode),
41816 +                              build_sd_key(inode, &key), &data, &coord, &lh,
41817 +                              /* stat data lives on a leaf level */
41818 +                              LEAF_LEVEL, CBK_UNIQUE);
41819 +
41820 +       /* we don't want to re-check that somebody didn't insert
41821 +          stat-data while we were doing io, because if it did,
41822 +          insert_by_key() returned error. */
41823 +       /* but what _is_ possible is that plugin for inode's stat-data,
41824 +          list of non-standard plugins or their state would change
41825 +          during io, so that stat-data wouldn't fit into sd. To avoid
41826 +          this race we keep inode_state lock. This lock has to be
41827 +          taken each time you access inode in a way that would cause
41828 +          changes in sd size: changing plugins etc.
41829 +        */
41830 +
41831 +       if (result == IBK_INSERT_OK) {
41832 +               coord_clear_iplug(&coord);
41833 +               result = zload(coord.node);
41834 +               if (result == 0) {
41835 +                       /* have we really inserted stat data? */
41836 +                       assert("nikita-725", item_is_statdata(&coord));
41837 +
41838 +                       /* inode was just created. It is inserted into hash
41839 +                          table, but no directory entry was yet inserted into
41840 +                          parent. So, inode is inaccessible through
41841 +                          ->lookup(). All places that directly grab inode
41842 +                          from hash-table (like old knfsd), should check
41843 +                          IMMUTABLE flag that is set by common_create_child.
41844 +                        */
41845 +                       assert("nikita-3240", data.iplug != NULL);
41846 +                       assert("nikita-3241", data.iplug->s.sd.save != NULL);
41847 +                       area = item_body_by_coord(&coord);
41848 +                       result = data.iplug->s.sd.save(inode, &area);
41849 +                       znode_make_dirty(coord.node);
41850 +                       if (result == 0) {
41851 +                               /* object has stat-data now */
41852 +                               inode_clr_flag(inode, REISER4_NO_SD);
41853 +                               inode_set_flag(inode, REISER4_SDLEN_KNOWN);
41854 +                               /* initialise stat-data seal */
41855 +                               seal_init(&ref->sd_seal, &coord, &key);
41856 +                               ref->sd_coord = coord;
41857 +                               check_inode_seal(inode, &coord, &key);
41858 +                       } else if (result != -ENOMEM)
41859 +                               /*
41860 +                                * convert any other error code to -EIO to
41861 +                                * avoid confusing user level with unexpected
41862 +                                * errors.
41863 +                                */
41864 +                               result = RETERR(-EIO);
41865 +                       zrelse(coord.node);
41866 +               }
41867 +       }
41868 +       done_lh(&lh);
41869 +
41870 +       if (result != 0)
41871 +               key_warning(&key, inode, result);
41872 +       else
41873 +               oid_count_allocated();
41874 +
41875 +       return result;
41876 +}
41877 +
41878 +/* find sd of inode in a tree, deal with errors */
41879 +int lookup_sd(struct inode *inode /* inode to look sd for */ ,
41880 +             znode_lock_mode lock_mode /* lock mode */ ,
41881 +             coord_t * coord /* resulting coord */ ,
41882 +             lock_handle * lh /* resulting lock handle */ ,
41883 +             const reiser4_key * key /* resulting key */ ,
41884 +             int silent)
41885 +{
41886 +       int result;
41887 +       __u32 flags;
41888 +
41889 +       assert("nikita-1692", inode != NULL);
41890 +       assert("nikita-1693", coord != NULL);
41891 +       assert("nikita-1694", key != NULL);
41892 +
41893 +       /* look for the object's stat data in a tree.
41894 +          This returns in "node" pointer to a locked znode and in "pos"
41895 +          position of an item found in node. Both are only valid if
41896 +          coord_found is returned. */
41897 +       flags = (lock_mode == ZNODE_WRITE_LOCK) ? CBK_FOR_INSERT : 0;
41898 +       flags |= CBK_UNIQUE;
41899 +       /*
41900 +        * traverse tree to find stat data. We cannot use vroot here, because
41901 +        * it only covers _body_ of the file, and stat data don't belong
41902 +        * there.
41903 +        */
41904 +       result = coord_by_key(tree_by_inode(inode),
41905 +                             key,
41906 +                             coord,
41907 +                             lh,
41908 +                             lock_mode,
41909 +                             FIND_EXACT, LEAF_LEVEL, LEAF_LEVEL, flags, NULL);
41910 +       if (REISER4_DEBUG && result == 0)
41911 +               check_sd_coord(coord, key);
41912 +
41913 +       if (result != 0 && !silent)
41914 +               key_warning(key, inode, result);
41915 +       return result;
41916 +}
41917 +
41918 +static int
41919 +locate_inode_sd(struct inode *inode,
41920 +               reiser4_key * key, coord_t * coord, lock_handle * lh)
41921 +{
41922 +       reiser4_inode *state;
41923 +       seal_t seal;
41924 +       int result;
41925 +
41926 +       assert("nikita-3483", inode != NULL);
41927 +
41928 +       state = reiser4_inode_data(inode);
41929 +       spin_lock_inode(inode);
41930 +       *coord = state->sd_coord;
41931 +       coord_clear_iplug(coord);
41932 +       seal = state->sd_seal;
41933 +       spin_unlock_inode(inode);
41934 +
41935 +       build_sd_key(inode, key);
41936 +       if (seal_is_set(&seal)) {
41937 +               /* first, try to use seal */
41938 +               result = seal_validate(&seal,
41939 +                                      coord,
41940 +                                      key,
41941 +                                      lh, ZNODE_WRITE_LOCK, ZNODE_LOCK_LOPRI);
41942 +               if (result == 0)
41943 +                       check_sd_coord(coord, key);
41944 +       } else
41945 +               result = -E_REPEAT;
41946 +
41947 +       if (result != 0) {
41948 +               coord_init_zero(coord);
41949 +               result = lookup_sd(inode, ZNODE_WRITE_LOCK, coord, lh, key, 0);
41950 +       }
41951 +       return result;
41952 +}
41953 +
41954 +/* update stat-data at @coord */
41955 +static int
41956 +update_sd_at(struct inode *inode, coord_t * coord, reiser4_key * key,
41957 +            lock_handle * lh)
41958 +{
41959 +       int result;
41960 +       reiser4_item_data data;
41961 +       char *area;
41962 +       reiser4_inode *state;
41963 +       znode *loaded;
41964 +
41965 +       state = reiser4_inode_data(inode);
41966 +
41967 +       coord_clear_iplug(coord);
41968 +       result = zload(coord->node);
41969 +       if (result != 0)
41970 +               return result;
41971 +       loaded = coord->node;
41972 +
41973 +       spin_lock_inode(inode);
41974 +       assert("nikita-728", inode_sd_plugin(inode) != NULL);
41975 +       data.iplug = inode_sd_plugin(inode);
41976 +
41977 +       /* if inode has non-standard plugins, add appropriate stat data
41978 +        * extension */
41979 +       if (state->plugin_mask != 0)
41980 +               inode_set_extension(inode, PLUGIN_STAT);
41981 +
41982 +       /* data.length is how much space to add to (or remove
41983 +          from if negative) sd */
41984 +       if (!inode_get_flag(inode, REISER4_SDLEN_KNOWN)) {
41985 +               /* recalculate stat-data length */
41986 +               data.length =
41987 +                   data.iplug->s.sd.save_len(inode) -
41988 +                   item_length_by_coord(coord);
41989 +               inode_set_flag(inode, REISER4_SDLEN_KNOWN);
41990 +       } else
41991 +               data.length = 0;
41992 +       spin_unlock_inode(inode);
41993 +
41994 +       /* if on-disk stat data is of different length than required
41995 +          for this inode, resize it */
41996 +       if (data.length != 0) {
41997 +               data.data = NULL;
41998 +               data.user = 0;
41999 +
42000 +               /* insertion code requires that insertion point (coord) was
42001 +                * between units. */
42002 +               coord->between = AFTER_UNIT;
42003 +               result = resize_item(coord,
42004 +                                    &data, key, lh, COPI_DONT_SHIFT_LEFT);
42005 +               if (result != 0) {
42006 +                       key_warning(key, inode, result);
42007 +                       zrelse(loaded);
42008 +                       return result;
42009 +               }
42010 +               if (loaded != coord->node) {
42011 +                       /* resize_item moved coord to another node. Zload it */
42012 +                       zrelse(loaded);
42013 +                       coord_clear_iplug(coord);
42014 +                       result = zload(coord->node);
42015 +                       if (result != 0)
42016 +                               return result;
42017 +                       loaded = coord->node;
42018 +               }
42019 +       }
42020 +
42021 +       area = item_body_by_coord(coord);
42022 +       spin_lock_inode(inode);
42023 +       result = data.iplug->s.sd.save(inode, &area);
42024 +       znode_make_dirty(coord->node);
42025 +
42026 +       /* re-initialise stat-data seal */
42027 +
42028 +       /*
42029 +        * coord.between was possibly skewed from AT_UNIT when stat-data size
42030 +        * was changed and new extensions were pasted into item.
42031 +        */
42032 +       coord->between = AT_UNIT;
42033 +       seal_init(&state->sd_seal, coord, key);
42034 +       state->sd_coord = *coord;
42035 +       spin_unlock_inode(inode);
42036 +       check_inode_seal(inode, coord, key);
42037 +       zrelse(loaded);
42038 +       return result;
42039 +}
42040 +
42041 +/* Update existing stat-data in a tree. Called with inode state locked. Return
42042 +   inode state locked. */
42043 +static int update_sd(struct inode *inode /* inode to update sd for */ )
42044 +{
42045 +       int result;
42046 +       reiser4_key key;
42047 +       coord_t coord;
42048 +       lock_handle lh;
42049 +
42050 +       assert("nikita-726", inode != NULL);
42051 +
42052 +       /* no stat-data, nothing to update?! */
42053 +       assert("nikita-3482", !inode_get_flag(inode, REISER4_NO_SD));
42054 +
42055 +       init_lh(&lh);
42056 +
42057 +       result = locate_inode_sd(inode, &key, &coord, &lh);
42058 +       if (result == 0)
42059 +               result = update_sd_at(inode, &coord, &key, &lh);
42060 +       done_lh(&lh);
42061 +
42062 +       return result;
42063 +}
42064 +
42065 +/* helper for delete_object_common and delete_directory_common. Remove object
42066 +   stat data. Space for that must be reserved by caller before
42067 +*/
42068 +static int
42069 +common_object_delete_no_reserve(struct inode *inode /* object to remove */ )
42070 +{
42071 +       int result;
42072 +
42073 +       assert("nikita-1477", inode != NULL);
42074 +
42075 +       if (!inode_get_flag(inode, REISER4_NO_SD)) {
42076 +               reiser4_key sd_key;
42077 +
42078 +               DQUOT_FREE_INODE(inode);
42079 +               DQUOT_DROP(inode);
42080 +
42081 +               build_sd_key(inode, &sd_key);
42082 +               result =
42083 +                   cut_tree(tree_by_inode(inode), &sd_key, &sd_key, NULL, 0);
42084 +               if (result == 0) {
42085 +                       inode_set_flag(inode, REISER4_NO_SD);
42086 +                       result = oid_release(inode->i_sb, get_inode_oid(inode));
42087 +                       if (result == 0) {
42088 +                               oid_count_released();
42089 +
42090 +                               result = safe_link_del(tree_by_inode(inode),
42091 +                                                      get_inode_oid(inode),
42092 +                                                      SAFE_UNLINK);
42093 +                       }
42094 +               }
42095 +       } else
42096 +               result = 0;
42097 +       return result;
42098 +}
42099 +
42100 +/* helper for safelink_common */
42101 +static int process_truncate(struct inode *inode, __u64 size)
42102 +{
42103 +       int result;
42104 +       struct iattr attr;
42105 +       file_plugin *fplug;
42106 +       reiser4_context *ctx;
42107 +       struct dentry dentry;
42108 +
42109 +       assert("vs-21", is_in_reiser4_context());
42110 +       ctx = init_context(inode->i_sb);
42111 +       assert("vs-22", !IS_ERR(ctx));
42112 +
42113 +       attr.ia_size = size;
42114 +       attr.ia_valid = ATTR_SIZE | ATTR_CTIME;
42115 +       fplug = inode_file_plugin(inode);
42116 +
42117 +       mutex_lock(&inode->i_mutex);
42118 +       assert("vs-1704", get_current_context()->trans->atom == NULL);
42119 +       dentry.d_inode = inode;
42120 +       result = inode->i_op->setattr(&dentry, &attr);
42121 +       mutex_unlock(&inode->i_mutex);
42122 +
42123 +       context_set_commit_async(ctx);
42124 +       reiser4_exit_context(ctx);
42125 +
42126 +       return result;
42127 +}
42128 +
42129 +/* Local variables:
42130 +   c-indentation-style: "K&R"
42131 +   mode-name: "LC"
42132 +   c-basic-offset: 8
42133 +   tab-width: 8
42134 +   fill-column: 120
42135 +   End:
42136 +*/
42137 diff --git a/fs/reiser4/plugin/hash.c b/fs/reiser4/plugin/hash.c
42138 new file mode 100644
42139 index 0000000..0e861d0
42140 --- /dev/null
42141 +++ b/fs/reiser4/plugin/hash.c
42142 @@ -0,0 +1,350 @@
42143 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
42144 + * reiser4/README */
42145 +
42146 +/* Hash functions */
42147 +
42148 +#include "../debug.h"
42149 +#include "plugin_header.h"
42150 +#include "plugin.h"
42151 +#include "../super.h"
42152 +#include "../inode.h"
42153 +
42154 +#include <linux/types.h>
42155 +
42156 +/* old rupasov (yura) hash */
42157 +static __u64 hash_rupasov(const unsigned char *name /* name to hash */ ,
42158 +                         int len /* @name's length */ )
42159 +{
42160 +       int i;
42161 +       int j;
42162 +       int pow;
42163 +       __u64 a;
42164 +       __u64 c;
42165 +
42166 +       assert("nikita-672", name != NULL);
42167 +       assert("nikita-673", len >= 0);
42168 +
42169 +       for (pow = 1, i = 1; i < len; ++i)
42170 +               pow = pow * 10;
42171 +
42172 +       if (len == 1)
42173 +               a = name[0] - 48;
42174 +       else
42175 +               a = (name[0] - 48) * pow;
42176 +
42177 +       for (i = 1; i < len; ++i) {
42178 +               c = name[i] - 48;
42179 +               for (pow = 1, j = i; j < len - 1; ++j)
42180 +                       pow = pow * 10;
42181 +               a = a + c * pow;
42182 +       }
42183 +       for (; i < 40; ++i) {
42184 +               c = '0' - 48;
42185 +               for (pow = 1, j = i; j < len - 1; ++j)
42186 +                       pow = pow * 10;
42187 +               a = a + c * pow;
42188 +       }
42189 +
42190 +       for (; i < 256; ++i) {
42191 +               c = i;
42192 +               for (pow = 1, j = i; j < len - 1; ++j)
42193 +                       pow = pow * 10;
42194 +               a = a + c * pow;
42195 +       }
42196 +
42197 +       a = a << 7;
42198 +       return a;
42199 +}
42200 +
42201 +/* r5 hash */
42202 +static __u64 hash_r5(const unsigned char *name /* name to hash */ ,
42203 +                    int len UNUSED_ARG /* @name's length */ )
42204 +{
42205 +       __u64 a = 0;
42206 +
42207 +       assert("nikita-674", name != NULL);
42208 +       assert("nikita-675", len >= 0);
42209 +
42210 +       while (*name) {
42211 +               a += *name << 4;
42212 +               a += *name >> 4;
42213 +               a *= 11;
42214 +               name++;
42215 +       }
42216 +       return a;
42217 +}
42218 +
42219 +/* Keyed 32-bit hash function using TEA in a Davis-Meyer function
42220 +     H0 = Key
42221 +     Hi = E Mi(Hi-1) + Hi-1
42222 +
42223 +   (see Applied Cryptography, 2nd edition, p448).
42224 +
42225 +   Jeremy Fitzhardinge <jeremy@zip.com.au> 1998
42226 +
42227 +   Jeremy has agreed to the contents of reiserfs/README. -Hans
42228 +
42229 +   This code was blindly upgraded to __u64 by s/__u32/__u64/g.
42230 +*/
42231 +static __u64 hash_tea(const unsigned char *name /* name to hash */ ,
42232 +                     int len /* @name's length */ )
42233 +{
42234 +       __u64 k[] = { 0x9464a485u, 0x542e1a94u, 0x3e846bffu, 0xb75bcfc3u };
42235 +
42236 +       __u64 h0 = k[0], h1 = k[1];
42237 +       __u64 a, b, c, d;
42238 +       __u64 pad;
42239 +       int i;
42240 +
42241 +       assert("nikita-676", name != NULL);
42242 +       assert("nikita-677", len >= 0);
42243 +
42244 +#define DELTA 0x9E3779B9u
42245 +#define FULLROUNDS 10          /* 32 is overkill, 16 is strong crypto */
42246 +#define PARTROUNDS 6           /* 6 gets complete mixing */
42247 +
42248 +/* a, b, c, d - data; h0, h1 - accumulated hash */
42249 +#define TEACORE(rounds)                                                        \
42250 +       do {                                                            \
42251 +               __u64 sum = 0;                                          \
42252 +               int n = rounds;                                         \
42253 +               __u64 b0, b1;                                           \
42254 +                                                                       \
42255 +               b0 = h0;                                                \
42256 +               b1 = h1;                                                \
42257 +                                                                       \
42258 +               do                                                      \
42259 +               {                                                       \
42260 +                       sum += DELTA;                                   \
42261 +                       b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b); \
42262 +                       b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d); \
42263 +               } while(--n);                                           \
42264 +                                                                       \
42265 +               h0 += b0;                                               \
42266 +               h1 += b1;                                               \
42267 +       } while(0)
42268 +
42269 +       pad = (__u64) len | ((__u64) len << 8);
42270 +       pad |= pad << 16;
42271 +
42272 +       while (len >= 16) {
42273 +               a = (__u64) name[0] | (__u64) name[1] << 8 | (__u64) name[2] <<
42274 +                   16 | (__u64) name[3] << 24;
42275 +               b = (__u64) name[4] | (__u64) name[5] << 8 | (__u64) name[6] <<
42276 +                   16 | (__u64) name[7] << 24;
42277 +               c = (__u64) name[8] | (__u64) name[9] << 8 | (__u64) name[10] <<
42278 +                   16 | (__u64) name[11] << 24;
42279 +               d = (__u64) name[12] | (__u64) name[13] << 8 | (__u64) name[14]
42280 +                   << 16 | (__u64) name[15] << 24;
42281 +
42282 +               TEACORE(PARTROUNDS);
42283 +
42284 +               len -= 16;
42285 +               name += 16;
42286 +       }
42287 +
42288 +       if (len >= 12) {
42289 +               //assert(len < 16);
42290 +               if (len >= 16)
42291 +                       *(int *)0 = 0;
42292 +
42293 +               a = (__u64) name[0] | (__u64) name[1] << 8 | (__u64) name[2] <<
42294 +                   16 | (__u64) name[3] << 24;
42295 +               b = (__u64) name[4] | (__u64) name[5] << 8 | (__u64) name[6] <<
42296 +                   16 | (__u64) name[7] << 24;
42297 +               c = (__u64) name[8] | (__u64) name[9] << 8 | (__u64) name[10] <<
42298 +                   16 | (__u64) name[11] << 24;
42299 +
42300 +               d = pad;
42301 +               for (i = 12; i < len; i++) {
42302 +                       d <<= 8;
42303 +                       d |= name[i];
42304 +               }
42305 +       } else if (len >= 8) {
42306 +               //assert(len < 12);
42307 +               if (len >= 12)
42308 +                       *(int *)0 = 0;
42309 +               a = (__u64) name[0] | (__u64) name[1] << 8 | (__u64) name[2] <<
42310 +                   16 | (__u64) name[3] << 24;
42311 +               b = (__u64) name[4] | (__u64) name[5] << 8 | (__u64) name[6] <<
42312 +                   16 | (__u64) name[7] << 24;
42313 +
42314 +               c = d = pad;
42315 +               for (i = 8; i < len; i++) {
42316 +                       c <<= 8;
42317 +                       c |= name[i];
42318 +               }
42319 +       } else if (len >= 4) {
42320 +               //assert(len < 8);
42321 +               if (len >= 8)
42322 +                       *(int *)0 = 0;
42323 +               a = (__u64) name[0] | (__u64) name[1] << 8 | (__u64) name[2] <<
42324 +                   16 | (__u64) name[3] << 24;
42325 +
42326 +               b = c = d = pad;
42327 +               for (i = 4; i < len; i++) {
42328 +                       b <<= 8;
42329 +                       b |= name[i];
42330 +               }
42331 +       } else {
42332 +               //assert(len < 4);
42333 +               if (len >= 4)
42334 +                       *(int *)0 = 0;
42335 +               a = b = c = d = pad;
42336 +               for (i = 0; i < len; i++) {
42337 +                       a <<= 8;
42338 +                       a |= name[i];
42339 +               }
42340 +       }
42341 +
42342 +       TEACORE(FULLROUNDS);
42343 +
42344 +/*     return 0;*/
42345 +       return h0 ^ h1;
42346 +
42347 +}
42348 +
42349 +/* classical 64 bit Fowler/Noll/Vo-1 (FNV-1) hash.
42350 +
42351 +   See http://www.isthe.com/chongo/tech/comp/fnv/ for details.
42352 +
42353 +   Excerpts:
42354 +
42355 +     FNV hashes are designed to be fast while maintaining a low collision
42356 +     rate.
42357 +
42358 +     [This version also seems to preserve lexicographical order locally.]
42359 +
42360 +     FNV hash algorithms and source code have been released into the public
42361 +     domain.
42362 +
42363 +*/
42364 +static __u64 hash_fnv1(const unsigned char *name /* name to hash */ ,
42365 +                      int len UNUSED_ARG /* @name's length */ )
42366 +{
42367 +       unsigned long long a = 0xcbf29ce484222325ull;
42368 +       const unsigned long long fnv_64_prime = 0x100000001b3ull;
42369 +
42370 +       assert("nikita-678", name != NULL);
42371 +       assert("nikita-679", len >= 0);
42372 +
42373 +       /* FNV-1 hash each octet in the buffer */
42374 +       for (; *name; ++name) {
42375 +               /* multiply by the 32 bit FNV magic prime mod 2^64 */
42376 +               a *= fnv_64_prime;
42377 +               /* xor the bottom with the current octet */
42378 +               a ^= (unsigned long long)(*name);
42379 +       }
42380 +       /* return our new hash value */
42381 +       return a;
42382 +}
42383 +
42384 +/* degenerate hash function used to simplify testing of non-unique key
42385 +   handling */
42386 +static __u64 hash_deg(const unsigned char *name UNUSED_ARG /* name to hash */ ,
42387 +                     int len UNUSED_ARG /* @name's length */ )
42388 +{
42389 +       return 0xc0c0c0c010101010ull;
42390 +}
42391 +
42392 +static int change_hash(struct inode *inode, reiser4_plugin * plugin)
42393 +{
42394 +       int result;
42395 +
42396 +       assert("nikita-3503", inode != NULL);
42397 +       assert("nikita-3504", plugin != NULL);
42398 +
42399 +       assert("nikita-3505", is_reiser4_inode(inode));
42400 +       assert("nikita-3506", inode_dir_plugin(inode) != NULL);
42401 +       assert("nikita-3507", plugin->h.type_id == REISER4_HASH_PLUGIN_TYPE);
42402 +
42403 +       result = 0;
42404 +       if (inode_hash_plugin(inode) == NULL ||
42405 +           inode_hash_plugin(inode)->h.id != plugin->h.id) {
42406 +               if (is_dir_empty(inode) == 0)
42407 +                       result =
42408 +                           plugin_set_hash(&reiser4_inode_data(inode)->pset,
42409 +                                           &plugin->hash);
42410 +               else
42411 +                       result = RETERR(-ENOTEMPTY);
42412 +
42413 +       }
42414 +       return result;
42415 +}
42416 +
42417 +static reiser4_plugin_ops hash_plugin_ops = {
42418 +       .init = NULL,
42419 +       .load = NULL,
42420 +       .save_len = NULL,
42421 +       .save = NULL,
42422 +       .change = change_hash
42423 +};
42424 +
42425 +/* hash plugins */
42426 +hash_plugin hash_plugins[LAST_HASH_ID] = {
42427 +       [RUPASOV_HASH_ID] = {
42428 +               .h = {
42429 +                       .type_id = REISER4_HASH_PLUGIN_TYPE,
42430 +                       .id = RUPASOV_HASH_ID,
42431 +                       .pops = &hash_plugin_ops,
42432 +                       .label = "rupasov",
42433 +                       .desc = "Original Yura's hash",
42434 +                       .linkage = {NULL, NULL}
42435 +               },
42436 +               .hash = hash_rupasov
42437 +       },
42438 +       [R5_HASH_ID] = {
42439 +               .h = {
42440 +                       .type_id = REISER4_HASH_PLUGIN_TYPE,
42441 +                       .id = R5_HASH_ID,
42442 +                       .pops = &hash_plugin_ops,
42443 +                       .label = "r5",
42444 +                       .desc = "r5 hash",
42445 +                       .linkage = {NULL, NULL}
42446 +               },
42447 +               .hash = hash_r5
42448 +       },
42449 +       [TEA_HASH_ID] = {
42450 +               .h = {
42451 +                       .type_id = REISER4_HASH_PLUGIN_TYPE,
42452 +                       .id = TEA_HASH_ID,
42453 +                       .pops = &hash_plugin_ops,
42454 +                       .label = "tea",
42455 +                       .desc = "tea hash",
42456 +                       .linkage = {NULL, NULL}
42457 +               },
42458 +               .hash = hash_tea
42459 +       },
42460 +       [FNV1_HASH_ID] = {
42461 +               .h = {
42462 +                       .type_id = REISER4_HASH_PLUGIN_TYPE,
42463 +                       .id = FNV1_HASH_ID,
42464 +                       .pops = &hash_plugin_ops,
42465 +                       .label = "fnv1",
42466 +                       .desc = "fnv1 hash",
42467 +                       .linkage = {NULL, NULL}
42468 +               },
42469 +               .hash = hash_fnv1
42470 +       },
42471 +       [DEGENERATE_HASH_ID] = {
42472 +               .h = {
42473 +                       .type_id = REISER4_HASH_PLUGIN_TYPE,
42474 +                       .id = DEGENERATE_HASH_ID,
42475 +                       .pops = &hash_plugin_ops,
42476 +                       .label = "degenerate hash",
42477 +                       .desc = "Degenerate hash: only for testing",
42478 +                       .linkage = {NULL, NULL}
42479 +               },
42480 +               .hash = hash_deg
42481 +       }
42482 +};
42483 +
42484 +/* Make Linus happy.
42485 +   Local variables:
42486 +   c-indentation-style: "K&R"
42487 +   mode-name: "LC"
42488 +   c-basic-offset: 8
42489 +   tab-width: 8
42490 +   fill-column: 120
42491 +   End:
42492 +*/
42493 diff --git a/fs/reiser4/plugin/inode_ops.c b/fs/reiser4/plugin/inode_ops.c
42494 new file mode 100644
42495 index 0000000..7c215af
42496 --- /dev/null
42497 +++ b/fs/reiser4/plugin/inode_ops.c
42498 @@ -0,0 +1,886 @@
42499 +/*
42500 + * Copyright 2005 by Hans Reiser, licensing governed by reiser4/README
42501 + */
42502 +
42503 +/*
42504 + * this file contains typical implementations for most of methods of struct
42505 + * inode_operations
42506 + */
42507 +
42508 +#include "../inode.h"
42509 +#include "../safe_link.h"
42510 +
42511 +#include <linux/quotaops.h>
42512 +#include <linux/namei.h>
42513 +
42514 +
42515 +static int create_vfs_object(struct inode *parent, struct dentry *dentry,
42516 +                     reiser4_object_create_data *data);
42517 +
42518 +/**
42519 + * create_common - create of inode operations
42520 + * @parent: inode of parent directory
42521 + * @dentry: dentry of new object to create
42522 + * @mode: the permissions to use
42523 + * @nameidata:
42524 + *
42525 + * This is common implementation of vfs's create method of struct
42526 + * inode_operations.
42527 + * Creates regular file using file plugin from parent directory plugin set.
42528 + */
42529 +int create_common(struct inode *parent, struct dentry *dentry,
42530 +                 int mode, struct nameidata *nameidata)
42531 +{
42532 +       reiser4_object_create_data data;
42533 +
42534 +       memset(&data, 0, sizeof data);
42535 +       data.mode = S_IFREG | mode;
42536 +       data.id = inode_regular_plugin(parent)->id;
42537 +       return create_vfs_object(parent, dentry, &data);
42538 +}
42539 +
42540 +int lookup_name(struct inode *dir, struct dentry *, reiser4_key *);
42541 +void check_light_weight(struct inode *inode, struct inode *parent);
42542 +
42543 +/**
42544 + * lookup_common - lookup of inode operations
42545 + * @parent: inode of directory to lookup into
42546 + * @dentry: name to look for
42547 + * @nameidata:
42548 + *
42549 + * This is common implementation of vfs's lookup method of struct
42550 + * inode_operations.
42551 + */
42552 +struct dentry *lookup_common(struct inode *parent, struct dentry *dentry,
42553 +                            struct nameidata *nameidata)
42554 +{
42555 +       reiser4_context *ctx;
42556 +       int result;
42557 +       struct dentry *new;
42558 +       struct inode *inode;
42559 +       reiser4_dir_entry_desc entry;
42560 +
42561 +       ctx = init_context(parent->i_sb);
42562 +       if (IS_ERR(ctx))
42563 +               return (struct dentry *)ctx;
42564 +
42565 +       /* set up operations on dentry. */
42566 +       dentry->d_op = &get_super_private(parent->i_sb)->ops.dentry;
42567 +
42568 +       result = lookup_name(parent, dentry, &entry.key);
42569 +       if (result) {
42570 +               context_set_commit_async(ctx);
42571 +               reiser4_exit_context(ctx);
42572 +               if (result == -ENOENT) {
42573 +                       /* object not found */
42574 +                       if (!IS_DEADDIR(parent))
42575 +                               d_add(dentry, NULL);
42576 +                       return NULL;
42577 +               }
42578 +               return ERR_PTR(result);
42579 +       }
42580 +
42581 +       inode = reiser4_iget(parent->i_sb, &entry.key, 0);
42582 +       if (IS_ERR(inode)) {
42583 +               context_set_commit_async(ctx);
42584 +               reiser4_exit_context(ctx);
42585 +               return ERR_PTR(PTR_ERR(inode));
42586 +       }
42587 +
42588 +       /* success */
42589 +       check_light_weight(inode, parent);
42590 +       new = d_splice_alias(inode, dentry);
42591 +       reiser4_iget_complete(inode);
42592 +
42593 +       /* prevent balance_dirty_pages() from being called: we don't want to
42594 +        * do this under directory i_mutex. */
42595 +       context_set_commit_async(ctx);
42596 +       reiser4_exit_context(ctx);
42597 +       return new;
42598 +}
42599 +
42600 +static reiser4_block_nr common_estimate_link(struct inode *parent,
42601 +                                            struct inode *object);
42602 +int reiser4_update_dir(struct inode *);
42603 +
42604 +/**
42605 + * link_common - link of inode operations
42606 + * @existing: dentry of object which is to get new name
42607 + * @parent: directory where new name is to be created
42608 + * @newname: new name
42609 + *
42610 + * This is common implementation of vfs's link method of struct
42611 + * inode_operations.
42612 + */
42613 +int link_common(struct dentry *existing, struct inode *parent,
42614 +               struct dentry *newname)
42615 +{
42616 +       reiser4_context *ctx;
42617 +       int result;
42618 +       struct inode *object;
42619 +       dir_plugin *parent_dplug;
42620 +       reiser4_dir_entry_desc entry;
42621 +       reiser4_object_create_data data;
42622 +       reiser4_block_nr reserve;
42623 +
42624 +       ctx = init_context(parent->i_sb);
42625 +       if (IS_ERR(ctx))
42626 +               return PTR_ERR(ctx);
42627 +
42628 +       assert("nikita-1431", existing != NULL);
42629 +       assert("nikita-1432", parent != NULL);
42630 +       assert("nikita-1433", newname != NULL);
42631 +
42632 +       object = existing->d_inode;
42633 +       assert("nikita-1434", object != NULL);
42634 +
42635 +       /* check for race with create_object() */
42636 +       if (inode_get_flag(object, REISER4_IMMUTABLE)) {
42637 +               context_set_commit_async(ctx);
42638 +               reiser4_exit_context(ctx);
42639 +               return RETERR(-E_REPEAT);
42640 +       }
42641 +
42642 +       parent_dplug = inode_dir_plugin(parent);
42643 +
42644 +       memset(&entry, 0, sizeof entry);
42645 +       entry.obj = object;
42646 +
42647 +       data.mode = object->i_mode;
42648 +       data.id = inode_file_plugin(object)->h.id;
42649 +
42650 +       reserve = common_estimate_link(parent, existing->d_inode);
42651 +       if ((__s64) reserve < 0) {
42652 +               context_set_commit_async(ctx);
42653 +               reiser4_exit_context(ctx);
42654 +               return reserve;
42655 +       }
42656 +
42657 +       if (reiser4_grab_space(reserve, BA_CAN_COMMIT)) {
42658 +               context_set_commit_async(ctx);
42659 +               reiser4_exit_context(ctx);
42660 +               return RETERR(-ENOSPC);
42661 +       }
42662 +
42663 +       /*
42664 +        * Subtle race handling: sys_link() doesn't take i_mutex on @parent. It
42665 +        * means that link(2) can race against unlink(2) or rename(2), and
42666 +        * inode is dead (->i_nlink == 0) when reiser4_link() is entered.
42667 +        *
42668 +        * For such inode we have to undo special processing done in
42669 +        * reiser4_unlink() viz. creation of safe-link.
42670 +        */
42671 +       if (unlikely(object->i_nlink == 0)) {
42672 +               result = safe_link_del(tree_by_inode(object),
42673 +                                      get_inode_oid(object), SAFE_UNLINK);
42674 +               if (result != 0) {
42675 +                       context_set_commit_async(ctx);
42676 +                       reiser4_exit_context(ctx);
42677 +                       return result;
42678 +               }
42679 +       }
42680 +
42681 +       /* increment nlink of @existing and update its stat data */
42682 +       result = reiser4_add_nlink(object, parent, 1);
42683 +       if (result == 0) {
42684 +               /* add entry to the parent */
42685 +               result =
42686 +                   parent_dplug->add_entry(parent, newname, &data, &entry);
42687 +               if (result != 0) {
42688 +                       /* failed to add entry to the parent, decrement nlink
42689 +                          of @existing */
42690 +                       reiser4_del_nlink(object, parent, 1);
42691 +                       /*
42692 +                        * now, if that failed, we have a file with too big
42693 +                        * nlink---space leak, much better than directory
42694 +                        * entry pointing to nowhere
42695 +                        */
42696 +               }
42697 +       }
42698 +       if (result == 0) {
42699 +               atomic_inc(&object->i_count);
42700 +               /*
42701 +                * Upon successful completion, link() shall mark for update
42702 +                * the st_ctime field of the file. Also, the st_ctime and
42703 +                * st_mtime fields of the directory that contains the new
42704 +                * entry shall be marked for update. --SUS
42705 +                */
42706 +               result = reiser4_update_dir(parent);
42707 +       }
42708 +       if (result == 0)
42709 +               d_instantiate(newname, existing->d_inode);
42710 +
42711 +       context_set_commit_async(ctx);
42712 +       reiser4_exit_context(ctx);
42713 +       return result;
42714 +}
42715 +
42716 +static int unlink_check_and_grab(struct inode *parent, struct dentry *victim);
42717 +
42718 +/**
42719 + * unlink_common - unlink of inode operations
42720 + * @parent: inode of directory to remove name from
42721 + * @victim: name to be removed
42722 + *
42723 + * This is common implementation of vfs's unlink method of struct
42724 + * inode_operations.
42725 + */
42726 +int unlink_common(struct inode *parent, struct dentry *victim)
42727 +{
42728 +       reiser4_context *ctx;
42729 +       int result;
42730 +       struct inode *object;
42731 +       file_plugin *fplug;
42732 +
42733 +       ctx = init_context(parent->i_sb);
42734 +       if (IS_ERR(ctx))
42735 +               return PTR_ERR(ctx);
42736 +
42737 +       object = victim->d_inode;
42738 +       fplug = inode_file_plugin(object);
42739 +       assert("nikita-2882", fplug->detach != NULL);
42740 +
42741 +       result = unlink_check_and_grab(parent, victim);
42742 +       if (result != 0) {
42743 +               context_set_commit_async(ctx);
42744 +               reiser4_exit_context(ctx);
42745 +               return result;
42746 +       }
42747 +
42748 +       result = fplug->detach(object, parent);
42749 +       if (result == 0) {
42750 +               dir_plugin *parent_dplug;
42751 +               reiser4_dir_entry_desc entry;
42752 +
42753 +               parent_dplug = inode_dir_plugin(parent);
42754 +               memset(&entry, 0, sizeof entry);
42755 +
42756 +               /* first, delete directory entry */
42757 +               result = parent_dplug->rem_entry(parent, victim, &entry);
42758 +               if (result == 0) {
42759 +                       /*
42760 +                        * if name was removed successfully, we _have_ to
42761 +                        * return 0 from this function, because upper level
42762 +                        * caller (vfs_{rmdir,unlink}) expect this.
42763 +                        *
42764 +                        * now that directory entry is removed, update
42765 +                        * stat-data
42766 +                        */
42767 +                       reiser4_del_nlink(object, parent, 1);
42768 +                       /*
42769 +                        * Upon successful completion, unlink() shall mark for
42770 +                        * update the st_ctime and st_mtime fields of the
42771 +                        * parent directory. Also, if the file's link count is
42772 +                        * not 0, the st_ctime field of the file shall be
42773 +                        * marked for update. --SUS
42774 +                        */
42775 +                       reiser4_update_dir(parent);
42776 +                       /* add safe-link for this file */
42777 +                       if (object->i_nlink == 0)
42778 +                               safe_link_add(object, SAFE_UNLINK);
42779 +               }
42780 +       }
42781 +
42782 +       if (unlikely(result != 0)) {
42783 +               if (result != -ENOMEM)
42784 +                       warning("nikita-3398", "Cannot unlink %llu (%i)",
42785 +                               (unsigned long long)get_inode_oid(object),
42786 +                               result);
42787 +               /* if operation failed commit pending inode modifications to
42788 +                * the stat-data */
42789 +               reiser4_update_sd(object);
42790 +               reiser4_update_sd(parent);
42791 +       }
42792 +
42793 +       reiser4_release_reserved(object->i_sb);
42794 +
42795 +       /* @object's i_ctime was updated by ->rem_link() method(). */
42796 +
42797 +       /* @victim can be already removed from the disk by this time. Inode is
42798 +          then marked so that iput() wouldn't try to remove stat data. But
42799 +          inode itself is still there.
42800 +        */
42801 +
42802 +       /*
42803 +        * we cannot release directory semaphore here, because name has
42804 +        * already been deleted, but dentry (@victim) still exists.  Prevent
42805 +        * balance_dirty_pages() from being called on exiting this context: we
42806 +        * don't want to do this under directory i_mutex.
42807 +        */
42808 +       context_set_commit_async(ctx);
42809 +       reiser4_exit_context(ctx);
42810 +       return result;
42811 +}
42812 +
42813 +/**
42814 + * symlink_common - symlink of inode operations
42815 + * @parent: inode of parent directory
42816 + * @dentry: dentry of object to be created
42817 + * @linkname: string symlink is to contain
42818 + *
42819 + * This is common implementation of vfs's symlink method of struct
42820 + * inode_operations.
42821 + * Creates object using file plugin SYMLINK_FILE_PLUGIN_ID.
42822 + */
42823 +int symlink_common(struct inode *parent, struct dentry *dentry,
42824 +                  const char *linkname)
42825 +{
42826 +       reiser4_object_create_data data;
42827 +
42828 +       memset(&data, 0, sizeof data);
42829 +       data.name = linkname;
42830 +       data.id = SYMLINK_FILE_PLUGIN_ID;
42831 +       data.mode = S_IFLNK | S_IRWXUGO;
42832 +       return create_vfs_object(parent, dentry, &data);
42833 +}
42834 +
42835 +/**
42836 + * mkdir_common - mkdir of inode operations
42837 + * @parent: inode of parent directory
42838 + * @dentry: dentry of object to be created
42839 + * @mode: the permissions to use
42840 + *
42841 + * This is common implementation of vfs's mkdir method of struct
42842 + * inode_operations.
42843 + * Creates object using file plugin DIRECTORY_FILE_PLUGIN_ID.
42844 + */
42845 +int mkdir_common(struct inode *parent, struct dentry *dentry, int mode)
42846 +{
42847 +       reiser4_object_create_data data;
42848 +
42849 +       memset(&data, 0, sizeof data);
42850 +       data.mode = S_IFDIR | mode;
42851 +       data.id = DIRECTORY_FILE_PLUGIN_ID;
42852 +       return create_vfs_object(parent, dentry, &data);
42853 +}
42854 +
42855 +/**
42856 + * mknod_common - mknod of inode operations
42857 + * @parent: inode of parent directory
42858 + * @dentry: dentry of object to be created
42859 + * @mode: the permissions to use and file type
42860 + * @rdev: minor and major of new device file
42861 + *
42862 + * This is common implementation of vfs's mknod method of struct
42863 + * inode_operations.
42864 + * Creates object using file plugin SPECIAL_FILE_PLUGIN_ID.
42865 + */
42866 +int mknod_common(struct inode *parent, struct dentry *dentry,
42867 +                int mode, dev_t rdev)
42868 +{
42869 +       reiser4_object_create_data data;
42870 +
42871 +       memset(&data, 0, sizeof data);
42872 +       data.mode = mode;
42873 +       data.rdev = rdev;
42874 +       data.id = SPECIAL_FILE_PLUGIN_ID;
42875 +       return create_vfs_object(parent, dentry, &data);
42876 +}
42877 +
42878 +/*
42879 + * implementation of vfs's rename method of struct inode_operations for typical
42880 + * directory is in inode_ops_rename.c
42881 + */
42882 +
42883 +/**
42884 + * follow_link_common - follow_link of inode operations
42885 + * @dentry: dentry of symlink
42886 + * @data:
42887 + *
42888 + * This is common implementation of vfs's followlink method of struct
42889 + * inode_operations.
42890 + * Assumes that inode's i_private points to the content of symbolic link.
42891 + */
42892 +void *follow_link_common(struct dentry *dentry, struct nameidata *nd)
42893 +{
42894 +       assert("vs-851", S_ISLNK(dentry->d_inode->i_mode));
42895 +
42896 +       if (!dentry->d_inode->i_private
42897 +           || !inode_get_flag(dentry->d_inode, REISER4_GENERIC_PTR_USED))
42898 +               return ERR_PTR(RETERR(-EINVAL));
42899 +       nd_set_link(nd, dentry->d_inode->i_private);
42900 +       return NULL;
42901 +}
42902 +
42903 +/**
42904 + * permission_common - permission of inode operations
42905 + * @inode: inode to check permissions for
42906 + * @mask: mode bits to check permissions for
42907 + * @nameidata:
42908 + *
42909 + * Uses generic function to check for rwx permissions.
42910 + */
42911 +int permission_common(struct inode *inode, int mask,
42912 +                     struct nameidata *nameidata)
42913 +{
42914 +       return generic_permission(inode, mask, NULL);
42915 +}
42916 +
42917 +static int setattr_reserve(reiser4_tree *);
42918 +
42919 +/* this is common implementation of vfs's setattr method of struct
42920 +   inode_operations
42921 +*/
42922 +int setattr_common(struct dentry *dentry, struct iattr *attr)
42923 +{
42924 +       reiser4_context *ctx;
42925 +       struct inode *inode;
42926 +       int result;
42927 +
42928 +       inode = dentry->d_inode;
42929 +       result = inode_change_ok(inode, attr);
42930 +       if (result)
42931 +               return result;
42932 +
42933 +       ctx = init_context(inode->i_sb);
42934 +       if (IS_ERR(ctx))
42935 +               return PTR_ERR(ctx);
42936 +
42937 +       assert("nikita-3119", !(attr->ia_valid & ATTR_SIZE));
42938 +
42939 +       /*
42940 +        * grab disk space and call standard inode_setattr().
42941 +        */
42942 +       result = setattr_reserve(tree_by_inode(inode));
42943 +       if (!result) {
42944 +               if ((attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid)
42945 +                   || (attr->ia_valid & ATTR_GID
42946 +                       && attr->ia_gid != inode->i_gid)) {
42947 +                       result = DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0;
42948 +                       if (result) {
42949 +                               context_set_commit_async(ctx);
42950 +                               reiser4_exit_context(ctx);
42951 +                               return result;
42952 +                       }
42953 +               }
42954 +               result = inode_setattr(inode, attr);
42955 +               if (!result)
42956 +                       reiser4_update_sd(inode);
42957 +       }
42958 +
42959 +       context_set_commit_async(ctx);
42960 +       reiser4_exit_context(ctx);
42961 +       return result;
42962 +}
42963 +
42964 +/* this is common implementation of vfs's getattr method of struct
42965 +   inode_operations
42966 +*/
42967 +int
42968 +getattr_common(struct vfsmount *mnt UNUSED_ARG, struct dentry *dentry,
42969 +              struct kstat *stat)
42970 +{
42971 +       struct inode *obj;
42972 +
42973 +       assert("nikita-2298", dentry != NULL);
42974 +       assert("nikita-2299", stat != NULL);
42975 +       assert("nikita-2300", dentry->d_inode != NULL);
42976 +
42977 +       obj = dentry->d_inode;
42978 +
42979 +       stat->dev = obj->i_sb->s_dev;
42980 +       stat->ino = oid_to_uino(get_inode_oid(obj));
42981 +       stat->mode = obj->i_mode;
42982 +       /* don't confuse userland with huge nlink. This is not entirely
42983 +        * correct, because nlink_t is not necessary 16 bit signed. */
42984 +       stat->nlink = min(obj->i_nlink, (typeof(obj->i_nlink)) 0x7fff);
42985 +       stat->uid = obj->i_uid;
42986 +       stat->gid = obj->i_gid;
42987 +       stat->rdev = obj->i_rdev;
42988 +       stat->atime = obj->i_atime;
42989 +       stat->mtime = obj->i_mtime;
42990 +       stat->ctime = obj->i_ctime;
42991 +       stat->size = obj->i_size;
42992 +       stat->blocks =
42993 +           (inode_get_bytes(obj) + VFS_BLKSIZE - 1) >> VFS_BLKSIZE_BITS;
42994 +       /* "preferred" blocksize for efficient file system I/O */
42995 +       stat->blksize = get_super_private(obj->i_sb)->optimal_io_size;
42996 +
42997 +       return 0;
42998 +}
42999 +
43000 +/* Estimate the maximum amount of nodes which might be allocated or changed on
43001 +   typical new object creation. Typical creation consists of calling create
43002 +   method of file plugin, adding directory entry to parent and update parent
43003 +   directory's stat data.
43004 +*/
43005 +static reiser4_block_nr estimate_create_vfs_object(struct inode *parent,       /* parent object */
43006 +                                                  struct inode *object
43007 +                                                  /* object */ )
43008 +{
43009 +       assert("vpf-309", parent != NULL);
43010 +       assert("vpf-307", object != NULL);
43011 +
43012 +       return
43013 +           /* object creation estimation */
43014 +           inode_file_plugin(object)->estimate.create(object) +
43015 +           /* stat data of parent directory estimation */
43016 +           inode_file_plugin(parent)->estimate.update(parent) +
43017 +           /* adding entry estimation */
43018 +           inode_dir_plugin(parent)->estimate.add_entry(parent) +
43019 +           /* to undo in the case of failure */
43020 +           inode_dir_plugin(parent)->estimate.rem_entry(parent);
43021 +}
43022 +
43023 +/* Create child in directory.
43024 +
43025 +   . get object's plugin
43026 +   . get fresh inode
43027 +   . initialize inode
43028 +   . add object's stat-data
43029 +   . initialize object's directory
43030 +   . add entry to the parent
43031 +   . instantiate dentry
43032 +
43033 +*/
43034 +static int do_create_vfs_child(reiser4_object_create_data * data,      /* parameters of new
43035 +                                                                          object */
43036 +                              struct inode **retobj)
43037 +{
43038 +       int result;
43039 +
43040 +       struct dentry *dentry;  /* parent object */
43041 +       struct inode *parent;   /* new name */
43042 +
43043 +       dir_plugin *par_dir;    /* directory plugin on the parent */
43044 +       dir_plugin *obj_dir;    /* directory plugin on the new object */
43045 +       file_plugin *obj_plug;  /* object plugin on the new object */
43046 +       struct inode *object;   /* new object */
43047 +       reiser4_block_nr reserve;
43048 +
43049 +       reiser4_dir_entry_desc entry;   /* new directory entry */
43050 +
43051 +       assert("nikita-1420", data != NULL);
43052 +       parent = data->parent;
43053 +       dentry = data->dentry;
43054 +
43055 +       assert("nikita-1418", parent != NULL);
43056 +       assert("nikita-1419", dentry != NULL);
43057 +
43058 +       /* check, that name is acceptable for parent */
43059 +       par_dir = inode_dir_plugin(parent);
43060 +       if (par_dir->is_name_acceptable &&
43061 +           !par_dir->is_name_acceptable(parent,
43062 +                                        dentry->d_name.name,
43063 +                                        (int)dentry->d_name.len))
43064 +               return RETERR(-ENAMETOOLONG);
43065 +
43066 +       result = 0;
43067 +       obj_plug = file_plugin_by_id((int)data->id);
43068 +       if (obj_plug == NULL) {
43069 +               warning("nikita-430", "Cannot find plugin %i", data->id);
43070 +               return RETERR(-ENOENT);
43071 +       }
43072 +       object = new_inode(parent->i_sb);
43073 +       if (object == NULL)
43074 +               return RETERR(-ENOMEM);
43075 +       /* we'll update i_nlink below */
43076 +       object->i_nlink = 0;
43077 +       /* new_inode() initializes i_ino to "arbitrary" value. Reset it to 0,
43078 +        * to simplify error handling: if some error occurs before i_ino is
43079 +        * initialized with oid, i_ino should already be set to some
43080 +        * distinguished value. */
43081 +       object->i_ino = 0;
43082 +
43083 +       /* So that on error iput will be called. */
43084 +       *retobj = object;
43085 +
43086 +       if (DQUOT_ALLOC_INODE(object)) {
43087 +               DQUOT_DROP(object);
43088 +               object->i_flags |= S_NOQUOTA;
43089 +               return RETERR(-EDQUOT);
43090 +       }
43091 +
43092 +       memset(&entry, 0, sizeof entry);
43093 +       entry.obj = object;
43094 +
43095 +       plugin_set_file(&reiser4_inode_data(object)->pset, obj_plug);
43096 +       result = obj_plug->set_plug_in_inode(object, parent, data);
43097 +       if (result) {
43098 +               warning("nikita-431", "Cannot install plugin %i on %llx",
43099 +                       data->id, (unsigned long long)get_inode_oid(object));
43100 +               DQUOT_FREE_INODE(object);
43101 +               object->i_flags |= S_NOQUOTA;
43102 +               return result;
43103 +       }
43104 +
43105 +       /* reget plugin after installation */
43106 +       obj_plug = inode_file_plugin(object);
43107 +
43108 +       if (obj_plug->create_object == NULL) {
43109 +               DQUOT_FREE_INODE(object);
43110 +               object->i_flags |= S_NOQUOTA;
43111 +               return RETERR(-EPERM);
43112 +       }
43113 +
43114 +       /* if any of hash, tail, sd or permission plugins for newly created
43115 +          object are not set yet set them here inheriting them from parent
43116 +          directory
43117 +        */
43118 +       assert("nikita-2070", obj_plug->adjust_to_parent != NULL);
43119 +       result = obj_plug->adjust_to_parent(object,
43120 +                                           parent,
43121 +                                           object->i_sb->s_root->d_inode);
43122 +       if (result != 0) {
43123 +               warning("nikita-432", "Cannot inherit from %llx to %llx",
43124 +                       (unsigned long long)get_inode_oid(parent),
43125 +                       (unsigned long long)get_inode_oid(object));
43126 +               DQUOT_FREE_INODE(object);
43127 +               object->i_flags |= S_NOQUOTA;
43128 +               return result;
43129 +       }
43130 +
43131 +       /* setup inode and file-operations for this inode */
43132 +       setup_inode_ops(object, data);
43133 +
43134 +       /* call file plugin's method to initialize plugin specific part of
43135 +        * inode */
43136 +       if (obj_plug->init_inode_data)
43137 +               obj_plug->init_inode_data(object, data, 1 /*create */ );
43138 +
43139 +       /* obtain directory plugin (if any) for new object. */
43140 +       obj_dir = inode_dir_plugin(object);
43141 +       if (obj_dir != NULL && obj_dir->init == NULL) {
43142 +               DQUOT_FREE_INODE(object);
43143 +               object->i_flags |= S_NOQUOTA;
43144 +               return RETERR(-EPERM);
43145 +       }
43146 +
43147 +       reiser4_inode_data(object)->locality_id = get_inode_oid(parent);
43148 +
43149 +       reserve = estimate_create_vfs_object(parent, object);
43150 +       if (reiser4_grab_space(reserve, BA_CAN_COMMIT)) {
43151 +               DQUOT_FREE_INODE(object);
43152 +               object->i_flags |= S_NOQUOTA;
43153 +               return RETERR(-ENOSPC);
43154 +       }
43155 +
43156 +       /* mark inode `immutable'. We disable changes to the file being
43157 +          created until valid directory entry for it is inserted. Otherwise,
43158 +          if file were expanded and insertion of directory entry fails, we
43159 +          have to remove file, but we only alloted enough space in
43160 +          transaction to remove _empty_ file. 3.x code used to remove stat
43161 +          data in different transaction thus possibly leaking disk space on
43162 +          crash. This all only matters if it's possible to access file
43163 +          without name, for example, by inode number
43164 +        */
43165 +       inode_set_flag(object, REISER4_IMMUTABLE);
43166 +
43167 +       /* create empty object, this includes allocation of new objectid. For
43168 +          directories this implies creation of dot and dotdot  */
43169 +       assert("nikita-2265", inode_get_flag(object, REISER4_NO_SD));
43170 +
43171 +       /* mark inode as `loaded'. From this point onward
43172 +          reiser4_delete_inode() will try to remove its stat-data. */
43173 +       inode_set_flag(object, REISER4_LOADED);
43174 +
43175 +       result = obj_plug->create_object(object, parent, data);
43176 +       if (result != 0) {
43177 +               inode_clr_flag(object, REISER4_IMMUTABLE);
43178 +               if (result != -ENAMETOOLONG && result != -ENOMEM)
43179 +                       warning("nikita-2219",
43180 +                               "Failed to create sd for %llu",
43181 +                               (unsigned long long)get_inode_oid(object));
43182 +               DQUOT_FREE_INODE(object);
43183 +               object->i_flags |= S_NOQUOTA;
43184 +               return result;
43185 +       }
43186 +
43187 +       if (obj_dir != NULL)
43188 +               result = obj_dir->init(object, parent, data);
43189 +       if (result == 0) {
43190 +               assert("nikita-434", !inode_get_flag(object, REISER4_NO_SD));
43191 +               /* insert inode into VFS hash table */
43192 +               insert_inode_hash(object);
43193 +               /* create entry */
43194 +               result = par_dir->add_entry(parent, dentry, data, &entry);
43195 +               if (result == 0) {
43196 +                       result = reiser4_add_nlink(object, parent, 0);
43197 +                       /* If O_CREAT is set and the file did not previously
43198 +                          exist, upon successful completion, open() shall
43199 +                          mark for update the st_atime, st_ctime, and
43200 +                          st_mtime fields of the file and the st_ctime and
43201 +                          st_mtime fields of the parent directory. --SUS
43202 +                        */
43203 +                       /* @object times are already updated by
43204 +                          reiser4_add_nlink() */
43205 +                       if (result == 0)
43206 +                               reiser4_update_dir(parent);
43207 +                       if (result != 0)
43208 +                               /* cleanup failure to add nlink */
43209 +                               par_dir->rem_entry(parent, dentry, &entry);
43210 +               }
43211 +               if (result != 0)
43212 +                       /* cleanup failure to add entry */
43213 +                       obj_plug->detach(object, parent);
43214 +       } else if (result != -ENOMEM)
43215 +               warning("nikita-2219", "Failed to initialize dir for %llu: %i",
43216 +                       (unsigned long long)get_inode_oid(object), result);
43217 +
43218 +       /*
43219 +        * update stat-data, committing all pending modifications to the inode
43220 +        * fields.
43221 +        */
43222 +       reiser4_update_sd(object);
43223 +       if (result != 0) {
43224 +               DQUOT_FREE_INODE(object);
43225 +               object->i_flags |= S_NOQUOTA;
43226 +               /* if everything was ok (result == 0), parent stat-data is
43227 +                * already updated above (update_parent_dir()) */
43228 +               reiser4_update_sd(parent);
43229 +               /* failure to create entry, remove object */
43230 +               obj_plug->delete_object(object);
43231 +       }
43232 +
43233 +       /* file has name now, clear immutable flag */
43234 +       inode_clr_flag(object, REISER4_IMMUTABLE);
43235 +
43236 +       /* on error, iput() will call ->delete_inode(). We should keep track
43237 +          of the existence of stat-data for this inode and avoid attempt to
43238 +          remove it in reiser4_delete_inode(). This is accomplished through
43239 +          REISER4_NO_SD bit in inode.u.reiser4_i.plugin.flags
43240 +        */
43241 +       return result;
43242 +}
43243 +
43244 +/* this is helper for common implementations of reiser4_mkdir, reiser4_create,
43245 +   reiser4_mknod and reiser4_symlink
43246 +*/
43247 +static int
43248 +create_vfs_object(struct inode *parent,
43249 +                 struct dentry *dentry, reiser4_object_create_data * data)
43250 +{
43251 +       reiser4_context *ctx;
43252 +       int result;
43253 +       struct inode *child;
43254 +
43255 +       ctx = init_context(parent->i_sb);
43256 +       if (IS_ERR(ctx))
43257 +               return PTR_ERR(ctx);
43258 +       context_set_commit_async(ctx);
43259 +
43260 +       data->parent = parent;
43261 +       data->dentry = dentry;
43262 +       child = NULL;
43263 +       result = do_create_vfs_child(data, &child);
43264 +       if (unlikely(result != 0)) {
43265 +               if (child != NULL) {
43266 +                       reiser4_make_bad_inode(child);
43267 +                       iput(child);
43268 +               }
43269 +       } else
43270 +               d_instantiate(dentry, child);
43271 +
43272 +       reiser4_exit_context(ctx);
43273 +       return result;
43274 +}
43275 +
43276 +/* helper for link_common. Estimate disk space necessary to add a link
43277 +   from @parent to @object
43278 +*/
43279 +static reiser4_block_nr common_estimate_link(struct inode *parent,     /* parent directory */
43280 +                                            struct inode *object
43281 +                                            /* object to which new link is being cerated */
43282 +                                            )
43283 +{
43284 +       reiser4_block_nr res = 0;
43285 +       file_plugin *fplug;
43286 +       dir_plugin *dplug;
43287 +
43288 +       assert("vpf-317", object != NULL);
43289 +       assert("vpf-318", parent != NULL);
43290 +
43291 +       fplug = inode_file_plugin(object);
43292 +       dplug = inode_dir_plugin(parent);
43293 +       /* VS-FIXME-HANS: why do we do fplug->estimate.update(object) twice instead of multiplying by 2? */
43294 +       /* reiser4_add_nlink(object) */
43295 +       res += fplug->estimate.update(object);
43296 +       /* add_entry(parent) */
43297 +       res += dplug->estimate.add_entry(parent);
43298 +       /* reiser4_del_nlink(object) */
43299 +       res += fplug->estimate.update(object);
43300 +       /* update_dir(parent) */
43301 +       res += inode_file_plugin(parent)->estimate.update(parent);
43302 +       /* safe-link */
43303 +       res += estimate_one_item_removal(tree_by_inode(object));
43304 +
43305 +       return res;
43306 +}
43307 +
43308 +/* Estimate disk space necessary to remove a link between @parent and
43309 +   @object.
43310 +*/
43311 +static reiser4_block_nr estimate_unlink(struct inode *parent,  /* parent directory */
43312 +                                       struct inode *object
43313 +                                       /* object to which new link is being cerated */
43314 +                                       )
43315 +{
43316 +       reiser4_block_nr res = 0;
43317 +       file_plugin *fplug;
43318 +       dir_plugin *dplug;
43319 +
43320 +       assert("vpf-317", object != NULL);
43321 +       assert("vpf-318", parent != NULL);
43322 +
43323 +       fplug = inode_file_plugin(object);
43324 +       dplug = inode_dir_plugin(parent);
43325 +
43326 +       /* rem_entry(parent) */
43327 +       res += dplug->estimate.rem_entry(parent);
43328 +       /* reiser4_del_nlink(object) */
43329 +       res += fplug->estimate.update(object);
43330 +       /* update_dir(parent) */
43331 +       res += inode_file_plugin(parent)->estimate.update(parent);
43332 +       /* fplug->unlink */
43333 +       res += fplug->estimate.unlink(object, parent);
43334 +       /* safe-link */
43335 +       res += estimate_one_insert_item(tree_by_inode(object));
43336 +
43337 +       return res;
43338 +}
43339 +
43340 +/* helper for unlink_common. Estimate and grab space for unlink. */
43341 +static int unlink_check_and_grab(struct inode *parent, struct dentry *victim)
43342 +{
43343 +       file_plugin *fplug;
43344 +       struct inode *child;
43345 +       int result;
43346 +
43347 +       result = 0;
43348 +       child = victim->d_inode;
43349 +       fplug = inode_file_plugin(child);
43350 +
43351 +       /* check for race with create_object() */
43352 +       if (inode_get_flag(child, REISER4_IMMUTABLE))
43353 +               return RETERR(-E_REPEAT);
43354 +       /* object being deleted should have stat data */
43355 +       assert("vs-949", !inode_get_flag(child, REISER4_NO_SD));
43356 +
43357 +       /* ask object plugin */
43358 +       if (fplug->can_rem_link != NULL && !fplug->can_rem_link(child))
43359 +               return RETERR(-ENOTEMPTY);
43360 +
43361 +       result = (int)estimate_unlink(parent, child);
43362 +       if (result < 0)
43363 +               return result;
43364 +
43365 +       return reiser4_grab_reserved(child->i_sb, result, BA_CAN_COMMIT);
43366 +}
43367 +
43368 +/* helper for setattr_common */
43369 +static int setattr_reserve(reiser4_tree * tree)
43370 +{
43371 +       assert("vs-1096", is_grab_enabled(get_current_context()));
43372 +       return reiser4_grab_space(estimate_one_insert_into_item(tree),
43373 +                                 BA_CAN_COMMIT);
43374 +}
43375 +
43376 +/* helper function. Standards require that for many file-system operations
43377 +   on success ctime and mtime of parent directory is to be updated. */
43378 +int reiser4_update_dir(struct inode *dir)
43379 +{
43380 +       assert("nikita-2525", dir != NULL);
43381 +
43382 +       dir->i_ctime = dir->i_mtime = CURRENT_TIME;
43383 +       return reiser4_update_sd(dir);
43384 +}
43385 diff --git a/fs/reiser4/plugin/inode_ops_rename.c b/fs/reiser4/plugin/inode_ops_rename.c
43386 new file mode 100644
43387 index 0000000..ef084c2
43388 --- /dev/null
43389 +++ b/fs/reiser4/plugin/inode_ops_rename.c
43390 @@ -0,0 +1,904 @@
43391 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
43392 + * reiser4/README */
43393 +
43394 +#include "../inode.h"
43395 +#include "../safe_link.h"
43396 +
43397 +static const char *possible_leak = "Possible disk space leak.";
43398 +
43399 +/* re-bind existing name at @from_coord in @from_dir to point to @to_inode.
43400 +
43401 +   Helper function called from hashed_rename() */
43402 +static int replace_name(struct inode *to_inode,        /* inode where @from_coord is
43403 +                                                * to be re-targeted at */
43404 +                       struct inode *from_dir, /* directory where @from_coord
43405 +                                                * lives */
43406 +                       struct inode *from_inode,       /* inode @from_coord
43407 +                                                        * originally point to */
43408 +                       coord_t * from_coord,   /* where directory entry is in
43409 +                                                * the tree */
43410 +                       lock_handle * from_lh /* lock handle on @from_coord */ )
43411 +{
43412 +       item_plugin *from_item;
43413 +       int result;
43414 +       znode *node;
43415 +
43416 +       coord_clear_iplug(from_coord);
43417 +       node = from_coord->node;
43418 +       result = zload(node);
43419 +       if (result != 0)
43420 +               return result;
43421 +       from_item = item_plugin_by_coord(from_coord);
43422 +       if (item_type_by_coord(from_coord) == DIR_ENTRY_ITEM_TYPE) {
43423 +               reiser4_key to_key;
43424 +
43425 +               build_sd_key(to_inode, &to_key);
43426 +
43427 +               /* everything is found and prepared to change directory entry
43428 +                  at @from_coord to point to @to_inode.
43429 +
43430 +                  @to_inode is just about to get new name, so bump its link
43431 +                  counter.
43432 +
43433 +                */
43434 +               result = reiser4_add_nlink(to_inode, from_dir, 0);
43435 +               if (result != 0) {
43436 +                       /* Don't issue warning: this may be plain -EMLINK */
43437 +                       zrelse(node);
43438 +                       return result;
43439 +               }
43440 +
43441 +               result =
43442 +                   from_item->s.dir.update_key(from_coord, &to_key, from_lh);
43443 +               if (result != 0) {
43444 +                       reiser4_del_nlink(to_inode, from_dir, 0);
43445 +                       zrelse(node);
43446 +                       return result;
43447 +               }
43448 +
43449 +               /* @from_inode just lost its name, he-he.
43450 +
43451 +                  If @from_inode was directory, it contained dotdot pointing
43452 +                  to @from_dir. @from_dir i_nlink will be decreased when
43453 +                  iput() will be called on @from_inode.
43454 +
43455 +                  If file-system is not ADG (hard-links are
43456 +                  supported on directories), iput(from_inode) will not remove
43457 +                  @from_inode, and thus above is incorrect, but hard-links on
43458 +                  directories are problematic in many other respects.
43459 +                */
43460 +               result = reiser4_del_nlink(from_inode, from_dir, 0);
43461 +               if (result != 0) {
43462 +                       warning("nikita-2330",
43463 +                               "Cannot remove link from source: %i. %s",
43464 +                               result, possible_leak);
43465 +               }
43466 +               /* Has to return success, because entry is already
43467 +                * modified. */
43468 +               result = 0;
43469 +
43470 +               /* NOTE-NIKITA consider calling plugin method in stead of
43471 +                  accessing inode fields directly. */
43472 +               from_dir->i_mtime = CURRENT_TIME;
43473 +       } else {
43474 +               warning("nikita-2326", "Unexpected item type");
43475 +               result = RETERR(-EIO);
43476 +       }
43477 +       zrelse(node);
43478 +       return result;
43479 +}
43480 +
43481 +/* add new entry pointing to @inode into @dir at @coord, locked by @lh
43482 +
43483 +   Helper function used by hashed_rename(). */
43484 +static int add_name(struct inode *inode,       /* inode where @coord is to be
43485 +                                                * re-targeted at */
43486 +                   struct inode *dir,  /* directory where @coord lives */
43487 +                   struct dentry *name,        /* new name */
43488 +                   coord_t * coord,    /* where directory entry is in the tree */
43489 +                   lock_handle * lh,   /* lock handle on @coord */
43490 +                   int is_dir /* true, if @inode is directory */ )
43491 +{
43492 +       int result;
43493 +       reiser4_dir_entry_desc entry;
43494 +
43495 +       assert("nikita-2333", lh->node == coord->node);
43496 +       assert("nikita-2334", is_dir == S_ISDIR(inode->i_mode));
43497 +
43498 +       memset(&entry, 0, sizeof entry);
43499 +       entry.obj = inode;
43500 +       /* build key of directory entry description */
43501 +       inode_dir_plugin(dir)->build_entry_key(dir, &name->d_name, &entry.key);
43502 +
43503 +       /* ext2 does this in different order: first inserts new entry,
43504 +          then increases directory nlink. We don't want do this,
43505 +          because reiser4_add_nlink() calls ->add_link() plugin
43506 +          method that can fail for whatever reason, leaving as with
43507 +          cleanup problems.
43508 +        */
43509 +       /* @inode is getting new name */
43510 +       reiser4_add_nlink(inode, dir, 0);
43511 +       /* create @new_name in @new_dir pointing to
43512 +          @old_inode */
43513 +       result = WITH_COORD(coord,
43514 +                           inode_dir_item_plugin(dir)->s.dir.add_entry(dir,
43515 +                                                                       coord,
43516 +                                                                       lh,
43517 +                                                                       name,
43518 +                                                                       &entry));
43519 +       if (result != 0) {
43520 +               int result2;
43521 +               result2 = reiser4_del_nlink(inode, dir, 0);
43522 +               if (result2 != 0) {
43523 +                       warning("nikita-2327",
43524 +                               "Cannot drop link on %lli %i. %s",
43525 +                               (unsigned long long)get_inode_oid(inode),
43526 +                               result2, possible_leak);
43527 +               }
43528 +       } else
43529 +               INODE_INC_FIELD(dir, i_size);
43530 +       return result;
43531 +}
43532 +
43533 +static reiser4_block_nr estimate_rename(struct inode *old_dir, /* directory where @old is located */
43534 +                                       struct dentry *old_name,        /* old name */
43535 +                                       struct inode *new_dir,  /* directory where @new is located */
43536 +                                       struct dentry *new_name /* new name */ )
43537 +{
43538 +       reiser4_block_nr res1, res2;
43539 +       dir_plugin *p_parent_old, *p_parent_new;
43540 +       file_plugin *p_child_old, *p_child_new;
43541 +
43542 +       assert("vpf-311", old_dir != NULL);
43543 +       assert("vpf-312", new_dir != NULL);
43544 +       assert("vpf-313", old_name != NULL);
43545 +       assert("vpf-314", new_name != NULL);
43546 +
43547 +       p_parent_old = inode_dir_plugin(old_dir);
43548 +       p_parent_new = inode_dir_plugin(new_dir);
43549 +       p_child_old = inode_file_plugin(old_name->d_inode);
43550 +       if (new_name->d_inode)
43551 +               p_child_new = inode_file_plugin(new_name->d_inode);
43552 +       else
43553 +               p_child_new = NULL;
43554 +
43555 +       /* find_entry - can insert one leaf. */
43556 +       res1 = res2 = 1;
43557 +
43558 +       /* replace_name */
43559 +       {
43560 +               /* reiser4_add_nlink(p_child_old) and reiser4_del_nlink(p_child_old) */
43561 +               res1 += 2 * p_child_old->estimate.update(old_name->d_inode);
43562 +               /* update key */
43563 +               res1 += 1;
43564 +               /* reiser4_del_nlink(p_child_new) */
43565 +               if (p_child_new)
43566 +                       res1 += p_child_new->estimate.update(new_name->d_inode);
43567 +       }
43568 +
43569 +       /* else add_name */
43570 +       {
43571 +               /* reiser4_add_nlink(p_parent_new) and reiser4_del_nlink(p_parent_new) */
43572 +               res2 +=
43573 +                   2 * inode_file_plugin(new_dir)->estimate.update(new_dir);
43574 +               /* reiser4_add_nlink(p_parent_old) */
43575 +               res2 += p_child_old->estimate.update(old_name->d_inode);
43576 +               /* add_entry(p_parent_new) */
43577 +               res2 += p_parent_new->estimate.add_entry(new_dir);
43578 +               /* reiser4_del_nlink(p_parent_old) */
43579 +               res2 += p_child_old->estimate.update(old_name->d_inode);
43580 +       }
43581 +
43582 +       res1 = res1 < res2 ? res2 : res1;
43583 +
43584 +       /* reiser4_write_sd(p_parent_new) */
43585 +       res1 += inode_file_plugin(new_dir)->estimate.update(new_dir);
43586 +
43587 +       /* reiser4_write_sd(p_child_new) */
43588 +       if (p_child_new)
43589 +               res1 += p_child_new->estimate.update(new_name->d_inode);
43590 +
43591 +       /* hashed_rem_entry(p_parent_old) */
43592 +       res1 += p_parent_old->estimate.rem_entry(old_dir);
43593 +
43594 +       /* reiser4_del_nlink(p_child_old) */
43595 +       res1 += p_child_old->estimate.update(old_name->d_inode);
43596 +
43597 +       /* replace_name */
43598 +       {
43599 +               /* reiser4_add_nlink(p_parent_dir_new) */
43600 +               res1 += inode_file_plugin(new_dir)->estimate.update(new_dir);
43601 +               /* update_key */
43602 +               res1 += 1;
43603 +               /* reiser4_del_nlink(p_parent_new) */
43604 +               res1 += inode_file_plugin(new_dir)->estimate.update(new_dir);
43605 +               /* reiser4_del_nlink(p_parent_old) */
43606 +               res1 += inode_file_plugin(old_dir)->estimate.update(old_dir);
43607 +       }
43608 +
43609 +       /* reiser4_write_sd(p_parent_old) */
43610 +       res1 += inode_file_plugin(old_dir)->estimate.update(old_dir);
43611 +
43612 +       /* reiser4_write_sd(p_child_old) */
43613 +       res1 += p_child_old->estimate.update(old_name->d_inode);
43614 +
43615 +       return res1;
43616 +}
43617 +
43618 +static int hashed_rename_estimate_and_grab(struct inode *old_dir,      /* directory where @old is located */
43619 +                                          struct dentry *old_name,     /* old name */
43620 +                                          struct inode *new_dir,       /* directory where @new is located */
43621 +                                          struct dentry *new_name
43622 +                                          /* new name */ )
43623 +{
43624 +       reiser4_block_nr reserve;
43625 +
43626 +       reserve = estimate_rename(old_dir, old_name, new_dir, new_name);
43627 +
43628 +       if (reiser4_grab_space(reserve, BA_CAN_COMMIT))
43629 +               return RETERR(-ENOSPC);
43630 +
43631 +       return 0;
43632 +}
43633 +
43634 +/* check whether @old_inode and @new_inode can be moved within file system
43635 + * tree. This singles out attempts to rename pseudo-files, for example. */
43636 +static int can_rename(struct inode *old_dir, struct inode *old_inode,
43637 +                     struct inode *new_dir, struct inode *new_inode)
43638 +{
43639 +       file_plugin *fplug;
43640 +       dir_plugin *dplug;
43641 +
43642 +       assert("nikita-3370", old_inode != NULL);
43643 +
43644 +       dplug = inode_dir_plugin(new_dir);
43645 +       fplug = inode_file_plugin(old_inode);
43646 +
43647 +       if (dplug == NULL)
43648 +               return RETERR(-ENOTDIR);
43649 +       else if (new_dir->i_op->create == NULL)
43650 +               return RETERR(-EPERM);
43651 +       else if (!fplug->can_add_link(old_inode))
43652 +               return RETERR(-EMLINK);
43653 +       else if (new_inode != NULL) {
43654 +               fplug = inode_file_plugin(new_inode);
43655 +               if (fplug->can_rem_link != NULL &&
43656 +                   !fplug->can_rem_link(new_inode))
43657 +                       return RETERR(-EBUSY);
43658 +       }
43659 +       return 0;
43660 +}
43661 +
43662 +int find_entry(struct inode *, struct dentry *, lock_handle *,
43663 +              znode_lock_mode, reiser4_dir_entry_desc *);
43664 +int reiser4_update_dir(struct inode *);
43665 +
43666 +/* this is common implementation of vfs's rename method of struct
43667 +   inode_operations
43668 +   See comments in the body.
43669 +
43670 +   It is arguable that this function can be made generic so, that it
43671 +   will be applicable to any kind of directory plugin that deals with
43672 +   directories composed out of directory entries. The only obstacle
43673 +   here is that we don't have any data-type to represent directory
43674 +   entry. This should be re-considered when more than one different
43675 +   directory plugin will be implemented.
43676 +*/
43677 +int rename_common(struct inode *old_dir /* directory where @old is located */ ,
43678 +                 struct dentry *old_name /* old name */ ,
43679 +                 struct inode *new_dir /* directory where @new is located */ ,
43680 +                 struct dentry *new_name /* new name */ )
43681 +{
43682 +       /* From `The Open Group Base Specifications Issue 6'
43683 +
43684 +          If either the old or new argument names a symbolic link, rename()
43685 +          shall operate on the symbolic link itself, and shall not resolve
43686 +          the last component of the argument. If the old argument and the new
43687 +          argument resolve to the same existing file, rename() shall return
43688 +          successfully and perform no other action.
43689 +
43690 +          [this is done by VFS: vfs_rename()]
43691 +
43692 +          If the old argument points to the pathname of a file that is not a
43693 +          directory, the new argument shall not point to the pathname of a
43694 +          directory.
43695 +
43696 +          [checked by VFS: vfs_rename->may_delete()]
43697 +
43698 +          If the link named by the new argument exists, it shall
43699 +          be removed and old renamed to new. In this case, a link named new
43700 +          shall remain visible to other processes throughout the renaming
43701 +          operation and refer either to the file referred to by new or old
43702 +          before the operation began.
43703 +
43704 +          [we should assure this]
43705 +
43706 +          Write access permission is required for
43707 +          both the directory containing old and the directory containing new.
43708 +
43709 +          [checked by VFS: vfs_rename->may_delete(), may_create()]
43710 +
43711 +          If the old argument points to the pathname of a directory, the new
43712 +          argument shall not point to the pathname of a file that is not a
43713 +          directory.
43714 +
43715 +          [checked by VFS: vfs_rename->may_delete()]
43716 +
43717 +          If the directory named by the new argument exists, it
43718 +          shall be removed and old renamed to new. In this case, a link named
43719 +          new shall exist throughout the renaming operation and shall refer
43720 +          either to the directory referred to by new or old before the
43721 +          operation began.
43722 +
43723 +          [we should assure this]
43724 +
43725 +          If new names an existing directory, it shall be
43726 +          required to be an empty directory.
43727 +
43728 +          [we should check this]
43729 +
43730 +          If the old argument points to a pathname of a symbolic link, the
43731 +          symbolic link shall be renamed. If the new argument points to a
43732 +          pathname of a symbolic link, the symbolic link shall be removed.
43733 +
43734 +          The new pathname shall not contain a path prefix that names
43735 +          old. Write access permission is required for the directory
43736 +          containing old and the directory containing new. If the old
43737 +          argument points to the pathname of a directory, write access
43738 +          permission may be required for the directory named by old, and, if
43739 +          it exists, the directory named by new.
43740 +
43741 +          [checked by VFS: vfs_rename(), vfs_rename_dir()]
43742 +
43743 +          If the link named by the new argument exists and the file's link
43744 +          count becomes 0 when it is removed and no process has the file
43745 +          open, the space occupied by the file shall be freed and the file
43746 +          shall no longer be accessible. If one or more processes have the
43747 +          file open when the last link is removed, the link shall be removed
43748 +          before rename() returns, but the removal of the file contents shall
43749 +          be postponed until all references to the file are closed.
43750 +
43751 +          [iput() handles this, but we can do this manually, a la
43752 +          reiser4_unlink()]
43753 +
43754 +          Upon successful completion, rename() shall mark for update the
43755 +          st_ctime and st_mtime fields of the parent directory of each file.
43756 +
43757 +          [N/A]
43758 +
43759 +        */
43760 +       reiser4_context *ctx;
43761 +       int result;
43762 +       int is_dir;             /* is @old_name directory */
43763 +
43764 +       struct inode *old_inode;
43765 +       struct inode *new_inode;
43766 +       coord_t *new_coord;
43767 +
43768 +       reiser4_dentry_fsdata *new_fsdata;
43769 +       dir_plugin *dplug;
43770 +       file_plugin *fplug;
43771 +
43772 +       reiser4_dir_entry_desc *old_entry, *new_entry, *dotdot_entry;
43773 +       lock_handle *new_lh, *dotdot_lh;
43774 +       struct dentry *dotdot_name;
43775 +       reiser4_dentry_fsdata *dataonstack;
43776 +
43777 +       ctx = init_context(old_dir->i_sb);
43778 +       if (IS_ERR(ctx))
43779 +               return PTR_ERR(ctx);
43780 +
43781 +       old_entry = kmalloc(3 * sizeof(*old_entry) + 2 * sizeof(*new_lh) +
43782 +                           sizeof(*dotdot_name) + sizeof(*dataonstack),
43783 +                           GFP_KERNEL);
43784 +       if (old_entry == NULL) {
43785 +               context_set_commit_async(ctx);
43786 +               reiser4_exit_context(ctx);
43787 +               return RETERR(-ENOMEM);
43788 +       }
43789 +       memset(old_entry, 0, 3 * sizeof(*old_entry) + 2 * sizeof(*new_lh) +
43790 +              sizeof(*dotdot_name) + sizeof(*dataonstack));
43791 +
43792 +       new_entry = old_entry + 1;
43793 +       dotdot_entry = old_entry + 2;
43794 +       new_lh = (lock_handle *)(old_entry + 3);
43795 +       dotdot_lh = new_lh + 1;
43796 +       dotdot_name = (struct dentry *)(new_lh + 2);
43797 +       dataonstack = (reiser4_dentry_fsdata *)(dotdot_name + 1);
43798 +
43799 +       assert("nikita-2318", old_dir != NULL);
43800 +       assert("nikita-2319", new_dir != NULL);
43801 +       assert("nikita-2320", old_name != NULL);
43802 +       assert("nikita-2321", new_name != NULL);
43803 +
43804 +       old_inode = old_name->d_inode;
43805 +       new_inode = new_name->d_inode;
43806 +
43807 +       dplug = inode_dir_plugin(old_dir);
43808 +       fplug = NULL;
43809 +
43810 +       new_fsdata = reiser4_get_dentry_fsdata(new_name);
43811 +       if (IS_ERR(new_fsdata)) {
43812 +               kfree(old_entry);
43813 +               context_set_commit_async(ctx);
43814 +               reiser4_exit_context(ctx);
43815 +               return PTR_ERR(new_fsdata);
43816 +       }
43817 +
43818 +       new_coord = &new_fsdata->dec.entry_coord;
43819 +       coord_clear_iplug(new_coord);
43820 +
43821 +       is_dir = S_ISDIR(old_inode->i_mode);
43822 +
43823 +       assert("nikita-3461", old_inode->i_nlink >= 1 + !!is_dir);
43824 +
43825 +       /* if target is existing directory and it's not empty---return error.
43826 +
43827 +          This check is done specifically, because is_dir_empty() requires
43828 +          tree traversal and have to be done before locks are taken.
43829 +        */
43830 +       if (is_dir && new_inode != NULL && is_dir_empty(new_inode) != 0) {
43831 +               kfree(old_entry);
43832 +               context_set_commit_async(ctx);
43833 +               reiser4_exit_context(ctx);
43834 +               return RETERR(-ENOTEMPTY);
43835 +       }
43836 +
43837 +       result = can_rename(old_dir, old_inode, new_dir, new_inode);
43838 +       if (result != 0) {
43839 +               kfree(old_entry);
43840 +               context_set_commit_async(ctx);
43841 +               reiser4_exit_context(ctx);
43842 +               return result;
43843 +       }
43844 +
43845 +       result = hashed_rename_estimate_and_grab(old_dir, old_name,
43846 +                                                new_dir, new_name);
43847 +       if (result != 0) {
43848 +               kfree(old_entry);
43849 +               context_set_commit_async(ctx);
43850 +               reiser4_exit_context(ctx);
43851 +               return result;
43852 +       }
43853 +
43854 +       init_lh(new_lh);
43855 +
43856 +       /* find entry for @new_name */
43857 +       result = find_entry(new_dir,
43858 +                           new_name, new_lh, ZNODE_WRITE_LOCK, new_entry);
43859 +
43860 +       if (IS_CBKERR(result)) {
43861 +               done_lh(new_lh);
43862 +               kfree(old_entry);
43863 +               context_set_commit_async(ctx);
43864 +               reiser4_exit_context(ctx);
43865 +               return result;
43866 +       }
43867 +
43868 +       seal_done(&new_fsdata->dec.entry_seal);
43869 +
43870 +       /* add or replace name for @old_inode as @new_name */
43871 +       if (new_inode != NULL) {
43872 +               /* target (@new_name) exists. */
43873 +               /* Not clear what to do with objects that are
43874 +                  both directories and files at the same time. */
43875 +               if (result == CBK_COORD_FOUND) {
43876 +                       result = replace_name(old_inode,
43877 +                                             new_dir,
43878 +                                             new_inode, new_coord, new_lh);
43879 +                       if (result == 0)
43880 +                               fplug = inode_file_plugin(new_inode);
43881 +               } else if (result == CBK_COORD_NOTFOUND) {
43882 +                       /* VFS told us that @new_name is bound to existing
43883 +                          inode, but we failed to find directory entry. */
43884 +                       warning("nikita-2324", "Target not found");
43885 +                       result = RETERR(-ENOENT);
43886 +               }
43887 +       } else {
43888 +               /* target (@new_name) doesn't exists. */
43889 +               if (result == CBK_COORD_NOTFOUND)
43890 +                       result = add_name(old_inode,
43891 +                                         new_dir,
43892 +                                         new_name, new_coord, new_lh, is_dir);
43893 +               else if (result == CBK_COORD_FOUND) {
43894 +                       /* VFS told us that @new_name is "negative" dentry,
43895 +                          but we found directory entry. */
43896 +                       warning("nikita-2331", "Target found unexpectedly");
43897 +                       result = RETERR(-EIO);
43898 +               }
43899 +       }
43900 +
43901 +       assert("nikita-3462", ergo(result == 0,
43902 +                                  old_inode->i_nlink >= 2 + !!is_dir));
43903 +
43904 +       /* We are done with all modifications to the @new_dir, release lock on
43905 +          node. */
43906 +       done_lh(new_lh);
43907 +
43908 +       if (fplug != NULL) {
43909 +               /* detach @new_inode from name-space */
43910 +               result = fplug->detach(new_inode, new_dir);
43911 +               if (result != 0)
43912 +                       warning("nikita-2330", "Cannot detach %lli: %i. %s",
43913 +                               (unsigned long long)get_inode_oid(new_inode),
43914 +                               result, possible_leak);
43915 +       }
43916 +
43917 +       if (new_inode != NULL)
43918 +               reiser4_update_sd(new_inode);
43919 +
43920 +       if (result == 0) {
43921 +               old_entry->obj = old_inode;
43922 +
43923 +               dplug->build_entry_key(old_dir,
43924 +                                      &old_name->d_name, &old_entry->key);
43925 +
43926 +               /* At this stage new name was introduced for
43927 +                  @old_inode. @old_inode, @new_dir, and @new_inode i_nlink
43928 +                  counters were updated.
43929 +
43930 +                  We want to remove @old_name now. If @old_inode wasn't
43931 +                  directory this is simple.
43932 +                */
43933 +               result = dplug->rem_entry(old_dir, old_name, old_entry);
43934 +               if (result != 0 && result != -ENOMEM) {
43935 +                       warning("nikita-2335",
43936 +                               "Cannot remove old name: %i", result);
43937 +               } else {
43938 +                       result = reiser4_del_nlink(old_inode, old_dir, 0);
43939 +                       if (result != 0 && result != -ENOMEM) {
43940 +                               warning("nikita-2337",
43941 +                                       "Cannot drop link on old: %i", result);
43942 +                       }
43943 +               }
43944 +
43945 +               if (result == 0 && is_dir) {
43946 +                       /* @old_inode is directory. We also have to update
43947 +                          dotdot entry. */
43948 +                       coord_t *dotdot_coord;
43949 +
43950 +                       memset(dataonstack, 0, sizeof dataonstack);
43951 +                       memset(dotdot_entry, 0, sizeof dotdot_entry);
43952 +                       dotdot_entry->obj = old_dir;
43953 +                       memset(dotdot_name, 0, sizeof dotdot_name);
43954 +                       dotdot_name->d_name.name = "..";
43955 +                       dotdot_name->d_name.len = 2;
43956 +                       /*
43957 +                        * allocate ->d_fsdata on the stack to avoid using
43958 +                        * reiser4_get_dentry_fsdata(). Locking is not needed,
43959 +                        * because dentry is private to the current thread.
43960 +                        */
43961 +                       dotdot_name->d_fsdata = dataonstack;
43962 +                       init_lh(dotdot_lh);
43963 +
43964 +                       dotdot_coord = &dataonstack->dec.entry_coord;
43965 +                       coord_clear_iplug(dotdot_coord);
43966 +
43967 +                       result = find_entry(old_inode, dotdot_name, dotdot_lh,
43968 +                                           ZNODE_WRITE_LOCK, dotdot_entry);
43969 +                       if (result == 0) {
43970 +                               /* replace_name() decreases i_nlink on
43971 +                                * @old_dir */
43972 +                               result = replace_name(new_dir,
43973 +                                                     old_inode,
43974 +                                                     old_dir,
43975 +                                                     dotdot_coord, dotdot_lh);
43976 +                       } else
43977 +                               result = RETERR(-EIO);
43978 +                       done_lh(dotdot_lh);
43979 +               }
43980 +       }
43981 +       reiser4_update_dir(new_dir);
43982 +       reiser4_update_dir(old_dir);
43983 +       reiser4_update_sd(old_inode);
43984 +       if (result == 0) {
43985 +               file_plugin *fplug;
43986 +
43987 +               if (new_inode != NULL) {
43988 +                       /* add safe-link for target file (in case we removed
43989 +                        * last reference to the poor fellow */
43990 +                       fplug = inode_file_plugin(new_inode);
43991 +                       if (new_inode->i_nlink == 0)
43992 +                               result = safe_link_add(new_inode, SAFE_UNLINK);
43993 +               }
43994 +       }
43995 +       kfree(old_entry);
43996 +       context_set_commit_async(ctx);
43997 +       reiser4_exit_context(ctx);
43998 +       return result;
43999 +}
44000 +
44001 +#if 0
44002 +int rename_common(struct inode *old_dir /* directory where @old is located */ ,
44003 +                 struct dentry *old_name /* old name */ ,
44004 +                 struct inode *new_dir /* directory where @new is located */ ,
44005 +                 struct dentry *new_name /* new name */ )
44006 +{
44007 +       /* From `The Open Group Base Specifications Issue 6'
44008 +
44009 +          If either the old or new argument names a symbolic link, rename()
44010 +          shall operate on the symbolic link itself, and shall not resolve
44011 +          the last component of the argument. If the old argument and the new
44012 +          argument resolve to the same existing file, rename() shall return
44013 +          successfully and perform no other action.
44014 +
44015 +          [this is done by VFS: vfs_rename()]
44016 +
44017 +          If the old argument points to the pathname of a file that is not a
44018 +          directory, the new argument shall not point to the pathname of a
44019 +          directory.
44020 +
44021 +          [checked by VFS: vfs_rename->may_delete()]
44022 +
44023 +          If the link named by the new argument exists, it shall
44024 +          be removed and old renamed to new. In this case, a link named new
44025 +          shall remain visible to other processes throughout the renaming
44026 +          operation and refer either to the file referred to by new or old
44027 +          before the operation began.
44028 +
44029 +          [we should assure this]
44030 +
44031 +          Write access permission is required for
44032 +          both the directory containing old and the directory containing new.
44033 +
44034 +          [checked by VFS: vfs_rename->may_delete(), may_create()]
44035 +
44036 +          If the old argument points to the pathname of a directory, the new
44037 +          argument shall not point to the pathname of a file that is not a
44038 +          directory.
44039 +
44040 +          [checked by VFS: vfs_rename->may_delete()]
44041 +
44042 +          If the directory named by the new argument exists, it
44043 +          shall be removed and old renamed to new. In this case, a link named
44044 +          new shall exist throughout the renaming operation and shall refer
44045 +          either to the directory referred to by new or old before the
44046 +          operation began.
44047 +
44048 +          [we should assure this]
44049 +
44050 +          If new names an existing directory, it shall be
44051 +          required to be an empty directory.
44052 +
44053 +          [we should check this]
44054 +
44055 +          If the old argument points to a pathname of a symbolic link, the
44056 +          symbolic link shall be renamed. If the new argument points to a
44057 +          pathname of a symbolic link, the symbolic link shall be removed.
44058 +
44059 +          The new pathname shall not contain a path prefix that names
44060 +          old. Write access permission is required for the directory
44061 +          containing old and the directory containing new. If the old
44062 +          argument points to the pathname of a directory, write access
44063 +          permission may be required for the directory named by old, and, if
44064 +          it exists, the directory named by new.
44065 +
44066 +          [checked by VFS: vfs_rename(), vfs_rename_dir()]
44067 +
44068 +          If the link named by the new argument exists and the file's link
44069 +          count becomes 0 when it is removed and no process has the file
44070 +          open, the space occupied by the file shall be freed and the file
44071 +          shall no longer be accessible. If one or more processes have the
44072 +          file open when the last link is removed, the link shall be removed
44073 +          before rename() returns, but the removal of the file contents shall
44074 +          be postponed until all references to the file are closed.
44075 +
44076 +          [iput() handles this, but we can do this manually, a la
44077 +          reiser4_unlink()]
44078 +
44079 +          Upon successful completion, rename() shall mark for update the
44080 +          st_ctime and st_mtime fields of the parent directory of each file.
44081 +
44082 +          [N/A]
44083 +
44084 +        */
44085 +       reiser4_context *ctx;
44086 +       int result;
44087 +       int is_dir;             /* is @old_name directory */
44088 +       struct inode *old_inode;
44089 +       struct inode *new_inode;
44090 +       reiser4_dir_entry_desc old_entry;
44091 +       reiser4_dir_entry_desc new_entry;
44092 +       coord_t *new_coord;
44093 +       reiser4_dentry_fsdata *new_fsdata;
44094 +       lock_handle new_lh;
44095 +       dir_plugin *dplug;
44096 +       file_plugin *fplug;
44097 +
44098 +       ctx = init_context(old_dir->i_sb);
44099 +       if (IS_ERR(ctx))
44100 +               return PTR_ERR(ctx);
44101 +
44102 +       assert("nikita-2318", old_dir != NULL);
44103 +       assert("nikita-2319", new_dir != NULL);
44104 +       assert("nikita-2320", old_name != NULL);
44105 +       assert("nikita-2321", new_name != NULL);
44106 +
44107 +       old_inode = old_name->d_inode;
44108 +       new_inode = new_name->d_inode;
44109 +
44110 +       dplug = inode_dir_plugin(old_dir);
44111 +       fplug = NULL;
44112 +
44113 +       new_fsdata = reiser4_get_dentry_fsdata(new_name);
44114 +       if (IS_ERR(new_fsdata)) {
44115 +               result = PTR_ERR(new_fsdata);
44116 +               goto exit;
44117 +       }
44118 +
44119 +       new_coord = &new_fsdata->dec.entry_coord;
44120 +       coord_clear_iplug(new_coord);
44121 +
44122 +       is_dir = S_ISDIR(old_inode->i_mode);
44123 +
44124 +       assert("nikita-3461", old_inode->i_nlink >= 1 + !!is_dir);
44125 +
44126 +       /* if target is existing directory and it's not empty---return error.
44127 +
44128 +          This check is done specifically, because is_dir_empty() requires
44129 +          tree traversal and have to be done before locks are taken.
44130 +        */
44131 +       if (is_dir && new_inode != NULL && is_dir_empty(new_inode) != 0)
44132 +               return RETERR(-ENOTEMPTY);
44133 +
44134 +       result = can_rename(old_dir, old_inode, new_dir, new_inode);
44135 +       if (result != 0)
44136 +               goto exit;
44137 +
44138 +       result = hashed_rename_estimate_and_grab(old_dir, old_name,
44139 +                                                new_dir, new_name);
44140 +       if (result != 0)
44141 +               goto exit;
44142 +
44143 +       init_lh(&new_lh);
44144 +
44145 +       /* find entry for @new_name */
44146 +       result = find_entry(new_dir,
44147 +                           new_name, &new_lh, ZNODE_WRITE_LOCK, &new_entry);
44148 +
44149 +       if (IS_CBKERR(result)) {
44150 +               done_lh(&new_lh);
44151 +               goto exit;
44152 +       }
44153 +
44154 +       seal_done(&new_fsdata->dec.entry_seal);
44155 +
44156 +       /* add or replace name for @old_inode as @new_name */
44157 +       if (new_inode != NULL) {
44158 +               /* target (@new_name) exists. */
44159 +               /* Not clear what to do with objects that are
44160 +                  both directories and files at the same time. */
44161 +               if (result == CBK_COORD_FOUND) {
44162 +                       result = replace_name(old_inode,
44163 +                                             new_dir,
44164 +                                             new_inode, new_coord, &new_lh);
44165 +                       if (result == 0)
44166 +                               fplug = inode_file_plugin(new_inode);
44167 +               } else if (result == CBK_COORD_NOTFOUND) {
44168 +                       /* VFS told us that @new_name is bound to existing
44169 +                          inode, but we failed to find directory entry. */
44170 +                       warning("nikita-2324", "Target not found");
44171 +                       result = RETERR(-ENOENT);
44172 +               }
44173 +       } else {
44174 +               /* target (@new_name) doesn't exists. */
44175 +               if (result == CBK_COORD_NOTFOUND)
44176 +                       result = add_name(old_inode,
44177 +                                         new_dir,
44178 +                                         new_name, new_coord, &new_lh, is_dir);
44179 +               else if (result == CBK_COORD_FOUND) {
44180 +                       /* VFS told us that @new_name is "negative" dentry,
44181 +                          but we found directory entry. */
44182 +                       warning("nikita-2331", "Target found unexpectedly");
44183 +                       result = RETERR(-EIO);
44184 +               }
44185 +       }
44186 +
44187 +       assert("nikita-3462", ergo(result == 0,
44188 +                                  old_inode->i_nlink >= 2 + !!is_dir));
44189 +
44190 +       /* We are done with all modifications to the @new_dir, release lock on
44191 +          node. */
44192 +       done_lh(&new_lh);
44193 +
44194 +       if (fplug != NULL) {
44195 +               /* detach @new_inode from name-space */
44196 +               result = fplug->detach(new_inode, new_dir);
44197 +               if (result != 0)
44198 +                       warning("nikita-2330", "Cannot detach %lli: %i. %s",
44199 +                               (unsigned long long)get_inode_oid(new_inode),
44200 +                               result, possible_leak);
44201 +       }
44202 +
44203 +       if (new_inode != NULL)
44204 +               reiser4_update_sd(new_inode);
44205 +
44206 +       if (result == 0) {
44207 +               memset(&old_entry, 0, sizeof old_entry);
44208 +               old_entry.obj = old_inode;
44209 +
44210 +               dplug->build_entry_key(old_dir,
44211 +                                      &old_name->d_name, &old_entry.key);
44212 +
44213 +               /* At this stage new name was introduced for
44214 +                  @old_inode. @old_inode, @new_dir, and @new_inode i_nlink
44215 +                  counters were updated.
44216 +
44217 +                  We want to remove @old_name now. If @old_inode wasn't
44218 +                  directory this is simple.
44219 +                */
44220 +               result = dplug->rem_entry(old_dir, old_name, &old_entry);
44221 +               /*result = rem_entry_hashed(old_dir, old_name, &old_entry); */
44222 +               if (result != 0 && result != -ENOMEM) {
44223 +                       warning("nikita-2335",
44224 +                               "Cannot remove old name: %i", result);
44225 +               } else {
44226 +                       result = reiser4_del_nlink(old_inode, old_dir, 0);
44227 +                       if (result != 0 && result != -ENOMEM) {
44228 +                               warning("nikita-2337",
44229 +                                       "Cannot drop link on old: %i", result);
44230 +                       }
44231 +               }
44232 +
44233 +               if (result == 0 && is_dir) {
44234 +                       /* @old_inode is directory. We also have to update
44235 +                          dotdot entry. */
44236 +                       coord_t *dotdot_coord;
44237 +                       lock_handle dotdot_lh;
44238 +                       struct dentry dotdot_name;
44239 +                       reiser4_dir_entry_desc dotdot_entry;
44240 +                       reiser4_dentry_fsdata dataonstack;
44241 +                       reiser4_dentry_fsdata *fsdata;
44242 +
44243 +                       memset(&dataonstack, 0, sizeof dataonstack);
44244 +                       memset(&dotdot_entry, 0, sizeof dotdot_entry);
44245 +                       dotdot_entry.obj = old_dir;
44246 +                       memset(&dotdot_name, 0, sizeof dotdot_name);
44247 +                       dotdot_name.d_name.name = "..";
44248 +                       dotdot_name.d_name.len = 2;
44249 +                       /*
44250 +                        * allocate ->d_fsdata on the stack to avoid using
44251 +                        * reiser4_get_dentry_fsdata(). Locking is not needed,
44252 +                        * because dentry is private to the current thread.
44253 +                        */
44254 +                       dotdot_name.d_fsdata = &dataonstack;
44255 +                       init_lh(&dotdot_lh);
44256 +
44257 +                       fsdata = &dataonstack;
44258 +                       dotdot_coord = &fsdata->dec.entry_coord;
44259 +                       coord_clear_iplug(dotdot_coord);
44260 +
44261 +                       result = find_entry(old_inode, &dotdot_name, &dotdot_lh,
44262 +                                           ZNODE_WRITE_LOCK, &dotdot_entry);
44263 +                       if (result == 0) {
44264 +                               /* replace_name() decreases i_nlink on
44265 +                                * @old_dir */
44266 +                               result = replace_name(new_dir,
44267 +                                                     old_inode,
44268 +                                                     old_dir,
44269 +                                                     dotdot_coord, &dotdot_lh);
44270 +                       } else
44271 +                               result = RETERR(-EIO);
44272 +                       done_lh(&dotdot_lh);
44273 +               }
44274 +       }
44275 +       reiser4_update_dir(new_dir);
44276 +       reiser4_update_dir(old_dir);
44277 +       reiser4_update_sd(old_inode);
44278 +       if (result == 0) {
44279 +               file_plugin *fplug;
44280 +
44281 +               if (new_inode != NULL) {
44282 +                       /* add safe-link for target file (in case we removed
44283 +                        * last reference to the poor fellow */
44284 +                       fplug = inode_file_plugin(new_inode);
44285 +                       if (new_inode->i_nlink == 0)
44286 +                               result = safe_link_add(new_inode, SAFE_UNLINK);
44287 +               }
44288 +       }
44289 +      exit:
44290 +       context_set_commit_async(ctx);
44291 +       reiser4_exit_context(ctx);
44292 +       return result;
44293 +}
44294 +#endif
44295 diff --git a/fs/reiser4/plugin/item/Makefile b/fs/reiser4/plugin/item/Makefile
44296 new file mode 100644
44297 index 0000000..1bae623
44298 --- /dev/null
44299 +++ b/fs/reiser4/plugin/item/Makefile
44300 @@ -0,0 +1,18 @@
44301 +obj-$(CONFIG_REISER4_FS) += item_plugins.o
44302 +
44303 +item_plugins-objs :=           \
44304 +       item.o                  \
44305 +       static_stat.o           \
44306 +       sde.o                   \
44307 +       cde.o                   \
44308 +       blackbox.o              \
44309 +       internal.o              \
44310 +       tail.o                  \
44311 +       ctail.o                 \
44312 +       extent.o                \
44313 +       extent_item_ops.o       \
44314 +       extent_file_ops.o       \
44315 +       extent_flush_ops.o
44316 +
44317 +
44318 +
44319 diff --git a/fs/reiser4/plugin/item/acl.h b/fs/reiser4/plugin/item/acl.h
44320 new file mode 100644
44321 index 0000000..f26762a
44322 --- /dev/null
44323 +++ b/fs/reiser4/plugin/item/acl.h
44324 @@ -0,0 +1,66 @@
44325 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
44326 +
44327 +/* Directory entry. */
44328 +
44329 +#if !defined( __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__ )
44330 +#define __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__
44331 +
44332 +#include "../../forward.h"
44333 +#include "../../dformat.h"
44334 +#include "../../kassign.h"
44335 +#include "../../key.h"
44336 +
44337 +#include <linux/fs.h>
44338 +#include <linux/dcache.h>      /* for struct dentry */
44339 +
44340 +typedef struct directory_entry_format {
44341 +       /* key of object stat-data. It's not necessary to store whole
44342 +          key here, because it's always key of stat-data, so minor
44343 +          packing locality and offset can be omitted here. But this
44344 +          relies on particular key allocation scheme for stat-data, so,
44345 +          for extensibility sake, whole key can be stored here.
44346 +
44347 +          We store key as array of bytes, because we don't want 8-byte
44348 +          alignment of dir entries.
44349 +        */
44350 +       obj_key_id id;
44351 +       /* file name. Null terminated string. */
44352 +       d8 name[0];
44353 +} directory_entry_format;
44354 +
44355 +void print_de(const char *prefix, coord_t * coord);
44356 +int extract_key_de(const coord_t * coord, reiser4_key * key);
44357 +int update_key_de(const coord_t * coord, const reiser4_key * key,
44358 +                 lock_handle * lh);
44359 +char *extract_name_de(const coord_t * coord, char *buf);
44360 +unsigned extract_file_type_de(const coord_t * coord);
44361 +int add_entry_de(struct inode *dir, coord_t * coord,
44362 +                lock_handle * lh, const struct dentry *name,
44363 +                reiser4_dir_entry_desc * entry);
44364 +int rem_entry_de(struct inode *dir, const struct qstr *name, coord_t * coord,
44365 +                lock_handle * lh, reiser4_dir_entry_desc * entry);
44366 +int max_name_len_de(const struct inode *dir);
44367 +
44368 +int de_rem_and_shrink(struct inode *dir, coord_t * coord, int length);
44369 +
44370 +char *extract_dent_name(const coord_t * coord,
44371 +                       directory_entry_format * dent, char *buf);
44372 +
44373 +#if REISER4_LARGE_KEY
44374 +#define DE_NAME_BUF_LEN (24)
44375 +#else
44376 +#define DE_NAME_BUF_LEN (16)
44377 +#endif
44378 +
44379 +/* __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__ */
44380 +#endif
44381 +
44382 +/* Make Linus happy.
44383 +   Local variables:
44384 +   c-indentation-style: "K&R"
44385 +   mode-name: "LC"
44386 +   c-basic-offset: 8
44387 +   tab-width: 8
44388 +   fill-column: 120
44389 +   End:
44390 +*/
44391 diff --git a/fs/reiser4/plugin/item/blackbox.c b/fs/reiser4/plugin/item/blackbox.c
44392 new file mode 100644
44393 index 0000000..0f24ca3
44394 --- /dev/null
44395 +++ b/fs/reiser4/plugin/item/blackbox.c
44396 @@ -0,0 +1,142 @@
44397 +/* Copyright 2003 by Hans Reiser, licensing governed by
44398 + * reiser4/README */
44399 +
44400 +/* Black box item implementation */
44401 +
44402 +#include "../../forward.h"
44403 +#include "../../debug.h"
44404 +#include "../../dformat.h"
44405 +#include "../../kassign.h"
44406 +#include "../../coord.h"
44407 +#include "../../tree.h"
44408 +#include "../../lock.h"
44409 +
44410 +#include "blackbox.h"
44411 +#include "item.h"
44412 +#include "../plugin.h"
44413 +
44414 +int
44415 +store_black_box(reiser4_tree * tree,
44416 +               const reiser4_key * key, void *data, int length)
44417 +{
44418 +       int result;
44419 +       reiser4_item_data idata;
44420 +       coord_t coord;
44421 +       lock_handle lh;
44422 +
44423 +       memset(&idata, 0, sizeof idata);
44424 +
44425 +       idata.data = data;
44426 +       idata.user = 0;
44427 +       idata.length = length;
44428 +       idata.iplug = item_plugin_by_id(BLACK_BOX_ID);
44429 +
44430 +       init_lh(&lh);
44431 +       result = insert_by_key(tree, key,
44432 +                              &idata, &coord, &lh, LEAF_LEVEL, CBK_UNIQUE);
44433 +
44434 +       assert("nikita-3413",
44435 +              ergo(result == 0,
44436 +                   WITH_COORD(&coord,
44437 +                              item_length_by_coord(&coord) == length)));
44438 +
44439 +       done_lh(&lh);
44440 +       return result;
44441 +}
44442 +
44443 +int
44444 +load_black_box(reiser4_tree * tree,
44445 +              reiser4_key * key, void *data, int length, int exact)
44446 +{
44447 +       int result;
44448 +       coord_t coord;
44449 +       lock_handle lh;
44450 +
44451 +       init_lh(&lh);
44452 +       result = coord_by_key(tree, key,
44453 +                             &coord, &lh, ZNODE_READ_LOCK,
44454 +                             exact ? FIND_EXACT : FIND_MAX_NOT_MORE_THAN,
44455 +                             LEAF_LEVEL, LEAF_LEVEL, CBK_UNIQUE, NULL);
44456 +
44457 +       if (result == 0) {
44458 +               int ilen;
44459 +
44460 +               result = zload(coord.node);
44461 +               if (result == 0) {
44462 +                       ilen = item_length_by_coord(&coord);
44463 +                       if (ilen <= length) {
44464 +                               memcpy(data, item_body_by_coord(&coord), ilen);
44465 +                               unit_key_by_coord(&coord, key);
44466 +                       } else if (exact) {
44467 +                               /*
44468 +                                * item is larger than buffer provided by the
44469 +                                * user. Only issue a warning if @exact is
44470 +                                * set. If @exact is false, we are iterating
44471 +                                * over all safe-links and here we are reaching
44472 +                                * the end of the iteration.
44473 +                                */
44474 +                               warning("nikita-3415",
44475 +                                       "Wrong black box length: %i > %i",
44476 +                                       ilen, length);
44477 +                               result = RETERR(-EIO);
44478 +                       }
44479 +                       zrelse(coord.node);
44480 +               }
44481 +       }
44482 +
44483 +       done_lh(&lh);
44484 +       return result;
44485 +
44486 +}
44487 +
44488 +int
44489 +update_black_box(reiser4_tree * tree,
44490 +                const reiser4_key * key, void *data, int length)
44491 +{
44492 +       int result;
44493 +       coord_t coord;
44494 +       lock_handle lh;
44495 +
44496 +       init_lh(&lh);
44497 +       result = coord_by_key(tree, key,
44498 +                             &coord, &lh, ZNODE_READ_LOCK,
44499 +                             FIND_EXACT,
44500 +                             LEAF_LEVEL, LEAF_LEVEL, CBK_UNIQUE, NULL);
44501 +       if (result == 0) {
44502 +               int ilen;
44503 +
44504 +               result = zload(coord.node);
44505 +               if (result == 0) {
44506 +                       ilen = item_length_by_coord(&coord);
44507 +                       if (length <= ilen) {
44508 +                               memcpy(item_body_by_coord(&coord), data,
44509 +                                      length);
44510 +                       } else {
44511 +                               warning("nikita-3437",
44512 +                                       "Wrong black box length: %i < %i",
44513 +                                       ilen, length);
44514 +                               result = RETERR(-EIO);
44515 +                       }
44516 +                       zrelse(coord.node);
44517 +               }
44518 +       }
44519 +
44520 +       done_lh(&lh);
44521 +       return result;
44522 +
44523 +}
44524 +
44525 +int kill_black_box(reiser4_tree * tree, const reiser4_key * key)
44526 +{
44527 +       return cut_tree(tree, key, key, NULL, 1);
44528 +}
44529 +
44530 +/* Make Linus happy.
44531 +   Local variables:
44532 +   c-indentation-style: "K&R"
44533 +   mode-name: "LC"
44534 +   c-basic-offset: 8
44535 +   tab-width: 8
44536 +   fill-column: 120
44537 +   End:
44538 +*/
44539 diff --git a/fs/reiser4/plugin/item/blackbox.h b/fs/reiser4/plugin/item/blackbox.h
44540 new file mode 100644
44541 index 0000000..f5b7af3
44542 --- /dev/null
44543 +++ b/fs/reiser4/plugin/item/blackbox.h
44544 @@ -0,0 +1,33 @@
44545 +/* Copyright 2003 by Hans Reiser, licensing governed by
44546 + * reiser4/README */
44547 +
44548 +/* "Black box" entry to fixed-width contain user supplied data */
44549 +
44550 +#if !defined( __FS_REISER4_BLACK_BOX_H__ )
44551 +#define __FS_REISER4_BLACK_BOX_H__
44552 +
44553 +#include "../../forward.h"
44554 +#include "../../dformat.h"
44555 +#include "../../kassign.h"
44556 +#include "../../key.h"
44557 +
44558 +extern int store_black_box(reiser4_tree * tree,
44559 +                          const reiser4_key * key, void *data, int length);
44560 +extern int load_black_box(reiser4_tree * tree,
44561 +                         reiser4_key * key, void *data, int length, int exact);
44562 +extern int kill_black_box(reiser4_tree * tree, const reiser4_key * key);
44563 +extern int update_black_box(reiser4_tree * tree,
44564 +                           const reiser4_key * key, void *data, int length);
44565 +
44566 +/* __FS_REISER4_BLACK_BOX_H__ */
44567 +#endif
44568 +
44569 +/* Make Linus happy.
44570 +   Local variables:
44571 +   c-indentation-style: "K&R"
44572 +   mode-name: "LC"
44573 +   c-basic-offset: 8
44574 +   tab-width: 8
44575 +   fill-column: 120
44576 +   End:
44577 +*/
44578 diff --git a/fs/reiser4/plugin/item/cde.c b/fs/reiser4/plugin/item/cde.c
44579 new file mode 100644
44580 index 0000000..65ccd66
44581 --- /dev/null
44582 +++ b/fs/reiser4/plugin/item/cde.c
44583 @@ -0,0 +1,1007 @@
44584 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
44585 +
44586 +/* Directory entry implementation */
44587 +
44588 +/* DESCRIPTION:
44589 +
44590 +   This is "compound" directory item plugin implementation. This directory
44591 +   item type is compound (as opposed to the "simple directory item" in
44592 +   fs/reiser4/plugin/item/sde.[ch]), because it consists of several directory
44593 +   entries.
44594 +
44595 +   The reason behind this decision is disk space efficiency: all directory
44596 +   entries inside the same directory have identical fragment in their
44597 +   keys. This, of course, depends on key assignment policy. In our default key
44598 +   assignment policy, all directory entries have the same locality which is
44599 +   equal to the object id of their directory.
44600 +
44601 +   Composing directory item out of several directory entries for the same
44602 +   directory allows us to store said key fragment only once. That is, this is
44603 +   some ad hoc form of key compression (stem compression) that is implemented
44604 +   here, because general key compression is not supposed to be implemented in
44605 +   v4.0.
44606 +
44607 +   Another decision that was made regarding all directory item plugins, is
44608 +   that they will store entry keys unaligned. This is for that sake of disk
44609 +   space efficiency again.
44610 +
44611 +   In should be noted, that storing keys unaligned increases CPU consumption,
44612 +   at least on some architectures.
44613 +
44614 +   Internal on-disk structure of the compound directory item is the following:
44615 +
44616 +        HEADER          cde_item_format.        Here number of entries is stored.
44617 +        ENTRY_HEADER_0  cde_unit_header.        Here part of entry key and
44618 +        ENTRY_HEADER_1                          offset of entry body are stored.
44619 +        ENTRY_HEADER_2                         (basically two last parts of key)
44620 +        ...
44621 +        ENTRY_HEADER_N
44622 +        ENTRY_BODY_0    directory_entry_format. Here part of stat data key and
44623 +        ENTRY_BODY_1                            NUL-terminated name are stored.
44624 +        ENTRY_BODY_2                           (part of statadta key in the
44625 +                                                sence that since all SDs have
44626 +                                                zero offset, this offset is not
44627 +                                                stored on disk).
44628 +        ...
44629 +        ENTRY_BODY_N
44630 +
44631 +   When it comes to the balancing, each directory entry in compound directory
44632 +   item is unit, that is, something that can be cut from one item and pasted
44633 +   into another item of the same type. Handling of unit cut and paste is major
44634 +   reason for the complexity of code below.
44635 +
44636 +*/
44637 +
44638 +#include "../../forward.h"
44639 +#include "../../debug.h"
44640 +#include "../../dformat.h"
44641 +#include "../../kassign.h"
44642 +#include "../../key.h"
44643 +#include "../../coord.h"
44644 +#include "sde.h"
44645 +#include "cde.h"
44646 +#include "item.h"
44647 +#include "../node/node.h"
44648 +#include "../plugin.h"
44649 +#include "../../znode.h"
44650 +#include "../../carry.h"
44651 +#include "../../tree.h"
44652 +#include "../../inode.h"
44653 +
44654 +#include <linux/fs.h>          /* for struct inode */
44655 +#include <linux/dcache.h>      /* for struct dentry */
44656 +#include <linux/quotaops.h>
44657 +
44658 +#if 0
44659 +#define CHECKME(coord)                                         \
44660 +({                                                             \
44661 +       const char *message;                                    \
44662 +       coord_t dup;                                            \
44663 +                                                               \
44664 +       coord_dup_nocheck(&dup, (coord));                       \
44665 +       dup.unit_pos = 0;                                       \
44666 +       assert("nikita-2871", cde_check(&dup, &message) == 0);  \
44667 +})
44668 +#else
44669 +#define CHECKME(coord) noop
44670 +#endif
44671 +
44672 +/* return body of compound directory item at @coord */
44673 +static inline cde_item_format *formatted_at(const coord_t * coord)
44674 +{
44675 +       assert("nikita-1282", coord != NULL);
44676 +       return item_body_by_coord(coord);
44677 +}
44678 +
44679 +/* return entry header at @coord */
44680 +static inline cde_unit_header *header_at(const coord_t *
44681 +                                        coord /* coord of item */ ,
44682 +                                        int idx /* index of unit */ )
44683 +{
44684 +       assert("nikita-1283", coord != NULL);
44685 +       return &formatted_at(coord)->entry[idx];
44686 +}
44687 +
44688 +/* return number of units in compound directory item at @coord */
44689 +static int units(const coord_t * coord /* coord of item */ )
44690 +{
44691 +       return le16_to_cpu(get_unaligned(&formatted_at(coord)->num_of_entries));
44692 +}
44693 +
44694 +/* return offset of the body of @idx-th entry in @coord */
44695 +static unsigned int offset_of(const coord_t * coord /* coord of item */ ,
44696 +                             int idx /* index of unit */ )
44697 +{
44698 +       if (idx < units(coord))
44699 +               return le16_to_cpu(get_unaligned(&header_at(coord, idx)->offset));
44700 +       else if (idx == units(coord))
44701 +               return item_length_by_coord(coord);
44702 +       else
44703 +               impossible("nikita-1308", "Wrong idx");
44704 +       return 0;
44705 +}
44706 +
44707 +/* set offset of the body of @idx-th entry in @coord */
44708 +static void set_offset(const coord_t * coord /* coord of item */ ,
44709 +                      int idx /* index of unit */ ,
44710 +                      unsigned int offset /* new offset */ )
44711 +{
44712 +       put_unaligned(cpu_to_le16((__u16) offset), &header_at(coord, idx)->offset);
44713 +}
44714 +
44715 +static void adj_offset(const coord_t * coord /* coord of item */ ,
44716 +                      int idx /* index of unit */ ,
44717 +                      int delta /* offset change */ )
44718 +{
44719 +       d16 *doffset;
44720 +       __u16 offset;
44721 +
44722 +       doffset = &header_at(coord, idx)->offset;
44723 +       offset = le16_to_cpu(get_unaligned(doffset));
44724 +       offset += delta;
44725 +       put_unaligned(cpu_to_le16((__u16) offset), doffset);
44726 +}
44727 +
44728 +/* return pointer to @offset-th byte from the beginning of @coord */
44729 +static char *address(const coord_t * coord /* coord of item */ ,
44730 +                    int offset)
44731 +{
44732 +       return ((char *)item_body_by_coord(coord)) + offset;
44733 +}
44734 +
44735 +/* return pointer to the body of @idx-th entry in @coord */
44736 +static directory_entry_format *entry_at(const coord_t * coord  /* coord of
44737 +                                                                * item */ ,
44738 +                                       int idx /* index of unit */ )
44739 +{
44740 +       return (directory_entry_format *) address(coord,
44741 +                                                 (int)offset_of(coord, idx));
44742 +}
44743 +
44744 +/* return number of unit referenced by @coord */
44745 +static int idx_of(const coord_t * coord /* coord of item */ )
44746 +{
44747 +       assert("nikita-1285", coord != NULL);
44748 +       return coord->unit_pos;
44749 +}
44750 +
44751 +/* find position where entry with @entry_key would be inserted into @coord */
44752 +static int find(const coord_t * coord /* coord of item */ ,
44753 +               const reiser4_key * entry_key /* key to look for */ ,
44754 +               cmp_t * last /* result of last comparison */ )
44755 +{
44756 +       int entries;
44757 +
44758 +       int left;
44759 +       int right;
44760 +
44761 +       cde_unit_header *header;
44762 +
44763 +       assert("nikita-1295", coord != NULL);
44764 +       assert("nikita-1296", entry_key != NULL);
44765 +       assert("nikita-1297", last != NULL);
44766 +
44767 +       entries = units(coord);
44768 +       left = 0;
44769 +       right = entries - 1;
44770 +       while (right - left >= REISER4_SEQ_SEARCH_BREAK) {
44771 +               int median;
44772 +
44773 +               median = (left + right) >> 1;
44774 +
44775 +               header = header_at(coord, median);
44776 +               *last = de_id_key_cmp(&header->hash, entry_key);
44777 +               switch (*last) {
44778 +               case LESS_THAN:
44779 +                       left = median;
44780 +                       break;
44781 +               case GREATER_THAN:
44782 +                       right = median;
44783 +                       break;
44784 +               case EQUAL_TO:{
44785 +                               do {
44786 +                                       median--;
44787 +                                       header--;
44788 +                               } while (median >= 0 &&
44789 +                                        de_id_key_cmp(&header->hash,
44790 +                                                      entry_key) == EQUAL_TO);
44791 +                               return median + 1;
44792 +                       }
44793 +               }
44794 +       }
44795 +       header = header_at(coord, left);
44796 +       for (; left < entries; ++left, ++header) {
44797 +               prefetch(header + 1);
44798 +               *last = de_id_key_cmp(&header->hash, entry_key);
44799 +               if (*last != LESS_THAN)
44800 +                       break;
44801 +       }
44802 +       if (left < entries)
44803 +               return left;
44804 +       else
44805 +               return RETERR(-ENOENT);
44806 +
44807 +}
44808 +
44809 +/* expand @coord as to accommodate for insertion of @no new entries starting
44810 +   from @pos, with total bodies size @size. */
44811 +static int expand_item(const coord_t * coord /* coord of item */ ,
44812 +                      int pos /* unit position */ , int no     /* number of new
44813 +                                                                * units*/ ,
44814 +                      int size /* total size of new units' data */ ,
44815 +                      unsigned int data_size   /* free space already reserved
44816 +                                                * in the item for insertion */ )
44817 +{
44818 +       int entries;
44819 +       cde_unit_header *header;
44820 +       char *dent;
44821 +       int i;
44822 +
44823 +       assert("nikita-1310", coord != NULL);
44824 +       assert("nikita-1311", pos >= 0);
44825 +       assert("nikita-1312", no > 0);
44826 +       assert("nikita-1313", data_size >= no * sizeof(directory_entry_format));
44827 +       assert("nikita-1343",
44828 +              item_length_by_coord(coord) >=
44829 +              (int)(size + data_size + no * sizeof *header));
44830 +
44831 +       entries = units(coord);
44832 +
44833 +       if (pos == entries)
44834 +               dent = address(coord, size);
44835 +       else
44836 +               dent = (char *)entry_at(coord, pos);
44837 +       /* place where new header will be in */
44838 +       header = header_at(coord, pos);
44839 +       /* free space for new entry headers */
44840 +       memmove(header + no, header,
44841 +               (unsigned)(address(coord, size) - (char *)header));
44842 +       /* if adding to the end initialise first new header */
44843 +       if (pos == entries) {
44844 +               set_offset(coord, pos, (unsigned)size);
44845 +       }
44846 +
44847 +       /* adjust entry pointer and size */
44848 +       dent = dent + no * sizeof *header;
44849 +       size += no * sizeof *header;
44850 +       /* free space for new entries */
44851 +       memmove(dent + data_size, dent,
44852 +               (unsigned)(address(coord, size) - dent));
44853 +
44854 +       /* increase counter */
44855 +       entries += no;
44856 +       put_unaligned(cpu_to_le16((__u16) entries), &formatted_at(coord)->num_of_entries);
44857 +
44858 +       /* [ 0 ... pos ] entries were shifted by no * ( sizeof *header )
44859 +          bytes.  */
44860 +       for (i = 0; i <= pos; ++i)
44861 +               adj_offset(coord, i, no * sizeof *header);
44862 +       /* [ pos + no ... +\infty ) entries were shifted by ( no *
44863 +          sizeof *header + data_size ) bytes */
44864 +       for (i = pos + no; i < entries; ++i)
44865 +               adj_offset(coord, i, no * sizeof *header + data_size);
44866 +       return 0;
44867 +}
44868 +
44869 +/* insert new @entry into item */
44870 +static int expand(const coord_t * coord /* coord of item */ ,
44871 +                 cde_entry * entry /* entry to insert */ ,
44872 +                 int len /* length of @entry data */ ,
44873 +                 int *pos /* position to insert */ ,
44874 +                 reiser4_dir_entry_desc * dir_entry    /* parameters for new
44875 +                                                        * entry */ )
44876 +{
44877 +       cmp_t cmp_res;
44878 +       int datasize;
44879 +
44880 +       *pos = find(coord, &dir_entry->key, &cmp_res);
44881 +       if (*pos < 0)
44882 +               *pos = units(coord);
44883 +
44884 +       datasize = sizeof(directory_entry_format);
44885 +       if (is_longname(entry->name->name, entry->name->len))
44886 +               datasize += entry->name->len + 1;
44887 +
44888 +       expand_item(coord, *pos, 1, item_length_by_coord(coord) - len,
44889 +                   datasize);
44890 +       return 0;
44891 +}
44892 +
44893 +/* paste body of @entry into item */
44894 +static int paste_entry(const coord_t * coord /* coord of item */ ,
44895 +                      cde_entry * entry /* new entry */ ,
44896 +                      int pos /* position to insert */ ,
44897 +                      reiser4_dir_entry_desc * dir_entry       /* parameters for
44898 +                                                                * new entry */ )
44899 +{
44900 +       cde_unit_header *header;
44901 +       directory_entry_format *dent;
44902 +       const char *name;
44903 +       int len;
44904 +
44905 +       header = header_at(coord, pos);
44906 +       dent = entry_at(coord, pos);
44907 +
44908 +       build_de_id_by_key(&dir_entry->key, &header->hash);
44909 +       build_inode_key_id(entry->obj, &dent->id);
44910 +       /* AUDIT unsafe strcpy() operation! It should be replaced with
44911 +          much less CPU hungry
44912 +          memcpy( ( char * ) dent -> name, entry -> name -> name , entry -> name -> len );
44913 +
44914 +          Also a more major thing is that there should be a way to figure out
44915 +          amount of space in dent -> name and be able to check that we are
44916 +          not going to overwrite more than we supposed to */
44917 +       name = entry->name->name;
44918 +       len = entry->name->len;
44919 +       if (is_longname(name, len)) {
44920 +               strcpy((unsigned char *)dent->name, name);
44921 +               put_unaligned(0, &dent->name[len]);
44922 +       }
44923 +       return 0;
44924 +}
44925 +
44926 +/* estimate how much space is necessary in item to insert/paste set of entries
44927 +   described in @data. */
44928 +int estimate_cde(const coord_t * coord /* coord of item */ ,
44929 +                const reiser4_item_data * data /* parameters for new item */ )
44930 +{
44931 +       cde_entry_data *e;
44932 +       int result;
44933 +       int i;
44934 +
44935 +       e = (cde_entry_data *) data->data;
44936 +
44937 +       assert("nikita-1288", e != NULL);
44938 +       assert("nikita-1289", e->num_of_entries >= 0);
44939 +
44940 +       if (coord == NULL)
44941 +               /* insert */
44942 +               result = sizeof(cde_item_format);
44943 +       else
44944 +               /* paste */
44945 +               result = 0;
44946 +
44947 +       result += e->num_of_entries *
44948 +           (sizeof(cde_unit_header) + sizeof(directory_entry_format));
44949 +       for (i = 0; i < e->num_of_entries; ++i) {
44950 +               const char *name;
44951 +               int len;
44952 +
44953 +               name = e->entry[i].name->name;
44954 +               len = e->entry[i].name->len;
44955 +               assert("nikita-2054", strlen(name) == len);
44956 +               if (is_longname(name, len))
44957 +                       result += len + 1;
44958 +       }
44959 +       ((reiser4_item_data *) data)->length = result;
44960 +       return result;
44961 +}
44962 +
44963 +/* ->nr_units() method for this item plugin. */
44964 +pos_in_node_t nr_units_cde(const coord_t * coord /* coord of item */ )
44965 +{
44966 +       return units(coord);
44967 +}
44968 +
44969 +/* ->unit_key() method for this item plugin. */
44970 +reiser4_key *unit_key_cde(const coord_t * coord /* coord of item */ ,
44971 +                         reiser4_key * key /* resulting key */ )
44972 +{
44973 +       assert("nikita-1452", coord != NULL);
44974 +       assert("nikita-1345", idx_of(coord) < units(coord));
44975 +       assert("nikita-1346", key != NULL);
44976 +
44977 +       item_key_by_coord(coord, key);
44978 +       extract_key_from_de_id(extract_dir_id_from_key(key),
44979 +                              &header_at(coord, idx_of(coord))->hash, key);
44980 +       return key;
44981 +}
44982 +
44983 +/* mergeable_cde(): implementation of ->mergeable() item method.
44984 +
44985 +   Two directory items are mergeable iff they are from the same
44986 +   directory. That simple.
44987 +
44988 +*/
44989 +int mergeable_cde(const coord_t * p1 /* coord of first item */ ,
44990 +                 const coord_t * p2 /* coord of second item */ )
44991 +{
44992 +       reiser4_key k1;
44993 +       reiser4_key k2;
44994 +
44995 +       assert("nikita-1339", p1 != NULL);
44996 +       assert("nikita-1340", p2 != NULL);
44997 +
44998 +       return
44999 +           (item_plugin_by_coord(p1) == item_plugin_by_coord(p2)) &&
45000 +           (extract_dir_id_from_key(item_key_by_coord(p1, &k1)) ==
45001 +            extract_dir_id_from_key(item_key_by_coord(p2, &k2)));
45002 +
45003 +}
45004 +
45005 +/* ->max_key_inside() method for this item plugin. */
45006 +reiser4_key *max_key_inside_cde(const coord_t * coord /* coord of item */ ,
45007 +                               reiser4_key * result /* resulting key */ )
45008 +{
45009 +       assert("nikita-1342", coord != NULL);
45010 +
45011 +       item_key_by_coord(coord, result);
45012 +       set_key_ordering(result, get_key_ordering(max_key()));
45013 +       set_key_fulloid(result, get_key_fulloid(max_key()));
45014 +       set_key_offset(result, get_key_offset(max_key()));
45015 +       return result;
45016 +}
45017 +
45018 +/* @data contains data which are to be put into tree */
45019 +int can_contain_key_cde(const coord_t * coord /* coord of item */ ,
45020 +                       const reiser4_key * key /* key to check */ ,
45021 +                       const reiser4_item_data * data  /* parameters of new
45022 +                                                        * item/unit being
45023 +                                                        * created */ )
45024 +{
45025 +       reiser4_key item_key;
45026 +
45027 +       /* FIXME-VS: do not rely on anything but iplug field of @data. Only
45028 +          data->iplug is initialized */
45029 +       assert("vs-457", data && data->iplug);
45030 +/*     assert( "vs-553", data -> user == 0 );*/
45031 +       item_key_by_coord(coord, &item_key);
45032 +
45033 +       return (item_plugin_by_coord(coord) == data->iplug) &&
45034 +           (extract_dir_id_from_key(&item_key) ==
45035 +            extract_dir_id_from_key(key));
45036 +}
45037 +
45038 +#if REISER4_DEBUG
45039 +/* cde_check ->check() method for compressed directory items
45040 +
45041 +   used for debugging, every item should have here the most complete
45042 +   possible check of the consistency of the item that the inventor can
45043 +   construct
45044 +*/
45045 +int check_cde(const coord_t * coord /* coord of item to check */ ,
45046 +             const char **error /* where to store error message */ )
45047 +{
45048 +       int i;
45049 +       int result;
45050 +       char *item_start;
45051 +       char *item_end;
45052 +       reiser4_key key;
45053 +
45054 +       coord_t c;
45055 +
45056 +       assert("nikita-1357", coord != NULL);
45057 +       assert("nikita-1358", error != NULL);
45058 +
45059 +       if (!ergo(coord->item_pos != 0,
45060 +                 is_dot_key(item_key_by_coord(coord, &key)))) {
45061 +               *error = "CDE doesn't start with dot";
45062 +               return -1;
45063 +       }
45064 +       item_start = item_body_by_coord(coord);
45065 +       item_end = item_start + item_length_by_coord(coord);
45066 +
45067 +       coord_dup(&c, coord);
45068 +       result = 0;
45069 +       for (i = 0; i < units(coord); ++i) {
45070 +               directory_entry_format *entry;
45071 +
45072 +               if ((char *)(header_at(coord, i) + 1) >
45073 +                   item_end - units(coord) * sizeof *entry) {
45074 +                       *error = "CDE header is out of bounds";
45075 +                       result = -1;
45076 +                       break;
45077 +               }
45078 +               entry = entry_at(coord, i);
45079 +               if ((char *)entry < item_start + sizeof(cde_item_format)) {
45080 +                       *error = "CDE header is too low";
45081 +                       result = -1;
45082 +                       break;
45083 +               }
45084 +               if ((char *)(entry + 1) > item_end) {
45085 +                       *error = "CDE header is too high";
45086 +                       result = -1;
45087 +                       break;
45088 +               }
45089 +       }
45090 +
45091 +       return result;
45092 +}
45093 +#endif
45094 +
45095 +/* ->init() method for this item plugin. */
45096 +int init_cde(coord_t * coord /* coord of item */ ,
45097 +            coord_t * from UNUSED_ARG, reiser4_item_data * data        /* structure used for insertion */
45098 +            UNUSED_ARG)
45099 +{
45100 +       put_unaligned(cpu_to_le16(0), &formatted_at(coord)->num_of_entries);
45101 +       return 0;
45102 +}
45103 +
45104 +/* ->lookup() method for this item plugin. */
45105 +lookup_result lookup_cde(const reiser4_key * key /* key to search for */ ,
45106 +                        lookup_bias bias /* search bias */ ,
45107 +                        coord_t * coord /* coord of item to lookup in */ )
45108 +{
45109 +       cmp_t last_comp;
45110 +       int pos;
45111 +
45112 +       reiser4_key utmost_key;
45113 +
45114 +       assert("nikita-1293", coord != NULL);
45115 +       assert("nikita-1294", key != NULL);
45116 +
45117 +       CHECKME(coord);
45118 +
45119 +       if (keygt(item_key_by_coord(coord, &utmost_key), key)) {
45120 +               coord->unit_pos = 0;
45121 +               coord->between = BEFORE_UNIT;
45122 +               return CBK_COORD_NOTFOUND;
45123 +       }
45124 +       pos = find(coord, key, &last_comp);
45125 +       if (pos >= 0) {
45126 +               coord->unit_pos = (int)pos;
45127 +               switch (last_comp) {
45128 +               case EQUAL_TO:
45129 +                       coord->between = AT_UNIT;
45130 +                       return CBK_COORD_FOUND;
45131 +               case GREATER_THAN:
45132 +                       coord->between = BEFORE_UNIT;
45133 +                       return RETERR(-ENOENT);
45134 +               case LESS_THAN:
45135 +               default:
45136 +                       impossible("nikita-1298", "Broken find");
45137 +                       return RETERR(-EIO);
45138 +               }
45139 +       } else {
45140 +               coord->unit_pos = units(coord) - 1;
45141 +               coord->between = AFTER_UNIT;
45142 +               return (bias ==
45143 +                       FIND_MAX_NOT_MORE_THAN) ? CBK_COORD_FOUND :
45144 +                   CBK_COORD_NOTFOUND;
45145 +       }
45146 +}
45147 +
45148 +/* ->paste() method for this item plugin. */
45149 +int paste_cde(coord_t * coord /* coord of item */ ,
45150 +             reiser4_item_data * data  /* parameters of new unit being
45151 +                                        * inserted */ ,
45152 +             carry_plugin_info * info UNUSED_ARG /* todo carry queue */ )
45153 +{
45154 +       cde_entry_data *e;
45155 +       int result;
45156 +       int i;
45157 +
45158 +       CHECKME(coord);
45159 +       e = (cde_entry_data *) data->data;
45160 +
45161 +       result = 0;
45162 +       for (i = 0; i < e->num_of_entries; ++i) {
45163 +               int pos;
45164 +               int phantom_size;
45165 +
45166 +               phantom_size = data->length;
45167 +               if (units(coord) == 0)
45168 +                       phantom_size -= sizeof(cde_item_format);
45169 +
45170 +               result =
45171 +                   expand(coord, e->entry + i, phantom_size, &pos, data->arg);
45172 +               if (result != 0)
45173 +                       break;
45174 +               result = paste_entry(coord, e->entry + i, pos, data->arg);
45175 +               if (result != 0)
45176 +                       break;
45177 +       }
45178 +       CHECKME(coord);
45179 +       return result;
45180 +}
45181 +
45182 +/* amount of space occupied by all entries starting from @idx both headers and
45183 +   bodies. */
45184 +static unsigned int part_size(const coord_t * coord /* coord of item */ ,
45185 +                             int idx /* index of unit */ )
45186 +{
45187 +       assert("nikita-1299", coord != NULL);
45188 +       assert("nikita-1300", idx < (int)units(coord));
45189 +
45190 +       return sizeof(cde_item_format) +
45191 +           (idx + 1) * sizeof(cde_unit_header) + offset_of(coord,
45192 +                                                           idx + 1) -
45193 +           offset_of(coord, 0);
45194 +}
45195 +
45196 +/* how many but not more than @want units of @source can be merged with
45197 +   item in @target node. If pend == append - we try to append last item
45198 +   of @target by first units of @source. If pend == prepend - we try to
45199 +   "prepend" first item in @target by last units of @source. @target
45200 +   node has @free_space bytes of free space. Total size of those units
45201 +   are returned via @size */
45202 +int can_shift_cde(unsigned free_space /* free space in item */ ,
45203 +                 coord_t * coord /* coord of source item */ ,
45204 +                 znode * target /* target node */ ,
45205 +                 shift_direction pend /* shift direction */ ,
45206 +                 unsigned *size /* resulting number of shifted bytes */ ,
45207 +                 unsigned want /* maximal number of bytes to shift */ )
45208 +{
45209 +       int shift;
45210 +
45211 +       CHECKME(coord);
45212 +       if (want == 0) {
45213 +               *size = 0;
45214 +               return 0;
45215 +       }
45216 +
45217 +       /* pend == SHIFT_LEFT <==> shifting to the left */
45218 +       if (pend == SHIFT_LEFT) {
45219 +               for (shift = min((int)want - 1, units(coord)); shift >= 0;
45220 +                    --shift) {
45221 +                       *size = part_size(coord, shift);
45222 +                       if (target != NULL)
45223 +                               *size -= sizeof(cde_item_format);
45224 +                       if (*size <= free_space)
45225 +                               break;
45226 +               }
45227 +               shift = shift + 1;
45228 +       } else {
45229 +               int total_size;
45230 +
45231 +               assert("nikita-1301", pend == SHIFT_RIGHT);
45232 +
45233 +               total_size = item_length_by_coord(coord);
45234 +               for (shift = units(coord) - want - 1; shift < units(coord) - 1;
45235 +                    ++shift) {
45236 +                       *size = total_size - part_size(coord, shift);
45237 +                       if (target == NULL)
45238 +                               *size += sizeof(cde_item_format);
45239 +                       if (*size <= free_space)
45240 +                               break;
45241 +               }
45242 +               shift = units(coord) - shift - 1;
45243 +       }
45244 +       if (shift == 0)
45245 +               *size = 0;
45246 +       CHECKME(coord);
45247 +       return shift;
45248 +}
45249 +
45250 +/* ->copy_units() method for this item plugin. */
45251 +void copy_units_cde(coord_t * target /* coord of target item */ ,
45252 +                   coord_t * source /* coord of source item */ ,
45253 +                   unsigned from /* starting unit */ ,
45254 +                   unsigned count /* how many units to copy */ ,
45255 +                   shift_direction where_is_free_space /* shift direction */ ,
45256 +                   unsigned free_space /* free space in item */ )
45257 +{
45258 +       char *header_from;
45259 +       char *header_to;
45260 +
45261 +       char *entry_from;
45262 +       char *entry_to;
45263 +
45264 +       int pos_in_target;
45265 +       int data_size;
45266 +       int data_delta;
45267 +       int i;
45268 +
45269 +       assert("nikita-1303", target != NULL);
45270 +       assert("nikita-1304", source != NULL);
45271 +       assert("nikita-1305", (int)from < units(source));
45272 +       assert("nikita-1307", (int)(from + count) <= units(source));
45273 +
45274 +       if (where_is_free_space == SHIFT_LEFT) {
45275 +               assert("nikita-1453", from == 0);
45276 +               pos_in_target = units(target);
45277 +       } else {
45278 +               assert("nikita-1309", (int)(from + count) == units(source));
45279 +               pos_in_target = 0;
45280 +               memmove(item_body_by_coord(target),
45281 +                       (char *)item_body_by_coord(target) + free_space,
45282 +                       item_length_by_coord(target) - free_space);
45283 +       }
45284 +
45285 +       CHECKME(target);
45286 +       CHECKME(source);
45287 +
45288 +       /* expand @target */
45289 +       data_size =
45290 +           offset_of(source, (int)(from + count)) - offset_of(source,
45291 +                                                              (int)from);
45292 +
45293 +       if (units(target) == 0)
45294 +               free_space -= sizeof(cde_item_format);
45295 +
45296 +       expand_item(target, pos_in_target, (int)count,
45297 +                   (int)(item_length_by_coord(target) - free_space),
45298 +                   (unsigned)data_size);
45299 +
45300 +       /* copy first @count units of @source into @target */
45301 +       data_delta =
45302 +           offset_of(target, pos_in_target) - offset_of(source, (int)from);
45303 +
45304 +       /* copy entries */
45305 +       entry_from = (char *)entry_at(source, (int)from);
45306 +       entry_to = (char *)entry_at(source, (int)(from + count));
45307 +       memmove(entry_at(target, pos_in_target), entry_from,
45308 +               (unsigned)(entry_to - entry_from));
45309 +
45310 +       /* copy headers */
45311 +       header_from = (char *)header_at(source, (int)from);
45312 +       header_to = (char *)header_at(source, (int)(from + count));
45313 +       memmove(header_at(target, pos_in_target), header_from,
45314 +               (unsigned)(header_to - header_from));
45315 +
45316 +       /* update offsets */
45317 +       for (i = pos_in_target; i < (int)(pos_in_target + count); ++i)
45318 +               adj_offset(target, i, data_delta);
45319 +       CHECKME(target);
45320 +       CHECKME(source);
45321 +}
45322 +
45323 +/* ->cut_units() method for this item plugin. */
45324 +int cut_units_cde(coord_t * coord /* coord of item */ ,
45325 +                 pos_in_node_t from /* start unit pos */ ,
45326 +                 pos_in_node_t to /* stop unit pos */ ,
45327 +                 struct carry_cut_data *cdata UNUSED_ARG,
45328 +                 reiser4_key * smallest_removed, reiser4_key * new_first)
45329 +{
45330 +       char *header_from;
45331 +       char *header_to;
45332 +
45333 +       char *entry_from;
45334 +       char *entry_to;
45335 +
45336 +       int size;
45337 +       int entry_delta;
45338 +       int header_delta;
45339 +       int i;
45340 +
45341 +       unsigned count;
45342 +
45343 +       CHECKME(coord);
45344 +
45345 +       count = to - from + 1;
45346 +
45347 +       assert("nikita-1454", coord != NULL);
45348 +       assert("nikita-1455", (int)(from + count) <= units(coord));
45349 +
45350 +       if (smallest_removed)
45351 +               unit_key_by_coord(coord, smallest_removed);
45352 +
45353 +       if (new_first) {
45354 +               coord_t next;
45355 +
45356 +               /* not everything is cut from item head */
45357 +               assert("vs-1527", from == 0);
45358 +               assert("vs-1528", to < units(coord) - 1);
45359 +
45360 +               coord_dup(&next, coord);
45361 +               next.unit_pos++;
45362 +               unit_key_by_coord(&next, new_first);
45363 +       }
45364 +
45365 +       size = item_length_by_coord(coord);
45366 +       if (count == (unsigned)units(coord)) {
45367 +               return size;
45368 +       }
45369 +
45370 +       header_from = (char *)header_at(coord, (int)from);
45371 +       header_to = (char *)header_at(coord, (int)(from + count));
45372 +
45373 +       entry_from = (char *)entry_at(coord, (int)from);
45374 +       entry_to = (char *)entry_at(coord, (int)(from + count));
45375 +
45376 +       /* move headers */
45377 +       memmove(header_from, header_to,
45378 +               (unsigned)(address(coord, size) - header_to));
45379 +
45380 +       header_delta = header_to - header_from;
45381 +
45382 +       entry_from -= header_delta;
45383 +       entry_to -= header_delta;
45384 +       size -= header_delta;
45385 +
45386 +       /* copy entries */
45387 +       memmove(entry_from, entry_to,
45388 +               (unsigned)(address(coord, size) - entry_to));
45389 +
45390 +       entry_delta = entry_to - entry_from;
45391 +       size -= entry_delta;
45392 +
45393 +       /* update offsets */
45394 +
45395 +       for (i = 0; i < (int)from; ++i)
45396 +               adj_offset(coord, i, -header_delta);
45397 +
45398 +       for (i = from; i < units(coord) - (int)count; ++i)
45399 +               adj_offset(coord, i, -header_delta - entry_delta);
45400 +
45401 +       put_unaligned(cpu_to_le16((__u16) units(coord) - count),
45402 +                     &formatted_at(coord)->num_of_entries);
45403 +
45404 +       if (from == 0) {
45405 +               /* entries from head was removed - move remaining to right */
45406 +               memmove((char *)item_body_by_coord(coord) +
45407 +                       header_delta + entry_delta, item_body_by_coord(coord),
45408 +                       (unsigned)size);
45409 +               if (REISER4_DEBUG)
45410 +                       memset(item_body_by_coord(coord), 0,
45411 +                              (unsigned)header_delta + entry_delta);
45412 +       } else {
45413 +               /* freed space is already at the end of item */
45414 +               if (REISER4_DEBUG)
45415 +                       memset((char *)item_body_by_coord(coord) + size, 0,
45416 +                              (unsigned)header_delta + entry_delta);
45417 +       }
45418 +
45419 +       return header_delta + entry_delta;
45420 +}
45421 +
45422 +int kill_units_cde(coord_t * coord /* coord of item */ ,
45423 +                  pos_in_node_t from /* start unit pos */ ,
45424 +                  pos_in_node_t to /* stop unit pos */ ,
45425 +                  struct carry_kill_data *kdata UNUSED_ARG,
45426 +                  reiser4_key * smallest_removed, reiser4_key * new_first)
45427 +{
45428 +       return cut_units_cde(coord, from, to, NULL, smallest_removed, new_first);
45429 +}
45430 +
45431 +/* ->s.dir.extract_key() method for this item plugin. */
45432 +int extract_key_cde(const coord_t * coord /* coord of item */ ,
45433 +                   reiser4_key * key /* resulting key */ )
45434 +{
45435 +       directory_entry_format *dent;
45436 +
45437 +       assert("nikita-1155", coord != NULL);
45438 +       assert("nikita-1156", key != NULL);
45439 +
45440 +       dent = entry_at(coord, idx_of(coord));
45441 +       return extract_key_from_id(&dent->id, key);
45442 +}
45443 +
45444 +int
45445 +update_key_cde(const coord_t * coord, const reiser4_key * key,
45446 +              lock_handle * lh UNUSED_ARG)
45447 +{
45448 +       directory_entry_format *dent;
45449 +       obj_key_id obj_id;
45450 +       int result;
45451 +
45452 +       assert("nikita-2344", coord != NULL);
45453 +       assert("nikita-2345", key != NULL);
45454 +
45455 +       dent = entry_at(coord, idx_of(coord));
45456 +       result = build_obj_key_id(key, &obj_id);
45457 +       if (result == 0) {
45458 +               dent->id = obj_id;
45459 +               znode_make_dirty(coord->node);
45460 +       }
45461 +       return 0;
45462 +}
45463 +
45464 +/* ->s.dir.extract_name() method for this item plugin. */
45465 +char *extract_name_cde(const coord_t * coord /* coord of item */ , char *buf)
45466 +{
45467 +       directory_entry_format *dent;
45468 +
45469 +       assert("nikita-1157", coord != NULL);
45470 +
45471 +       dent = entry_at(coord, idx_of(coord));
45472 +       return extract_dent_name(coord, dent, buf);
45473 +}
45474 +
45475 +static int cde_bytes(int pasting, const reiser4_item_data * data)
45476 +{
45477 +       int result;
45478 +
45479 +       result = data->length;
45480 +       if (!pasting)
45481 +               result -= sizeof(cde_item_format);
45482 +       return result;
45483 +}
45484 +
45485 +/* ->s.dir.add_entry() method for this item plugin */
45486 +int add_entry_cde(struct inode *dir /* directory object */ ,
45487 +                 coord_t * coord /* coord of item */ ,
45488 +                 lock_handle * lh /* lock handle for insertion */ ,
45489 +                 const struct dentry *name /* name to insert */ ,
45490 +                 reiser4_dir_entry_desc * dir_entry    /* parameters of new
45491 +                                                        * directory entry */ )
45492 +{
45493 +       reiser4_item_data data;
45494 +       cde_entry entry;
45495 +       cde_entry_data edata;
45496 +       int result;
45497 +
45498 +       assert("nikita-1656", coord->node == lh->node);
45499 +       assert("nikita-1657", znode_is_write_locked(coord->node));
45500 +
45501 +       edata.num_of_entries = 1;
45502 +       edata.entry = &entry;
45503 +
45504 +       entry.dir = dir;
45505 +       entry.obj = dir_entry->obj;
45506 +       entry.name = &name->d_name;
45507 +
45508 +       data.data = (char *)&edata;
45509 +       data.user = 0;          /* &edata is not user space */
45510 +       data.iplug = item_plugin_by_id(COMPOUND_DIR_ID);
45511 +       data.arg = dir_entry;
45512 +       assert("nikita-1302", data.iplug != NULL);
45513 +
45514 +       result = is_dot_key(&dir_entry->key);
45515 +       data.length = estimate_cde(result ? coord : NULL, &data);
45516 +
45517 +       /* NOTE-NIKITA quota plugin? */
45518 +       if (DQUOT_ALLOC_SPACE_NODIRTY(dir, cde_bytes(result, &data)))
45519 +               return RETERR(-EDQUOT);
45520 +
45521 +       if (result)
45522 +               result = insert_by_coord(coord, &data, &dir_entry->key, lh, 0);
45523 +       else
45524 +               result = resize_item(coord, &data, &dir_entry->key, lh, 0);
45525 +       return result;
45526 +}
45527 +
45528 +/* ->s.dir.rem_entry() */
45529 +int rem_entry_cde(struct inode *dir /* directory of item */ ,
45530 +                 const struct qstr *name, coord_t * coord /* coord of item */ ,
45531 +                 lock_handle * lh UNUSED_ARG   /* lock handle for
45532 +                                                * removal */ ,
45533 +                 reiser4_dir_entry_desc * entry UNUSED_ARG     /* parameters of
45534 +                                                                * directory entry
45535 +                                                                * being removed */ )
45536 +{
45537 +       coord_t shadow;
45538 +       int result;
45539 +       int length;
45540 +       ON_DEBUG(char buf[DE_NAME_BUF_LEN]);
45541 +
45542 +       assert("nikita-2870", strlen(name->name) == name->len);
45543 +       assert("nikita-2869",
45544 +              !strcmp(name->name, extract_name_cde(coord, buf)));
45545 +
45546 +       length = sizeof(directory_entry_format) + sizeof(cde_unit_header);
45547 +       if (is_longname(name->name, name->len))
45548 +               length += name->len + 1;
45549 +
45550 +       if (inode_get_bytes(dir) < length) {
45551 +               warning("nikita-2628", "Dir is broke: %llu: %llu",
45552 +                       (unsigned long long)get_inode_oid(dir),
45553 +                       inode_get_bytes(dir));
45554 +
45555 +               return RETERR(-EIO);
45556 +       }
45557 +
45558 +       /* cut_node() is supposed to take pointers to _different_
45559 +          coords, because it will modify them without respect to
45560 +          possible aliasing. To work around this, create temporary copy
45561 +          of @coord.
45562 +        */
45563 +       coord_dup(&shadow, coord);
45564 +       result =
45565 +           kill_node_content(coord, &shadow, NULL, NULL, NULL, NULL, NULL, 0);
45566 +       if (result == 0) {
45567 +               /* NOTE-NIKITA quota plugin? */
45568 +               DQUOT_FREE_SPACE_NODIRTY(dir, length);
45569 +       }
45570 +       return result;
45571 +}
45572 +
45573 +/* ->s.dir.max_name_len() method for this item plugin */
45574 +int max_name_len_cde(const struct inode *dir /* directory */ )
45575 +{
45576 +       return
45577 +           tree_by_inode(dir)->nplug->max_item_size() -
45578 +           sizeof(directory_entry_format) - sizeof(cde_item_format) -
45579 +           sizeof(cde_unit_header) - 2;
45580 +}
45581 +
45582 +/* Make Linus happy.
45583 +   Local variables:
45584 +   c-indentation-style: "K&R"
45585 +   mode-name: "LC"
45586 +   c-basic-offset: 8
45587 +   tab-width: 8
45588 +   fill-column: 120
45589 +   End:
45590 +*/
45591 diff --git a/fs/reiser4/plugin/item/cde.h b/fs/reiser4/plugin/item/cde.h
45592 new file mode 100644
45593 index 0000000..b92fd14
45594 --- /dev/null
45595 +++ b/fs/reiser4/plugin/item/cde.h
45596 @@ -0,0 +1,87 @@
45597 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
45598 +
45599 +/* Compound directory item. See cde.c for description. */
45600 +
45601 +#if !defined( __FS_REISER4_PLUGIN_COMPRESSED_DE_H__ )
45602 +#define __FS_REISER4_PLUGIN_COMPRESSED_DE_H__
45603 +
45604 +#include "../../forward.h"
45605 +#include "../../kassign.h"
45606 +#include "../../dformat.h"
45607 +
45608 +#include <linux/fs.h>          /* for struct inode */
45609 +#include <linux/dcache.h>      /* for struct dentry, etc  */
45610 +
45611 +typedef struct cde_unit_header {
45612 +       de_id hash;
45613 +       d16 offset;
45614 +} cde_unit_header;
45615 +
45616 +typedef struct cde_item_format {
45617 +       d16 num_of_entries;
45618 +       cde_unit_header entry[0];
45619 +} cde_item_format;
45620 +
45621 +typedef struct cde_entry {
45622 +       const struct inode *dir;
45623 +       const struct inode *obj;
45624 +       const struct qstr *name;
45625 +} cde_entry;
45626 +
45627 +typedef struct cde_entry_data {
45628 +       int num_of_entries;
45629 +       cde_entry *entry;
45630 +} cde_entry_data;
45631 +
45632 +/* plugin->item.b.* */
45633 +reiser4_key *max_key_inside_cde(const coord_t * coord, reiser4_key * result);
45634 +int can_contain_key_cde(const coord_t * coord, const reiser4_key * key,
45635 +                       const reiser4_item_data *);
45636 +int mergeable_cde(const coord_t * p1, const coord_t * p2);
45637 +pos_in_node_t nr_units_cde(const coord_t * coord);
45638 +reiser4_key *unit_key_cde(const coord_t * coord, reiser4_key * key);
45639 +int estimate_cde(const coord_t * coord, const reiser4_item_data * data);
45640 +void print_cde(const char *prefix, coord_t * coord);
45641 +int init_cde(coord_t * coord, coord_t * from, reiser4_item_data * data);
45642 +lookup_result lookup_cde(const reiser4_key * key, lookup_bias bias,
45643 +                        coord_t * coord);
45644 +int paste_cde(coord_t * coord, reiser4_item_data * data,
45645 +             carry_plugin_info * info UNUSED_ARG);
45646 +int can_shift_cde(unsigned free_space, coord_t * coord, znode * target,
45647 +                 shift_direction pend, unsigned *size, unsigned want);
45648 +void copy_units_cde(coord_t * target, coord_t * source, unsigned from,
45649 +                   unsigned count, shift_direction where_is_free_space,
45650 +                   unsigned free_space);
45651 +int cut_units_cde(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
45652 +                 struct carry_cut_data *, reiser4_key * smallest_removed,
45653 +                 reiser4_key * new_first);
45654 +int kill_units_cde(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
45655 +                  struct carry_kill_data *, reiser4_key * smallest_removed,
45656 +                  reiser4_key * new_first);
45657 +void print_cde(const char *prefix, coord_t * coord);
45658 +int check_cde(const coord_t * coord, const char **error);
45659 +
45660 +/* plugin->u.item.s.dir.* */
45661 +int extract_key_cde(const coord_t * coord, reiser4_key * key);
45662 +int update_key_cde(const coord_t * coord, const reiser4_key * key,
45663 +                  lock_handle * lh);
45664 +char *extract_name_cde(const coord_t * coord, char *buf);
45665 +int add_entry_cde(struct inode *dir, coord_t * coord,
45666 +                 lock_handle * lh, const struct dentry *name,
45667 +                 reiser4_dir_entry_desc * entry);
45668 +int rem_entry_cde(struct inode *dir, const struct qstr *name, coord_t * coord,
45669 +                 lock_handle * lh, reiser4_dir_entry_desc * entry);
45670 +int max_name_len_cde(const struct inode *dir);
45671 +
45672 +/* __FS_REISER4_PLUGIN_COMPRESSED_DE_H__ */
45673 +#endif
45674 +
45675 +/* Make Linus happy.
45676 +   Local variables:
45677 +   c-indentation-style: "K&R"
45678 +   mode-name: "LC"
45679 +   c-basic-offset: 8
45680 +   tab-width: 8
45681 +   fill-column: 120
45682 +   End:
45683 +*/
45684 diff --git a/fs/reiser4/plugin/item/ctail.c b/fs/reiser4/plugin/item/ctail.c
45685 new file mode 100644
45686 index 0000000..d962933
45687 --- /dev/null
45688 +++ b/fs/reiser4/plugin/item/ctail.c
45689 @@ -0,0 +1,1588 @@
45690 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
45691 +
45692 +/* ctails (aka "clustered tails") are items for cryptcompress objects */
45693 +
45694 +/* DESCRIPTION:
45695 +
45696 +Each cryptcompress object is stored on disk as a set of clusters sliced
45697 +into ctails.
45698 +
45699 +Internal on-disk structure:
45700 +
45701 +        HEADER   (1)  Here stored disk cluster shift
45702 +       BODY
45703 +*/
45704 +
45705 +#include "../../forward.h"
45706 +#include "../../debug.h"
45707 +#include "../../dformat.h"
45708 +#include "../../kassign.h"
45709 +#include "../../key.h"
45710 +#include "../../coord.h"
45711 +#include "item.h"
45712 +#include "../node/node.h"
45713 +#include "../plugin.h"
45714 +#include "../object.h"
45715 +#include "../../znode.h"
45716 +#include "../../carry.h"
45717 +#include "../../tree.h"
45718 +#include "../../inode.h"
45719 +#include "../../super.h"
45720 +#include "../../context.h"
45721 +#include "../../page_cache.h"
45722 +#include "../cluster.h"
45723 +#include "../../flush.h"
45724 +#include "../../tree_walk.h"
45725 +
45726 +#include <linux/pagevec.h>
45727 +#include <linux/swap.h>
45728 +#include <linux/fs.h>
45729 +
45730 +/* return body of ctail item at @coord */
45731 +static ctail_item_format *ctail_formatted_at(const coord_t * coord)
45732 +{
45733 +       assert("edward-60", coord != NULL);
45734 +       return item_body_by_coord(coord);
45735 +}
45736 +
45737 +int cluster_shift_by_coord(const coord_t * coord)
45738 +{
45739 +       return get_unaligned(&ctail_formatted_at(coord)->cluster_shift);
45740 +}
45741 +
45742 +static loff_t off_by_coord(const coord_t * coord)
45743 +{
45744 +       reiser4_key key;
45745 +       return get_key_offset(item_key_by_coord(coord, &key));
45746 +}
45747 +
45748 +static int coord_is_unprepped_ctail(const coord_t * coord)
45749 +{
45750 +       assert("edward-1233", coord != NULL);
45751 +       assert("edward-1234", item_id_by_coord(coord) == CTAIL_ID);
45752 +       assert("edward-1235",
45753 +              ergo((int)cluster_shift_by_coord(coord) == (int)UCTAIL_SHIFT,
45754 +                   nr_units_ctail(coord) == (pos_in_node_t) UCTAIL_NR_UNITS));
45755 +
45756 +       return (int)cluster_shift_by_coord(coord) == (int)UCTAIL_SHIFT;
45757 +}
45758 +
45759 +static cloff_t clust_by_coord(const coord_t * coord, struct inode *inode)
45760 +{
45761 +       int shift;
45762 +
45763 +       if (inode != NULL) {
45764 +               shift = inode_cluster_shift(inode);
45765 +               assert("edward-1236",
45766 +                      ergo(!coord_is_unprepped_ctail(coord),
45767 +                           shift == cluster_shift_by_coord(coord)));
45768 +       } else {
45769 +               assert("edward-1237", !coord_is_unprepped_ctail(coord));
45770 +               shift = cluster_shift_by_coord(coord);
45771 +       }
45772 +       return off_by_coord(coord) >> shift;
45773 +}
45774 +
45775 +static int disk_cluster_size(const coord_t * coord)
45776 +{
45777 +       assert("edward-1156",
45778 +              item_plugin_by_coord(coord) == item_plugin_by_id(CTAIL_ID));
45779 +       /* calculation of disk cluster size
45780 +          is meaninless if ctail is unprepped */
45781 +       assert("edward-1238", !coord_is_unprepped_ctail(coord));
45782 +
45783 +       return 1 << cluster_shift_by_coord(coord);
45784 +}
45785 +
45786 +/* true if the key is of first disk cluster item */
45787 +static int is_disk_cluster_key(const reiser4_key * key, const coord_t * coord)
45788 +{
45789 +       assert("edward-1239", item_id_by_coord(coord) == CTAIL_ID);
45790 +
45791 +       return coord_is_unprepped_ctail(coord) ||
45792 +           ((get_key_offset(key) &
45793 +             ((loff_t) disk_cluster_size(coord) - 1)) == 0);
45794 +}
45795 +
45796 +static char *first_unit(coord_t * coord)
45797 +{
45798 +       /* FIXME: warning: pointer of type `void *' used in arithmetic */
45799 +       return (char *)item_body_by_coord(coord) + sizeof(ctail_item_format);
45800 +}
45801 +
45802 +/* plugin->u.item.b.max_key_inside :
45803 +   tail_max_key_inside */
45804 +
45805 +/* plugin->u.item.b.can_contain_key */
45806 +int
45807 +can_contain_key_ctail(const coord_t * coord, const reiser4_key * key,
45808 +                     const reiser4_item_data * data)
45809 +{
45810 +       reiser4_key item_key;
45811 +
45812 +       if (item_plugin_by_coord(coord) != data->iplug)
45813 +               return 0;
45814 +
45815 +       item_key_by_coord(coord, &item_key);
45816 +       if (get_key_locality(key) != get_key_locality(&item_key) ||
45817 +           get_key_objectid(key) != get_key_objectid(&item_key))
45818 +               return 0;
45819 +       if (get_key_offset(&item_key) + nr_units_ctail(coord) !=
45820 +           get_key_offset(key))
45821 +               return 0;
45822 +       if (is_disk_cluster_key(key, coord))
45823 +               return 0;
45824 +       return 1;
45825 +}
45826 +
45827 +/* plugin->u.item.b.mergeable
45828 +   c-tails of different clusters are not mergeable */
45829 +int mergeable_ctail(const coord_t * p1, const coord_t * p2)
45830 +{
45831 +       reiser4_key key1, key2;
45832 +
45833 +       assert("edward-62", item_id_by_coord(p1) == CTAIL_ID);
45834 +       assert("edward-61",
45835 +              item_type_by_coord(p1) == UNIX_FILE_METADATA_ITEM_TYPE);
45836 +
45837 +       if (item_id_by_coord(p2) != CTAIL_ID) {
45838 +               /* second item is of another type */
45839 +               return 0;
45840 +       }
45841 +
45842 +       item_key_by_coord(p1, &key1);
45843 +       item_key_by_coord(p2, &key2);
45844 +       if (get_key_locality(&key1) != get_key_locality(&key2) ||
45845 +           get_key_objectid(&key1) != get_key_objectid(&key2) ||
45846 +           get_key_type(&key1) != get_key_type(&key2)) {
45847 +               /* items of different objects */
45848 +               return 0;
45849 +       }
45850 +       if (get_key_offset(&key1) + nr_units_ctail(p1) != get_key_offset(&key2))
45851 +               /*  not adjacent items */
45852 +               return 0;
45853 +       if (is_disk_cluster_key(&key2, p2))
45854 +               return 0;
45855 +       return 1;
45856 +}
45857 +
45858 +/* plugin->u.item.b.nr_units */
45859 +pos_in_node_t nr_units_ctail(const coord_t * coord)
45860 +{
45861 +       return (item_length_by_coord(coord) -
45862 +               sizeof(ctail_formatted_at(coord)->cluster_shift));
45863 +}
45864 +
45865 +/* plugin->u.item.b.estimate:
45866 +   estimate how much space is needed to insert/paste @data->length bytes
45867 +   into ctail at @coord */
45868 +int estimate_ctail(const coord_t * coord /* coord of item */ ,
45869 +                  const reiser4_item_data *
45870 +                  data /* parameters for new item */ )
45871 +{
45872 +       if (coord == NULL)
45873 +               /* insert */
45874 +               return (sizeof(ctail_item_format) + data->length);
45875 +       else
45876 +               /* paste */
45877 +               return data->length;
45878 +}
45879 +
45880 +/* ->init() method for this item plugin. */
45881 +int init_ctail(coord_t * to /* coord of item */ ,
45882 +              coord_t * from /* old_item */ ,
45883 +              reiser4_item_data * data /* structure used for insertion */ )
45884 +{
45885 +       int cluster_shift;      /* cpu value to convert */
45886 +
45887 +       if (data) {
45888 +               assert("edward-463", data->length > sizeof(ctail_item_format));
45889 +               cluster_shift = *((int *)(data->arg));
45890 +               data->length -= sizeof(ctail_item_format);
45891 +       } else {
45892 +               assert("edward-464", from != NULL);
45893 +               assert("edward-855", ctail_ok(from));
45894 +               cluster_shift = (int)(cluster_shift_by_coord(from));
45895 +       }
45896 +       put_unaligned((d8)cluster_shift, &ctail_formatted_at(to)->cluster_shift);
45897 +       assert("edward-856", ctail_ok(to));
45898 +       return 0;
45899 +}
45900 +
45901 +/* plugin->u.item.b.lookup:
45902 +   NULL: We are looking for item keys only */
45903 +
45904 +#if REISER4_DEBUG
45905 +int ctail_ok(const coord_t * coord)
45906 +{
45907 +       return coord_is_unprepped_ctail(coord) ||
45908 +           cluster_shift_ok(cluster_shift_by_coord(coord));
45909 +}
45910 +
45911 +/* plugin->u.item.b.check */
45912 +int check_ctail(const coord_t * coord, const char **error)
45913 +{
45914 +       if (!ctail_ok(coord)) {
45915 +               if (error)
45916 +                       *error = "bad cluster shift in ctail";
45917 +               return 1;
45918 +       }
45919 +       return 0;
45920 +}
45921 +#endif
45922 +
45923 +/* plugin->u.item.b.paste */
45924 +int
45925 +paste_ctail(coord_t * coord, reiser4_item_data * data,
45926 +           carry_plugin_info * info UNUSED_ARG)
45927 +{
45928 +       unsigned old_nr_units;
45929 +
45930 +       assert("edward-268", data->data != NULL);
45931 +       /* copy only from kernel space */
45932 +       assert("edward-66", data->user == 0);
45933 +
45934 +       old_nr_units =
45935 +           item_length_by_coord(coord) - sizeof(ctail_item_format) -
45936 +           data->length;
45937 +
45938 +       /* ctail items never get pasted in the middle */
45939 +
45940 +       if (coord->unit_pos == 0 && coord->between == AT_UNIT) {
45941 +
45942 +               /* paste at the beginning when create new item */
45943 +               assert("edward-450",
45944 +                      item_length_by_coord(coord) ==
45945 +                      data->length + sizeof(ctail_item_format));
45946 +               assert("edward-451", old_nr_units == 0);
45947 +       } else if (coord->unit_pos == old_nr_units - 1
45948 +                  && coord->between == AFTER_UNIT) {
45949 +
45950 +               /* paste at the end */
45951 +               coord->unit_pos++;
45952 +       } else
45953 +               impossible("edward-453", "bad paste position");
45954 +
45955 +       memcpy(first_unit(coord) + coord->unit_pos, data->data, data->length);
45956 +
45957 +       assert("edward-857", ctail_ok(coord));
45958 +
45959 +       return 0;
45960 +}
45961 +
45962 +/* plugin->u.item.b.fast_paste */
45963 +
45964 +/* plugin->u.item.b.can_shift
45965 +   number of units is returned via return value, number of bytes via @size. For
45966 +   ctail items they coincide */
45967 +int
45968 +can_shift_ctail(unsigned free_space, coord_t * source,
45969 +               znode * target, shift_direction direction UNUSED_ARG,
45970 +               unsigned *size /* number of bytes */ , unsigned want)
45971 +{
45972 +       /* make sure that that we do not want to shift more than we have */
45973 +       assert("edward-68", want > 0 && want <= nr_units_ctail(source));
45974 +
45975 +       *size = min(want, free_space);
45976 +
45977 +       if (!target) {
45978 +               /* new item will be created */
45979 +               if (*size <= sizeof(ctail_item_format)) {
45980 +                       *size = 0;
45981 +                       return 0;
45982 +               }
45983 +               return *size - sizeof(ctail_item_format);
45984 +       }
45985 +       return *size;
45986 +}
45987 +
45988 +/* plugin->u.item.b.copy_units
45989 +   cooperates with ->can_shift() */
45990 +void
45991 +copy_units_ctail(coord_t * target, coord_t * source,
45992 +                unsigned from, unsigned count /* units */ ,
45993 +                shift_direction where_is_free_space,
45994 +                unsigned free_space /* bytes */ )
45995 +{
45996 +       /* make sure that item @target is expanded already */
45997 +       assert("edward-69", (unsigned)item_length_by_coord(target) >= count);
45998 +       assert("edward-70", free_space == count || free_space == count + 1);
45999 +
46000 +       assert("edward-858", ctail_ok(source));
46001 +
46002 +       if (where_is_free_space == SHIFT_LEFT) {
46003 +               /* append item @target with @count first bytes of @source:
46004 +                  this restriction came from ordinary tails */
46005 +               assert("edward-71", from == 0);
46006 +               assert("edward-860", ctail_ok(target));
46007 +
46008 +               memcpy(first_unit(target) + nr_units_ctail(target) - count,
46009 +                      first_unit(source), count);
46010 +       } else {
46011 +               /* target item is moved to right already */
46012 +               reiser4_key key;
46013 +
46014 +               assert("edward-72", nr_units_ctail(source) == from + count);
46015 +
46016 +               if (free_space == count) {
46017 +                       init_ctail(target, source, NULL);
46018 +               } else {
46019 +                       /* new item has been created */
46020 +                       assert("edward-862", ctail_ok(target));
46021 +               }
46022 +               memcpy(first_unit(target), first_unit(source) + from, count);
46023 +
46024 +               assert("edward-863", ctail_ok(target));
46025 +
46026 +               /* new units are inserted before first unit in an item,
46027 +                  therefore, we have to update item key */
46028 +               item_key_by_coord(source, &key);
46029 +               set_key_offset(&key, get_key_offset(&key) + from);
46030 +
46031 +               node_plugin_by_node(target->node)->update_item_key(target, &key,
46032 +                                                                  NULL /*info */);
46033 +       }
46034 +}
46035 +
46036 +/* plugin->u.item.b.create_hook */
46037 +int create_hook_ctail(const coord_t * coord, void *arg)
46038 +{
46039 +       assert("edward-864", znode_is_loaded(coord->node));
46040 +
46041 +       znode_set_convertible(coord->node);
46042 +       return 0;
46043 +}
46044 +
46045 +/* plugin->u.item.b.kill_hook */
46046 +int
46047 +kill_hook_ctail(const coord_t * coord, pos_in_node_t from, pos_in_node_t count,
46048 +               carry_kill_data * kdata)
46049 +{
46050 +       struct inode *inode;
46051 +
46052 +       assert("edward-1157", item_id_by_coord(coord) == CTAIL_ID);
46053 +       assert("edward-291", znode_is_write_locked(coord->node));
46054 +
46055 +       inode = kdata->inode;
46056 +       if (inode) {
46057 +               reiser4_key key;
46058 +               item_key_by_coord(coord, &key);
46059 +
46060 +               if (from == 0 && is_disk_cluster_key(&key, coord)) {
46061 +                       cloff_t start =
46062 +                           off_to_clust(get_key_offset(&key), inode);
46063 +                       truncate_page_cluster(inode, start);
46064 +               }
46065 +       }
46066 +       return 0;
46067 +}
46068 +
46069 +/* for shift_hook_ctail(),
46070 +   return true if the first disk cluster item has dirty child
46071 +*/
46072 +static int ctail_convertible(const coord_t * coord)
46073 +{
46074 +       int result;
46075 +       reiser4_key key;
46076 +       jnode *child = NULL;
46077 +
46078 +       assert("edward-477", coord != NULL);
46079 +       assert("edward-478", item_id_by_coord(coord) == CTAIL_ID);
46080 +
46081 +       if (coord_is_unprepped_ctail(coord))
46082 +               /* unprepped ctail should be converted */
46083 +               return 1;
46084 +
46085 +       item_key_by_coord(coord, &key);
46086 +       child = jlookup(current_tree,
46087 +                       get_key_objectid(&key),
46088 +                       off_to_pg(off_by_coord(coord)));
46089 +       if (!child)
46090 +               return 0;
46091 +       result = JF_ISSET(child, JNODE_DIRTY);
46092 +       jput(child);
46093 +       return result;
46094 +}
46095 +
46096 +/* FIXME-EDWARD */
46097 +/* plugin->u.item.b.shift_hook */
46098 +int shift_hook_ctail(const coord_t * item /* coord of item */ ,
46099 +                    unsigned from UNUSED_ARG /* start unit */ ,
46100 +                    unsigned count UNUSED_ARG /* stop unit */ ,
46101 +                    znode * old_node /* old parent */ )
46102 +{
46103 +       assert("edward-479", item != NULL);
46104 +       assert("edward-480", item->node != old_node);
46105 +
46106 +       if (!znode_convertible(old_node) || znode_convertible(item->node))
46107 +               return 0;
46108 +       if (ctail_convertible(item))
46109 +               znode_set_convertible(item->node);
46110 +       return 0;
46111 +}
46112 +
46113 +static int
46114 +cut_or_kill_ctail_units(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
46115 +                       int cut, void *p, reiser4_key * smallest_removed,
46116 +                       reiser4_key * new_first)
46117 +{
46118 +       pos_in_node_t count;    /* number of units to cut */
46119 +       char *item;
46120 +
46121 +       count = to - from + 1;
46122 +       item = item_body_by_coord(coord);
46123 +
46124 +       assert("edward-74", ergo(from != 0, to == coord_last_unit_pos(coord)));
46125 +
46126 +       if (smallest_removed) {
46127 +               /* store smallest key removed */
46128 +               item_key_by_coord(coord, smallest_removed);
46129 +               set_key_offset(smallest_removed,
46130 +                              get_key_offset(smallest_removed) + from);
46131 +       }
46132 +
46133 +       if (new_first) {
46134 +               assert("vs-1531", from == 0);
46135 +
46136 +               item_key_by_coord(coord, new_first);
46137 +               set_key_offset(new_first,
46138 +                              get_key_offset(new_first) + from + count);
46139 +       }
46140 +
46141 +       if (!cut)
46142 +               kill_hook_ctail(coord, from, 0, (struct carry_kill_data *)p);
46143 +
46144 +       if (from == 0) {
46145 +               if (count != nr_units_ctail(coord)) {
46146 +                       /* part of item is removed, so move free space at the beginning
46147 +                          of the item and update item key */
46148 +                       reiser4_key key;
46149 +                       memcpy(item + to + 1, item, sizeof(ctail_item_format));
46150 +                       item_key_by_coord(coord, &key);
46151 +                       set_key_offset(&key, get_key_offset(&key) + count);
46152 +                       node_plugin_by_node(coord->node)->update_item_key(coord,
46153 +                                                                         &key,
46154 +                                                                         NULL);
46155 +               } else {
46156 +                       /* cut_units should not be called to cut evrything */
46157 +                       assert("vs-1532", ergo(cut, 0));
46158 +                       /* whole item is cut, so more then amount of space occupied
46159 +                          by units got freed */
46160 +                       count += sizeof(ctail_item_format);
46161 +               }
46162 +               if (REISER4_DEBUG)
46163 +                       memset(item, 0, count);
46164 +       } else if (REISER4_DEBUG)
46165 +               memset(item + sizeof(ctail_item_format) + from, 0, count);
46166 +       return count;
46167 +}
46168 +
46169 +/* plugin->u.item.b.cut_units */
46170 +int
46171 +cut_units_ctail(coord_t * item, pos_in_node_t from, pos_in_node_t to,
46172 +               carry_cut_data * cdata, reiser4_key * smallest_removed,
46173 +               reiser4_key * new_first)
46174 +{
46175 +       return cut_or_kill_ctail_units(item, from, to, 1, NULL,
46176 +                                      smallest_removed, new_first);
46177 +}
46178 +
46179 +/* plugin->u.item.b.kill_units */
46180 +int
46181 +kill_units_ctail(coord_t * item, pos_in_node_t from, pos_in_node_t to,
46182 +                struct carry_kill_data *kdata, reiser4_key * smallest_removed,
46183 +                reiser4_key * new_first)
46184 +{
46185 +       return cut_or_kill_ctail_units(item, from, to, 0, kdata,
46186 +                                      smallest_removed, new_first);
46187 +}
46188 +
46189 +/* plugin->u.item.s.file.read */
46190 +int read_ctail(struct file *file UNUSED_ARG, flow_t * f, hint_t * hint)
46191 +{
46192 +       uf_coord_t *uf_coord;
46193 +       coord_t *coord;
46194 +
46195 +       uf_coord = &hint->ext_coord;
46196 +       coord = &uf_coord->coord;
46197 +       assert("edward-127", f->user == 0);
46198 +       assert("edward-129", coord && coord->node);
46199 +       assert("edward-130", coord_is_existing_unit(coord));
46200 +       assert("edward-132", znode_is_loaded(coord->node));
46201 +
46202 +       /* start read only from the beginning of ctail */
46203 +       assert("edward-133", coord->unit_pos == 0);
46204 +       /* read only whole ctails */
46205 +       assert("edward-135", nr_units_ctail(coord) <= f->length);
46206 +
46207 +       assert("edward-136", schedulable());
46208 +       assert("edward-886", ctail_ok(coord));
46209 +
46210 +       if (f->data)
46211 +               memcpy(f->data, (char *)first_unit(coord),
46212 +                      (size_t) nr_units_ctail(coord));
46213 +
46214 +       dclust_set_extension(hint);
46215 +       mark_page_accessed(znode_page(coord->node));
46216 +       move_flow_forward(f, nr_units_ctail(coord));
46217 +
46218 +       return 0;
46219 +}
46220 +
46221 +/* Reads a disk cluster consists of ctail items,
46222 +   attaches a transform stream with plain text */
46223 +int ctail_read_disk_cluster(reiser4_cluster_t * clust, struct inode *inode,
46224 +                           int write)
46225 +{
46226 +       int result;
46227 +       assert("edward-671", clust->hint != NULL);
46228 +       assert("edward-140", clust->dstat == INVAL_DISK_CLUSTER);
46229 +       assert("edward-672", crc_inode_ok(inode));
46230 +
46231 +       /* set input stream */
46232 +       result = grab_tfm_stream(inode, &clust->tc, INPUT_STREAM);
46233 +       if (result)
46234 +               return result;
46235 +
46236 +       result = find_cluster(clust, inode, 1 /* read */ , write);
46237 +       assert("edward-1340", !result);
46238 +       if (result)
46239 +               return result;
46240 +       if (!write)
46241 +               /* write still need the lock to insert unprepped
46242 +                  items, etc... */
46243 +               put_hint_cluster(clust, inode, ZNODE_READ_LOCK);
46244 +
46245 +       assert("edward-673",
46246 +              ergo(write, znode_is_write_locked(clust->hint->lh.node)));
46247 +
46248 +       if (clust->dstat == FAKE_DISK_CLUSTER ||
46249 +           clust->dstat == UNPR_DISK_CLUSTER) {
46250 +               tfm_cluster_set_uptodate(&clust->tc);
46251 +               return 0;
46252 +       }
46253 +       result = grab_coa(&clust->tc, inode_compression_plugin(inode));
46254 +       if (result)
46255 +               return result;
46256 +       result = inflate_cluster(clust, inode);
46257 +       if (result)
46258 +               return result;
46259 +       tfm_cluster_set_uptodate(&clust->tc);
46260 +       return 0;
46261 +}
46262 +
46263 +/* read one locked page */
46264 +int do_readpage_ctail(struct inode * inode, reiser4_cluster_t * clust,
46265 +                     struct page *page)
46266 +{
46267 +       int ret;
46268 +       unsigned cloff;
46269 +       char *data;
46270 +       size_t pgcnt;
46271 +       tfm_cluster_t *tc = &clust->tc;
46272 +
46273 +       assert("edward-212", PageLocked(page));
46274 +
46275 +       if (PageUptodate(page))
46276 +               goto exit;
46277 +
46278 +       if (!tfm_cluster_is_uptodate(&clust->tc)) {
46279 +               clust->index = pg_to_clust(page->index, inode);
46280 +               unlock_page(page);
46281 +               ret = ctail_read_disk_cluster(clust, inode, 0 /* read */ );
46282 +               lock_page(page);
46283 +               if (ret)
46284 +                       return ret;
46285 +       }
46286 +       if (PageUptodate(page))
46287 +               /* races with another read/write */
46288 +               goto exit;
46289 +
46290 +       /* bytes in the page */
46291 +       pgcnt = cnt_to_pgcnt(i_size_read(inode), page->index);
46292 +
46293 +       if (pgcnt == 0) {
46294 +               assert("edward-1290", 0);
46295 +               return RETERR(-EINVAL);
46296 +       }
46297 +       assert("edward-119", tfm_cluster_is_uptodate(tc));
46298 +
46299 +       switch (clust->dstat) {
46300 +       case UNPR_DISK_CLUSTER:
46301 +               assert("edward-1285", 0);
46302 +#if REISER4_DEBUG
46303 +               warning("edward-1168",
46304 +                       "page %lu is not uptodate and disk cluster %lu (inode %llu) is unprepped\n",
46305 +                       page->index, clust->index,
46306 +                       (unsigned long long)get_inode_oid(inode));
46307 +#endif
46308 +       case FAKE_DISK_CLUSTER:
46309 +               /* fill the page by zeroes */
46310 +               data = kmap_atomic(page, KM_USER0);
46311 +
46312 +               memset(data, 0, PAGE_CACHE_SIZE);
46313 +               flush_dcache_page(page);
46314 +               kunmap_atomic(data, KM_USER0);
46315 +               SetPageUptodate(page);
46316 +               break;
46317 +       case PREP_DISK_CLUSTER:
46318 +               /* fill the page by transformed data */
46319 +               assert("edward-1058", !PageUptodate(page));
46320 +               assert("edward-120", tc->len <= inode_cluster_size(inode));
46321 +
46322 +               /* start page offset in the cluster */
46323 +               cloff = pg_to_off_to_cloff(page->index, inode);
46324 +
46325 +               data = kmap(page);
46326 +               memcpy(data, tfm_stream_data(tc, OUTPUT_STREAM) + cloff, pgcnt);
46327 +               memset(data + pgcnt, 0, (size_t) PAGE_CACHE_SIZE - pgcnt);
46328 +               flush_dcache_page(page);
46329 +               kunmap(page);
46330 +               SetPageUptodate(page);
46331 +               break;
46332 +       default:
46333 +               impossible("edward-1169", "bad disk cluster state");
46334 +       }
46335 +      exit:
46336 +       return 0;
46337 +}
46338 +
46339 +/* plugin->u.item.s.file.readpage */
46340 +int readpage_ctail(void *vp, struct page *page)
46341 +{
46342 +       int result;
46343 +       hint_t *hint;
46344 +       reiser4_cluster_t *clust = vp;
46345 +
46346 +       assert("edward-114", clust != NULL);
46347 +       assert("edward-115", PageLocked(page));
46348 +       assert("edward-116", !PageUptodate(page));
46349 +       assert("edward-117", !jprivate(page) && !PagePrivate(page));
46350 +       assert("edward-118", page->mapping && page->mapping->host);
46351 +       assert("edward-867", !tfm_cluster_is_uptodate(&clust->tc));
46352 +
46353 +       hint = kmalloc(sizeof(*hint), GFP_KERNEL);
46354 +       if (hint == NULL)
46355 +               return RETERR(-ENOMEM);
46356 +       clust->hint = hint;
46357 +       result = load_file_hint(clust->file, hint);
46358 +       if (result) {
46359 +               kfree(hint);
46360 +               return result;
46361 +       }
46362 +       assert("vs-25", hint->ext_coord.lh == &hint->lh);
46363 +       result = do_readpage_ctail(page->mapping->host, clust, page);
46364 +
46365 +       assert("edward-213", PageLocked(page));
46366 +       assert("edward-1163", ergo(!result, PageUptodate(page)));
46367 +       assert("edward-868",
46368 +              ergo(!result, tfm_cluster_is_uptodate(&clust->tc)));
46369 +
46370 +       unlock_page(page);
46371 +       done_lh(&hint->lh);
46372 +       hint->ext_coord.valid = 0;
46373 +       save_file_hint(clust->file, hint);
46374 +       kfree(hint);
46375 +       tfm_cluster_clr_uptodate(&clust->tc);
46376 +
46377 +       return result;
46378 +}
46379 +
46380 +/* This unconditionally reads a disk cluster.
46381 +   Helper function for ->readpages() */
46382 +static int
46383 +ctail_read_page_cluster(reiser4_cluster_t * clust, struct inode *inode)
46384 +{
46385 +       int i;
46386 +       int result;
46387 +       assert("edward-779", clust != NULL);
46388 +       assert("edward-1059", clust->win == NULL);
46389 +       assert("edward-780", inode != NULL);
46390 +
46391 +       result = prepare_page_cluster(inode, clust, 0 /* do not capture */ );
46392 +       if (result)
46393 +               return result;
46394 +       result = ctail_read_disk_cluster(clust, inode, 0 /* read */ );
46395 +       if (result)
46396 +               goto out;
46397 +       /* at this point stream with valid plain text is attached */
46398 +       assert("edward-781", tfm_cluster_is_uptodate(&clust->tc));
46399 +
46400 +       for (i = 0; i < clust->nr_pages; i++) {
46401 +               struct page *page = clust->pages[i];
46402 +               lock_page(page);
46403 +               result = do_readpage_ctail(inode, clust, page);
46404 +               unlock_page(page);
46405 +               if (result)
46406 +                       break;
46407 +       }
46408 +       tfm_cluster_clr_uptodate(&clust->tc);
46409 +      out:
46410 +       release_cluster_pages(clust);
46411 +       return result;
46412 +}
46413 +
46414 +#define list_to_page(head) (list_entry((head)->prev, struct page, lru))
46415 +#define list_to_next_page(head) (list_entry((head)->prev->prev, struct page, lru))
46416 +
46417 +#if REISER4_DEBUG
46418 +#define check_order(pages)                                                    \
46419 +assert("edward-214", ergo(!list_empty(pages) && pages->next != pages->prev,   \
46420 +       list_to_page(pages)->index < list_to_next_page(pages)->index))
46421 +#endif
46422 +
46423 +/* plugin->u.item.s.file.readpages
46424 +   Populate an address space with some page clusters,
46425 +   and start reads against them.
46426 +   FIXME-EDWARD: this function should return errors?
46427 +*/
46428 +void
46429 +readpages_ctail(void *vp, struct address_space *mapping,
46430 +               struct list_head *pages)
46431 +{
46432 +       int ret = 0;
46433 +       hint_t *hint;
46434 +       reiser4_cluster_t clust;
46435 +       struct page *page;
46436 +       struct pagevec lru_pvec;
46437 +       struct inode *inode = mapping->host;
46438 +       int progress = 0;
46439 +
46440 +       assert("edward-214", ergo(!list_empty(pages) &&
46441 +                                 pages->next != pages->prev,
46442 +                                 list_to_page(pages)->index <
46443 +                                 list_to_next_page(pages)->index));
46444 +       pagevec_init(&lru_pvec, 0);
46445 +       cluster_init_read(&clust, NULL);
46446 +       clust.file = vp;
46447 +       hint = kmalloc(sizeof(*hint), GFP_KERNEL);
46448 +       if (hint == NULL) {
46449 +               warning("vs-28", "failed to allocate hint");
46450 +               goto exit1;
46451 +       }
46452 +       clust.hint = hint;
46453 +       ret = load_file_hint(clust.file, hint);
46454 +       if (ret)
46455 +               goto exit2;
46456 +       ret = alloc_cluster_pgset(&clust, cluster_nrpages(inode));
46457 +       if (ret)
46458 +               goto exit3;
46459 +       assert("vs-26", hint->ext_coord.lh == &hint->lh);
46460 +
46461 +       /* address_space-level file readahead doesn't know about
46462 +          reiser4 concept of clustering, so we work around this
46463 +          fact: with each page of the list @pages address space
46464 +          will be populated with the whole page cluster.
46465 +       */
46466 +       while (!list_empty(pages)) {
46467 +               page = list_to_page(pages);
46468 +               list_del(&page->lru);
46469 +               if (add_to_page_cache(page, mapping, page->index, GFP_KERNEL)) {
46470 +                       page_cache_release(page);
46471 +                       continue;
46472 +               }
46473 +               if (PageUptodate(page)) {
46474 +                       if (!pagevec_add(&lru_pvec, page))
46475 +                               __pagevec_lru_add(&lru_pvec);
46476 +                       unlock_page(page);
46477 +                       continue;
46478 +               }
46479 +               unlock_page(page);
46480 +
46481 +               move_cluster_forward(&clust, inode, page->index, &progress);
46482 +               ret = ctail_read_page_cluster(&clust, inode);
46483 +               if (ret)
46484 +                       break;
46485 +               assert("edward-869", !tfm_cluster_is_uptodate(&clust.tc));
46486 +               lock_page(page);
46487 +
46488 +               ret = do_readpage_ctail(inode, &clust, page);
46489 +               if (!pagevec_add(&lru_pvec, page))
46490 +                       __pagevec_lru_add(&lru_pvec);
46491 +               if (ret) {
46492 +                       warning("edward-215", "do_readpage_ctail failed");
46493 +                       unlock_page(page);
46494 +                       break;
46495 +               }
46496 +               assert("edward-1061", PageUptodate(page));
46497 +
46498 +               unlock_page(page);
46499 +       }
46500 +       assert("edward-870", !tfm_cluster_is_uptodate(&clust.tc));
46501 + exit3:
46502 +       done_lh(&hint->lh);
46503 +       save_file_hint(clust.file, hint);
46504 +       hint->ext_coord.valid = 0;
46505 + exit2:
46506 +       kfree(hint);
46507 + exit1:
46508 +       while (!list_empty(pages)) {
46509 +               struct page *victim;
46510 +               victim = list_to_page(pages);
46511 +               list_del(&victim->lru);
46512 +               page_cache_release(victim);
46513 +       }
46514 +       put_cluster_handle(&clust);
46515 +       pagevec_lru_add(&lru_pvec);
46516 +       return;
46517 +}
46518 +
46519 +/*
46520 +   plugin->u.item.s.file.append_key
46521 +   key of the first item of the next disk cluster
46522 +*/
46523 +reiser4_key *append_key_ctail(const coord_t * coord, reiser4_key * key)
46524 +{
46525 +       assert("edward-1241", item_id_by_coord(coord) == CTAIL_ID);
46526 +       assert("edward-1242", cluster_shift_ok(cluster_shift_by_coord(coord)));
46527 +
46528 +       item_key_by_coord(coord, key);
46529 +       set_key_offset(key,
46530 +                      ((__u64) (clust_by_coord(coord, NULL)) +
46531 +                       1) << cluster_shift_by_coord(coord));
46532 +       return key;
46533 +}
46534 +
46535 +static int
46536 +insert_unprepped_ctail(reiser4_cluster_t * clust, struct inode *inode)
46537 +{
46538 +       int result;
46539 +       char buf[UCTAIL_NR_UNITS];
46540 +       reiser4_item_data data;
46541 +       reiser4_key key;
46542 +       int shift = (int)UCTAIL_SHIFT;
46543 +
46544 +       memset(buf, 0, (size_t) UCTAIL_NR_UNITS);
46545 +       result = key_by_inode_cryptcompress(inode,
46546 +                                           clust_to_off(clust->index, inode),
46547 +                                           &key);
46548 +       if (result)
46549 +               return result;
46550 +       data.user = 0;
46551 +       data.iplug = item_plugin_by_id(CTAIL_ID);
46552 +       data.arg = &shift;
46553 +       data.length = sizeof(ctail_item_format) + (size_t) UCTAIL_NR_UNITS;
46554 +       data.data = buf;
46555 +
46556 +       result = insert_by_coord(&clust->hint->ext_coord.coord,
46557 +                                &data, &key, clust->hint->ext_coord.lh, 0);
46558 +       return result;
46559 +}
46560 +
46561 +static int
46562 +insert_crc_flow(coord_t * coord, lock_handle * lh, flow_t * f,
46563 +               struct inode *inode)
46564 +{
46565 +       int result;
46566 +       carry_pool *pool;
46567 +       carry_level *lowest_level;
46568 +       reiser4_item_data *data;
46569 +       carry_op *op;
46570 +       int cluster_shift = inode_cluster_shift(inode);
46571 +
46572 +       pool =
46573 +           init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
46574 +                           sizeof(*data));
46575 +       if (IS_ERR(pool))
46576 +               return PTR_ERR(pool);
46577 +       lowest_level = (carry_level *) (pool + 1);
46578 +       init_carry_level(lowest_level, pool);
46579 +       data = (reiser4_item_data *) (lowest_level + 3);
46580 +
46581 +       assert("edward-466", coord->between == AFTER_ITEM
46582 +              || coord->between == AFTER_UNIT || coord->between == BEFORE_ITEM
46583 +              || coord->between == EMPTY_NODE
46584 +              || coord->between == BEFORE_UNIT);
46585 +
46586 +       if (coord->between == AFTER_UNIT) {
46587 +               coord->unit_pos = 0;
46588 +               coord->between = AFTER_ITEM;
46589 +       }
46590 +       op = post_carry(lowest_level, COP_INSERT_FLOW, coord->node,
46591 +                       0 /* operate directly on coord -> node */ );
46592 +       if (IS_ERR(op) || (op == NULL)) {
46593 +               done_carry_pool(pool);
46594 +               return RETERR(op ? PTR_ERR(op) : -EIO);
46595 +       }
46596 +       data->user = 0;
46597 +       data->iplug = item_plugin_by_id(CTAIL_ID);
46598 +       data->arg = &cluster_shift;
46599 +
46600 +       data->length = 0;
46601 +       data->data = NULL;
46602 +
46603 +       op->u.insert_flow.flags = COPI_DONT_SHIFT_LEFT | COPI_DONT_SHIFT_RIGHT;
46604 +       op->u.insert_flow.insert_point = coord;
46605 +       op->u.insert_flow.flow = f;
46606 +       op->u.insert_flow.data = data;
46607 +       op->u.insert_flow.new_nodes = 0;
46608 +
46609 +       lowest_level->track_type = CARRY_TRACK_CHANGE;
46610 +       lowest_level->tracked = lh;
46611 +
46612 +       result = carry(lowest_level, NULL);
46613 +       done_carry_pool(pool);
46614 +
46615 +       return result;
46616 +}
46617 +
46618 +/* Implementation of CRC_APPEND_ITEM mode of ctail conversion */
46619 +static int
46620 +insert_crc_flow_in_place(coord_t * coord, lock_handle * lh, flow_t * f,
46621 +                        struct inode *inode)
46622 +{
46623 +       int ret;
46624 +       coord_t pos;
46625 +       lock_handle lock;
46626 +
46627 +       assert("edward-674", f->length <= inode_scaled_cluster_size(inode));
46628 +       assert("edward-484", coord->between == AT_UNIT
46629 +              || coord->between == AFTER_ITEM);
46630 +       assert("edward-485", item_id_by_coord(coord) == CTAIL_ID);
46631 +
46632 +       coord_dup(&pos, coord);
46633 +       pos.unit_pos = 0;
46634 +       pos.between = AFTER_ITEM;
46635 +
46636 +       init_lh(&lock);
46637 +       copy_lh(&lock, lh);
46638 +
46639 +       ret = insert_crc_flow(&pos, &lock, f, inode);
46640 +       done_lh(&lock);
46641 +       assert("edward-1347", znode_is_write_locked(lh->node));
46642 +       assert("edward-1228", !ret);
46643 +       return ret;
46644 +}
46645 +
46646 +/* Implementation of CRC_OVERWRITE_ITEM mode of ctail conversion */
46647 +static int overwrite_ctail(coord_t * coord, flow_t * f)
46648 +{
46649 +       unsigned count;
46650 +
46651 +       assert("edward-269", f->user == 0);
46652 +       assert("edward-270", f->data != NULL);
46653 +       assert("edward-271", f->length > 0);
46654 +       assert("edward-272", coord_is_existing_unit(coord));
46655 +       assert("edward-273", coord->unit_pos == 0);
46656 +       assert("edward-274", znode_is_write_locked(coord->node));
46657 +       assert("edward-275", schedulable());
46658 +       assert("edward-467", item_id_by_coord(coord) == CTAIL_ID);
46659 +       assert("edward-1243", ctail_ok(coord));
46660 +
46661 +       count = nr_units_ctail(coord);
46662 +
46663 +       if (count > f->length)
46664 +               count = f->length;
46665 +       memcpy(first_unit(coord), f->data, count);
46666 +       move_flow_forward(f, count);
46667 +       coord->unit_pos += count;
46668 +       return 0;
46669 +}
46670 +
46671 +/* Implementation of CRC_CUT_ITEM mode of ctail conversion:
46672 +   cut ctail (part or whole) starting from next unit position */
46673 +static int cut_ctail(coord_t * coord)
46674 +{
46675 +       coord_t stop;
46676 +
46677 +       assert("edward-435", coord->between == AT_UNIT &&
46678 +              coord->item_pos < coord_num_items(coord) &&
46679 +              coord->unit_pos <= coord_num_units(coord));
46680 +
46681 +       if (coord->unit_pos == coord_num_units(coord))
46682 +               /* nothing to cut */
46683 +               return 0;
46684 +       coord_dup(&stop, coord);
46685 +       stop.unit_pos = coord_last_unit_pos(coord);
46686 +
46687 +       return cut_node_content(coord, &stop, NULL, NULL, NULL);
46688 +}
46689 +
46690 +int
46691 +ctail_insert_unprepped_cluster(reiser4_cluster_t * clust, struct inode *inode)
46692 +{
46693 +       int result;
46694 +       assert("edward-1244", inode != NULL);
46695 +       assert("edward-1245", clust->hint != NULL);
46696 +       assert("edward-1246", clust->dstat == FAKE_DISK_CLUSTER);
46697 +       assert("edward-1247", clust->reserved == 1);
46698 +       assert("edward-1248", get_current_context()->grabbed_blocks ==
46699 +              estimate_insert_cluster(inode));
46700 +
46701 +       result = get_disk_cluster_locked(clust, inode, ZNODE_WRITE_LOCK);
46702 +       if (cbk_errored(result))
46703 +               return result;
46704 +       assert("edward-1249", result == CBK_COORD_NOTFOUND);
46705 +       assert("edward-1250", znode_is_write_locked(clust->hint->lh.node));
46706 +
46707 +       assert("edward-1295",
46708 +              clust->hint->ext_coord.lh->node ==
46709 +              clust->hint->ext_coord.coord.node);
46710 +
46711 +       coord_set_between_clusters(&clust->hint->ext_coord.coord);
46712 +
46713 +       result = insert_unprepped_ctail(clust, inode);
46714 +       all_grabbed2free();
46715 +
46716 +       assert("edward-1251", !result);
46717 +       assert("edward-1252", crc_inode_ok(inode));
46718 +       assert("edward-1253", znode_is_write_locked(clust->hint->lh.node));
46719 +       assert("edward-1254",
46720 +              reiser4_clustered_blocks(reiser4_get_current_sb()));
46721 +       assert("edward-1255",
46722 +              znode_convertible(clust->hint->ext_coord.coord.node));
46723 +
46724 +       return result;
46725 +}
46726 +
46727 +static int do_convert_ctail(flush_pos_t * pos, crc_write_mode_t mode)
46728 +{
46729 +       int result = 0;
46730 +       convert_item_info_t *info;
46731 +
46732 +       assert("edward-468", pos != NULL);
46733 +       assert("edward-469", pos->sq != NULL);
46734 +       assert("edward-845", item_convert_data(pos) != NULL);
46735 +
46736 +       info = item_convert_data(pos);
46737 +       assert("edward-679", info->flow.data != NULL);
46738 +
46739 +       switch (mode) {
46740 +       case CRC_APPEND_ITEM:
46741 +               assert("edward-1229", info->flow.length != 0);
46742 +               assert("edward-1256",
46743 +                      cluster_shift_ok(cluster_shift_by_coord(&pos->coord)));
46744 +               result =
46745 +                   insert_crc_flow_in_place(&pos->coord, &pos->lock,
46746 +                                            &info->flow, info->inode);
46747 +               break;
46748 +       case CRC_OVERWRITE_ITEM:
46749 +               assert("edward-1230", info->flow.length != 0);
46750 +               overwrite_ctail(&pos->coord, &info->flow);
46751 +               if (info->flow.length != 0)
46752 +                       break;
46753 +       case CRC_CUT_ITEM:
46754 +               assert("edward-1231", info->flow.length == 0);
46755 +               result = cut_ctail(&pos->coord);
46756 +               break;
46757 +       default:
46758 +               result = RETERR(-EIO);
46759 +               impossible("edward-244", "bad convert mode");
46760 +       }
46761 +       return result;
46762 +}
46763 +
46764 +/* plugin->u.item.f.scan */
46765 +int scan_ctail(flush_scan * scan)
46766 +{
46767 +       int result = 0;
46768 +       struct page *page;
46769 +       struct inode *inode;
46770 +       jnode *node = scan->node;
46771 +
46772 +       assert("edward-227", scan->node != NULL);
46773 +       assert("edward-228", jnode_is_cluster_page(scan->node));
46774 +       assert("edward-639", znode_is_write_locked(scan->parent_lock.node));
46775 +
46776 +       page = jnode_page(node);
46777 +       inode = page->mapping->host;
46778 +
46779 +       if (!scanning_left(scan))
46780 +               return result;
46781 +       if (!ZF_ISSET(scan->parent_lock.node, JNODE_DIRTY))
46782 +               znode_make_dirty(scan->parent_lock.node);
46783 +
46784 +       if (!znode_convertible(scan->parent_lock.node)) {
46785 +               if (JF_ISSET(scan->node, JNODE_DIRTY))
46786 +                       znode_set_convertible(scan->parent_lock.node);
46787 +               else {
46788 +                       warning("edward-681",
46789 +                               "cluster page is already processed");
46790 +                       return -EAGAIN;
46791 +               }
46792 +       }
46793 +       return result;
46794 +}
46795 +
46796 +/* If true, this function attaches children */
46797 +static int should_attach_convert_idata(flush_pos_t * pos)
46798 +{
46799 +       int result;
46800 +       assert("edward-431", pos != NULL);
46801 +       assert("edward-432", pos->child == NULL);
46802 +       assert("edward-619", znode_is_write_locked(pos->coord.node));
46803 +       assert("edward-470",
46804 +              item_plugin_by_coord(&pos->coord) ==
46805 +              item_plugin_by_id(CTAIL_ID));
46806 +
46807 +       /* check for leftmost child */
46808 +       utmost_child_ctail(&pos->coord, LEFT_SIDE, &pos->child);
46809 +
46810 +       if (!pos->child)
46811 +               return 0;
46812 +       spin_lock_jnode(pos->child);
46813 +       result = (JF_ISSET(pos->child, JNODE_DIRTY) &&
46814 +                 pos->child->atom == ZJNODE(pos->coord.node)->atom);
46815 +       spin_unlock_jnode(pos->child);
46816 +       if (!result && pos->child) {
46817 +               /* existing child isn't to attach, clear up this one */
46818 +               jput(pos->child);
46819 +               pos->child = NULL;
46820 +       }
46821 +       return result;
46822 +}
46823 +
46824 +/* plugin->init_convert_data() */
46825 +static int
46826 +init_convert_data_ctail(convert_item_info_t * idata, struct inode *inode)
46827 +{
46828 +       assert("edward-813", idata != NULL);
46829 +       assert("edward-814", inode != NULL);
46830 +
46831 +       idata->inode = inode;
46832 +       idata->d_cur = DC_FIRST_ITEM;
46833 +       idata->d_next = DC_INVALID_STATE;
46834 +
46835 +       return 0;
46836 +}
46837 +
46838 +static int alloc_item_convert_data(convert_info_t * sq)
46839 +{
46840 +       assert("edward-816", sq != NULL);
46841 +       assert("edward-817", sq->itm == NULL);
46842 +
46843 +       sq->itm = kmalloc(sizeof(*sq->itm), GFP_KERNEL);
46844 +       if (sq->itm == NULL)
46845 +               return RETERR(-ENOMEM);
46846 +       return 0;
46847 +}
46848 +
46849 +static void free_item_convert_data(convert_info_t * sq)
46850 +{
46851 +       assert("edward-818", sq != NULL);
46852 +       assert("edward-819", sq->itm != NULL);
46853 +       assert("edward-820", sq->iplug != NULL);
46854 +
46855 +       kfree(sq->itm);
46856 +       sq->itm = NULL;
46857 +       return;
46858 +}
46859 +
46860 +static int alloc_convert_data(flush_pos_t * pos)
46861 +{
46862 +       assert("edward-821", pos != NULL);
46863 +       assert("edward-822", pos->sq == NULL);
46864 +
46865 +       pos->sq = kmalloc(sizeof(*pos->sq), GFP_KERNEL);
46866 +       if (!pos->sq)
46867 +               return RETERR(-ENOMEM);
46868 +       memset(pos->sq, 0, sizeof(*pos->sq));
46869 +       cluster_init_write(&pos->sq->clust, 0);
46870 +       return 0;
46871 +}
46872 +
46873 +void free_convert_data(flush_pos_t * pos)
46874 +{
46875 +       convert_info_t *sq;
46876 +
46877 +       assert("edward-823", pos != NULL);
46878 +       assert("edward-824", pos->sq != NULL);
46879 +
46880 +       sq = pos->sq;
46881 +       if (sq->itm)
46882 +               free_item_convert_data(sq);
46883 +       put_cluster_handle(&sq->clust);
46884 +       kfree(pos->sq);
46885 +       pos->sq = NULL;
46886 +       return;
46887 +}
46888 +
46889 +static int init_item_convert_data(flush_pos_t * pos, struct inode *inode)
46890 +{
46891 +       convert_info_t *sq;
46892 +
46893 +       assert("edward-825", pos != NULL);
46894 +       assert("edward-826", pos->sq != NULL);
46895 +       assert("edward-827", item_convert_data(pos) != NULL);
46896 +       assert("edward-828", inode != NULL);
46897 +
46898 +       sq = pos->sq;
46899 +
46900 +       memset(sq->itm, 0, sizeof(*sq->itm));
46901 +
46902 +       /* iplug->init_convert_data() */
46903 +       return init_convert_data_ctail(sq->itm, inode);
46904 +}
46905 +
46906 +/* create and attach disk cluster info used by 'convert' phase of the flush
46907 +   squalloc() */
46908 +static int attach_convert_idata(flush_pos_t * pos, struct inode *inode)
46909 +{
46910 +       int ret = 0;
46911 +       convert_item_info_t *info;
46912 +       reiser4_cluster_t *clust;
46913 +       file_plugin *fplug = inode_file_plugin(inode);
46914 +       compression_plugin *cplug = inode_compression_plugin(inode);
46915 +
46916 +       assert("edward-248", pos != NULL);
46917 +       assert("edward-249", pos->child != NULL);
46918 +       assert("edward-251", inode != NULL);
46919 +       assert("edward-682", crc_inode_ok(inode));
46920 +       assert("edward-252", fplug == file_plugin_by_id(CRC_FILE_PLUGIN_ID));
46921 +       assert("edward-473",
46922 +              item_plugin_by_coord(&pos->coord) ==
46923 +              item_plugin_by_id(CTAIL_ID));
46924 +
46925 +       if (!pos->sq) {
46926 +               ret = alloc_convert_data(pos);
46927 +               if (ret)
46928 +                       return ret;
46929 +       }
46930 +       clust = &pos->sq->clust;
46931 +       ret = grab_coa(&clust->tc, cplug);
46932 +       if (ret)
46933 +               goto err;
46934 +       ret = set_cluster_by_page(clust,
46935 +                                 jnode_page(pos->child),
46936 +                                 MAX_CLUSTER_NRPAGES);
46937 +       if (ret)
46938 +               goto err;
46939 +
46940 +       assert("edward-829", pos->sq != NULL);
46941 +       assert("edward-250", item_convert_data(pos) == NULL);
46942 +
46943 +       pos->sq->iplug = item_plugin_by_id(CTAIL_ID);
46944 +
46945 +       ret = alloc_item_convert_data(pos->sq);
46946 +       if (ret)
46947 +               goto err;
46948 +       ret = init_item_convert_data(pos, inode);
46949 +       if (ret)
46950 +               goto err;
46951 +       info = item_convert_data(pos);
46952 +
46953 +       ret = flush_cluster_pages(clust, pos->child, inode);
46954 +       if (ret)
46955 +               goto err;
46956 +
46957 +       deflate_cluster(clust, inode);
46958 +       inc_item_convert_count(pos);
46959 +
46960 +       /* make flow by transformed stream */
46961 +       fplug->flow_by_inode(info->inode,
46962 +                            (const char __user *)tfm_stream_data(&clust->tc, OUTPUT_STREAM),
46963 +                            0 /* kernel space */ ,
46964 +                            clust->tc.len,
46965 +                            clust_to_off(clust->index, inode),
46966 +                            WRITE_OP, &info->flow);
46967 +       jput(pos->child);
46968 +
46969 +       assert("edward-683", crc_inode_ok(inode));
46970 +       return 0;
46971 +      err:
46972 +       jput(pos->child);
46973 +       free_convert_data(pos);
46974 +       return ret;
46975 +}
46976 +
46977 +/* clear up disk cluster info */
46978 +static void detach_convert_idata(convert_info_t * sq)
46979 +{
46980 +       convert_item_info_t *info;
46981 +
46982 +       assert("edward-253", sq != NULL);
46983 +       assert("edward-840", sq->itm != NULL);
46984 +
46985 +       info = sq->itm;
46986 +       assert("edward-255", info->inode != NULL);
46987 +       assert("edward-1212", info->flow.length == 0);
46988 +
46989 +       free_item_convert_data(sq);
46990 +       return;
46991 +}
46992 +
46993 +/* plugin->u.item.f.utmost_child */
46994 +
46995 +/* This function sets leftmost child for a first cluster item,
46996 +   if the child exists, and NULL in other cases.
46997 +   NOTE-EDWARD: Do not call this for RIGHT_SIDE */
46998 +
46999 +int utmost_child_ctail(const coord_t * coord, sideof side, jnode ** child)
47000 +{
47001 +       reiser4_key key;
47002 +
47003 +       item_key_by_coord(coord, &key);
47004 +
47005 +       assert("edward-257", coord != NULL);
47006 +       assert("edward-258", child != NULL);
47007 +       assert("edward-259", side == LEFT_SIDE);
47008 +       assert("edward-260",
47009 +              item_plugin_by_coord(coord) == item_plugin_by_id(CTAIL_ID));
47010 +
47011 +       if (!is_disk_cluster_key(&key, coord))
47012 +               *child = NULL;
47013 +       else
47014 +               *child = jlookup(current_tree,
47015 +                                get_key_objectid(item_key_by_coord
47016 +                                                 (coord, &key)),
47017 +                                off_to_pg(get_key_offset(&key)));
47018 +       return 0;
47019 +}
47020 +
47021 +/* Returns true if @p2 is the next item to @p1
47022 +   in the _same_ disk cluster.
47023 +   Disk cluster is a set of items. If ->clustered() != NULL,
47024 +   with each item the whole disk cluster should be read/modified
47025 +*/
47026 +static int clustered_ctail(const coord_t * p1, const coord_t * p2)
47027 +{
47028 +       return mergeable_ctail(p1, p2);
47029 +}
47030 +
47031 +/* Go rightward and check for next disk cluster item, set
47032 +   d_next to DC_CHAINED_ITEM, if the last one exists.
47033 +   If the current position is last item, go to right neighbor.
47034 +   Skip empty nodes. Note, that right neighbors may be not in
47035 +   the slum because of races. If so, make it dirty and
47036 +   convertible.
47037 +*/
47038 +static int next_item_dc_stat(flush_pos_t * pos)
47039 +{
47040 +       int ret = 0;
47041 +       int stop = 0;
47042 +       znode *cur;
47043 +       coord_t coord;
47044 +       lock_handle lh;
47045 +       lock_handle right_lock;
47046 +
47047 +       assert("edward-1232", !node_is_empty(pos->coord.node));
47048 +       assert("edward-1014",
47049 +              pos->coord.item_pos < coord_num_items(&pos->coord));
47050 +       assert("edward-1015", chaining_data_present(pos));
47051 +       assert("edward-1017",
47052 +              item_convert_data(pos)->d_next == DC_INVALID_STATE);
47053 +
47054 +       item_convert_data(pos)->d_next = DC_AFTER_CLUSTER;
47055 +
47056 +       if (item_convert_data(pos)->d_cur == DC_AFTER_CLUSTER)
47057 +               return ret;
47058 +       if (pos->coord.item_pos < coord_num_items(&pos->coord) - 1)
47059 +               return ret;
47060 +
47061 +       /* check next slum item */
47062 +       init_lh(&right_lock);
47063 +       cur = pos->coord.node;
47064 +
47065 +       while (!stop) {
47066 +               init_lh(&lh);
47067 +               ret = reiser4_get_right_neighbor(&lh,
47068 +                                                cur,
47069 +                                                ZNODE_WRITE_LOCK,
47070 +                                                GN_CAN_USE_UPPER_LEVELS);
47071 +               if (ret)
47072 +                       break;
47073 +               ret = zload(lh.node);
47074 +               if (ret) {
47075 +                       done_lh(&lh);
47076 +                       break;
47077 +               }
47078 +               coord_init_before_first_item(&coord, lh.node);
47079 +
47080 +               if (node_is_empty(lh.node)) {
47081 +                       znode_make_dirty(lh.node);
47082 +                       znode_set_convertible(lh.node);
47083 +                       stop = 0;
47084 +               } else if (clustered_ctail(&pos->coord, &coord)) {
47085 +
47086 +                       item_convert_data(pos)->d_next = DC_CHAINED_ITEM;
47087 +
47088 +                       if (!ZF_ISSET(lh.node, JNODE_DIRTY)) {
47089 +                               /*
47090 +                                  warning("edward-1024",
47091 +                                  "next slum item mergeable, "
47092 +                                  "but znode %p isn't dirty\n",
47093 +                                  lh.node);
47094 +                                */
47095 +                               znode_make_dirty(lh.node);
47096 +                       }
47097 +                       if (!znode_convertible(lh.node)) {
47098 +                               /*
47099 +                                  warning("edward-1272",
47100 +                                  "next slum item mergeable, "
47101 +                                  "but znode %p isn't convertible\n",
47102 +                                  lh.node);
47103 +                                */
47104 +                               znode_set_convertible(lh.node);
47105 +                       }
47106 +                       stop = 1;
47107 +               } else
47108 +                       stop = 1;
47109 +               zrelse(lh.node);
47110 +               done_lh(&right_lock);
47111 +               copy_lh(&right_lock, &lh);
47112 +               done_lh(&lh);
47113 +               cur = right_lock.node;
47114 +       }
47115 +       done_lh(&right_lock);
47116 +
47117 +       if (ret == -E_NO_NEIGHBOR)
47118 +               ret = 0;
47119 +       return ret;
47120 +}
47121 +
47122 +static int
47123 +assign_convert_mode(convert_item_info_t * idata, crc_write_mode_t * mode)
47124 +{
47125 +       int result = 0;
47126 +
47127 +       assert("edward-1025", idata != NULL);
47128 +
47129 +       if (idata->flow.length) {
47130 +               /* append or overwrite */
47131 +               switch (idata->d_cur) {
47132 +               case DC_FIRST_ITEM:
47133 +               case DC_CHAINED_ITEM:
47134 +                       *mode = CRC_OVERWRITE_ITEM;
47135 +                       break;
47136 +               case DC_AFTER_CLUSTER:
47137 +                       *mode = CRC_APPEND_ITEM;
47138 +                       break;
47139 +               default:
47140 +                       impossible("edward-1018", "wrong current item state");
47141 +               }
47142 +       } else {
47143 +               /* cut or invalidate */
47144 +               switch (idata->d_cur) {
47145 +               case DC_FIRST_ITEM:
47146 +               case DC_CHAINED_ITEM:
47147 +                       *mode = CRC_CUT_ITEM;
47148 +                       break;
47149 +               case DC_AFTER_CLUSTER:
47150 +                       result = 1;
47151 +                       break;
47152 +               default:
47153 +                       impossible("edward-1019", "wrong current item state");
47154 +               }
47155 +       }
47156 +       return result;
47157 +}
47158 +
47159 +/* plugin->u.item.f.convert */
47160 +/* write ctail in guessed mode */
47161 +int convert_ctail(flush_pos_t * pos)
47162 +{
47163 +       int result;
47164 +       int nr_items;
47165 +       crc_write_mode_t mode = CRC_OVERWRITE_ITEM;
47166 +
47167 +       assert("edward-1020", pos != NULL);
47168 +       assert("edward-1213", coord_num_items(&pos->coord) != 0);
47169 +       assert("edward-1257", item_id_by_coord(&pos->coord) == CTAIL_ID);
47170 +       assert("edward-1258", ctail_ok(&pos->coord));
47171 +       assert("edward-261", pos->coord.node != NULL);
47172 +
47173 +       nr_items = coord_num_items(&pos->coord);
47174 +       if (!chaining_data_present(pos)) {
47175 +               if (should_attach_convert_idata(pos)) {
47176 +                       /* attach convert item info */
47177 +                       struct inode *inode;
47178 +
47179 +                       assert("edward-264", pos->child != NULL);
47180 +                       assert("edward-265", jnode_page(pos->child) != NULL);
47181 +                       assert("edward-266",
47182 +                              jnode_page(pos->child)->mapping != NULL);
47183 +
47184 +                       inode = jnode_page(pos->child)->mapping->host;
47185 +
47186 +                       assert("edward-267", inode != NULL);
47187 +
47188 +                       /* attach item convert info by child and put the last one */
47189 +                       result = attach_convert_idata(pos, inode);
47190 +                       pos->child = NULL;
47191 +                       if (result == -E_REPEAT) {
47192 +                               /* jnode became clean, or there is no dirty
47193 +                                  pages (nothing to update in disk cluster) */
47194 +                               warning("edward-1021",
47195 +                                       "convert_ctail: nothing to attach");
47196 +                               return 0;
47197 +                       }
47198 +                       if (result != 0)
47199 +                               return result;
47200 +               } else
47201 +                       /* unconvertible */
47202 +                       return 0;
47203 +       } else {
47204 +               /* use old convert info */
47205 +
47206 +               convert_item_info_t *idata;
47207 +
47208 +               idata = item_convert_data(pos);
47209 +
47210 +               result = assign_convert_mode(idata, &mode);
47211 +               if (result) {
47212 +                       /* disk cluster is over,
47213 +                          nothing to update anymore */
47214 +                       detach_convert_idata(pos->sq);
47215 +                       return 0;
47216 +               }
47217 +       }
47218 +
47219 +       assert("edward-433", chaining_data_present(pos));
47220 +       assert("edward-1022",
47221 +              pos->coord.item_pos < coord_num_items(&pos->coord));
47222 +
47223 +       result = next_item_dc_stat(pos);
47224 +       if (result) {
47225 +               detach_convert_idata(pos->sq);
47226 +               return result;
47227 +       }
47228 +       result = do_convert_ctail(pos, mode);
47229 +       if (result) {
47230 +               detach_convert_idata(pos->sq);
47231 +               return result;
47232 +       }
47233 +       switch (mode) {
47234 +       case CRC_CUT_ITEM:
47235 +               assert("edward-1214", item_convert_data(pos)->flow.length == 0);
47236 +               assert("edward-1215",
47237 +                      coord_num_items(&pos->coord) == nr_items ||
47238 +                      coord_num_items(&pos->coord) == nr_items - 1);
47239 +               if (item_convert_data(pos)->d_next == DC_CHAINED_ITEM)
47240 +                       break;
47241 +               if (coord_num_items(&pos->coord) != nr_items) {
47242 +                       /* the item was killed, no more chained items */
47243 +                       detach_convert_idata(pos->sq);
47244 +                       if (!node_is_empty(pos->coord.node))
47245 +                               /* make sure the next item will be scanned */
47246 +                               coord_init_before_item(&pos->coord);
47247 +                       break;
47248 +               }
47249 +       case CRC_APPEND_ITEM:
47250 +               assert("edward-434", item_convert_data(pos)->flow.length == 0);
47251 +               detach_convert_idata(pos->sq);
47252 +               break;
47253 +       case CRC_OVERWRITE_ITEM:
47254 +               if (coord_is_unprepped_ctail(&pos->coord)) {
47255 +                       /* convert unpprepped ctail to prepped one */
47256 +                       int shift;
47257 +                       shift =
47258 +                           inode_cluster_shift(item_convert_data(pos)->inode);
47259 +                       assert("edward-1259", cluster_shift_ok(shift));
47260 +                       put_unaligned((d8)shift,
47261 +                               &ctail_formatted_at(&pos->coord)->
47262 +                               cluster_shift);
47263 +               }
47264 +               break;
47265 +       }
47266 +       return result;
47267 +}
47268 +
47269 +/* Make Linus happy.
47270 +   Local variables:
47271 +   c-indentation-style: "K&R"
47272 +   mode-name: "LC"
47273 +   c-basic-offset: 8
47274 +   tab-width: 8
47275 +   fill-column: 120
47276 +   End:
47277 +*/
47278 diff --git a/fs/reiser4/plugin/item/ctail.h b/fs/reiser4/plugin/item/ctail.h
47279 new file mode 100644
47280 index 0000000..906fe31
47281 --- /dev/null
47282 +++ b/fs/reiser4/plugin/item/ctail.h
47283 @@ -0,0 +1,89 @@
47284 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
47285 +
47286 +#if !defined( __FS_REISER4_CTAIL_H__ )
47287 +#define __FS_REISER4_CTAIL_H__
47288 +
47289 +/* cryptcompress object item. See ctail.c for description. */
47290 +
47291 +#define UCTAIL_NR_UNITS 1
47292 +#define UCTAIL_SHIFT 0xff
47293 +
47294 +typedef struct ctail_item_format {
47295 +       /* cluster shift */
47296 +       d8 cluster_shift;
47297 +       /* ctail body */
47298 +       d8 body[0];
47299 +} __attribute__ ((packed)) ctail_item_format;
47300 +
47301 +/* The following is a set of various item states in a disk cluster.
47302 +   Disk cluster is a set of items whose keys belong to the interval
47303 +   [dc_key , dc_key + disk_cluster_size - 1] */
47304 +typedef enum {
47305 +       DC_INVALID_STATE = 0,
47306 +       DC_FIRST_ITEM = 1,
47307 +       DC_CHAINED_ITEM = 2,
47308 +       DC_AFTER_CLUSTER = 3
47309 +} dc_item_stat;
47310 +
47311 +typedef struct {
47312 +       int shift;              /* we keep here a cpu value of cluster_shift field
47313 +                                  of ctail_item_format (see above) */
47314 +} ctail_coord_extension_t;
47315 +
47316 +struct cut_list;
47317 +
47318 +/* plugin->item.b.* */
47319 +int can_contain_key_ctail(const coord_t *, const reiser4_key *,
47320 +                         const reiser4_item_data *);
47321 +int mergeable_ctail(const coord_t * p1, const coord_t * p2);
47322 +pos_in_node_t nr_units_ctail(const coord_t * coord);
47323 +int estimate_ctail(const coord_t * coord, const reiser4_item_data * data);
47324 +void print_ctail(const char *prefix, coord_t * coord);
47325 +lookup_result lookup_ctail(const reiser4_key *, lookup_bias, coord_t *);
47326 +
47327 +int paste_ctail(coord_t * coord, reiser4_item_data * data,
47328 +               carry_plugin_info * info UNUSED_ARG);
47329 +int init_ctail(coord_t *, coord_t *, reiser4_item_data *);
47330 +int can_shift_ctail(unsigned free_space, coord_t * coord,
47331 +                   znode * target, shift_direction pend, unsigned *size,
47332 +                   unsigned want);
47333 +void copy_units_ctail(coord_t * target, coord_t * source, unsigned from,
47334 +                     unsigned count, shift_direction where_is_free_space,
47335 +                     unsigned free_space);
47336 +int cut_units_ctail(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
47337 +                   carry_cut_data *, reiser4_key * smallest_removed,
47338 +                   reiser4_key * new_first);
47339 +int kill_units_ctail(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
47340 +                    carry_kill_data *, reiser4_key * smallest_removed,
47341 +                    reiser4_key * new_first);
47342 +int ctail_ok(const coord_t * coord);
47343 +int check_ctail(const coord_t * coord, const char **error);
47344 +
47345 +/* plugin->u.item.s.* */
47346 +int read_ctail(struct file *, flow_t *, hint_t *);
47347 +int readpage_ctail(void *, struct page *);
47348 +void readpages_ctail(void *, struct address_space *, struct list_head *);
47349 +reiser4_key *append_key_ctail(const coord_t *, reiser4_key *);
47350 +int create_hook_ctail(const coord_t * coord, void *arg);
47351 +int kill_hook_ctail(const coord_t *, pos_in_node_t, pos_in_node_t,
47352 +                   carry_kill_data *);
47353 +int shift_hook_ctail(const coord_t *, unsigned, unsigned, znode *);
47354 +
47355 +/* plugin->u.item.f */
47356 +int utmost_child_ctail(const coord_t *, sideof, jnode **);
47357 +int scan_ctail(flush_scan *);
47358 +int convert_ctail(flush_pos_t *);
47359 +size_t inode_scaled_cluster_size(struct inode *);
47360 +int cluster_shift_by_coord(const coord_t * coord);
47361 +
47362 +#endif                         /* __FS_REISER4_CTAIL_H__ */
47363 +
47364 +/* Make Linus happy.
47365 +   Local variables:
47366 +   c-indentation-style: "K&R"
47367 +   mode-name: "LC"
47368 +   c-basic-offset: 8
47369 +   tab-width: 8
47370 +   fill-column: 120
47371 +   End:
47372 +*/
47373 diff --git a/fs/reiser4/plugin/item/extent.c b/fs/reiser4/plugin/item/extent.c
47374 new file mode 100644
47375 index 0000000..bb9af9b
47376 --- /dev/null
47377 +++ b/fs/reiser4/plugin/item/extent.c
47378 @@ -0,0 +1,197 @@
47379 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
47380 +
47381 +#include "item.h"
47382 +#include "../../key.h"
47383 +#include "../../super.h"
47384 +#include "../../carry.h"
47385 +#include "../../inode.h"
47386 +#include "../../page_cache.h"
47387 +#include "../../flush.h"
47388 +#include "../object.h"
47389 +
47390 +/* prepare structure reiser4_item_data. It is used to put one extent unit into tree */
47391 +/* Audited by: green(2002.06.13) */
47392 +reiser4_item_data *init_new_extent(reiser4_item_data * data, void *ext_unit,
47393 +                                  int nr_extents)
47394 +{
47395 +       data->data = ext_unit;
47396 +       /* data->data is kernel space */
47397 +       data->user = 0;
47398 +       data->length = sizeof(reiser4_extent) * nr_extents;
47399 +       data->arg = NULL;
47400 +       data->iplug = item_plugin_by_id(EXTENT_POINTER_ID);
47401 +       return data;
47402 +}
47403 +
47404 +/* how many bytes are addressed by @nr first extents of the extent item */
47405 +reiser4_block_nr extent_size(const coord_t * coord, pos_in_node_t nr)
47406 +{
47407 +       pos_in_node_t i;
47408 +       reiser4_block_nr blocks;
47409 +       reiser4_extent *ext;
47410 +
47411 +       ext = item_body_by_coord(coord);
47412 +       assert("vs-263", nr <= nr_units_extent(coord));
47413 +
47414 +       blocks = 0;
47415 +       for (i = 0; i < nr; i++, ext++) {
47416 +               blocks += extent_get_width(ext);
47417 +       }
47418 +
47419 +       return blocks * current_blocksize;
47420 +}
47421 +
47422 +extent_state state_of_extent(reiser4_extent * ext)
47423 +{
47424 +       switch ((int)extent_get_start(ext)) {
47425 +       case 0:
47426 +               return HOLE_EXTENT;
47427 +       case 1:
47428 +               return UNALLOCATED_EXTENT;
47429 +       default:
47430 +               break;
47431 +       }
47432 +       return ALLOCATED_EXTENT;
47433 +}
47434 +
47435 +int extent_is_unallocated(const coord_t * item)
47436 +{
47437 +       assert("jmacd-5133", item_is_extent(item));
47438 +
47439 +       return state_of_extent(extent_by_coord(item)) == UNALLOCATED_EXTENT;
47440 +}
47441 +
47442 +/* set extent's start and width */
47443 +void
47444 +set_extent(reiser4_extent * ext, reiser4_block_nr start, reiser4_block_nr width)
47445 +{
47446 +       extent_set_start(ext, start);
47447 +       extent_set_width(ext, width);
47448 +}
47449 +
47450 +
47451 +/**
47452 + * replace_extent - replace extent and paste 1 or 2 after it
47453 + * @un_extent: coordinate of extent to be overwritten
47454 + * @lh: need better comment
47455 + * @key: need better comment
47456 + * @exts_to_add: data prepared for insertion into tree
47457 + * @replace: need better comment
47458 + * @flags: need better comment
47459 + * @return_insert_position: need better comment
47460 + *
47461 + * Overwrites one extent, pastes 1 or 2 more ones after overwritten one.  If
47462 + * @return_inserted_position is 1 - @un_extent and @lh are returned set to
47463 + * first of newly inserted units, if it is 0 - @un_extent and @lh are returned
47464 + * set to extent which was overwritten.
47465 + */
47466 +int replace_extent(struct replace_handle *h, int return_inserted_position)
47467 +{
47468 +       int result;
47469 +       znode *orig_znode;
47470 +       /*ON_DEBUG(reiser4_extent orig_ext);*/  /* this is for debugging */
47471 +
47472 +       assert("vs-990", coord_is_existing_unit(h->coord));
47473 +       assert("vs-1375", znode_is_write_locked(h->coord->node));
47474 +       assert("vs-1426", extent_get_width(&h->overwrite) != 0);
47475 +       assert("vs-1427", extent_get_width(&h->new_extents[0]) != 0);
47476 +       assert("vs-1427", ergo(h->nr_new_extents == 2,
47477 +                              extent_get_width(&h->new_extents[1]) != 0));
47478 +
47479 +       /* compose structure for paste */
47480 +       init_new_extent(&h->item, &h->new_extents[0], h->nr_new_extents);
47481 +
47482 +       coord_dup(&h->coord_after, h->coord);
47483 +       init_lh(&h->lh_after);
47484 +       copy_lh(&h->lh_after, h->lh);
47485 +       tap_init(&h->watch, &h->coord_after, &h->lh_after, ZNODE_WRITE_LOCK);
47486 +       tap_monitor(&h->watch);
47487 +
47488 +       ON_DEBUG(h->orig_ext = *extent_by_coord(h->coord));
47489 +       orig_znode = h->coord->node;
47490 +
47491 +#if REISER4_DEBUG
47492 +       /* make sure that key is set properly */
47493 +       unit_key_by_coord(h->coord, &h->tmp);
47494 +       set_key_offset(&h->tmp,
47495 +                      get_key_offset(&h->tmp) +
47496 +                      extent_get_width(&h->overwrite) * current_blocksize);
47497 +       assert("vs-1080", keyeq(&h->tmp, &h->paste_key));
47498 +#endif
47499 +
47500 +       /* set insert point after unit to be replaced */
47501 +       h->coord->between = AFTER_UNIT;
47502 +
47503 +       result = insert_into_item(h->coord, return_inserted_position ? h->lh : NULL,
47504 +                                 &h->paste_key, &h->item, h->flags);
47505 +       if (!result) {
47506 +               /* now we have to replace the unit after which new units were
47507 +                  inserted. Its position is tracked by @watch */
47508 +               reiser4_extent *ext;
47509 +               znode *node;
47510 +
47511 +               node = h->coord_after.node;
47512 +               if (node != orig_znode) {
47513 +                       coord_clear_iplug(&h->coord_after);
47514 +                       result = zload(node);
47515 +               }
47516 +
47517 +               if (likely(!result)) {
47518 +                       ext = extent_by_coord(&h->coord_after);
47519 +
47520 +                       assert("vs-987", znode_is_loaded(node));
47521 +                       assert("vs-988", !memcmp(ext, &h->orig_ext, sizeof(*ext)));
47522 +
47523 +                       /* overwrite extent unit */
47524 +                       memcpy(ext, &h->overwrite, sizeof(reiser4_extent));
47525 +                       znode_make_dirty(node);
47526 +
47527 +                       if (node != orig_znode)
47528 +                               zrelse(node);
47529 +
47530 +                       if (return_inserted_position == 0) {
47531 +                               /* coord and lh are to be set to overwritten
47532 +                                  extent */
47533 +                               assert("vs-1662",
47534 +                                      WITH_DATA(node, !memcmp(&h->overwrite,
47535 +                                                              extent_by_coord(
47536 +                                                                      &h->coord_after),
47537 +                                                              sizeof(reiser4_extent))));
47538 +
47539 +                               *h->coord = h->coord_after;
47540 +                               done_lh(h->lh);
47541 +                               copy_lh(h->lh, &h->lh_after);
47542 +                       } else {
47543 +                               /* h->coord and h->lh are to be set to first of
47544 +                                  inserted units */
47545 +                               assert("vs-1663",
47546 +                                      WITH_DATA(h->coord->node,
47547 +                                                !memcmp(&h->new_extents[0],
47548 +                                                        extent_by_coord(h->coord),
47549 +                                                        sizeof(reiser4_extent))));
47550 +                               assert("vs-1664", h->lh->node == h->coord->node);
47551 +                       }
47552 +               }
47553 +       }
47554 +       tap_done(&h->watch);
47555 +
47556 +       return result;
47557 +}
47558 +
47559 +lock_handle *znode_lh(znode *node)
47560 +{
47561 +       assert("vs-1371", znode_is_write_locked(node));
47562 +       assert("vs-1372", znode_is_wlocked_once(node));
47563 +       return list_entry(node->lock.owners.next, lock_handle, owners_link);
47564 +}
47565 +
47566 +/*
47567 + * Local variables:
47568 + * c-indentation-style: "K&R"
47569 + * mode-name: "LC"
47570 + * c-basic-offset: 8
47571 + * tab-width: 8
47572 + * fill-column: 79
47573 + * scroll-step: 1
47574 + * End:
47575 + */
47576 diff --git a/fs/reiser4/plugin/item/extent.h b/fs/reiser4/plugin/item/extent.h
47577 new file mode 100644
47578 index 0000000..3a60bcd
47579 --- /dev/null
47580 +++ b/fs/reiser4/plugin/item/extent.h
47581 @@ -0,0 +1,228 @@
47582 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
47583 +
47584 +#ifndef __REISER4_EXTENT_H__
47585 +#define __REISER4_EXTENT_H__
47586 +
47587 +/* on disk extent */
47588 +typedef struct {
47589 +       reiser4_dblock_nr start;
47590 +       reiser4_dblock_nr width;
47591 +} reiser4_extent;
47592 +
47593 +typedef struct extent_stat {
47594 +       int unallocated_units;
47595 +       int unallocated_blocks;
47596 +       int allocated_units;
47597 +       int allocated_blocks;
47598 +       int hole_units;
47599 +       int hole_blocks;
47600 +} extent_stat;
47601 +
47602 +/* extents in an extent item can be either holes, or unallocated or allocated
47603 +   extents */
47604 +typedef enum {
47605 +       HOLE_EXTENT,
47606 +       UNALLOCATED_EXTENT,
47607 +       ALLOCATED_EXTENT
47608 +} extent_state;
47609 +
47610 +#define HOLE_EXTENT_START 0
47611 +#define UNALLOCATED_EXTENT_START 1
47612 +#define UNALLOCATED_EXTENT_START2 2
47613 +
47614 +typedef struct {
47615 +       reiser4_block_nr pos_in_unit;
47616 +       reiser4_block_nr width; /* width of current unit */
47617 +       pos_in_node_t nr_units; /* number of units */
47618 +       int ext_offset;         /* offset from the beginning of zdata() */
47619 +       unsigned long expected_page;
47620 +#if REISER4_DEBUG
47621 +       reiser4_extent extent;
47622 +#endif
47623 +} extent_coord_extension_t;
47624 +
47625 +/* macros to set/get fields of on-disk extent */
47626 +static inline reiser4_block_nr extent_get_start(const reiser4_extent * ext)
47627 +{
47628 +       return le64_to_cpu(ext->start);
47629 +}
47630 +
47631 +static inline reiser4_block_nr extent_get_width(const reiser4_extent * ext)
47632 +{
47633 +       return le64_to_cpu(ext->width);
47634 +}
47635 +
47636 +extern __u64 reiser4_current_block_count(void);
47637 +
47638 +static inline void
47639 +extent_set_start(reiser4_extent * ext, reiser4_block_nr start)
47640 +{
47641 +       cassert(sizeof(ext->start) == 8);
47642 +       assert("nikita-2510",
47643 +              ergo(start > 1, start < reiser4_current_block_count()));
47644 +       put_unaligned(cpu_to_le64(start), &ext->start);
47645 +}
47646 +
47647 +static inline void
47648 +extent_set_width(reiser4_extent * ext, reiser4_block_nr width)
47649 +{
47650 +       cassert(sizeof(ext->width) == 8);
47651 +       assert("", width > 0);
47652 +       put_unaligned(cpu_to_le64(width), &ext->width);
47653 +       assert("nikita-2511",
47654 +              ergo(extent_get_start(ext) > 1,
47655 +                   extent_get_start(ext) + width <=
47656 +                   reiser4_current_block_count()));
47657 +}
47658 +
47659 +#define extent_item(coord)                                     \
47660 +({                                                             \
47661 +       assert("nikita-3143", item_is_extent(coord));           \
47662 +       ((reiser4_extent *)item_body_by_coord (coord));         \
47663 +})
47664 +
47665 +#define extent_by_coord(coord)                                 \
47666 +({                                                             \
47667 +       assert("nikita-3144", item_is_extent(coord));           \
47668 +       (extent_item (coord) + (coord)->unit_pos);              \
47669 +})
47670 +
47671 +#define width_by_coord(coord)                                  \
47672 +({                                                             \
47673 +       assert("nikita-3145", item_is_extent(coord));           \
47674 +       extent_get_width (extent_by_coord(coord));              \
47675 +})
47676 +
47677 +struct carry_cut_data;
47678 +struct carry_kill_data;
47679 +
47680 +/* plugin->u.item.b.* */
47681 +reiser4_key *max_key_inside_extent(const coord_t *, reiser4_key *);
47682 +int can_contain_key_extent(const coord_t * coord, const reiser4_key * key,
47683 +                          const reiser4_item_data *);
47684 +int mergeable_extent(const coord_t * p1, const coord_t * p2);
47685 +pos_in_node_t nr_units_extent(const coord_t *);
47686 +lookup_result lookup_extent(const reiser4_key *, lookup_bias, coord_t *);
47687 +void init_coord_extent(coord_t *);
47688 +int init_extent(coord_t *, reiser4_item_data *);
47689 +int paste_extent(coord_t *, reiser4_item_data *, carry_plugin_info *);
47690 +int can_shift_extent(unsigned free_space,
47691 +                    coord_t * source, znode * target, shift_direction,
47692 +                    unsigned *size, unsigned want);
47693 +void copy_units_extent(coord_t * target, coord_t * source, unsigned from,
47694 +                      unsigned count, shift_direction where_is_free_space,
47695 +                      unsigned free_space);
47696 +int kill_hook_extent(const coord_t *, pos_in_node_t from, pos_in_node_t count,
47697 +                    struct carry_kill_data *);
47698 +int create_hook_extent(const coord_t * coord, void *arg);
47699 +int cut_units_extent(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
47700 +                    struct carry_cut_data *, reiser4_key * smallest_removed,
47701 +                    reiser4_key * new_first);
47702 +int kill_units_extent(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
47703 +                     struct carry_kill_data *, reiser4_key * smallest_removed,
47704 +                     reiser4_key * new_first);
47705 +reiser4_key *unit_key_extent(const coord_t *, reiser4_key *);
47706 +reiser4_key *max_unit_key_extent(const coord_t *, reiser4_key *);
47707 +void print_extent(const char *, coord_t *);
47708 +int utmost_child_extent(const coord_t * coord, sideof side, jnode ** child);
47709 +int utmost_child_real_block_extent(const coord_t * coord, sideof side,
47710 +                                  reiser4_block_nr * block);
47711 +void item_stat_extent(const coord_t * coord, void *vp);
47712 +int check_extent(const coord_t * coord, const char **error);
47713 +
47714 +/* plugin->u.item.s.file.* */
47715 +ssize_t write_extent(struct file *, const char __user *, size_t, loff_t *);
47716 +int read_extent(struct file *, flow_t *, hint_t *);
47717 +int readpage_extent(void *, struct page *);
47718 +void readpages_extent(void *, struct address_space *, struct list_head *pages);
47719 +reiser4_key *append_key_extent(const coord_t *, reiser4_key *);
47720 +void init_coord_extension_extent(uf_coord_t *, loff_t offset);
47721 +int get_block_address_extent(const coord_t *, sector_t block,
47722 +                            sector_t * result);
47723 +
47724 +/* these are used in flush.c
47725 +   FIXME-VS: should they be somewhere in item_plugin? */
47726 +int allocate_extent_item_in_place(coord_t *, lock_handle *, flush_pos_t * pos);
47727 +int allocate_and_copy_extent(znode * left, coord_t * right, flush_pos_t * pos,
47728 +                            reiser4_key * stop_key);
47729 +
47730 +int extent_is_unallocated(const coord_t * item);       /* True if this extent is unallocated (i.e., not a hole, not allocated). */
47731 +__u64 extent_unit_index(const coord_t * item); /* Block offset of this unit. */
47732 +__u64 extent_unit_width(const coord_t * item); /* Number of blocks in this unit. */
47733 +
47734 +/* plugin->u.item.f. */
47735 +int scan_extent(flush_scan * scan);
47736 +extern int key_by_offset_extent(struct inode *, loff_t, reiser4_key *);
47737 +
47738 +reiser4_item_data *init_new_extent(reiser4_item_data * data, void *ext_unit,
47739 +                                  int nr_extents);
47740 +reiser4_block_nr extent_size(const coord_t * coord, pos_in_node_t nr);
47741 +extent_state state_of_extent(reiser4_extent * ext);
47742 +void set_extent(reiser4_extent *, reiser4_block_nr start,
47743 +               reiser4_block_nr width);
47744 +int update_extent(struct inode *, jnode *, loff_t pos, int *plugged_hole);
47745 +
47746 +#include "../../coord.h"
47747 +#include "../../lock.h"
47748 +#include "../../tap.h"
47749 +
47750 +struct replace_handle {
47751 +       /* these are to be set before calling replace_extent */
47752 +       coord_t *coord;
47753 +       lock_handle *lh;
47754 +       reiser4_key key;
47755 +       reiser4_key *pkey;
47756 +       reiser4_extent overwrite;
47757 +       reiser4_extent new_extents[2];
47758 +       int nr_new_extents;
47759 +       unsigned flags;
47760 +
47761 +       /* these are used by replace_extent */
47762 +       reiser4_item_data item;
47763 +       coord_t coord_after;
47764 +       lock_handle lh_after;
47765 +       tap_t watch;
47766 +       reiser4_key paste_key;
47767 +#if REISER4_DEBUG
47768 +       reiser4_extent orig_ext;
47769 +       reiser4_key tmp;
47770 +#endif
47771 +};
47772 +
47773 +/* this structure is kmalloced before calling make_extent to avoid excessive
47774 +   stack consumption on plug_hole->replace_extent */
47775 +struct make_extent_handle {
47776 +       uf_coord_t *uf_coord;
47777 +       reiser4_block_nr blocknr;
47778 +       int created;
47779 +       struct inode *inode;
47780 +       union {
47781 +               struct {
47782 +               } append;
47783 +               struct replace_handle replace;
47784 +       } u;
47785 +};
47786 +
47787 +int replace_extent(struct replace_handle *, int return_inserted_position);
47788 +lock_handle *znode_lh(znode *);
47789 +
47790 +/* the reiser4 repacker support */
47791 +struct repacker_cursor;
47792 +extern int process_extent_backward_for_repacking(tap_t *,
47793 +                                                struct repacker_cursor *);
47794 +extern int mark_extent_for_repacking(tap_t *, int);
47795 +
47796 +#define coord_by_uf_coord(uf_coord) (&((uf_coord)->coord))
47797 +#define ext_coord_by_uf_coord(uf_coord) (&((uf_coord)->extension.extent))
47798 +
47799 +/* __REISER4_EXTENT_H__ */
47800 +#endif
47801 +/*
47802 +   Local variables:
47803 +   c-indentation-style: "K&R"
47804 +   mode-name: "LC"
47805 +   c-basic-offset: 8
47806 +   tab-width: 8
47807 +   fill-column: 120
47808 +   End:
47809 +*/
47810 diff --git a/fs/reiser4/plugin/item/extent_file_ops.c b/fs/reiser4/plugin/item/extent_file_ops.c
47811 new file mode 100644
47812 index 0000000..6e1ac01
47813 --- /dev/null
47814 +++ b/fs/reiser4/plugin/item/extent_file_ops.c
47815 @@ -0,0 +1,1716 @@
47816 +/* COPYRIGHT 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
47817 +
47818 +#include "item.h"
47819 +#include "../../inode.h"
47820 +#include "../../page_cache.h"
47821 +#include "../object.h"
47822 +
47823 +#include <linux/quotaops.h>
47824 +#include <linux/swap.h>
47825 +#include "../../../../mm/filemap.h"
47826 +
47827 +
47828 +static inline reiser4_extent *ext_by_offset(const znode *node, int offset)
47829 +{
47830 +       reiser4_extent *ext;
47831 +
47832 +       ext = (reiser4_extent *) (zdata(node) + offset);
47833 +       return ext;
47834 +}
47835 +
47836 +/**
47837 + * check_uf_coord - verify coord extension
47838 + * @uf_coord:
47839 + * @key:
47840 + *
47841 + * Makes sure that all fields of @uf_coord are set properly. If @key is
47842 + * specified - check whether @uf_coord is set correspondingly.
47843 + */
47844 +static void check_uf_coord(const uf_coord_t *uf_coord, const reiser4_key *key)
47845 +{
47846 +#if REISER4_DEBUG
47847 +       const coord_t *coord;
47848 +       const extent_coord_extension_t *ext_coord;
47849 +       reiser4_extent *ext;
47850 +
47851 +       coord = &uf_coord->coord;
47852 +       ext_coord = &uf_coord->extension.extent;
47853 +       ext = ext_by_offset(coord->node, uf_coord->extension.extent.ext_offset);
47854 +
47855 +       assert("",
47856 +              WITH_DATA(coord->node,
47857 +                        (uf_coord->valid == 1 &&
47858 +                         coord_is_iplug_set(coord) &&
47859 +                         item_is_extent(coord) &&
47860 +                         ext_coord->nr_units == nr_units_extent(coord) &&
47861 +                         ext == extent_by_coord(coord) &&
47862 +                         ext_coord->width == extent_get_width(ext) &&
47863 +                         coord->unit_pos < ext_coord->nr_units &&
47864 +                         ext_coord->pos_in_unit < ext_coord->width &&
47865 +                         memcmp(ext, &ext_coord->extent,
47866 +                                sizeof(reiser4_extent)) == 0)));
47867 +       if (key) {
47868 +               reiser4_key coord_key;
47869 +
47870 +               unit_key_by_coord(&uf_coord->coord, &coord_key);
47871 +               set_key_offset(&coord_key,
47872 +                              get_key_offset(&coord_key) +
47873 +                              (uf_coord->extension.extent.
47874 +                               pos_in_unit << PAGE_CACHE_SHIFT));
47875 +               assert("", keyeq(key, &coord_key));
47876 +       }
47877 +#endif
47878 +}
47879 +
47880 +static inline reiser4_extent *ext_by_ext_coord(const uf_coord_t *uf_coord)
47881 +{
47882 +       check_uf_coord(uf_coord, NULL);
47883 +
47884 +       return ext_by_offset(uf_coord->coord.node,
47885 +                            uf_coord->extension.extent.ext_offset);
47886 +}
47887 +
47888 +#if REISER4_DEBUG
47889 +
47890 +/**
47891 + * offset_is_in_unit
47892 + *
47893 + *
47894 + *
47895 + */
47896 +/* return 1 if offset @off is inside of extent unit pointed to by @coord. Set
47897 +   pos_in_unit inside of unit correspondingly */
47898 +static int offset_is_in_unit(const coord_t *coord, loff_t off)
47899 +{
47900 +       reiser4_key unit_key;
47901 +       __u64 unit_off;
47902 +       reiser4_extent *ext;
47903 +
47904 +       ext = extent_by_coord(coord);
47905 +
47906 +       unit_key_extent(coord, &unit_key);
47907 +       unit_off = get_key_offset(&unit_key);
47908 +       if (off < unit_off)
47909 +               return 0;
47910 +       if (off >= (unit_off + (current_blocksize * extent_get_width(ext))))
47911 +               return 0;
47912 +       return 1;
47913 +}
47914 +
47915 +static int
47916 +coord_matches_key_extent(const coord_t * coord, const reiser4_key * key)
47917 +{
47918 +       reiser4_key item_key;
47919 +
47920 +       assert("vs-771", coord_is_existing_unit(coord));
47921 +       assert("vs-1258", keylt(key, append_key_extent(coord, &item_key)));
47922 +       assert("vs-1259", keyge(key, item_key_by_coord(coord, &item_key)));
47923 +
47924 +       return offset_is_in_unit(coord, get_key_offset(key));
47925 +}
47926 +
47927 +#endif
47928 +
47929 +/**
47930 + * can_append -
47931 + * @key:
47932 + * @coord:
47933 + *
47934 + * Returns 1 if @key is equal to an append key of item @coord is set to
47935 + */
47936 +static int can_append(const reiser4_key *key, const coord_t *coord)
47937 +{
47938 +       reiser4_key append_key;
47939 +
47940 +       return keyeq(key, append_key_extent(coord, &append_key));
47941 +}
47942 +
47943 +/**
47944 + * append_hole
47945 + * @coord:
47946 + * @lh:
47947 + * @key:
47948 + *
47949 + */
47950 +static int append_hole(coord_t *coord, lock_handle *lh,
47951 +                      const reiser4_key *key)
47952 +{
47953 +       reiser4_key append_key;
47954 +       reiser4_block_nr hole_width;
47955 +       reiser4_extent *ext, new_ext;
47956 +       reiser4_item_data idata;
47957 +
47958 +       /* last item of file may have to be appended with hole */
47959 +       assert("vs-708", znode_get_level(coord->node) == TWIG_LEVEL);
47960 +       assert("vs-714", item_id_by_coord(coord) == EXTENT_POINTER_ID);
47961 +
47962 +       /* key of first byte which is not addressed by this extent */
47963 +       append_key_extent(coord, &append_key);
47964 +
47965 +       assert("", keyle(&append_key, key));
47966 +
47967 +       /*
47968 +        * extent item has to be appended with hole. Calculate length of that
47969 +        * hole
47970 +        */
47971 +       hole_width = ((get_key_offset(key) - get_key_offset(&append_key) +
47972 +                      current_blocksize - 1) >> current_blocksize_bits);
47973 +       assert("vs-954", hole_width > 0);
47974 +
47975 +       /* set coord after last unit */
47976 +       coord_init_after_item_end(coord);
47977 +
47978 +       /* get last extent in the item */
47979 +       ext = extent_by_coord(coord);
47980 +       if (state_of_extent(ext) == HOLE_EXTENT) {
47981 +               /*
47982 +                * last extent of a file is hole extent. Widen that extent by
47983 +                * @hole_width blocks. Note that we do not worry about
47984 +                * overflowing - extent width is 64 bits
47985 +                */
47986 +               set_extent(ext, HOLE_EXTENT_START,
47987 +                          extent_get_width(ext) + hole_width);
47988 +               znode_make_dirty(coord->node);
47989 +               return 0;
47990 +       }
47991 +
47992 +       /* append last item of the file with hole extent unit */
47993 +       assert("vs-713", (state_of_extent(ext) == ALLOCATED_EXTENT ||
47994 +                         state_of_extent(ext) == UNALLOCATED_EXTENT));
47995 +
47996 +       set_extent(&new_ext, HOLE_EXTENT_START, hole_width);
47997 +       init_new_extent(&idata, &new_ext, 1);
47998 +       return insert_into_item(coord, lh, &append_key, &idata, 0);
47999 +}
48000 +
48001 +/**
48002 + * check_jnodes
48003 + * @twig: longterm locked twig node
48004 + * @key:
48005 + *
48006 + */
48007 +static void check_jnodes(znode *twig, const reiser4_key *key, int count)
48008 +{
48009 +#if REISER4_DEBUG
48010 +       coord_t c;
48011 +       reiser4_key node_key, jnode_key;
48012 +
48013 +       jnode_key = *key;
48014 +
48015 +       assert("", twig != NULL);
48016 +       assert("", znode_get_level(twig) == TWIG_LEVEL);
48017 +       assert("", znode_is_write_locked(twig));
48018 +
48019 +       zload(twig);
48020 +       /* get the smallest key in twig node */
48021 +       coord_init_first_unit(&c, twig);
48022 +       unit_key_by_coord(&c, &node_key);
48023 +       assert("", keyle(&node_key, &jnode_key));
48024 +
48025 +       coord_init_last_unit(&c, twig);
48026 +       unit_key_by_coord(&c, &node_key);
48027 +       if (item_plugin_by_coord(&c)->s.file.append_key)
48028 +               item_plugin_by_coord(&c)->s.file.append_key(&c, &node_key);
48029 +       set_key_offset(&jnode_key,
48030 +                      get_key_offset(&jnode_key) + (loff_t)count * PAGE_CACHE_SIZE - 1);
48031 +       assert("", keylt(&jnode_key, &node_key));
48032 +       zrelse(twig);
48033 +#endif
48034 +}
48035 +
48036 +/**
48037 + * append_last_extent - append last file item
48038 + * @uf_coord: coord to start insertion from
48039 + * @jnodes: array of jnodes
48040 + * @count: number of jnodes in the array
48041 + *
48042 + * There is already at least one extent item of file @inode in the tree. Append
48043 + * the last of them with unallocated extent unit of width @count. Assign
48044 + * fake block numbers to jnodes corresponding to the inserted extent.
48045 + */
48046 +static int append_last_extent(uf_coord_t *uf_coord, const reiser4_key *key,
48047 +                             jnode **jnodes, int count)
48048 +{
48049 +       int result;
48050 +       reiser4_extent new_ext;
48051 +       reiser4_item_data idata;
48052 +       coord_t *coord;
48053 +       extent_coord_extension_t *ext_coord;
48054 +       reiser4_extent *ext;
48055 +       reiser4_block_nr block;
48056 +       jnode *node;
48057 +       int i;
48058 +
48059 +       coord = &uf_coord->coord;
48060 +       ext_coord = &uf_coord->extension.extent;
48061 +       ext = ext_by_ext_coord(uf_coord);
48062 +
48063 +       /* check correctness of position in the item */
48064 +       assert("vs-228", coord->unit_pos == coord_last_unit_pos(coord));
48065 +       assert("vs-1311", coord->between == AFTER_UNIT);
48066 +       assert("vs-1302", ext_coord->pos_in_unit == ext_coord->width - 1);
48067 +
48068 +       if (!can_append(key, coord)) {
48069 +               /* hole extent has to be inserted */
48070 +               result = append_hole(coord, uf_coord->lh, key);
48071 +               uf_coord->valid = 0;
48072 +               return result;
48073 +       }
48074 +
48075 +       if (count == 0)
48076 +               return 0;
48077 +
48078 +       assert("", get_key_offset(key) == (loff_t)index_jnode(jnodes[0]) * PAGE_CACHE_SIZE);
48079 +
48080 +       result = DQUOT_ALLOC_BLOCK_NODIRTY(mapping_jnode(jnodes[0])->host,
48081 +                                          count);
48082 +       BUG_ON(result != 0);
48083 +
48084 +       switch (state_of_extent(ext)) {
48085 +       case UNALLOCATED_EXTENT:
48086 +               /*
48087 +                * last extent unit of the file is unallocated one. Increase
48088 +                * its width by @count
48089 +                */
48090 +               set_extent(ext, UNALLOCATED_EXTENT_START,
48091 +                          extent_get_width(ext) + count);
48092 +               znode_make_dirty(coord->node);
48093 +
48094 +               /* update coord extension */
48095 +               ext_coord->width += count;
48096 +               ON_DEBUG(extent_set_width
48097 +                        (&uf_coord->extension.extent.extent,
48098 +                         ext_coord->width));
48099 +               break;
48100 +
48101 +       case HOLE_EXTENT:
48102 +       case ALLOCATED_EXTENT:
48103 +               /*
48104 +                * last extent unit of the file is either hole or allocated
48105 +                * one. Append one unallocated extent of width @count
48106 +                */
48107 +               set_extent(&new_ext, UNALLOCATED_EXTENT_START, count);
48108 +               init_new_extent(&idata, &new_ext, 1);
48109 +               result = insert_into_item(coord, uf_coord->lh, key, &idata, 0);
48110 +               uf_coord->valid = 0;
48111 +               if (result)
48112 +                       return result;
48113 +               break;
48114 +
48115 +       default:
48116 +               return RETERR(-EIO);
48117 +       }
48118 +
48119 +       /*
48120 +        * make sure that we hold long term locked twig node containing all
48121 +        * jnodes we are about to capture
48122 +        */
48123 +       check_jnodes(uf_coord->lh->node, key, count);
48124 +
48125 +       /*
48126 +        * assign fake block numbers to all jnodes. FIXME: make sure whether
48127 +        * twig node containing inserted extent item is locked
48128 +        */
48129 +       block = fake_blocknr_unformatted(count);
48130 +       for (i = 0; i < count; i ++, block ++) {
48131 +               node = jnodes[i];
48132 +               spin_lock_jnode(node);
48133 +               JF_SET(node, JNODE_CREATED);
48134 +               jnode_set_block(node, &block);
48135 +               result = try_capture(node, ZNODE_WRITE_LOCK, 0);
48136 +               BUG_ON(result != 0);
48137 +               jnode_make_dirty_locked(node);
48138 +               spin_unlock_jnode(node);
48139 +       }
48140 +       return count;
48141 +}
48142 +
48143 +/**
48144 + * insert_first_hole - inser hole extent into tree
48145 + * @coord:
48146 + * @lh:
48147 + * @key:
48148 + *
48149 + *
48150 + */
48151 +static int insert_first_hole(coord_t *coord, lock_handle *lh,
48152 +                            const reiser4_key *key)
48153 +{
48154 +       reiser4_extent new_ext;
48155 +       reiser4_item_data idata;
48156 +       reiser4_key item_key;
48157 +       reiser4_block_nr hole_width;
48158 +
48159 +       /* @coord must be set for inserting of new item */
48160 +       assert("vs-711", coord_is_between_items(coord));
48161 +
48162 +       item_key = *key;
48163 +       set_key_offset(&item_key, 0ull);
48164 +
48165 +       hole_width = ((get_key_offset(key) + current_blocksize - 1) >>
48166 +                     current_blocksize_bits);
48167 +       assert("vs-710", hole_width > 0);
48168 +
48169 +       /* compose body of hole extent and insert item into tree */
48170 +       set_extent(&new_ext, HOLE_EXTENT_START, hole_width);
48171 +       init_new_extent(&idata, &new_ext, 1);
48172 +       return insert_extent_by_coord(coord, &idata, &item_key, lh);
48173 +}
48174 +
48175 +
48176 +/**
48177 + * insert_first_extent - insert first file item
48178 + * @inode: inode of file
48179 + * @uf_coord: coord to start insertion from
48180 + * @jnodes: array of jnodes
48181 + * @count: number of jnodes in the array
48182 + * @inode:
48183 + *
48184 + * There are no items of file @inode in the tree yet. Insert unallocated extent
48185 + * of width @count into tree or hole extent if writing not to the
48186 + * beginning. Assign fake block numbers to jnodes corresponding to the inserted
48187 + * unallocated extent. Returns number of jnodes or error code.
48188 + */
48189 +static int insert_first_extent(uf_coord_t *uf_coord, const reiser4_key *key,
48190 +                              jnode **jnodes, int count,
48191 +                              struct inode *inode)
48192 +{
48193 +       int result;
48194 +       int i;
48195 +       reiser4_extent new_ext;
48196 +       reiser4_item_data idata;
48197 +       reiser4_block_nr block;
48198 +       unix_file_info_t *uf_info;
48199 +       jnode *node;
48200 +
48201 +       /* first extent insertion starts at leaf level */
48202 +       assert("vs-719", znode_get_level(uf_coord->coord.node) == LEAF_LEVEL);
48203 +       assert("vs-711", coord_is_between_items(&uf_coord->coord));
48204 +
48205 +       if (get_key_offset(key) != 0) {
48206 +               result = insert_first_hole(&uf_coord->coord, uf_coord->lh, key);
48207 +               uf_coord->valid = 0;
48208 +               uf_info = unix_file_inode_data(inode);
48209 +
48210 +               /*
48211 +                * first item insertion is only possible when writing to empty
48212 +                * file or performing tail conversion
48213 +                */
48214 +               assert("", (uf_info->container == UF_CONTAINER_EMPTY ||
48215 +                           (inode_get_flag(inode, REISER4_PART_MIXED) &&
48216 +                            inode_get_flag(inode, REISER4_PART_IN_CONV))));
48217 +
48218 +               /* if file was empty - update its state */
48219 +               if (result == 0 && uf_info->container == UF_CONTAINER_EMPTY)
48220 +                       uf_info->container = UF_CONTAINER_EXTENTS;
48221 +               return result;
48222 +       }
48223 +
48224 +       if (count == 0)
48225 +               return 0;
48226 +
48227 +       result = DQUOT_ALLOC_BLOCK_NODIRTY(mapping_jnode(jnodes[0])->host, count);
48228 +       BUG_ON(result != 0);
48229 +
48230 +       /*
48231 +        * prepare for tree modification: compose body of item and item data
48232 +        * structure needed for insertion
48233 +        */
48234 +       set_extent(&new_ext, UNALLOCATED_EXTENT_START, count);
48235 +       init_new_extent(&idata, &new_ext, 1);
48236 +
48237 +       /* insert extent item into the tree */
48238 +       result = insert_extent_by_coord(&uf_coord->coord, &idata, key,
48239 +                                       uf_coord->lh);
48240 +       if (result)
48241 +               return result;
48242 +
48243 +       /*
48244 +        * make sure that we hold long term locked twig node containing all
48245 +        * jnodes we are about to capture
48246 +        */
48247 +       check_jnodes(uf_coord->lh->node, key, count);
48248 +       /*
48249 +        * assign fake block numbers to all jnodes, capture and mark them dirty
48250 +        */
48251 +       block = fake_blocknr_unformatted(count);
48252 +       for (i = 0; i < count; i ++, block ++) {
48253 +               node = jnodes[i];
48254 +               spin_lock_jnode(node);
48255 +               JF_SET(node, JNODE_CREATED);
48256 +               jnode_set_block(node, &block);
48257 +               result = try_capture(node, ZNODE_WRITE_LOCK, 0);
48258 +               BUG_ON(result != 0);
48259 +               jnode_make_dirty_locked(node);
48260 +               spin_unlock_jnode(node);
48261 +       }
48262 +
48263 +       /*
48264 +        * invalidate coordinate, research must be performed to continue
48265 +        * because write will continue on twig level
48266 +        */
48267 +       uf_coord->valid = 0;
48268 +       return count;
48269 +}
48270 +
48271 +/**
48272 + * plug_hole - replace hole extent with unallocated and holes
48273 + * @uf_coord:
48274 + * @key:
48275 + * @node:
48276 + * @h: structure containing coordinate, lock handle, key, etc
48277 + *
48278 + * Creates an unallocated extent of width 1 within a hole. In worst case two
48279 + * additional extents can be created.
48280 + */
48281 +static int plug_hole(uf_coord_t *uf_coord, const reiser4_key *key, int *how)
48282 +{
48283 +       struct replace_handle rh;
48284 +       reiser4_extent *ext;
48285 +       reiser4_block_nr width, pos_in_unit;
48286 +       coord_t *coord;
48287 +       extent_coord_extension_t *ext_coord;
48288 +       int return_inserted_position;
48289 +
48290 +       check_uf_coord(uf_coord, key);
48291 +
48292 +       rh.coord = coord_by_uf_coord(uf_coord);
48293 +       rh.lh = uf_coord->lh;
48294 +       rh.flags = 0;
48295 +
48296 +       coord = coord_by_uf_coord(uf_coord);
48297 +       ext_coord = ext_coord_by_uf_coord(uf_coord);
48298 +       ext = ext_by_ext_coord(uf_coord);
48299 +
48300 +       width = ext_coord->width;
48301 +       pos_in_unit = ext_coord->pos_in_unit;
48302 +
48303 +       *how = 0;
48304 +       if (width == 1) {
48305 +               set_extent(ext, UNALLOCATED_EXTENT_START, 1);
48306 +               znode_make_dirty(coord->node);
48307 +               /* update uf_coord */
48308 +               ON_DEBUG(ext_coord->extent = *ext);
48309 +               *how = 1;
48310 +               return 0;
48311 +       } else if (pos_in_unit == 0) {
48312 +               /* we deal with first element of extent */
48313 +               if (coord->unit_pos) {
48314 +                       /* there is an extent to the left */
48315 +                       if (state_of_extent(ext - 1) == UNALLOCATED_EXTENT) {
48316 +                               /*
48317 +                                * left neighboring unit is an unallocated
48318 +                                * extent. Increase its width and decrease
48319 +                                * width of hole
48320 +                                */
48321 +                               extent_set_width(ext - 1,
48322 +                                                extent_get_width(ext - 1) + 1);
48323 +                               extent_set_width(ext, width - 1);
48324 +                               znode_make_dirty(coord->node);
48325 +
48326 +                               /* update coord extension */
48327 +                               coord->unit_pos--;
48328 +                               ext_coord->width = extent_get_width(ext - 1);
48329 +                               ext_coord->pos_in_unit = ext_coord->width - 1;
48330 +                               ext_coord->ext_offset -= sizeof(reiser4_extent);
48331 +                               ON_DEBUG(ext_coord->extent =
48332 +                                        *extent_by_coord(coord));
48333 +                               *how = 2;
48334 +                               return 0;
48335 +                       }
48336 +               }
48337 +               /* extent for replace */
48338 +               set_extent(&rh.overwrite, UNALLOCATED_EXTENT_START, 1);
48339 +               /* extent to be inserted */
48340 +               set_extent(&rh.new_extents[0], HOLE_EXTENT_START, width - 1);
48341 +               rh.nr_new_extents = 1;
48342 +
48343 +               /* have replace_extent to return with @coord and @uf_coord->lh
48344 +                  set to unit which was replaced */
48345 +               return_inserted_position = 0;
48346 +               *how = 3;
48347 +       } else if (pos_in_unit == width - 1) {
48348 +               /* we deal with last element of extent */
48349 +               if (coord->unit_pos < nr_units_extent(coord) - 1) {
48350 +                       /* there is an extent unit to the right */
48351 +                       if (state_of_extent(ext + 1) == UNALLOCATED_EXTENT) {
48352 +                               /*
48353 +                                * right neighboring unit is an unallocated
48354 +                                * extent. Increase its width and decrease
48355 +                                * width of hole
48356 +                                */
48357 +                               extent_set_width(ext + 1,
48358 +                                                extent_get_width(ext + 1) + 1);
48359 +                               extent_set_width(ext, width - 1);
48360 +                               znode_make_dirty(coord->node);
48361 +
48362 +                               /* update coord extension */
48363 +                               coord->unit_pos++;
48364 +                               ext_coord->width = extent_get_width(ext + 1);
48365 +                               ext_coord->pos_in_unit = 0;
48366 +                               ext_coord->ext_offset += sizeof(reiser4_extent);
48367 +                               ON_DEBUG(ext_coord->extent =
48368 +                                        *extent_by_coord(coord));
48369 +                               *how = 4;
48370 +                               return 0;
48371 +                       }
48372 +               }
48373 +               /* extent for replace */
48374 +               set_extent(&rh.overwrite, HOLE_EXTENT_START, width - 1);
48375 +               /* extent to be inserted */
48376 +               set_extent(&rh.new_extents[0], UNALLOCATED_EXTENT_START, 1);
48377 +               rh.nr_new_extents = 1;
48378 +
48379 +               /* have replace_extent to return with @coord and @uf_coord->lh
48380 +                  set to unit which was inserted */
48381 +               return_inserted_position = 1;
48382 +               *how = 5;
48383 +       } else {
48384 +               /* extent for replace */
48385 +               set_extent(&rh.overwrite, HOLE_EXTENT_START, pos_in_unit);
48386 +               /* extents to be inserted */
48387 +               set_extent(&rh.new_extents[0], UNALLOCATED_EXTENT_START, 1);
48388 +               set_extent(&rh.new_extents[1], HOLE_EXTENT_START,
48389 +                          width - pos_in_unit - 1);
48390 +               rh.nr_new_extents = 2;
48391 +
48392 +               /* have replace_extent to return with @coord and @uf_coord->lh
48393 +                  set to first of units which were inserted */
48394 +               return_inserted_position = 1;
48395 +               *how = 6;
48396 +       }
48397 +       unit_key_by_coord(coord, &rh.paste_key);
48398 +       set_key_offset(&rh.paste_key, get_key_offset(&rh.paste_key) +
48399 +                      extent_get_width(&rh.overwrite) * current_blocksize);
48400 +
48401 +       uf_coord->valid = 0;
48402 +       return replace_extent(&rh, return_inserted_position);
48403 +}
48404 +
48405 +/**
48406 + * overwrite_one_block -
48407 + * @uf_coord:
48408 + * @key:
48409 + * @node:
48410 + *
48411 + * If @node corresponds to hole extent - create unallocated extent for it and
48412 + * assign fake block number. If @node corresponds to allocated extent - assign
48413 + * block number of jnode
48414 + */
48415 +static int overwrite_one_block(uf_coord_t *uf_coord, const reiser4_key *key,
48416 +                              jnode *node, int *hole_plugged)
48417 +{
48418 +       int result;
48419 +       extent_coord_extension_t *ext_coord;
48420 +       reiser4_extent *ext;
48421 +       reiser4_block_nr block;
48422 +       int how;
48423 +
48424 +       assert("vs-1312", uf_coord->coord.between == AT_UNIT);
48425 +
48426 +       result = 0;
48427 +       ext_coord = ext_coord_by_uf_coord(uf_coord);
48428 +       ext = ext_by_ext_coord(uf_coord);
48429 +       assert("", state_of_extent(ext) != UNALLOCATED_EXTENT);
48430 +
48431 +       switch (state_of_extent(ext)) {
48432 +       case ALLOCATED_EXTENT:
48433 +               block = extent_get_start(ext) + ext_coord->pos_in_unit;
48434 +               break;
48435 +
48436 +       case HOLE_EXTENT:
48437 +               result = DQUOT_ALLOC_BLOCK_NODIRTY(mapping_jnode(node)->host, 1);
48438 +               BUG_ON(result != 0);
48439 +               result = plug_hole(uf_coord, key, &how);
48440 +               if (result)
48441 +                       return result;
48442 +               block = fake_blocknr_unformatted(1);
48443 +               if (hole_plugged)
48444 +                       *hole_plugged = 1;
48445 +               JF_SET(node, JNODE_CREATED);
48446 +               break;
48447 +
48448 +       default:
48449 +               return RETERR(-EIO);
48450 +       }
48451 +
48452 +       jnode_set_block(node, &block);
48453 +       return 0;
48454 +}
48455 +
48456 +/**
48457 + * move_coord - move coordinate forward
48458 + * @uf_coord:
48459 + *
48460 + * Move coordinate one data block pointer forward. Return 1 if coord is set to
48461 + * the last one already or is invalid.
48462 + */
48463 +static int move_coord(uf_coord_t *uf_coord)
48464 +{
48465 +       extent_coord_extension_t *ext_coord;
48466 +
48467 +       if (uf_coord->valid == 0)
48468 +               return 1;
48469 +       ext_coord = &uf_coord->extension.extent;
48470 +       ext_coord->pos_in_unit ++;
48471 +       if (ext_coord->pos_in_unit < ext_coord->width)
48472 +               /* coordinate moved within the unit */
48473 +               return 0;
48474 +
48475 +       /* end of unit is reached. Try to move to next unit */
48476 +       ext_coord->pos_in_unit = 0;
48477 +       uf_coord->coord.unit_pos ++;
48478 +       if (uf_coord->coord.unit_pos < ext_coord->nr_units) {
48479 +               /* coordinate moved to next unit */
48480 +               ext_coord->ext_offset += sizeof(reiser4_extent);
48481 +               ext_coord->width =
48482 +                       extent_get_width(ext_by_offset
48483 +                                        (uf_coord->coord.node,
48484 +                                         ext_coord->ext_offset));
48485 +               ON_DEBUG(ext_coord->extent =
48486 +                        *ext_by_offset(uf_coord->coord.node,
48487 +                                       ext_coord->ext_offset));
48488 +               return 0;
48489 +       }
48490 +       /* end of item is reached */
48491 +       uf_coord->valid = 0;
48492 +       return 1;
48493 +}
48494 +
48495 +/**
48496 + * overwrite_extent -
48497 + * @inode:
48498 + *
48499 + * Returns number of handled jnodes.
48500 + */
48501 +static int overwrite_extent(uf_coord_t *uf_coord, const reiser4_key *key,
48502 +                           jnode **jnodes, int count, int *plugged_hole)
48503 +{
48504 +       int result;
48505 +       reiser4_key k;
48506 +       int i;
48507 +       jnode *node;
48508 +
48509 +       k = *key;
48510 +       for (i = 0; i < count; i ++) {
48511 +               node = jnodes[i];
48512 +               if (*jnode_get_block(node) == 0) {
48513 +                       result = overwrite_one_block(uf_coord, &k, node, plugged_hole);
48514 +                       if (result)
48515 +                               return result;
48516 +               }
48517 +               /*
48518 +                * make sure that we hold long term locked twig node containing
48519 +                * all jnodes we are about to capture
48520 +                */
48521 +               check_jnodes(uf_coord->lh->node, &k, 1);
48522 +               /*
48523 +                * assign fake block numbers to all jnodes, capture and mark
48524 +                * them dirty
48525 +                */
48526 +               spin_lock_jnode(node);
48527 +               result = try_capture(node, ZNODE_WRITE_LOCK, 0);
48528 +               BUG_ON(result != 0);
48529 +               jnode_make_dirty_locked(node);
48530 +               spin_unlock_jnode(node);
48531 +
48532 +               if (uf_coord->valid == 0)
48533 +                       return i + 1;
48534 +
48535 +               check_uf_coord(uf_coord, &k);
48536 +
48537 +               if (move_coord(uf_coord)) {
48538 +                       /*
48539 +                        * failed to move to the next node pointer. Either end
48540 +                        * of file or end of twig node is reached. In the later
48541 +                        * case we might go to the right neighbor.
48542 +                        */
48543 +                       uf_coord->valid = 0;
48544 +                       return i + 1;
48545 +               }
48546 +               set_key_offset(&k, get_key_offset(&k) + PAGE_CACHE_SIZE);
48547 +       }
48548 +
48549 +       return count;
48550 +}
48551 +
48552 +/**
48553 + * update_extent
48554 + * @file:
48555 + * @jnodes:
48556 + * @count:
48557 + * @off:
48558 + *
48559 + */
48560 +int update_extent(struct inode *inode, jnode *node, loff_t pos,
48561 +                 int *plugged_hole)
48562 +{
48563 +       int result;
48564 +       znode *loaded;
48565 +       uf_coord_t uf_coord;
48566 +       coord_t *coord;
48567 +       lock_handle lh;
48568 +       reiser4_key key;
48569 +
48570 +       assert("", lock_counters()->d_refs == 0);
48571 +
48572 +       key_by_inode_and_offset_common(inode, pos, &key);
48573 +
48574 +       init_uf_coord(&uf_coord, &lh);
48575 +       coord = &uf_coord.coord;
48576 +       result = find_file_item_nohint(coord, &lh, &key,
48577 +                                      ZNODE_WRITE_LOCK, inode);
48578 +       if (IS_CBKERR(result)) {
48579 +               assert("", lock_counters()->d_refs == 0);
48580 +               return result;
48581 +       }
48582 +
48583 +       result = zload(coord->node);
48584 +       BUG_ON(result != 0);
48585 +       loaded = coord->node;
48586 +
48587 +       if (coord->between == AFTER_UNIT) {
48588 +               /*
48589 +                * append existing extent item with unallocated extent of width
48590 +                * nr_jnodes
48591 +                */
48592 +               init_coord_extension_extent(&uf_coord,
48593 +                                           get_key_offset(&key));
48594 +               result = append_last_extent(&uf_coord, &key,
48595 +                                           &node, 1);
48596 +       } else if (coord->between == AT_UNIT) {
48597 +               /*
48598 +                * overwrite
48599 +                * not optimal yet. Will be optimized if new write will show
48600 +                * performance win.
48601 +                */
48602 +               init_coord_extension_extent(&uf_coord,
48603 +                                           get_key_offset(&key));
48604 +               result = overwrite_extent(&uf_coord, &key,
48605 +                                         &node, 1, plugged_hole);
48606 +       } else {
48607 +               /*
48608 +                * there are no items of this file in the tree yet. Create
48609 +                * first item of the file inserting one unallocated extent of
48610 +                * width nr_jnodes
48611 +                */
48612 +               result = insert_first_extent(&uf_coord, &key, &node, 1, inode);
48613 +       }
48614 +       assert("", result == 1 || result < 0);
48615 +       zrelse(loaded);
48616 +       done_lh(&lh);
48617 +       assert("", lock_counters()->d_refs == 0);
48618 +       return (result == 1) ? 0 : result;
48619 +}
48620 +
48621 +/**
48622 + * update_extents
48623 + * @file:
48624 + * @jnodes:
48625 + * @count:
48626 + * @off:
48627 + *
48628 + */
48629 +static int update_extents(struct file *file, jnode **jnodes, int count, loff_t pos)
48630 +{
48631 +       struct inode *inode;
48632 +       struct hint hint;
48633 +       reiser4_key key;
48634 +       int result;
48635 +       znode *loaded;
48636 +
48637 +       result = load_file_hint(file, &hint);
48638 +       BUG_ON(result != 0);
48639 +
48640 +       inode = file->f_dentry->d_inode;
48641 +       if (count != 0)
48642 +               /*
48643 +                * count == 0 is special case: expanding truncate
48644 +                */
48645 +               pos = (loff_t)index_jnode(jnodes[0]) << PAGE_CACHE_SHIFT;
48646 +       key_by_inode_and_offset_common(inode, pos, &key);
48647 +
48648 +       assert("", lock_counters()->d_refs == 0);
48649 +
48650 +       do {
48651 +               result = find_file_item(&hint, &key, ZNODE_WRITE_LOCK, inode);
48652 +               if (IS_CBKERR(result)) {
48653 +                       assert("", lock_counters()->d_refs == 0);
48654 +                       return result;
48655 +               }
48656 +
48657 +               result = zload(hint.ext_coord.coord.node);
48658 +               BUG_ON(result != 0);
48659 +               loaded = hint.ext_coord.coord.node;
48660 +
48661 +               if (hint.ext_coord.coord.between == AFTER_UNIT) {
48662 +                       /*
48663 +                        * append existing extent item with unallocated extent
48664 +                        * of width nr_jnodes
48665 +                        */
48666 +                       if (hint.ext_coord.valid == 0)
48667 +                               /* NOTE: get statistics on this */
48668 +                               init_coord_extension_extent(&hint.ext_coord,
48669 +                                                           get_key_offset(&key));
48670 +                       result = append_last_extent(&hint.ext_coord, &key,
48671 +                                                   jnodes, count);
48672 +               } else if (hint.ext_coord.coord.between == AT_UNIT) {
48673 +                       /*
48674 +                        * overwrite
48675 +                        * not optimal yet. Will be optimized if new write will
48676 +                        * show performance win.
48677 +                        */
48678 +                       if (hint.ext_coord.valid == 0)
48679 +                               /* NOTE: get statistics on this */
48680 +                               init_coord_extension_extent(&hint.ext_coord,
48681 +                                                           get_key_offset(&key));
48682 +                       result = overwrite_extent(&hint.ext_coord, &key,
48683 +                                                 jnodes, count, NULL);
48684 +               } else {
48685 +                       /*
48686 +                        * there are no items of this file in the tree
48687 +                        * yet. Create first item of the file inserting one
48688 +                        * unallocated extent of * width nr_jnodes
48689 +                        */
48690 +                       result = insert_first_extent(&hint.ext_coord, &key,
48691 +                                                    jnodes, count, inode);
48692 +               }
48693 +               zrelse(loaded);
48694 +               if (result < 0) {
48695 +                       done_lh(hint.ext_coord.lh);
48696 +                       break;
48697 +               }
48698 +
48699 +               jnodes += result;
48700 +               count -= result;
48701 +               set_key_offset(&key, get_key_offset(&key) + result * PAGE_CACHE_SIZE);
48702 +
48703 +               /* seal and unlock znode */
48704 +               if (hint.ext_coord.valid)
48705 +                       set_hint(&hint, &key, ZNODE_WRITE_LOCK);
48706 +               else
48707 +                       unset_hint(&hint);
48708 +
48709 +       } while (count > 0);
48710 +
48711 +       save_file_hint(file, &hint);
48712 +       assert("", lock_counters()->d_refs == 0);
48713 +       return result;
48714 +}
48715 +
48716 +/**
48717 + * write_extent_reserve_space - reserve space for extent write operation
48718 + * @inode:
48719 + *
48720 + * Estimates and reserves space which may be required for writing
48721 + * WRITE_GRANULARITY pages of file.
48722 + */
48723 +static int write_extent_reserve_space(struct inode *inode)
48724 +{
48725 +       __u64 count;
48726 +       reiser4_tree *tree;
48727 +
48728 +       /*
48729 +        * to write WRITE_GRANULARITY pages to a file by extents we have to
48730 +        * reserve disk space for:
48731 +
48732 +        * 1. find_file_item may have to insert empty node to the tree (empty
48733 +        * leaf node between two extent items). This requires 1 block and
48734 +        * number of blocks which are necessary to perform insertion of an
48735 +        * internal item into twig level.
48736 +
48737 +        * 2. for each of written pages there might be needed 1 block and
48738 +        * number of blocks which might be necessary to perform insertion of or
48739 +        * paste to an extent item.
48740 +
48741 +        * 3. stat data update
48742 +        */
48743 +       tree = tree_by_inode(inode);
48744 +       count = estimate_one_insert_item(tree) +
48745 +               WRITE_GRANULARITY * (1 + estimate_one_insert_into_item(tree)) +
48746 +               estimate_one_insert_item(tree);
48747 +       grab_space_enable();
48748 +       return reiser4_grab_space(count, 0 /* flags */);
48749 +}
48750 +
48751 +/**
48752 + * write_extent - write method of extent item plugin
48753 + * @file: file to write to
48754 + * @buf: address of user-space buffer
48755 + * @write_amount: number of bytes to write
48756 + * @off: position in file to write to
48757 + *
48758 + */
48759 +ssize_t write_extent(struct file *file, const char __user *buf, size_t count,
48760 +                    loff_t *pos)
48761 +{
48762 +       int have_to_update_extent;
48763 +       int nr_pages;
48764 +       struct page *page;
48765 +       jnode *jnodes[WRITE_GRANULARITY + 1];
48766 +       struct inode *inode;
48767 +       unsigned long index;
48768 +       unsigned long end;
48769 +       int i;
48770 +       int to_page, page_off;
48771 +       size_t left, written;
48772 +       int result;
48773 +
48774 +       inode = file->f_dentry->d_inode;
48775 +       if (write_extent_reserve_space(inode))
48776 +               return RETERR(-ENOSPC);
48777 +
48778 +       if (count == 0) {
48779 +               /* truncate case */
48780 +               update_extents(file, jnodes, 0, *pos);
48781 +               return 0;
48782 +       }
48783 +
48784 +       BUG_ON(get_current_context()->trans->atom != NULL);
48785 +
48786 +       index = *pos >> PAGE_CACHE_SHIFT;
48787 +       /* calculate number of pages which are to be written */
48788 +       end = ((*pos + count - 1) >> PAGE_CACHE_SHIFT);
48789 +       nr_pages = end - index + 1;
48790 +       assert("", nr_pages <= WRITE_GRANULARITY + 1);
48791 +
48792 +       /* get pages and jnodes */
48793 +       for (i = 0; i < nr_pages; i ++) {
48794 +               page = find_or_create_page(inode->i_mapping, index + i, get_gfp_mask());
48795 +               if (page == NULL) {
48796 +                       while(i --) {
48797 +                               unlock_page(jnode_page(jnodes[i]));
48798 +                               page_cache_release(jnode_page(jnodes[i]));
48799 +                       }
48800 +                       return RETERR(-ENOMEM);
48801 +               }
48802 +
48803 +               jnodes[i] = jnode_of_page(page);
48804 +               if (IS_ERR(jnodes[i])) {
48805 +                       unlock_page(page);
48806 +                       page_cache_release(page);
48807 +                       while (i --) {
48808 +                               jput(jnodes[i]);
48809 +                               page_cache_release(jnode_page(jnodes[i]));
48810 +                       }
48811 +                       return RETERR(-ENOMEM);
48812 +               }
48813 +               /* prevent jnode and page from disconnecting */
48814 +               JF_SET(jnodes[i], JNODE_WRITE_PREPARED);
48815 +               unlock_page(page);
48816 +       }
48817 +
48818 +       BUG_ON(get_current_context()->trans->atom != NULL);
48819 +
48820 +       have_to_update_extent = 0;
48821 +
48822 +       left = count;
48823 +       page_off = (*pos & (PAGE_CACHE_SIZE - 1));
48824 +       for (i = 0; i < nr_pages; i ++) {
48825 +               to_page = PAGE_CACHE_SIZE - page_off;
48826 +               if (to_page > left)
48827 +                       to_page = left;
48828 +               page = jnode_page(jnodes[i]);
48829 +               if (((loff_t)page->index << PAGE_CACHE_SHIFT) < inode->i_size &&
48830 +                   !PageUptodate(page) && to_page != PAGE_CACHE_SIZE) {
48831 +                       /*
48832 +                        * the above is not optimal for partial write to last
48833 +                        * page of file when file size is not at boundary of
48834 +                        * page
48835 +                        */
48836 +                       lock_page(page);
48837 +                       if (!PageUptodate(page)) {
48838 +                               result = readpage_unix_file(NULL, page);
48839 +                               BUG_ON(result != 0);
48840 +                               /* wait for read completion */
48841 +                               lock_page(page);
48842 +                               BUG_ON(!PageUptodate(page));
48843 +                               unlock_page(page);
48844 +                       } else
48845 +                               result = 0;
48846 +               }
48847 +
48848 +               BUG_ON(get_current_context()->trans->atom != NULL);
48849 +               fault_in_pages_readable(buf, to_page);
48850 +               BUG_ON(get_current_context()->trans->atom != NULL);
48851 +
48852 +               lock_page(page);
48853 +               if (!PageUptodate(page) && to_page != PAGE_CACHE_SIZE) {
48854 +                       void *kaddr;
48855 +
48856 +                       kaddr = kmap_atomic(page, KM_USER0);
48857 +                       memset(kaddr, 0, page_off);
48858 +                       memset(kaddr + page_off + to_page, 0,
48859 +                              PAGE_CACHE_SIZE - (page_off + to_page));
48860 +                       flush_dcache_page(page);
48861 +                       kunmap_atomic(kaddr, KM_USER0);
48862 +               }
48863 +
48864 +               written = filemap_copy_from_user(page, page_off, buf, to_page);
48865 +               if (written != to_page) {
48866 +                       unlock_page(page);
48867 +                       page_cache_release(page);
48868 +                       nr_pages = i;
48869 +                       jput(jnodes[i]);
48870 +                       result = RETERR(-EFAULT);
48871 +                       break;
48872 +               }
48873 +               flush_dcache_page(page);
48874 +               set_page_dirty_internal(page);
48875 +               unlock_page(page);
48876 +               mark_page_accessed(page);
48877 +               SetPageUptodate(page);
48878 +               page_cache_release(page);
48879 +
48880 +               if (jnodes[i]->blocknr == 0)
48881 +                       have_to_update_extent ++;
48882 +
48883 +               page_off = 0;
48884 +               buf += to_page;
48885 +               left -= to_page;
48886 +               BUG_ON(get_current_context()->trans->atom != NULL);
48887 +       }
48888 +
48889 +       if (have_to_update_extent) {
48890 +               update_extents(file, jnodes, nr_pages, *pos);
48891 +       } else {
48892 +               for (i = 0; i < nr_pages; i ++) {
48893 +                       spin_lock_jnode(jnodes[i]);
48894 +                       result = try_capture(jnodes[i], ZNODE_WRITE_LOCK, 0);
48895 +                       BUG_ON(result != 0);
48896 +                       jnode_make_dirty_locked(jnodes[i]);
48897 +                       spin_unlock_jnode(jnodes[i]);
48898 +               }
48899 +       }
48900 +
48901 +       for (i = 0; i < nr_pages; i ++) {
48902 +               JF_CLR(jnodes[i], JNODE_WRITE_PREPARED);
48903 +               jput(jnodes[i]);
48904 +       }
48905 +
48906 +       /* the only error handled so far is EFAULT on copy_from_user  */
48907 +       return (count - left) ? (count - left) : -EFAULT;
48908 +}
48909 +
48910 +static inline void zero_page(struct page *page)
48911 +{
48912 +       char *kaddr = kmap_atomic(page, KM_USER0);
48913 +
48914 +       memset(kaddr, 0, PAGE_CACHE_SIZE);
48915 +       flush_dcache_page(page);
48916 +       kunmap_atomic(kaddr, KM_USER0);
48917 +       SetPageUptodate(page);
48918 +       unlock_page(page);
48919 +}
48920 +
48921 +static int
48922 +do_readpage_extent(reiser4_extent * ext, reiser4_block_nr pos,
48923 +                  struct page *page)
48924 +{
48925 +       jnode *j;
48926 +       struct address_space *mapping;
48927 +       unsigned long index;
48928 +       oid_t oid;
48929 +       reiser4_block_nr block;
48930 +
48931 +       mapping = page->mapping;
48932 +       oid = get_inode_oid(mapping->host);
48933 +       index = page->index;
48934 +
48935 +       switch (state_of_extent(ext)) {
48936 +       case HOLE_EXTENT:
48937 +               /*
48938 +                * it is possible to have hole page with jnode, if page was
48939 +                * eflushed previously.
48940 +                */
48941 +               j = jfind(mapping, index);
48942 +               if (j == NULL) {
48943 +                       zero_page(page);
48944 +                       return 0;
48945 +               }
48946 +               spin_lock_jnode(j);
48947 +               if (!jnode_page(j)) {
48948 +                       jnode_attach_page(j, page);
48949 +               } else {
48950 +                       BUG_ON(jnode_page(j) != page);
48951 +                       assert("vs-1504", jnode_page(j) == page);
48952 +               }
48953 +               block = *jnode_get_io_block(j);
48954 +               spin_unlock_jnode(j);
48955 +               if (block == 0) {
48956 +                       zero_page(page);
48957 +                       jput(j);
48958 +                       return 0;
48959 +               }
48960 +               break;
48961 +
48962 +       case ALLOCATED_EXTENT:
48963 +               j = jnode_of_page(page);
48964 +               if (IS_ERR(j))
48965 +                       return PTR_ERR(j);
48966 +               if (*jnode_get_block(j) == 0) {
48967 +                       reiser4_block_nr blocknr;
48968 +
48969 +                       blocknr = extent_get_start(ext) + pos;
48970 +                       jnode_set_block(j, &blocknr);
48971 +               } else
48972 +                       assert("vs-1403",
48973 +                              j->blocknr == extent_get_start(ext) + pos);
48974 +               break;
48975 +
48976 +       case UNALLOCATED_EXTENT:
48977 +               j = jfind(mapping, index);
48978 +               assert("nikita-2688", j);
48979 +               assert("vs-1426", jnode_page(j) == NULL);
48980 +
48981 +               spin_lock_jnode(j);
48982 +               jnode_attach_page(j, page);
48983 +               spin_unlock_jnode(j);
48984 +               break;
48985 +
48986 +       default:
48987 +               warning("vs-957", "wrong extent\n");
48988 +               return RETERR(-EIO);
48989 +       }
48990 +
48991 +       BUG_ON(j == 0);
48992 +       page_io(page, j, READ, get_gfp_mask());
48993 +       jput(j);
48994 +       return 0;
48995 +}
48996 +
48997 +static int
48998 +move_coord_pages(coord_t * coord, extent_coord_extension_t * ext_coord,
48999 +                unsigned count)
49000 +{
49001 +       reiser4_extent *ext;
49002 +
49003 +       ext_coord->expected_page += count;
49004 +
49005 +       ext = ext_by_offset(coord->node, ext_coord->ext_offset);
49006 +
49007 +       do {
49008 +               if (ext_coord->pos_in_unit + count < ext_coord->width) {
49009 +                       ext_coord->pos_in_unit += count;
49010 +                       break;
49011 +               }
49012 +
49013 +               if (coord->unit_pos == ext_coord->nr_units - 1) {
49014 +                       coord->between = AFTER_UNIT;
49015 +                       return 1;
49016 +               }
49017 +
49018 +               /* shift to next unit */
49019 +               count -= (ext_coord->width - ext_coord->pos_in_unit);
49020 +               coord->unit_pos++;
49021 +               ext_coord->pos_in_unit = 0;
49022 +               ext_coord->ext_offset += sizeof(reiser4_extent);
49023 +               ext++;
49024 +               ON_DEBUG(ext_coord->extent = *ext);
49025 +               ext_coord->width = extent_get_width(ext);
49026 +       } while (1);
49027 +
49028 +       return 0;
49029 +}
49030 +
49031 +static int readahead_readpage_extent(void *vp, struct page *page)
49032 +{
49033 +       int result;
49034 +       uf_coord_t *uf_coord;
49035 +       coord_t *coord;
49036 +       extent_coord_extension_t *ext_coord;
49037 +
49038 +       uf_coord = vp;
49039 +       coord = &uf_coord->coord;
49040 +
49041 +       if (coord->between != AT_UNIT) {
49042 +               unlock_page(page);
49043 +               return RETERR(-EINVAL);
49044 +       }
49045 +
49046 +       ext_coord = &uf_coord->extension.extent;
49047 +       if (ext_coord->expected_page != page->index) {
49048 +               /* read_cache_pages skipped few pages. Try to adjust coord to page */
49049 +               assert("vs-1269", page->index > ext_coord->expected_page);
49050 +               if (move_coord_pages
49051 +                   (coord, ext_coord,
49052 +                    page->index - ext_coord->expected_page)) {
49053 +                       /* extent pointing to this page is not here */
49054 +                       unlock_page(page);
49055 +                       return RETERR(-EINVAL);
49056 +               }
49057 +
49058 +               assert("vs-1274", offset_is_in_unit(coord,
49059 +                                                   (loff_t) page->
49060 +                                                   index << PAGE_CACHE_SHIFT));
49061 +               ext_coord->expected_page = page->index;
49062 +       }
49063 +
49064 +       assert("vs-1281", page->index == ext_coord->expected_page);
49065 +       result =
49066 +           do_readpage_extent(ext_by_ext_coord(uf_coord),
49067 +                              ext_coord->pos_in_unit, page);
49068 +       if (!result)
49069 +               move_coord_pages(coord, ext_coord, 1);
49070 +       return result;
49071 +}
49072 +
49073 +static int move_coord_forward(uf_coord_t *ext_coord)
49074 +{
49075 +       coord_t *coord;
49076 +       extent_coord_extension_t *extension;
49077 +
49078 +       check_uf_coord(ext_coord, NULL);
49079 +
49080 +       extension = &ext_coord->extension.extent;
49081 +       extension->pos_in_unit++;
49082 +       if (extension->pos_in_unit < extension->width)
49083 +               /* stay within the same extent unit */
49084 +               return 0;
49085 +
49086 +       coord = &ext_coord->coord;
49087 +
49088 +       /* try to move to the next extent unit */
49089 +       coord->unit_pos++;
49090 +       if (coord->unit_pos < extension->nr_units) {
49091 +               /* went to the next extent unit */
49092 +               reiser4_extent *ext;
49093 +
49094 +               extension->pos_in_unit = 0;
49095 +               extension->ext_offset += sizeof(reiser4_extent);
49096 +               ext = ext_by_offset(coord->node, extension->ext_offset);
49097 +               ON_DEBUG(extension->extent = *ext);
49098 +               extension->width = extent_get_width(ext);
49099 +               return 0;
49100 +       }
49101 +
49102 +       /* there is no units in the item anymore */
49103 +       return 1;
49104 +}
49105 +
49106 +/* this is called by read_cache_pages for each of readahead pages */
49107 +static int extent_readpage_filler(void *data, struct page *page)
49108 +{
49109 +       hint_t *hint;
49110 +       loff_t offset;
49111 +       reiser4_key key;
49112 +       uf_coord_t *ext_coord;
49113 +       int result;
49114 +
49115 +       offset = (loff_t) page->index << PAGE_CACHE_SHIFT;
49116 +       key_by_inode_and_offset_common(page->mapping->host, offset, &key);
49117 +
49118 +       hint = (hint_t *) data;
49119 +       ext_coord = &hint->ext_coord;
49120 +
49121 +       BUG_ON(PageUptodate(page));
49122 +       unlock_page(page);
49123 +
49124 +       if (hint_validate(hint, &key, 1 /* check key */ , ZNODE_READ_LOCK) != 0) {
49125 +               result = coord_by_key(current_tree, &key, &ext_coord->coord,
49126 +                                     ext_coord->lh, ZNODE_READ_LOCK,
49127 +                                     FIND_EXACT, TWIG_LEVEL,
49128 +                                     TWIG_LEVEL, CBK_UNIQUE, NULL);
49129 +               if (result != CBK_COORD_FOUND) {
49130 +                       unset_hint(hint);
49131 +                       goto out;
49132 +               }
49133 +               ext_coord->valid = 0;
49134 +       }
49135 +
49136 +       if (zload(ext_coord->coord.node)) {
49137 +               unset_hint(hint);
49138 +               result = RETERR(-EIO);
49139 +               goto out;
49140 +       }
49141 +       if (!item_is_extent(&ext_coord->coord)) {
49142 +               /* tail conversion is running in parallel */
49143 +               zrelse(ext_coord->coord.node);
49144 +               unset_hint(hint);
49145 +               result = RETERR(-EIO);
49146 +               goto out;
49147 +       }
49148 +
49149 +       if (ext_coord->valid == 0)
49150 +               init_coord_extension_extent(ext_coord, offset);
49151 +
49152 +       check_uf_coord(ext_coord, &key);
49153 +
49154 +       lock_page(page);
49155 +       if (!PageUptodate(page)) {
49156 +               result = do_readpage_extent(ext_by_ext_coord(ext_coord),
49157 +                                           ext_coord->extension.extent.
49158 +                                           pos_in_unit, page);
49159 +               if (result)
49160 +                       unlock_page(page);
49161 +       } else {
49162 +               unlock_page(page);
49163 +               result = 0;
49164 +       }
49165 +       if (!result && move_coord_forward(ext_coord) == 0) {
49166 +               set_key_offset(&key, offset + PAGE_CACHE_SIZE);
49167 +               set_hint(hint, &key, ZNODE_READ_LOCK);
49168 +       } else
49169 +               unset_hint(hint);
49170 +       zrelse(ext_coord->coord.node);
49171 +
49172 +out:
49173 +       /* Calls to this function may be intermingled with VM writeback. */
49174 +       txn_restart_current();
49175 +       return result;
49176 +}
49177 +
49178 +/* this is called by reiser4_readpages */
49179 +static void
49180 +extent_readpages_hook(struct address_space *mapping, struct list_head *pages,
49181 +                     void *data)
49182 +{
49183 +       /* FIXME: try whether having reiser4_read_cache_pages improves anything */
49184 +       read_cache_pages(mapping, pages, extent_readpage_filler, data);
49185 +}
49186 +
49187 +static int
49188 +call_page_cache_readahead(struct address_space *mapping, struct file *file,
49189 +                         hint_t * hint,
49190 +                         unsigned long page_nr,
49191 +                         unsigned long ra_pages, struct file_ra_state *ra)
49192 +{
49193 +       reiser4_file_fsdata *fsdata;
49194 +       int result;
49195 +
49196 +       fsdata = reiser4_get_file_fsdata(file);
49197 +       if (IS_ERR(fsdata))
49198 +               return page_nr;
49199 +       fsdata->ra2.data = hint;
49200 +       fsdata->ra2.readpages = extent_readpages_hook;
49201 +
49202 +       result = page_cache_readahead(mapping, ra, file, page_nr, ra_pages);
49203 +       fsdata->ra2.readpages = NULL;
49204 +       return result;
49205 +}
49206 +
49207 +/* this is called when readahead did not */
49208 +static int call_readpage(struct file *file, struct page *page)
49209 +{
49210 +       int result;
49211 +
49212 +       result = readpage_unix_file_nolock(file, page);
49213 +       if (result)
49214 +               return result;
49215 +
49216 +       lock_page(page);
49217 +       if (!PageUptodate(page)) {
49218 +               unlock_page(page);
49219 +               page_detach_jnode(page, page->mapping, page->index);
49220 +               warning("jmacd-97178", "page is not up to date");
49221 +               return RETERR(-EIO);
49222 +       }
49223 +       unlock_page(page);
49224 +       return 0;
49225 +}
49226 +
49227 +static int filler(void *vp, struct page *page)
49228 +{
49229 +       return readpage_unix_file_nolock(vp, page);
49230 +}
49231 +
49232 +/* Implements plugin->u.item.s.file.read operation for extent items. */
49233 +int read_extent(struct file *file, flow_t *flow, hint_t *hint)
49234 +{
49235 +       int result;
49236 +       struct page *page;
49237 +       unsigned long cur_page, next_page;
49238 +       unsigned long page_off, count;
49239 +       struct address_space *mapping;
49240 +       loff_t file_off;
49241 +       uf_coord_t *uf_coord;
49242 +       coord_t *coord;
49243 +       extent_coord_extension_t *ext_coord;
49244 +       unsigned long nr_pages, prev_page;
49245 +       struct file_ra_state ra;
49246 +       char *kaddr;
49247 +
49248 +       assert("vs-1353", current_blocksize == PAGE_CACHE_SIZE);
49249 +       assert("vs-572", flow->user == 1);
49250 +       assert("vs-1351", flow->length > 0);
49251 +
49252 +       uf_coord = &hint->ext_coord;
49253 +
49254 +       check_uf_coord(uf_coord, NULL);
49255 +       assert("vs-33", uf_coord->lh == &hint->lh);
49256 +
49257 +       coord = &uf_coord->coord;
49258 +       assert("vs-1119", znode_is_rlocked(coord->node));
49259 +       assert("vs-1120", znode_is_loaded(coord->node));
49260 +       assert("vs-1256", coord_matches_key_extent(coord, &flow->key));
49261 +
49262 +       mapping = file->f_dentry->d_inode->i_mapping;
49263 +       ext_coord = &uf_coord->extension.extent;
49264 +
49265 +       /* offset in a file to start read from */
49266 +       file_off = get_key_offset(&flow->key);
49267 +       /* offset within the page to start read from */
49268 +       page_off = (unsigned long)(file_off & (PAGE_CACHE_SIZE - 1));
49269 +       /* bytes which can be read from the page which contains file_off */
49270 +       count = PAGE_CACHE_SIZE - page_off;
49271 +
49272 +       /* index of page containing offset read is to start from */
49273 +       cur_page = (unsigned long)(file_off >> PAGE_CACHE_SHIFT);
49274 +       next_page = cur_page;
49275 +       /* number of pages flow spans over */
49276 +       nr_pages =
49277 +           ((file_off + flow->length + PAGE_CACHE_SIZE -
49278 +             1) >> PAGE_CACHE_SHIFT) - cur_page;
49279 +
49280 +       /* we start having twig node read locked. However, we do not want to
49281 +          keep that lock all the time readahead works. So, set a sel and
49282 +          release twig node. */
49283 +       set_hint(hint, &flow->key, ZNODE_READ_LOCK);
49284 +       /* &hint->lh is done-ed */
49285 +
49286 +       ra = file->f_ra;
49287 +       prev_page = ra.prev_page;
49288 +       do {
49289 +               txn_restart_current();
49290 +               if (next_page == cur_page)
49291 +                       next_page =
49292 +                           call_page_cache_readahead(mapping, file, hint,
49293 +                                                     cur_page, nr_pages, &ra);
49294 +
49295 +               page = find_get_page(mapping, cur_page);
49296 +               if (unlikely(page == NULL)) {
49297 +                       handle_ra_miss(mapping, &ra, cur_page);
49298 +                       page = read_cache_page(mapping, cur_page, filler, file);
49299 +                       if (IS_ERR(page))
49300 +                               return PTR_ERR(page);
49301 +                       lock_page(page);
49302 +                       if (!PageUptodate(page)) {
49303 +                               unlock_page(page);
49304 +                               page_detach_jnode(page, mapping, cur_page);
49305 +                               page_cache_release(page);
49306 +                               warning("jmacd-97178",
49307 +                                       "extent_read: page is not up to date");
49308 +                               return RETERR(-EIO);
49309 +                       }
49310 +                       unlock_page(page);
49311 +               } else {
49312 +                       if (!PageUptodate(page)) {
49313 +                               lock_page(page);
49314 +
49315 +                               assert("", page->mapping == mapping);
49316 +                               if (PageUptodate(page))
49317 +                                       unlock_page(page);
49318 +                               else {
49319 +                                       result = call_readpage(file, page);
49320 +                                       if (result) {
49321 +                                               page_cache_release(page);
49322 +                                               return RETERR(result);
49323 +                                       }
49324 +                               }
49325 +                       }
49326 +                       if (prev_page != cur_page)
49327 +                               mark_page_accessed(page);
49328 +                       prev_page = cur_page;
49329 +               }
49330 +
49331 +               /* If users can be writing to this page using arbitrary virtual
49332 +                  addresses, take care about potential aliasing before reading
49333 +                  the page on the kernel side.
49334 +                */
49335 +               if (mapping_writably_mapped(mapping))
49336 +                       flush_dcache_page(page);
49337 +
49338 +               assert("nikita-3034", schedulable());
49339 +
49340 +               /* number of bytes which are to be read from the page */
49341 +               if (count > flow->length)
49342 +                       count = flow->length;
49343 +
49344 +               result = fault_in_pages_writeable(flow->data, count);
49345 +               if (result) {
49346 +                       page_cache_release(page);
49347 +                       return RETERR(-EFAULT);
49348 +               }
49349 +
49350 +               kaddr = kmap_atomic(page, KM_USER0);
49351 +               result = __copy_to_user_inatomic(flow->data,
49352 +                                              kaddr + page_off, count);
49353 +               kunmap_atomic(kaddr, KM_USER0);
49354 +               if (result != 0) {
49355 +                       kaddr = kmap(page);
49356 +                       result = __copy_to_user(flow->data, kaddr + page_off, count);
49357 +                       kunmap(page);
49358 +                       if (unlikely(result))
49359 +                               return RETERR(-EFAULT);
49360 +               }
49361 +
49362 +               page_cache_release(page);
49363 +
49364 +               /* increase key (flow->key), update user area pointer (flow->data) */
49365 +               move_flow_forward(flow, count);
49366 +
49367 +               page_off = 0;
49368 +               cur_page ++;
49369 +               count = PAGE_CACHE_SIZE;
49370 +               nr_pages--;
49371 +       } while (flow->length);
49372 +
49373 +       file->f_ra = ra;
49374 +       return 0;
49375 +}
49376 +
49377 +/*
49378 +  plugin->u.item.s.file.readpages
49379 +*/
49380 +void
49381 +readpages_extent(void *vp, struct address_space *mapping,
49382 +                struct list_head *pages)
49383 +{
49384 +       assert("vs-1739", 0);
49385 +       if (vp)
49386 +               read_cache_pages(mapping, pages, readahead_readpage_extent, vp);
49387 +}
49388 +
49389 +/*
49390 +   plugin->s.file.readpage
49391 +   reiser4_read->unix_file_read->page_cache_readahead->reiser4_readpage->unix_file_readpage->extent_readpage
49392 +   or
49393 +   filemap_nopage->reiser4_readpage->readpage_unix_file->->readpage_extent
49394 +
49395 +   At the beginning: coord->node is read locked, zloaded, page is
49396 +   locked, coord is set to existing unit inside of extent item (it is not necessary that coord matches to page->index)
49397 +*/
49398 +int readpage_extent(void *vp, struct page *page)
49399 +{
49400 +       uf_coord_t *uf_coord = vp;
49401 +       ON_DEBUG(coord_t * coord = &uf_coord->coord);
49402 +       ON_DEBUG(reiser4_key key);
49403 +
49404 +       assert("vs-1040", PageLocked(page));
49405 +       assert("vs-1050", !PageUptodate(page));
49406 +       assert("vs-1039", page->mapping && page->mapping->host);
49407 +
49408 +       assert("vs-1044", znode_is_loaded(coord->node));
49409 +       assert("vs-758", item_is_extent(coord));
49410 +       assert("vs-1046", coord_is_existing_unit(coord));
49411 +       assert("vs-1045", znode_is_rlocked(coord->node));
49412 +       assert("vs-1047",
49413 +              page->mapping->host->i_ino ==
49414 +              get_key_objectid(item_key_by_coord(coord, &key)));
49415 +       check_uf_coord(uf_coord, NULL);
49416 +
49417 +       return do_readpage_extent(ext_by_ext_coord(uf_coord),
49418 +                                 uf_coord->extension.extent.pos_in_unit, page);
49419 +}
49420 +
49421 +/**
49422 + * get_block_address_extent
49423 + * @coord:
49424 + * @block:
49425 + * @result:
49426 + *
49427 + *
49428 + */
49429 +int get_block_address_extent(const coord_t *coord, sector_t block,
49430 +                            sector_t *result)
49431 +{
49432 +       reiser4_extent *ext;
49433 +
49434 +       if (!coord_is_existing_unit(coord))
49435 +               return RETERR(-EINVAL);
49436 +
49437 +       ext = extent_by_coord(coord);
49438 +
49439 +       if (state_of_extent(ext) != ALLOCATED_EXTENT)
49440 +               /* FIXME: bad things may happen if it is unallocated extent */
49441 +               *result = 0;
49442 +       else {
49443 +               reiser4_key key;
49444 +
49445 +               unit_key_by_coord(coord, &key);
49446 +               assert("vs-1645",
49447 +                      block >= get_key_offset(&key) >> current_blocksize_bits);
49448 +               assert("vs-1646",
49449 +                      block <
49450 +                      (get_key_offset(&key) >> current_blocksize_bits) +
49451 +                      extent_get_width(ext));
49452 +               *result =
49453 +                   extent_get_start(ext) + (block -
49454 +                                            (get_key_offset(&key) >>
49455 +                                             current_blocksize_bits));
49456 +       }
49457 +       return 0;
49458 +}
49459 +
49460 +/*
49461 +  plugin->u.item.s.file.append_key
49462 +  key of first byte which is the next to last byte by addressed by this extent
49463 +*/
49464 +reiser4_key *append_key_extent(const coord_t * coord, reiser4_key * key)
49465 +{
49466 +       item_key_by_coord(coord, key);
49467 +       set_key_offset(key,
49468 +                      get_key_offset(key) + extent_size(coord,
49469 +                                                        nr_units_extent
49470 +                                                        (coord)));
49471 +
49472 +       assert("vs-610", get_key_offset(key)
49473 +              && (get_key_offset(key) & (current_blocksize - 1)) == 0);
49474 +       return key;
49475 +}
49476 +
49477 +/* plugin->u.item.s.file.init_coord_extension */
49478 +void init_coord_extension_extent(uf_coord_t * uf_coord, loff_t lookuped)
49479 +{
49480 +       coord_t *coord;
49481 +       extent_coord_extension_t *ext_coord;
49482 +       reiser4_key key;
49483 +       loff_t offset;
49484 +
49485 +       assert("vs-1295", uf_coord->valid == 0);
49486 +
49487 +       coord = &uf_coord->coord;
49488 +       assert("vs-1288", coord_is_iplug_set(coord));
49489 +       assert("vs-1327", znode_is_loaded(coord->node));
49490 +
49491 +       if (coord->between != AFTER_UNIT && coord->between != AT_UNIT)
49492 +               return;
49493 +
49494 +       ext_coord = &uf_coord->extension.extent;
49495 +       ext_coord->nr_units = nr_units_extent(coord);
49496 +       ext_coord->ext_offset =
49497 +           (char *)extent_by_coord(coord) - zdata(coord->node);
49498 +       ext_coord->width = extent_get_width(extent_by_coord(coord));
49499 +       ON_DEBUG(ext_coord->extent = *extent_by_coord(coord));
49500 +       uf_coord->valid = 1;
49501 +
49502 +       /* pos_in_unit is the only uninitialized field in extended coord */
49503 +       if (coord->between == AFTER_UNIT) {
49504 +               assert("vs-1330",
49505 +                      coord->unit_pos == nr_units_extent(coord) - 1);
49506 +
49507 +               ext_coord->pos_in_unit = ext_coord->width - 1;
49508 +       } else {
49509 +               /* AT_UNIT */
49510 +               unit_key_by_coord(coord, &key);
49511 +               offset = get_key_offset(&key);
49512 +
49513 +               assert("vs-1328", offset <= lookuped);
49514 +               assert("vs-1329",
49515 +                      lookuped <
49516 +                      offset + ext_coord->width * current_blocksize);
49517 +               ext_coord->pos_in_unit =
49518 +                   ((lookuped - offset) >> current_blocksize_bits);
49519 +       }
49520 +}
49521 +
49522 +/*
49523 + * Local variables:
49524 + * c-indentation-style: "K&R"
49525 + * mode-name: "LC"
49526 + * c-basic-offset: 8
49527 + * tab-width: 8
49528 + * fill-column: 79
49529 + * scroll-step: 1
49530 + * End:
49531 + */
49532 diff --git a/fs/reiser4/plugin/item/extent_flush_ops.c b/fs/reiser4/plugin/item/extent_flush_ops.c
49533 new file mode 100644
49534 index 0000000..66d7c3d
49535 --- /dev/null
49536 +++ b/fs/reiser4/plugin/item/extent_flush_ops.c
49537 @@ -0,0 +1,1018 @@
49538 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
49539 +
49540 +#include "item.h"
49541 +#include "../../tree.h"
49542 +#include "../../jnode.h"
49543 +#include "../../super.h"
49544 +#include "../../flush.h"
49545 +#include "../../carry.h"
49546 +#include "../object.h"
49547 +
49548 +#include <linux/pagemap.h>
49549 +
49550 +static reiser4_block_nr extent_unit_start(const coord_t * item);
49551 +
49552 +/* Return either first or last extent (depending on @side) of the item
49553 +   @coord is set to. Set @pos_in_unit either to first or to last block
49554 +   of extent. */
49555 +static reiser4_extent *extent_utmost_ext(const coord_t * coord, sideof side,
49556 +                                        reiser4_block_nr * pos_in_unit)
49557 +{
49558 +       reiser4_extent *ext;
49559 +
49560 +       if (side == LEFT_SIDE) {
49561 +               /* get first extent of item */
49562 +               ext = extent_item(coord);
49563 +               *pos_in_unit = 0;
49564 +       } else {
49565 +               /* get last extent of item and last position within it */
49566 +               assert("vs-363", side == RIGHT_SIDE);
49567 +               ext = extent_item(coord) + coord_last_unit_pos(coord);
49568 +               *pos_in_unit = extent_get_width(ext) - 1;
49569 +       }
49570 +
49571 +       return ext;
49572 +}
49573 +
49574 +/* item_plugin->f.utmost_child */
49575 +/* Return the child. Coord is set to extent item. Find jnode corresponding
49576 +   either to first or to last unformatted node pointed by the item */
49577 +int utmost_child_extent(const coord_t * coord, sideof side, jnode ** childp)
49578 +{
49579 +       reiser4_extent *ext;
49580 +       reiser4_block_nr pos_in_unit;
49581 +
49582 +       ext = extent_utmost_ext(coord, side, &pos_in_unit);
49583 +
49584 +       switch (state_of_extent(ext)) {
49585 +       case HOLE_EXTENT:
49586 +               *childp = NULL;
49587 +               return 0;
49588 +       case ALLOCATED_EXTENT:
49589 +       case UNALLOCATED_EXTENT:
49590 +               break;
49591 +       default:
49592 +               /* this should never happen */
49593 +               assert("vs-1417", 0);
49594 +       }
49595 +
49596 +       {
49597 +               reiser4_key key;
49598 +               reiser4_tree *tree;
49599 +               unsigned long index;
49600 +
49601 +               if (side == LEFT_SIDE) {
49602 +                       /* get key of first byte addressed by the extent */
49603 +                       item_key_by_coord(coord, &key);
49604 +               } else {
49605 +                       /* get key of byte which next after last byte addressed by the extent */
49606 +                       append_key_extent(coord, &key);
49607 +               }
49608 +
49609 +               assert("vs-544",
49610 +                      (get_key_offset(&key) >> PAGE_CACHE_SHIFT) < ~0ul);
49611 +               /* index of first or last (depending on @side) page addressed
49612 +                  by the extent */
49613 +               index =
49614 +                   (unsigned long)(get_key_offset(&key) >> PAGE_CACHE_SHIFT);
49615 +               if (side == RIGHT_SIDE)
49616 +                       index--;
49617 +
49618 +               tree = coord->node->zjnode.tree;
49619 +               *childp = jlookup(tree, get_key_objectid(&key), index);
49620 +       }
49621 +
49622 +       return 0;
49623 +}
49624 +
49625 +/* item_plugin->f.utmost_child_real_block */
49626 +/* Return the child's block, if allocated. */
49627 +int
49628 +utmost_child_real_block_extent(const coord_t * coord, sideof side,
49629 +                              reiser4_block_nr * block)
49630 +{
49631 +       reiser4_extent *ext;
49632 +
49633 +       ext = extent_by_coord(coord);
49634 +
49635 +       switch (state_of_extent(ext)) {
49636 +       case ALLOCATED_EXTENT:
49637 +               *block = extent_get_start(ext);
49638 +               if (side == RIGHT_SIDE)
49639 +                       *block += extent_get_width(ext) - 1;
49640 +               break;
49641 +       case HOLE_EXTENT:
49642 +       case UNALLOCATED_EXTENT:
49643 +               *block = 0;
49644 +               break;
49645 +       default:
49646 +               /* this should never happen */
49647 +               assert("vs-1418", 0);
49648 +       }
49649 +
49650 +       return 0;
49651 +}
49652 +
49653 +/* item_plugin->f.scan */
49654 +/* Performs leftward scanning starting from an unformatted node and its parent coordinate.
49655 +   This scan continues, advancing the parent coordinate, until either it encounters a
49656 +   formatted child or it finishes scanning this node.
49657 +
49658 +   If unallocated, the entire extent must be dirty and in the same atom.  (Actually, I'm
49659 +   not sure this is last property (same atom) is enforced, but it should be the case since
49660 +   one atom must write the parent and the others must read the parent, thus fusing?).  In
49661 +   any case, the code below asserts this case for unallocated extents.  Unallocated
49662 +   extents are thus optimized because we can skip to the endpoint when scanning.
49663 +
49664 +   It returns control to scan_extent, handles these terminating conditions, e.g., by
49665 +   loading the next twig.
49666 +*/
49667 +int scan_extent(flush_scan * scan)
49668 +{
49669 +       coord_t coord;
49670 +       jnode *neighbor;
49671 +       unsigned long scan_index, unit_index, unit_width, scan_max, scan_dist;
49672 +       reiser4_block_nr unit_start;
49673 +       __u64 oid;
49674 +       reiser4_key key;
49675 +       int ret = 0, allocated, incr;
49676 +       reiser4_tree *tree;
49677 +
49678 +       if (!JF_ISSET(scan->node, JNODE_DIRTY)) {
49679 +               scan->stop = 1;
49680 +               return 0;       /* Race with truncate, this node is already
49681 +                                * truncated. */
49682 +       }
49683 +
49684 +       coord_dup(&coord, &scan->parent_coord);
49685 +
49686 +       assert("jmacd-1404", !scan_finished(scan));
49687 +       assert("jmacd-1405", jnode_get_level(scan->node) == LEAF_LEVEL);
49688 +       assert("jmacd-1406", jnode_is_unformatted(scan->node));
49689 +
49690 +       /* The scan_index variable corresponds to the current page index of the
49691 +          unformatted block scan position. */
49692 +       scan_index = index_jnode(scan->node);
49693 +
49694 +       assert("jmacd-7889", item_is_extent(&coord));
49695 +
49696 +      repeat:
49697 +       /* objectid of file */
49698 +       oid = get_key_objectid(item_key_by_coord(&coord, &key));
49699 +
49700 +       allocated = !extent_is_unallocated(&coord);
49701 +       /* Get the values of this extent unit: */
49702 +       unit_index = extent_unit_index(&coord);
49703 +       unit_width = extent_unit_width(&coord);
49704 +       unit_start = extent_unit_start(&coord);
49705 +
49706 +       assert("jmacd-7187", unit_width > 0);
49707 +       assert("jmacd-7188", scan_index >= unit_index);
49708 +       assert("jmacd-7189", scan_index <= unit_index + unit_width - 1);
49709 +
49710 +       /* Depending on the scan direction, we set different maximum values for scan_index
49711 +          (scan_max) and the number of nodes that would be passed if the scan goes the
49712 +          entire way (scan_dist).  Incr is an integer reflecting the incremental
49713 +          direction of scan_index. */
49714 +       if (scanning_left(scan)) {
49715 +               scan_max = unit_index;
49716 +               scan_dist = scan_index - unit_index;
49717 +               incr = -1;
49718 +       } else {
49719 +               scan_max = unit_index + unit_width - 1;
49720 +               scan_dist = scan_max - unit_index;
49721 +               incr = +1;
49722 +       }
49723 +
49724 +       tree = coord.node->zjnode.tree;
49725 +
49726 +       /* If the extent is allocated we have to check each of its blocks.  If the extent
49727 +          is unallocated we can skip to the scan_max. */
49728 +       if (allocated) {
49729 +               do {
49730 +                       neighbor = jlookup(tree, oid, scan_index);
49731 +                       if (neighbor == NULL)
49732 +                               goto stop_same_parent;
49733 +
49734 +                       if (scan->node != neighbor
49735 +                           && !scan_goto(scan, neighbor)) {
49736 +                               /* @neighbor was jput() by scan_goto(). */
49737 +                               goto stop_same_parent;
49738 +                       }
49739 +
49740 +                       ret = scan_set_current(scan, neighbor, 1, &coord);
49741 +                       if (ret != 0) {
49742 +                               goto exit;
49743 +                       }
49744 +
49745 +                       /* reference to @neighbor is stored in @scan, no need
49746 +                          to jput(). */
49747 +                       scan_index += incr;
49748 +
49749 +               } while (incr + scan_max != scan_index);
49750 +
49751 +       } else {
49752 +               /* Optimized case for unallocated extents, skip to the end. */
49753 +               neighbor = jlookup(tree, oid, scan_max /*index */ );
49754 +               if (neighbor == NULL) {
49755 +                       /* Race with truncate */
49756 +                       scan->stop = 1;
49757 +                       ret = 0;
49758 +                       goto exit;
49759 +               }
49760 +
49761 +               assert("zam-1043", blocknr_is_fake(jnode_get_block(neighbor)));
49762 +
49763 +               ret = scan_set_current(scan, neighbor, scan_dist, &coord);
49764 +               if (ret != 0) {
49765 +                       goto exit;
49766 +               }
49767 +       }
49768 +
49769 +       if (coord_sideof_unit(&coord, scan->direction) == 0
49770 +           && item_is_extent(&coord)) {
49771 +               /* Continue as long as there are more extent units. */
49772 +
49773 +               scan_index =
49774 +                   extent_unit_index(&coord) +
49775 +                   (scanning_left(scan) ? extent_unit_width(&coord) - 1 : 0);
49776 +               goto repeat;
49777 +       }
49778 +
49779 +       if (0) {
49780 +             stop_same_parent:
49781 +
49782 +               /* If we are scanning left and we stop in the middle of an allocated
49783 +                  extent, we know the preceder immediately.. */
49784 +               /* middle of extent is (scan_index - unit_index) != 0. */
49785 +               if (scanning_left(scan) && (scan_index - unit_index) != 0) {
49786 +                       /* FIXME(B): Someone should step-through and verify that this preceder
49787 +                          calculation is indeed correct. */
49788 +                       /* @unit_start is starting block (number) of extent
49789 +                          unit. Flush stopped at the @scan_index block from
49790 +                          the beginning of the file, which is (scan_index -
49791 +                          unit_index) block within extent.
49792 +                        */
49793 +                       if (unit_start) {
49794 +                               /* skip preceder update when we are at hole */
49795 +                               scan->preceder_blk =
49796 +                                   unit_start + scan_index - unit_index;
49797 +                               check_preceder(scan->preceder_blk);
49798 +                       }
49799 +               }
49800 +
49801 +               /* In this case, we leave coord set to the parent of scan->node. */
49802 +               scan->stop = 1;
49803 +
49804 +       } else {
49805 +               /* In this case, we are still scanning, coord is set to the next item which is
49806 +                  either off-the-end of the node or not an extent. */
49807 +               assert("jmacd-8912", scan->stop == 0);
49808 +               assert("jmacd-7812",
49809 +                      (coord_is_after_sideof_unit(&coord, scan->direction)
49810 +                       || !item_is_extent(&coord)));
49811 +       }
49812 +
49813 +       ret = 0;
49814 +      exit:
49815 +       return ret;
49816 +}
49817 +
49818 +/* ask block allocator for some blocks */
49819 +static void extent_allocate_blocks(reiser4_blocknr_hint *preceder,
49820 +                                  reiser4_block_nr wanted_count,
49821 +                                  reiser4_block_nr *first_allocated,
49822 +                                  reiser4_block_nr *allocated,
49823 +                                  block_stage_t block_stage)
49824 +{
49825 +       *allocated = wanted_count;
49826 +       preceder->max_dist = 0; /* scan whole disk, if needed */
49827 +
49828 +       /* that number of blocks (wanted_count) is either in UNALLOCATED or in GRABBED */
49829 +       preceder->block_stage = block_stage;
49830 +
49831 +       /* FIXME: we do not handle errors here now */
49832 +       check_me("vs-420",
49833 +                reiser4_alloc_blocks(preceder, first_allocated, allocated,
49834 +                                     BA_PERMANENT) == 0);
49835 +       /* update flush_pos's preceder to last allocated block number */
49836 +       preceder->blk = *first_allocated + *allocated - 1;
49837 +}
49838 +
49839 +/* when on flush time unallocated extent is to be replaced with allocated one it may happen that one unallocated extent
49840 +   will have to be replaced with set of allocated extents. In this case insert_into_item will be called which may have
49841 +   to add new nodes into tree. Space for that is taken from inviolable reserve (5%). */
49842 +static reiser4_block_nr reserve_replace(void)
49843 +{
49844 +       reiser4_block_nr grabbed, needed;
49845 +
49846 +       grabbed = get_current_context()->grabbed_blocks;
49847 +       needed = estimate_one_insert_into_item(current_tree);
49848 +       check_me("vpf-340", !reiser4_grab_space_force(needed, BA_RESERVED));
49849 +       return grabbed;
49850 +}
49851 +
49852 +static void free_replace_reserved(reiser4_block_nr grabbed)
49853 +{
49854 +       reiser4_context *ctx;
49855 +
49856 +       ctx = get_current_context();
49857 +       grabbed2free(ctx, get_super_private(ctx->super),
49858 +                    ctx->grabbed_blocks - grabbed);
49859 +}
49860 +
49861 +/* Block offset of first block addressed by unit */
49862 +__u64 extent_unit_index(const coord_t * item)
49863 +{
49864 +       reiser4_key key;
49865 +
49866 +       assert("vs-648", coord_is_existing_unit(item));
49867 +       unit_key_by_coord(item, &key);
49868 +       return get_key_offset(&key) >> current_blocksize_bits;
49869 +}
49870 +
49871 +/* AUDIT shouldn't return value be of reiser4_block_nr type?
49872 +   Josh's answer: who knows?  Is a "number of blocks" the same type as "block offset"? */
49873 +__u64 extent_unit_width(const coord_t * item)
49874 +{
49875 +       assert("vs-649", coord_is_existing_unit(item));
49876 +       return width_by_coord(item);
49877 +}
49878 +
49879 +/* Starting block location of this unit */
49880 +static reiser4_block_nr extent_unit_start(const coord_t * item)
49881 +{
49882 +       return extent_get_start(extent_by_coord(item));
49883 +}
49884 +
49885 +/**
49886 + * split_allocated_extent -
49887 + * @coord:
49888 + * @pos_in_unit:
49889 + *
49890 + * replace allocated extent with two allocated extents
49891 + */
49892 +static int split_allocated_extent(coord_t *coord, reiser4_block_nr pos_in_unit)
49893 +{
49894 +       int result;
49895 +       struct replace_handle *h;
49896 +       reiser4_extent *ext;
49897 +       reiser4_block_nr grabbed;
49898 +
49899 +       ext = extent_by_coord(coord);
49900 +       assert("vs-1410", state_of_extent(ext) == ALLOCATED_EXTENT);
49901 +       assert("vs-1411", extent_get_width(ext) > pos_in_unit);
49902 +
49903 +       h = kmalloc(sizeof(*h), get_gfp_mask());
49904 +       if (h == NULL)
49905 +               return RETERR(-ENOMEM);
49906 +       h->coord = coord;
49907 +       h->lh = znode_lh(coord->node);
49908 +       h->pkey = &h->key;
49909 +       unit_key_by_coord(coord, h->pkey);
49910 +       set_key_offset(h->pkey,
49911 +                      (get_key_offset(h->pkey) +
49912 +                       pos_in_unit * current_blocksize));
49913 +       set_extent(&h->overwrite, extent_get_start(ext), pos_in_unit);
49914 +       set_extent(&h->new_extents[0], extent_get_start(ext) + pos_in_unit,
49915 +                  extent_get_width(ext) - pos_in_unit);
49916 +       h->nr_new_extents = 1;
49917 +       h->flags = COPI_DONT_SHIFT_LEFT;
49918 +       h->paste_key = h->key;
49919 +
49920 +       /* reserve space for extent unit paste, @grabbed is reserved before */
49921 +       grabbed = reserve_replace();
49922 +       result = replace_extent(h, 0 /* leave @coord set to overwritten
49923 +                                       extent */);
49924 +       /* restore reserved */
49925 +       free_replace_reserved(grabbed);
49926 +       kfree(h);
49927 +       return result;
49928 +}
49929 +
49930 +/* replace extent @ext by extent @replace. Try to merge @replace with previous extent of the item (if there is
49931 +   one). Return 1 if it succeeded, 0 - otherwise */
49932 +static int try_to_merge_with_left(coord_t *coord, reiser4_extent *ext,
49933 +                      reiser4_extent *replace)
49934 +{
49935 +       assert("vs-1415", extent_by_coord(coord) == ext);
49936 +
49937 +       if (coord->unit_pos == 0
49938 +           || state_of_extent(ext - 1) != ALLOCATED_EXTENT)
49939 +               /* @ext either does not exist or is not allocated extent */
49940 +               return 0;
49941 +       if (extent_get_start(ext - 1) + extent_get_width(ext - 1) !=
49942 +           extent_get_start(replace))
49943 +               return 0;
49944 +
49945 +       /* we can glue, widen previous unit */
49946 +       extent_set_width(ext - 1,
49947 +                        extent_get_width(ext - 1) + extent_get_width(replace));
49948 +
49949 +       if (extent_get_width(ext) != extent_get_width(replace)) {
49950 +               /* make current extent narrower */
49951 +               if (state_of_extent(ext) == ALLOCATED_EXTENT)
49952 +                       extent_set_start(ext,
49953 +                                        extent_get_start(ext) +
49954 +                                        extent_get_width(replace));
49955 +               extent_set_width(ext,
49956 +                                extent_get_width(ext) -
49957 +                                extent_get_width(replace));
49958 +       } else {
49959 +               /* current extent completely glued with its left neighbor, remove it */
49960 +               coord_t from, to;
49961 +
49962 +               coord_dup(&from, coord);
49963 +               from.unit_pos = nr_units_extent(coord) - 1;
49964 +               coord_dup(&to, &from);
49965 +
49966 +               /* currently cut from extent can cut either from the beginning or from the end. Move place which got
49967 +                  freed after unit removal to end of item */
49968 +               memmove(ext, ext + 1,
49969 +                       (from.unit_pos -
49970 +                        coord->unit_pos) * sizeof(reiser4_extent));
49971 +               /* wipe part of item which is going to be cut, so that node_check will not be confused */
49972 +               cut_node_content(&from, &to, NULL, NULL, NULL);
49973 +       }
49974 +       znode_make_dirty(coord->node);
49975 +       /* move coord back */
49976 +       coord->unit_pos--;
49977 +       return 1;
49978 +}
49979 +
49980 +/**
49981 + * conv_extent - replace extent with 2 ones
49982 + * @coord: coordinate of extent to be replaced
49983 + * @replace: extent to overwrite the one @coord is set to
49984 + *
49985 + * Overwrites extent @coord is set to and paste one extent unit after
49986 + * overwritten one if @replace is shorter than initial extent
49987 + */
49988 +static int conv_extent(coord_t *coord, reiser4_extent *replace)
49989 +{
49990 +       int result;
49991 +       struct replace_handle *h;
49992 +       reiser4_extent *ext;
49993 +       reiser4_block_nr start, width, new_width;
49994 +       reiser4_block_nr grabbed;
49995 +       extent_state state;
49996 +
49997 +       ext = extent_by_coord(coord);
49998 +       state = state_of_extent(ext);
49999 +       start = extent_get_start(ext);
50000 +       width = extent_get_width(ext);
50001 +       new_width = extent_get_width(replace);
50002 +
50003 +       assert("vs-1458", (state == UNALLOCATED_EXTENT ||
50004 +                          state == ALLOCATED_EXTENT));
50005 +       assert("vs-1459", width >= new_width);
50006 +
50007 +       if (try_to_merge_with_left(coord, ext, replace)) {
50008 +               /* merged @replace with left neighbor. Current unit is either
50009 +                  removed or narrowed */
50010 +               return 0;
50011 +       }
50012 +
50013 +       if (width == new_width) {
50014 +               /* replace current extent with @replace */
50015 +               *ext = *replace;
50016 +               znode_make_dirty(coord->node);
50017 +               return 0;
50018 +       }
50019 +
50020 +       h = kmalloc(sizeof(*h), get_gfp_mask());
50021 +       if (h == NULL)
50022 +               return RETERR(-ENOMEM);
50023 +       h->coord = coord;
50024 +       h->lh = znode_lh(coord->node);
50025 +       h->pkey = &h->key;
50026 +       unit_key_by_coord(coord, h->pkey);
50027 +       set_key_offset(h->pkey,
50028 +                      (get_key_offset(h->pkey) + new_width * current_blocksize));
50029 +       h->overwrite = *replace;
50030 +
50031 +       /* replace @ext with @replace and padding extent */
50032 +       set_extent(&h->new_extents[0],
50033 +                  (state == ALLOCATED_EXTENT) ? (start + new_width) : UNALLOCATED_EXTENT_START,
50034 +                  width - new_width);
50035 +       h->nr_new_extents = 1;
50036 +       h->flags = COPI_DONT_SHIFT_LEFT;
50037 +       h->paste_key = h->key;
50038 +
50039 +       /* reserve space for extent unit paste, @grabbed is reserved before */
50040 +       grabbed = reserve_replace();
50041 +       result = replace_extent(h, 0 /* leave @coord set to overwritten
50042 +                                       extent */);
50043 +
50044 +       /* restore reserved */
50045 +       free_replace_reserved(grabbed);
50046 +       kfree(h);
50047 +       return result;
50048 +}
50049 +
50050 +/**
50051 + * assign_real_blocknrs
50052 + * @flush_pos:
50053 + * @oid: objectid of file jnodes to assign block number to belongs to
50054 + * @index: first jnode on the range
50055 + * @count: number of jnodes to assign block numbers to
50056 + * @first: start of allocated block range
50057 + *
50058 + * Assigns block numbers to each of @count jnodes. Index of first jnode is
50059 + * @index. Jnodes get lookuped with jlookup.
50060 + */
50061 +static void assign_real_blocknrs(flush_pos_t *flush_pos, oid_t oid,
50062 +                                unsigned long index, reiser4_block_nr count,
50063 +                                reiser4_block_nr first)
50064 +{
50065 +       unsigned long i;
50066 +       reiser4_tree *tree;
50067 +       txn_atom *atom;
50068 +       int nr;
50069 +
50070 +       atom = atom_locked_by_fq(flush_pos->fq);
50071 +       assert("vs-1468", atom);
50072 +       BUG_ON(atom == NULL);
50073 +
50074 +       nr = 0;
50075 +       tree = current_tree;
50076 +       for (i = 0; i < count; ++i, ++index) {
50077 +               jnode *node;
50078 +
50079 +               node = jlookup(tree, oid, index);
50080 +               assert("", node != NULL);
50081 +               BUG_ON(node == NULL);
50082 +
50083 +               spin_lock_jnode(node);
50084 +               assert("", !jnode_is_flushprepped(node));
50085 +               assert("vs-1475", node->atom == atom);
50086 +               assert("vs-1476", atomic_read(&node->x_count) > 0);
50087 +
50088 +               JF_CLR(node, JNODE_FLUSH_RESERVED);
50089 +               jnode_set_block(node, &first);
50090 +               unformatted_make_reloc(node, flush_pos->fq);
50091 +               ON_DEBUG(count_jnode(node->atom, node, NODE_LIST(node),
50092 +                                    FQ_LIST, 0));
50093 +               spin_unlock_jnode(node);
50094 +               first++;
50095 +
50096 +               atomic_dec(&node->x_count);
50097 +               nr ++;
50098 +       }
50099 +
50100 +       spin_unlock_atom(atom);
50101 +       return;
50102 +}
50103 +
50104 +/**
50105 + * make_node_ovrwr - assign node to overwrite set
50106 + * @jnodes: overwrite set list head
50107 + * @node: jnode to belong to overwrite set
50108 + *
50109 + * Sets OVRWR jnode state bit and puts @node to the end of list head @jnodes
50110 + * which is an accumulator for nodes before they get to overwrite set list of
50111 + * atom.
50112 + */
50113 +static void make_node_ovrwr(struct list_head *jnodes, jnode *node)
50114 +{
50115 +       spin_lock_jnode(node);
50116 +
50117 +       assert("zam-917", !JF_ISSET(node, JNODE_RELOC));
50118 +       assert("zam-918", !JF_ISSET(node, JNODE_OVRWR));
50119 +
50120 +       JF_SET(node, JNODE_OVRWR);
50121 +       list_move_tail(&node->capture_link, jnodes);
50122 +       ON_DEBUG(count_jnode(node->atom, node, DIRTY_LIST, OVRWR_LIST, 0));
50123 +
50124 +       spin_unlock_jnode(node);
50125 +}
50126 +
50127 +/**
50128 + * mark_jnodes_overwrite - put bunch of jnodes to overwrite set
50129 + * @flush_pos: flush position
50130 + * @oid: objectid of file jnodes belong to
50131 + * @index: starting index
50132 + * @width: extent width
50133 + *
50134 + * Puts nodes of one extent (file objectid @oid, extent width @width) to atom's
50135 + * overwrite set. Starting from the one with index @index. If end of slum is
50136 + * detected (node is not found or flushprepped) - stop iterating and set flush
50137 + * position's state to POS_INVALID.
50138 + */
50139 +static void mark_jnodes_overwrite(flush_pos_t *flush_pos, oid_t oid,
50140 +                                 unsigned long index, reiser4_block_nr width)
50141 +{
50142 +       unsigned long i;
50143 +       reiser4_tree *tree;
50144 +       jnode *node;
50145 +       txn_atom *atom;
50146 +       LIST_HEAD(jnodes);
50147 +
50148 +       tree = current_tree;
50149 +
50150 +       atom = atom_locked_by_fq(pos_fq(flush_pos));
50151 +       assert("vs-1478", atom);
50152 +
50153 +       for (i = flush_pos->pos_in_unit; i < width; i++, index++) {
50154 +               node = jlookup(tree, oid, index);
50155 +               if (!node) {
50156 +                       flush_pos->state = POS_INVALID;
50157 +                       break;
50158 +               }
50159 +               if (jnode_check_flushprepped(node)) {
50160 +                       flush_pos->state = POS_INVALID;
50161 +                       atomic_dec(&node->x_count);
50162 +                       break;
50163 +               }
50164 +               if (node->atom != atom) {
50165 +                       flush_pos->state = POS_INVALID;
50166 +                       atomic_dec(&node->x_count);
50167 +                       break;
50168 +               }
50169 +               make_node_ovrwr(&jnodes, node);
50170 +               atomic_dec(&node->x_count);
50171 +       }
50172 +
50173 +       list_splice_init(&jnodes, ATOM_OVRWR_LIST(atom)->prev);
50174 +       spin_unlock_atom(atom);
50175 +}
50176 +
50177 +/**
50178 + * allocated_extent_slum_size
50179 + * @flush_pos:
50180 + * @oid:
50181 + * @index:
50182 + * @count:
50183 + *
50184 + *
50185 + */
50186 +static int allocated_extent_slum_size(flush_pos_t *flush_pos, oid_t oid,
50187 +                                     unsigned long index, unsigned long count)
50188 +{
50189 +       unsigned long i;
50190 +       reiser4_tree *tree;
50191 +       txn_atom *atom;
50192 +       int nr;
50193 +
50194 +       atom = atom_locked_by_fq(pos_fq(flush_pos));
50195 +       assert("vs-1468", atom);
50196 +
50197 +       nr = 0;
50198 +       tree = current_tree;
50199 +       for (i = 0; i < count; ++i, ++index) {
50200 +               jnode *node;
50201 +
50202 +               node = jlookup(tree, oid, index);
50203 +               if (!node)
50204 +                       break;
50205 +
50206 +               if (jnode_check_flushprepped(node)) {
50207 +                       atomic_dec(&node->x_count);
50208 +                       break;
50209 +               }
50210 +
50211 +               if (node->atom != atom) {
50212 +                       /*
50213 +                        * this is possible on overwrite: extent_write may
50214 +                        * capture several unformatted nodes without capturing
50215 +                        * any formatted nodes.
50216 +                        */
50217 +                       atomic_dec(&node->x_count);
50218 +                       break;
50219 +               }
50220 +
50221 +               assert("vs-1476", atomic_read(&node->x_count) > 1);
50222 +               atomic_dec(&node->x_count);
50223 +               nr ++;
50224 +       }
50225 +
50226 +       spin_unlock_atom(atom);
50227 +       return nr;
50228 +}
50229 +
50230 +/**
50231 + * alloc_extent
50232 + * @flush_pos:
50233 + *
50234 + *
50235 + * this is called by handle_pos_on_twig to proceed extent unit flush_pos->coord
50236 + * is set to. It is to prepare for flushing sequence of not flushprepped nodes
50237 + * (slum). It supposes that slum starts at flush_pos->pos_in_unit position
50238 + * within the extent. Slum gets to relocate set if flush_pos->leaf_relocate is
50239 + * set to 1 and to overwrite set otherwise
50240 + */
50241 +int alloc_extent(flush_pos_t *flush_pos)
50242 +{
50243 +       coord_t *coord;
50244 +       reiser4_extent *ext;
50245 +       reiser4_extent replace_ext;
50246 +       oid_t oid;
50247 +       reiser4_block_nr protected;
50248 +       reiser4_block_nr start;
50249 +       __u64 index;
50250 +       __u64 width;
50251 +       extent_state state;
50252 +       int result;
50253 +       reiser4_block_nr first_allocated;
50254 +       __u64 allocated;
50255 +       reiser4_key key;
50256 +       block_stage_t block_stage;
50257 +
50258 +       assert("vs-1468", flush_pos->state == POS_ON_EPOINT);
50259 +       assert("vs-1469", coord_is_existing_unit(&flush_pos->coord)
50260 +              && item_is_extent(&flush_pos->coord));
50261 +
50262 +       coord = &flush_pos->coord;
50263 +
50264 +       ext = extent_by_coord(coord);
50265 +       state = state_of_extent(ext);
50266 +       if (state == HOLE_EXTENT) {
50267 +               flush_pos->state = POS_INVALID;
50268 +               return 0;
50269 +       }
50270 +
50271 +       item_key_by_coord(coord, &key);
50272 +       oid = get_key_objectid(&key);
50273 +       index = extent_unit_index(coord) + flush_pos->pos_in_unit;
50274 +       start = extent_get_start(ext);
50275 +       width = extent_get_width(ext);
50276 +
50277 +       assert("vs-1457", width > flush_pos->pos_in_unit);
50278 +
50279 +       if (flush_pos->leaf_relocate || state == UNALLOCATED_EXTENT) {
50280 +               /* relocate */
50281 +               if (flush_pos->pos_in_unit) {
50282 +                       /* split extent unit into two */
50283 +                       result =
50284 +                           split_allocated_extent(coord,
50285 +                                                  flush_pos->pos_in_unit);
50286 +                       flush_pos->pos_in_unit = 0;
50287 +                       return result;
50288 +               }
50289 +
50290 +               /* limit number of nodes to allocate */
50291 +               if (flush_pos->nr_to_write < width)
50292 +                       width = flush_pos->nr_to_write;
50293 +
50294 +               if (state == ALLOCATED_EXTENT) {
50295 +                       /*
50296 +                        * all protected nodes are not flushprepped, therefore
50297 +                        * they are counted as flush_reserved
50298 +                        */
50299 +                       block_stage = BLOCK_FLUSH_RESERVED;
50300 +                       protected = allocated_extent_slum_size(flush_pos, oid,
50301 +                                                              index, width);
50302 +                       if (protected == 0) {
50303 +                               flush_pos->state = POS_INVALID;
50304 +                               flush_pos->pos_in_unit = 0;
50305 +                               return 0;
50306 +                       }
50307 +               } else {
50308 +                       block_stage = BLOCK_UNALLOCATED;
50309 +                       protected = width;
50310 +               }
50311 +
50312 +               /*
50313 +                * look at previous unit if possible. If it is allocated, make
50314 +                * preceder more precise
50315 +                */
50316 +               if (coord->unit_pos &&
50317 +                   (state_of_extent(ext - 1) == ALLOCATED_EXTENT))
50318 +                       pos_hint(flush_pos)->blk = extent_get_start(ext - 1) +
50319 +                               extent_get_width(ext - 1);
50320 +
50321 +               /* allocate new block numbers for protected nodes */
50322 +               extent_allocate_blocks(pos_hint(flush_pos), protected,
50323 +                                      &first_allocated, &allocated,
50324 +                                      block_stage);
50325 +
50326 +               if (state == ALLOCATED_EXTENT)
50327 +                       /*
50328 +                        * on relocating - free nodes which are going to be
50329 +                        * relocated
50330 +                        */
50331 +                       reiser4_dealloc_blocks(&start, &allocated,
50332 +                                              BLOCK_ALLOCATED, BA_DEFER);
50333 +
50334 +               /* assign new block numbers to protected nodes */
50335 +               assign_real_blocknrs(flush_pos, oid, index, allocated, first_allocated);
50336 +
50337 +
50338 +               /* prepare extent which will replace current one */
50339 +               set_extent(&replace_ext, first_allocated, allocated);
50340 +
50341 +               /* adjust extent item */
50342 +               result = conv_extent(coord, &replace_ext);
50343 +               if (result != 0 && result != -ENOMEM) {
50344 +                       warning("vs-1461",
50345 +                               "Failed to allocate extent. Should not happen\n");
50346 +                       return result;
50347 +               }
50348 +
50349 +               /*
50350 +                * break flush: we prepared for flushing as many blocks as we
50351 +                * were asked for
50352 +                */
50353 +               if (flush_pos->nr_to_write == allocated)
50354 +                       flush_pos->state = POS_INVALID;
50355 +       } else {
50356 +               /* overwrite */
50357 +               mark_jnodes_overwrite(flush_pos, oid, index, width);
50358 +       }
50359 +       flush_pos->pos_in_unit = 0;
50360 +       return 0;
50361 +}
50362 +
50363 +/* if @key is glueable to the item @coord is set to */
50364 +static int must_insert(const coord_t *coord, const reiser4_key *key)
50365 +{
50366 +       reiser4_key last;
50367 +
50368 +       if (item_id_by_coord(coord) == EXTENT_POINTER_ID
50369 +           && keyeq(append_key_extent(coord, &last), key))
50370 +               return 0;
50371 +       return 1;
50372 +}
50373 +
50374 +/* copy extent @copy to the end of @node. It may have to either insert new item after the last one, or append last item,
50375 +   or modify last unit of last item to have greater width */
50376 +static int put_unit_to_end(znode *node, const reiser4_key *key,
50377 +                          reiser4_extent *copy_ext)
50378 +{
50379 +       int result;
50380 +       coord_t coord;
50381 +       cop_insert_flag flags;
50382 +       reiser4_extent *last_ext;
50383 +       reiser4_item_data data;
50384 +
50385 +       /* set coord after last unit in an item */
50386 +       coord_init_last_unit(&coord, node);
50387 +       coord.between = AFTER_UNIT;
50388 +
50389 +       flags =
50390 +           COPI_DONT_SHIFT_LEFT | COPI_DONT_SHIFT_RIGHT | COPI_DONT_ALLOCATE;
50391 +       if (must_insert(&coord, key)) {
50392 +               result =
50393 +                   insert_by_coord(&coord, init_new_extent(&data, copy_ext, 1),
50394 +                                   key, NULL /*lh */ , flags);
50395 +
50396 +       } else {
50397 +               /* try to glue with last unit */
50398 +               last_ext = extent_by_coord(&coord);
50399 +               if (state_of_extent(last_ext) &&
50400 +                   extent_get_start(last_ext) + extent_get_width(last_ext) ==
50401 +                   extent_get_start(copy_ext)) {
50402 +                       /* widen last unit of node */
50403 +                       extent_set_width(last_ext,
50404 +                                        extent_get_width(last_ext) +
50405 +                                        extent_get_width(copy_ext));
50406 +                       znode_make_dirty(node);
50407 +                       return 0;
50408 +               }
50409 +
50410 +               /* FIXME: put an assertion here that we can not merge last unit in @node and new unit */
50411 +               result =
50412 +                   insert_into_item(&coord, NULL /*lh */ , key,
50413 +                                    init_new_extent(&data, copy_ext, 1),
50414 +                                    flags);
50415 +       }
50416 +
50417 +       assert("vs-438", result == 0 || result == -E_NODE_FULL);
50418 +       return result;
50419 +}
50420 +
50421 +/* @coord is set to extent unit */
50422 +squeeze_result squalloc_extent(znode *left, const coord_t *coord,
50423 +                              flush_pos_t *flush_pos,
50424 +                              reiser4_key *stop_key)
50425 +{
50426 +       reiser4_extent *ext;
50427 +       __u64 index;
50428 +       __u64 width;
50429 +       reiser4_block_nr start;
50430 +       extent_state state;
50431 +       oid_t oid;
50432 +       reiser4_block_nr first_allocated;
50433 +       __u64 allocated;
50434 +       __u64 protected;
50435 +       reiser4_extent copy_extent;
50436 +       reiser4_key key;
50437 +       int result;
50438 +       block_stage_t block_stage;
50439 +
50440 +       assert("vs-1457", flush_pos->pos_in_unit == 0);
50441 +       assert("vs-1467", coord_is_leftmost_unit(coord));
50442 +       assert("vs-1467", item_is_extent(coord));
50443 +
50444 +       ext = extent_by_coord(coord);
50445 +       index = extent_unit_index(coord);
50446 +       start = extent_get_start(ext);
50447 +       width = extent_get_width(ext);
50448 +       state = state_of_extent(ext);
50449 +       unit_key_by_coord(coord, &key);
50450 +       oid = get_key_objectid(&key);
50451 +
50452 +       if ((flush_pos->leaf_relocate && state == ALLOCATED_EXTENT) ||
50453 +           (state == UNALLOCATED_EXTENT)) {
50454 +               /* relocate */
50455 +               if (state == ALLOCATED_EXTENT) {
50456 +                       /* all protected nodes are not flushprepped, therefore
50457 +                        * they are counted as flush_reserved */
50458 +                       block_stage = BLOCK_FLUSH_RESERVED;
50459 +                       protected = allocated_extent_slum_size(flush_pos, oid,
50460 +                                                              index, width);
50461 +                       if (protected == 0) {
50462 +                               flush_pos->state = POS_INVALID;
50463 +                               flush_pos->pos_in_unit = 0;
50464 +                               return 0;
50465 +                       }
50466 +               } else {
50467 +                       block_stage = BLOCK_UNALLOCATED;
50468 +                       protected = width;
50469 +               }
50470 +
50471 +               /*
50472 +                * look at previous unit if possible. If it is allocated, make
50473 +                * preceder more precise
50474 +                */
50475 +               if (coord->unit_pos &&
50476 +                   (state_of_extent(ext - 1) == ALLOCATED_EXTENT))
50477 +                       pos_hint(flush_pos)->blk = extent_get_start(ext - 1) +
50478 +                               extent_get_width(ext - 1);
50479 +
50480 +               /* allocate new block numbers for protected nodes */
50481 +               extent_allocate_blocks(pos_hint(flush_pos), protected,
50482 +                                      &first_allocated, &allocated,
50483 +                                      block_stage);
50484 +
50485 +               /* prepare extent which will be copied to left */
50486 +               set_extent(&copy_extent, first_allocated, allocated);
50487 +
50488 +               result = put_unit_to_end(left, &key, &copy_extent);
50489 +               if (result == -E_NODE_FULL) {
50490 +                       int target_block_stage;
50491 +
50492 +                       /* free blocks which were just allocated */
50493 +                       target_block_stage =
50494 +                           (state ==
50495 +                            ALLOCATED_EXTENT) ? BLOCK_FLUSH_RESERVED :
50496 +                           BLOCK_UNALLOCATED;
50497 +                       reiser4_dealloc_blocks(&first_allocated, &allocated,
50498 +                                              target_block_stage,
50499 +                                              BA_PERMANENT);
50500 +
50501 +                       /* rewind the preceder. */
50502 +                       flush_pos->preceder.blk = first_allocated;
50503 +                       check_preceder(flush_pos->preceder.blk);
50504 +
50505 +                       return SQUEEZE_TARGET_FULL;
50506 +               }
50507 +
50508 +               if (state == ALLOCATED_EXTENT) {
50509 +                       /* free nodes which were relocated */
50510 +                       reiser4_dealloc_blocks(&start, &allocated,
50511 +                                              BLOCK_ALLOCATED, BA_DEFER);
50512 +               }
50513 +
50514 +               /* assign new block numbers to protected nodes */
50515 +               assign_real_blocknrs(flush_pos, oid, index, allocated,
50516 +                                    first_allocated);
50517 +
50518 +               set_key_offset(&key,
50519 +                              get_key_offset(&key) +
50520 +                              (allocated << current_blocksize_bits));
50521 +       } else {
50522 +               /*
50523 +                * overwrite: try to copy unit as it is to left neighbor and
50524 +                * make all first not flushprepped nodes overwrite nodes
50525 +                */
50526 +               set_extent(&copy_extent, start, width);
50527 +               result = put_unit_to_end(left, &key, &copy_extent);
50528 +               if (result == -E_NODE_FULL)
50529 +                       return SQUEEZE_TARGET_FULL;
50530 +
50531 +               if (state != HOLE_EXTENT)
50532 +                       mark_jnodes_overwrite(flush_pos, oid, index, width);
50533 +               set_key_offset(&key,
50534 +                              get_key_offset(&key) +
50535 +                              (width << current_blocksize_bits));
50536 +       }
50537 +       *stop_key = key;
50538 +       return SQUEEZE_CONTINUE;
50539 +}
50540 +
50541 +int key_by_offset_extent(struct inode *inode, loff_t off, reiser4_key * key)
50542 +{
50543 +       return key_by_inode_and_offset_common(inode, off, key);
50544 +}
50545 +
50546 +/*
50547 + * Local variables:
50548 + * c-indentation-style: "K&R"
50549 + * mode-name: "LC"
50550 + * c-basic-offset: 8
50551 + * tab-width: 8
50552 + * fill-column: 79
50553 + * scroll-step: 1
50554 + * End:
50555 + */
50556 diff --git a/fs/reiser4/plugin/item/extent_item_ops.c b/fs/reiser4/plugin/item/extent_item_ops.c
50557 new file mode 100644
50558 index 0000000..3ffce65
50559 --- /dev/null
50560 +++ b/fs/reiser4/plugin/item/extent_item_ops.c
50561 @@ -0,0 +1,882 @@
50562 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
50563 +
50564 +#include "item.h"
50565 +#include "../../inode.h"
50566 +#include "../../tree_walk.h"   /* check_sibling_list() */
50567 +#include "../../page_cache.h"
50568 +#include "../../carry.h"
50569 +
50570 +#include <linux/quotaops.h>
50571 +
50572 +/* item_plugin->b.max_key_inside */
50573 +reiser4_key *max_key_inside_extent(const coord_t * coord, reiser4_key * key)
50574 +{
50575 +       item_key_by_coord(coord, key);
50576 +       set_key_offset(key, get_key_offset(max_key()));
50577 +       return key;
50578 +}
50579 +
50580 +/* item_plugin->b.can_contain_key
50581 +   this checks whether @key of @data is matching to position set by @coord */
50582 +int
50583 +can_contain_key_extent(const coord_t * coord, const reiser4_key * key,
50584 +                      const reiser4_item_data * data)
50585 +{
50586 +       reiser4_key item_key;
50587 +
50588 +       if (item_plugin_by_coord(coord) != data->iplug)
50589 +               return 0;
50590 +
50591 +       item_key_by_coord(coord, &item_key);
50592 +       if (get_key_locality(key) != get_key_locality(&item_key) ||
50593 +           get_key_objectid(key) != get_key_objectid(&item_key) ||
50594 +           get_key_ordering(key) != get_key_ordering(&item_key))
50595 +               return 0;
50596 +
50597 +       return 1;
50598 +}
50599 +
50600 +/* item_plugin->b.mergeable
50601 +   first item is of extent type */
50602 +/* Audited by: green(2002.06.13) */
50603 +int mergeable_extent(const coord_t * p1, const coord_t * p2)
50604 +{
50605 +       reiser4_key key1, key2;
50606 +
50607 +       assert("vs-299", item_id_by_coord(p1) == EXTENT_POINTER_ID);
50608 +       /* FIXME-VS: Which is it? Assert or return 0 */
50609 +       if (item_id_by_coord(p2) != EXTENT_POINTER_ID) {
50610 +               return 0;
50611 +       }
50612 +
50613 +       item_key_by_coord(p1, &key1);
50614 +       item_key_by_coord(p2, &key2);
50615 +       if (get_key_locality(&key1) != get_key_locality(&key2) ||
50616 +           get_key_objectid(&key1) != get_key_objectid(&key2) ||
50617 +           get_key_ordering(&key1) != get_key_ordering(&key2) ||
50618 +           get_key_type(&key1) != get_key_type(&key2))
50619 +               return 0;
50620 +       if (get_key_offset(&key1) + extent_size(p1, nr_units_extent(p1)) !=
50621 +           get_key_offset(&key2))
50622 +               return 0;
50623 +       return 1;
50624 +}
50625 +
50626 +/* item_plugin->b.nr_units */
50627 +pos_in_node_t nr_units_extent(const coord_t * coord)
50628 +{
50629 +       /* length of extent item has to be multiple of extent size */
50630 +       assert("vs-1424",
50631 +              (item_length_by_coord(coord) % sizeof(reiser4_extent)) == 0);
50632 +       return item_length_by_coord(coord) / sizeof(reiser4_extent);
50633 +}
50634 +
50635 +/* item_plugin->b.lookup */
50636 +lookup_result
50637 +lookup_extent(const reiser4_key * key, lookup_bias bias UNUSED_ARG,
50638 +             coord_t * coord)
50639 +{                              /* znode and item_pos are
50640 +                                  set to an extent item to
50641 +                                  look through */
50642 +       reiser4_key item_key;
50643 +       reiser4_block_nr lookuped, offset;
50644 +       unsigned i, nr_units;
50645 +       reiser4_extent *ext;
50646 +       unsigned blocksize;
50647 +       unsigned char blocksize_bits;
50648 +
50649 +       item_key_by_coord(coord, &item_key);
50650 +       offset = get_key_offset(&item_key);
50651 +
50652 +       /* key we are looking for must be greater than key of item @coord */
50653 +       assert("vs-414", keygt(key, &item_key));
50654 +
50655 +       assert("umka-99945",
50656 +              !keygt(key, max_key_inside_extent(coord, &item_key)));
50657 +
50658 +       ext = extent_item(coord);
50659 +       assert("vs-1350", (char *)ext == (zdata(coord->node) + coord->offset));
50660 +
50661 +       blocksize = current_blocksize;
50662 +       blocksize_bits = current_blocksize_bits;
50663 +
50664 +       /* offset we are looking for */
50665 +       lookuped = get_key_offset(key);
50666 +
50667 +       nr_units = nr_units_extent(coord);
50668 +       /* go through all extents until the one which address given offset */
50669 +       for (i = 0; i < nr_units; i++, ext++) {
50670 +               offset += (extent_get_width(ext) << blocksize_bits);
50671 +               if (offset > lookuped) {
50672 +                       /* desired byte is somewhere in this extent */
50673 +                       coord->unit_pos = i;
50674 +                       coord->between = AT_UNIT;
50675 +                       return CBK_COORD_FOUND;
50676 +               }
50677 +       }
50678 +
50679 +       /* set coord after last unit */
50680 +       coord->unit_pos = nr_units - 1;
50681 +       coord->between = AFTER_UNIT;
50682 +       return CBK_COORD_FOUND;
50683 +}
50684 +
50685 +/* item_plugin->b.paste
50686 +   item @coord is set to has been appended with @data->length of free
50687 +   space. data->data contains data to be pasted into the item in position
50688 +   @coord->in_item.unit_pos. It must fit into that free space.
50689 +   @coord must be set between units.
50690 +*/
50691 +int
50692 +paste_extent(coord_t * coord, reiser4_item_data * data,
50693 +            carry_plugin_info * info UNUSED_ARG)
50694 +{
50695 +       unsigned old_nr_units;
50696 +       reiser4_extent *ext;
50697 +       int item_length;
50698 +
50699 +       ext = extent_item(coord);
50700 +       item_length = item_length_by_coord(coord);
50701 +       old_nr_units = (item_length - data->length) / sizeof(reiser4_extent);
50702 +
50703 +       /* this is also used to copy extent into newly created item, so
50704 +          old_nr_units could be 0 */
50705 +       assert("vs-260", item_length >= data->length);
50706 +
50707 +       /* make sure that coord is set properly */
50708 +       assert("vs-35",
50709 +              ((!coord_is_existing_unit(coord))
50710 +               || (!old_nr_units && !coord->unit_pos)));
50711 +
50712 +       /* first unit to be moved */
50713 +       switch (coord->between) {
50714 +       case AFTER_UNIT:
50715 +               coord->unit_pos++;
50716 +       case BEFORE_UNIT:
50717 +               coord->between = AT_UNIT;
50718 +               break;
50719 +       case AT_UNIT:
50720 +               assert("vs-331", !old_nr_units && !coord->unit_pos);
50721 +               break;
50722 +       default:
50723 +               impossible("vs-330", "coord is set improperly");
50724 +       }
50725 +
50726 +       /* prepare space for new units */
50727 +       memmove(ext + coord->unit_pos + data->length / sizeof(reiser4_extent),
50728 +               ext + coord->unit_pos,
50729 +               (old_nr_units - coord->unit_pos) * sizeof(reiser4_extent));
50730 +
50731 +       /* copy new data from kernel space */
50732 +       assert("vs-556", data->user == 0);
50733 +       memcpy(ext + coord->unit_pos, data->data, (unsigned)data->length);
50734 +
50735 +       /* after paste @coord is set to first of pasted units */
50736 +       assert("vs-332", coord_is_existing_unit(coord));
50737 +       assert("vs-333",
50738 +              !memcmp(data->data, extent_by_coord(coord),
50739 +                      (unsigned)data->length));
50740 +       return 0;
50741 +}
50742 +
50743 +/* item_plugin->b.can_shift */
50744 +int
50745 +can_shift_extent(unsigned free_space, coord_t * source,
50746 +                znode * target UNUSED_ARG, shift_direction pend UNUSED_ARG,
50747 +                unsigned *size, unsigned want)
50748 +{
50749 +       *size = item_length_by_coord(source);
50750 +       if (*size > free_space)
50751 +               /* never split a unit of extent item */
50752 +               *size = free_space - free_space % sizeof(reiser4_extent);
50753 +
50754 +       /* we can shift *size bytes, calculate how many do we want to shift */
50755 +       if (*size > want * sizeof(reiser4_extent))
50756 +               *size = want * sizeof(reiser4_extent);
50757 +
50758 +       if (*size % sizeof(reiser4_extent) != 0)
50759 +               impossible("vs-119", "Wrong extent size: %i %zd", *size,
50760 +                          sizeof(reiser4_extent));
50761 +       return *size / sizeof(reiser4_extent);
50762 +
50763 +}
50764 +
50765 +/* item_plugin->b.copy_units */
50766 +void
50767 +copy_units_extent(coord_t * target, coord_t * source,
50768 +                 unsigned from, unsigned count,
50769 +                 shift_direction where_is_free_space, unsigned free_space)
50770 +{
50771 +       char *from_ext, *to_ext;
50772 +
50773 +       assert("vs-217", free_space == count * sizeof(reiser4_extent));
50774 +
50775 +       from_ext = item_body_by_coord(source);
50776 +       to_ext = item_body_by_coord(target);
50777 +
50778 +       if (where_is_free_space == SHIFT_LEFT) {
50779 +               assert("vs-215", from == 0);
50780 +
50781 +               /* At this moment, item length was already updated in the item
50782 +                  header by shifting code, hence nr_units_extent() will
50783 +                  return "new" number of units---one we obtain after copying
50784 +                  units.
50785 +                */
50786 +               to_ext +=
50787 +                   (nr_units_extent(target) - count) * sizeof(reiser4_extent);
50788 +       } else {
50789 +               reiser4_key key;
50790 +               coord_t coord;
50791 +
50792 +               assert("vs-216",
50793 +                      from + count == coord_last_unit_pos(source) + 1);
50794 +
50795 +               from_ext += item_length_by_coord(source) - free_space;
50796 +
50797 +               /* new units are inserted before first unit in an item,
50798 +                  therefore, we have to update item key */
50799 +               coord = *source;
50800 +               coord.unit_pos = from;
50801 +               unit_key_extent(&coord, &key);
50802 +
50803 +               node_plugin_by_node(target->node)->update_item_key(target, &key,
50804 +                                                                  NULL /*info */);
50805 +       }
50806 +
50807 +       memcpy(to_ext, from_ext, free_space);
50808 +}
50809 +
50810 +/* item_plugin->b.create_hook
50811 +   @arg is znode of leaf node for which we need to update right delimiting key */
50812 +int create_hook_extent(const coord_t * coord, void *arg)
50813 +{
50814 +       coord_t *child_coord;
50815 +       znode *node;
50816 +       reiser4_key key;
50817 +       reiser4_tree *tree;
50818 +
50819 +       if (!arg)
50820 +               return 0;
50821 +
50822 +       child_coord = arg;
50823 +       tree = znode_get_tree(coord->node);
50824 +
50825 +       assert("nikita-3246", znode_get_level(child_coord->node) == LEAF_LEVEL);
50826 +
50827 +       write_lock_tree(tree);
50828 +       write_lock_dk(tree);
50829 +       /* find a node on the left level for which right delimiting key has to
50830 +          be updated */
50831 +       if (coord_wrt(child_coord) == COORD_ON_THE_LEFT) {
50832 +               assert("vs-411", znode_is_left_connected(child_coord->node));
50833 +               node = child_coord->node->left;
50834 +       } else {
50835 +               assert("vs-412", coord_wrt(child_coord) == COORD_ON_THE_RIGHT);
50836 +               node = child_coord->node;
50837 +               assert("nikita-3314", node != NULL);
50838 +       }
50839 +
50840 +       if (node != NULL) {
50841 +               znode_set_rd_key(node, item_key_by_coord(coord, &key));
50842 +
50843 +               assert("nikita-3282", check_sibling_list(node));
50844 +               /* break sibling links */
50845 +               if (ZF_ISSET(node, JNODE_RIGHT_CONNECTED) && node->right) {
50846 +                       ON_DEBUG(node->right->left_version =
50847 +                                atomic_inc_return(&delim_key_version);
50848 +                                node->right_version =
50849 +                                atomic_inc_return(&delim_key_version););
50850 +
50851 +                       node->right->left = NULL;
50852 +                       node->right = NULL;
50853 +               }
50854 +       }
50855 +       write_unlock_dk(tree);
50856 +       write_unlock_tree(tree);
50857 +       return 0;
50858 +}
50859 +
50860 +#define ITEM_TAIL_KILLED 0
50861 +#define ITEM_HEAD_KILLED 1
50862 +#define ITEM_KILLED 2
50863 +
50864 +/* item_plugin->b.kill_hook
50865 +   this is called when @count units starting from @from-th one are going to be removed
50866 +   */
50867 +int
50868 +kill_hook_extent(const coord_t * coord, pos_in_node_t from, pos_in_node_t count,
50869 +                struct carry_kill_data *kdata)
50870 +{
50871 +       reiser4_extent *ext;
50872 +       reiser4_block_nr start, length;
50873 +       const reiser4_key *pfrom_key, *pto_key;
50874 +       struct inode *inode;
50875 +       reiser4_tree *tree;
50876 +       pgoff_t from_off, to_off, offset, skip;
50877 +       int retval;
50878 +
50879 +       /* these are located in memory kmalloc-ed by kill_node_content */
50880 +       reiser4_key *min_item_key, *max_item_key, *from_key, *to_key, *key;
50881 +       coord_t *dup, *next;
50882 +
50883 +       assert("zam-811", znode_is_write_locked(coord->node));
50884 +       assert("nikita-3315", kdata != NULL);
50885 +       assert("vs-34", kdata->buf != NULL);
50886 +
50887 +       /* map structures to kdata->buf */
50888 +       min_item_key = (reiser4_key *) (kdata->buf);
50889 +       max_item_key = min_item_key + 1;
50890 +       from_key = max_item_key + 1;
50891 +       to_key = from_key + 1;
50892 +       key = to_key + 1;
50893 +       dup = (coord_t *) (key + 1);
50894 +       next = dup + 1;
50895 +
50896 +       item_key_by_coord(coord, min_item_key);
50897 +       max_item_key_by_coord(coord, max_item_key);
50898 +
50899 +       if (kdata->params.from_key) {
50900 +               pfrom_key = kdata->params.from_key;
50901 +               pto_key = kdata->params.to_key;
50902 +       } else {
50903 +               assert("vs-1549", from == coord->unit_pos);
50904 +               unit_key_by_coord(coord, from_key);
50905 +               pfrom_key = from_key;
50906 +
50907 +               coord_dup(dup, coord);
50908 +               dup->unit_pos = from + count - 1;
50909 +               max_unit_key_by_coord(dup, to_key);
50910 +               pto_key = to_key;
50911 +       }
50912 +
50913 +       if (!keylt(pto_key, max_item_key)) {
50914 +               if (!keygt(pfrom_key, min_item_key)) {
50915 +                       znode *left, *right;
50916 +
50917 +                       /* item is to be removed completely */
50918 +                       assert("nikita-3316", kdata->left != NULL
50919 +                              && kdata->right != NULL);
50920 +
50921 +                       left = kdata->left->node;
50922 +                       right = kdata->right->node;
50923 +
50924 +                       tree = current_tree;
50925 +                       /* we have to do two things:
50926 +                        *
50927 +                        *     1. link left and right formatted neighbors of
50928 +                        *        extent being removed, and
50929 +                        *
50930 +                        *     2. update their delimiting keys.
50931 +                        *
50932 +                        * atomicity of these operations is protected by
50933 +                        * taking dk-lock and tree-lock.
50934 +                        */
50935 +                       /* if neighbors of item being removed are znodes -
50936 +                        * link them */
50937 +                       write_lock_tree(tree);
50938 +                       write_lock_dk(tree);
50939 +                       link_left_and_right(left, right);
50940 +                       if (left) {
50941 +                               /* update right delimiting key of left
50942 +                                * neighbor of extent item */
50943 +                               /*coord_t next;
50944 +                                  reiser4_key key; */
50945 +
50946 +                               coord_dup(next, coord);
50947 +
50948 +                               if (coord_next_item(next))
50949 +                                       *key = *znode_get_rd_key(coord->node);
50950 +                               else
50951 +                                       item_key_by_coord(next, key);
50952 +                               znode_set_rd_key(left, key);
50953 +                       }
50954 +                       write_unlock_dk(tree);
50955 +                       write_unlock_tree(tree);
50956 +
50957 +                       from_off =
50958 +                           get_key_offset(min_item_key) >> PAGE_CACHE_SHIFT;
50959 +                       to_off =
50960 +                           (get_key_offset(max_item_key) +
50961 +                            1) >> PAGE_CACHE_SHIFT;
50962 +                       retval = ITEM_KILLED;
50963 +               } else {
50964 +                       /* tail of item is to be removed */
50965 +                       from_off =
50966 +                           (get_key_offset(pfrom_key) + PAGE_CACHE_SIZE -
50967 +                            1) >> PAGE_CACHE_SHIFT;
50968 +                       to_off =
50969 +                           (get_key_offset(max_item_key) +
50970 +                            1) >> PAGE_CACHE_SHIFT;
50971 +                       retval = ITEM_TAIL_KILLED;
50972 +               }
50973 +       } else {
50974 +               /* head of item is to be removed */
50975 +               assert("vs-1571", keyeq(pfrom_key, min_item_key));
50976 +               assert("vs-1572",
50977 +                      (get_key_offset(pfrom_key) & (PAGE_CACHE_SIZE - 1)) ==
50978 +                      0);
50979 +               assert("vs-1573",
50980 +                      ((get_key_offset(pto_key) + 1) & (PAGE_CACHE_SIZE -
50981 +                                                        1)) == 0);
50982 +
50983 +               if (kdata->left->node) {
50984 +                       /* update right delimiting key of left neighbor of extent item */
50985 +                       /*reiser4_key key; */
50986 +
50987 +                       *key = *pto_key;
50988 +                       set_key_offset(key, get_key_offset(pto_key) + 1);
50989 +
50990 +                       write_lock_dk(current_tree);
50991 +                       znode_set_rd_key(kdata->left->node, key);
50992 +                       write_unlock_dk(current_tree);
50993 +               }
50994 +
50995 +               from_off = get_key_offset(pfrom_key) >> PAGE_CACHE_SHIFT;
50996 +               to_off = (get_key_offset(pto_key) + 1) >> PAGE_CACHE_SHIFT;
50997 +               retval = ITEM_HEAD_KILLED;
50998 +       }
50999 +
51000 +       inode = kdata->inode;
51001 +       assert("vs-1545", inode != NULL);
51002 +       if (inode != NULL)
51003 +               /* take care of pages and jnodes corresponding to part of item being killed */
51004 +               reiser4_invalidate_pages(inode->i_mapping, from_off,
51005 +                                        to_off - from_off,
51006 +                                        kdata->params.truncate);
51007 +
51008 +       ext = extent_item(coord) + from;
51009 +       offset =
51010 +           (get_key_offset(min_item_key) +
51011 +            extent_size(coord, from)) >> PAGE_CACHE_SHIFT;
51012 +
51013 +       assert("vs-1551", from_off >= offset);
51014 +       assert("vs-1552", from_off - offset <= extent_get_width(ext));
51015 +       skip = from_off - offset;
51016 +       offset = from_off;
51017 +
51018 +       while (offset < to_off) {
51019 +               length = extent_get_width(ext) - skip;
51020 +               if (state_of_extent(ext) == HOLE_EXTENT) {
51021 +                       skip = 0;
51022 +                       offset += length;
51023 +                       ext++;
51024 +                       continue;
51025 +               }
51026 +
51027 +               if (offset + length > to_off) {
51028 +                       length = to_off - offset;
51029 +               }
51030 +
51031 +               DQUOT_FREE_BLOCK_NODIRTY(inode, length);
51032 +
51033 +               if (state_of_extent(ext) == UNALLOCATED_EXTENT) {
51034 +                       /* some jnodes corresponding to this unallocated extent */
51035 +                       fake_allocated2free(length, 0 /* unformatted */ );
51036 +
51037 +                       skip = 0;
51038 +                       offset += length;
51039 +                       ext++;
51040 +                       continue;
51041 +               }
51042 +
51043 +               assert("vs-1218", state_of_extent(ext) == ALLOCATED_EXTENT);
51044 +
51045 +               if (length != 0) {
51046 +                       start = extent_get_start(ext) + skip;
51047 +
51048 +                       /* BA_DEFER bit parameter is turned on because blocks which get freed are not safe to be freed
51049 +                          immediately */
51050 +                       reiser4_dealloc_blocks(&start, &length,
51051 +                                              0 /* not used */ ,
51052 +                                              BA_DEFER
51053 +                                              /* unformatted with defer */ );
51054 +               }
51055 +               skip = 0;
51056 +               offset += length;
51057 +               ext++;
51058 +       }
51059 +       return retval;
51060 +}
51061 +
51062 +/* item_plugin->b.kill_units */
51063 +int
51064 +kill_units_extent(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
51065 +                 struct carry_kill_data *kdata, reiser4_key * smallest_removed,
51066 +                 reiser4_key * new_first)
51067 +{
51068 +       reiser4_extent *ext;
51069 +       reiser4_key item_key;
51070 +       pos_in_node_t count;
51071 +       reiser4_key from_key, to_key;
51072 +       const reiser4_key *pfrom_key, *pto_key;
51073 +       loff_t off;
51074 +       int result;
51075 +
51076 +       assert("vs-1541",
51077 +              ((kdata->params.from_key == NULL && kdata->params.to_key == NULL)
51078 +               || (kdata->params.from_key != NULL
51079 +                   && kdata->params.to_key != NULL)));
51080 +
51081 +       if (kdata->params.from_key) {
51082 +               pfrom_key = kdata->params.from_key;
51083 +               pto_key = kdata->params.to_key;
51084 +       } else {
51085 +               coord_t dup;
51086 +
51087 +               /* calculate key range of kill */
51088 +               assert("vs-1549", from == coord->unit_pos);
51089 +               unit_key_by_coord(coord, &from_key);
51090 +               pfrom_key = &from_key;
51091 +
51092 +               coord_dup(&dup, coord);
51093 +               dup.unit_pos = to;
51094 +               max_unit_key_by_coord(&dup, &to_key);
51095 +               pto_key = &to_key;
51096 +       }
51097 +
51098 +       item_key_by_coord(coord, &item_key);
51099 +
51100 +#if REISER4_DEBUG
51101 +       {
51102 +               reiser4_key max_item_key;
51103 +
51104 +               max_item_key_by_coord(coord, &max_item_key);
51105 +
51106 +               if (new_first) {
51107 +                       /* head of item is to be cut */
51108 +                       assert("vs-1542", keyeq(pfrom_key, &item_key));
51109 +                       assert("vs-1538", keylt(pto_key, &max_item_key));
51110 +               } else {
51111 +                       /* tail of item is to be cut */
51112 +                       assert("vs-1540", keygt(pfrom_key, &item_key));
51113 +                       assert("vs-1543", !keylt(pto_key, &max_item_key));
51114 +               }
51115 +       }
51116 +#endif
51117 +
51118 +       if (smallest_removed)
51119 +               *smallest_removed = *pfrom_key;
51120 +
51121 +       if (new_first) {
51122 +               /* item head is cut. Item key will change. This new key is calculated here */
51123 +               assert("vs-1556",
51124 +                      (get_key_offset(pto_key) & (PAGE_CACHE_SIZE - 1)) ==
51125 +                      (PAGE_CACHE_SIZE - 1));
51126 +               *new_first = *pto_key;
51127 +               set_key_offset(new_first, get_key_offset(new_first) + 1);
51128 +       }
51129 +
51130 +       count = to - from + 1;
51131 +       result = kill_hook_extent(coord, from, count, kdata);
51132 +       if (result == ITEM_TAIL_KILLED) {
51133 +               assert("vs-1553",
51134 +                      get_key_offset(pfrom_key) >=
51135 +                      get_key_offset(&item_key) + extent_size(coord, from));
51136 +               off =
51137 +                   get_key_offset(pfrom_key) - (get_key_offset(&item_key) +
51138 +                                                extent_size(coord, from));
51139 +               if (off) {
51140 +                       /* unit @from is to be cut partially. Its width decreases */
51141 +                       ext = extent_item(coord) + from;
51142 +                       extent_set_width(ext,
51143 +                                        (off + PAGE_CACHE_SIZE -
51144 +                                         1) >> PAGE_CACHE_SHIFT);
51145 +                       count--;
51146 +               }
51147 +       } else {
51148 +               __u64 max_to_offset;
51149 +               __u64 rest;
51150 +
51151 +               assert("vs-1575", result == ITEM_HEAD_KILLED);
51152 +               assert("", from == 0);
51153 +               assert("",
51154 +                      ((get_key_offset(pto_key) + 1) & (PAGE_CACHE_SIZE -
51155 +                                                        1)) == 0);
51156 +               assert("",
51157 +                      get_key_offset(pto_key) + 1 >
51158 +                      get_key_offset(&item_key) + extent_size(coord, to));
51159 +               max_to_offset =
51160 +                   get_key_offset(&item_key) + extent_size(coord, to + 1) - 1;
51161 +               assert("", get_key_offset(pto_key) <= max_to_offset);
51162 +
51163 +               rest =
51164 +                   (max_to_offset -
51165 +                    get_key_offset(pto_key)) >> PAGE_CACHE_SHIFT;
51166 +               if (rest) {
51167 +                       /* unit @to is to be cut partially */
51168 +                       ext = extent_item(coord) + to;
51169 +
51170 +                       assert("", extent_get_width(ext) > rest);
51171 +
51172 +                       if (state_of_extent(ext) == ALLOCATED_EXTENT)
51173 +                               extent_set_start(ext,
51174 +                                                extent_get_start(ext) +
51175 +                                                (extent_get_width(ext) -
51176 +                                                 rest));
51177 +
51178 +                       extent_set_width(ext, rest);
51179 +                       count--;
51180 +               }
51181 +       }
51182 +       return count * sizeof(reiser4_extent);
51183 +}
51184 +
51185 +/* item_plugin->b.cut_units
51186 +   this is too similar to kill_units_extent */
51187 +int
51188 +cut_units_extent(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
51189 +                struct carry_cut_data *cdata, reiser4_key * smallest_removed,
51190 +                reiser4_key * new_first)
51191 +{
51192 +       reiser4_extent *ext;
51193 +       reiser4_key item_key;
51194 +       pos_in_node_t count;
51195 +       reiser4_key from_key, to_key;
51196 +       const reiser4_key *pfrom_key, *pto_key;
51197 +       loff_t off;
51198 +
51199 +       assert("vs-1541",
51200 +              ((cdata->params.from_key == NULL && cdata->params.to_key == NULL)
51201 +               || (cdata->params.from_key != NULL
51202 +                   && cdata->params.to_key != NULL)));
51203 +
51204 +       if (cdata->params.from_key) {
51205 +               pfrom_key = cdata->params.from_key;
51206 +               pto_key = cdata->params.to_key;
51207 +       } else {
51208 +               coord_t dup;
51209 +
51210 +               /* calculate key range of kill */
51211 +               coord_dup(&dup, coord);
51212 +               dup.unit_pos = from;
51213 +               unit_key_by_coord(&dup, &from_key);
51214 +
51215 +               dup.unit_pos = to;
51216 +               max_unit_key_by_coord(&dup, &to_key);
51217 +
51218 +               pfrom_key = &from_key;
51219 +               pto_key = &to_key;
51220 +       }
51221 +
51222 +       assert("vs-1555",
51223 +              (get_key_offset(pfrom_key) & (PAGE_CACHE_SIZE - 1)) == 0);
51224 +       assert("vs-1556",
51225 +              (get_key_offset(pto_key) & (PAGE_CACHE_SIZE - 1)) ==
51226 +              (PAGE_CACHE_SIZE - 1));
51227 +
51228 +       item_key_by_coord(coord, &item_key);
51229 +
51230 +#if REISER4_DEBUG
51231 +       {
51232 +               reiser4_key max_item_key;
51233 +
51234 +               assert("vs-1584",
51235 +                      get_key_locality(pfrom_key) ==
51236 +                      get_key_locality(&item_key));
51237 +               assert("vs-1585",
51238 +                      get_key_type(pfrom_key) == get_key_type(&item_key));
51239 +               assert("vs-1586",
51240 +                      get_key_objectid(pfrom_key) ==
51241 +                      get_key_objectid(&item_key));
51242 +               assert("vs-1587",
51243 +                      get_key_ordering(pfrom_key) ==
51244 +                      get_key_ordering(&item_key));
51245 +
51246 +               max_item_key_by_coord(coord, &max_item_key);
51247 +
51248 +               if (new_first != NULL) {
51249 +                       /* head of item is to be cut */
51250 +                       assert("vs-1542", keyeq(pfrom_key, &item_key));
51251 +                       assert("vs-1538", keylt(pto_key, &max_item_key));
51252 +               } else {
51253 +                       /* tail of item is to be cut */
51254 +                       assert("vs-1540", keygt(pfrom_key, &item_key));
51255 +                       assert("vs-1543", keyeq(pto_key, &max_item_key));
51256 +               }
51257 +       }
51258 +#endif
51259 +
51260 +       if (smallest_removed)
51261 +               *smallest_removed = *pfrom_key;
51262 +
51263 +       if (new_first) {
51264 +               /* item head is cut. Item key will change. This new key is calculated here */
51265 +               *new_first = *pto_key;
51266 +               set_key_offset(new_first, get_key_offset(new_first) + 1);
51267 +       }
51268 +
51269 +       count = to - from + 1;
51270 +
51271 +       assert("vs-1553",
51272 +              get_key_offset(pfrom_key) >=
51273 +              get_key_offset(&item_key) + extent_size(coord, from));
51274 +       off =
51275 +           get_key_offset(pfrom_key) - (get_key_offset(&item_key) +
51276 +                                        extent_size(coord, from));
51277 +       if (off) {
51278 +               /* tail of unit @from is to be cut partially. Its width decreases */
51279 +               assert("vs-1582", new_first == NULL);
51280 +               ext = extent_item(coord) + from;
51281 +               extent_set_width(ext, off >> PAGE_CACHE_SHIFT);
51282 +               count--;
51283 +       }
51284 +
51285 +       assert("vs-1554",
51286 +              get_key_offset(pto_key) <=
51287 +              get_key_offset(&item_key) + extent_size(coord, to + 1) - 1);
51288 +       off =
51289 +           (get_key_offset(&item_key) + extent_size(coord, to + 1) - 1) -
51290 +           get_key_offset(pto_key);
51291 +       if (off) {
51292 +               /* @to_key is smaller than max key of unit @to. Unit @to will not be removed. It gets start increased
51293 +                  and width decreased. */
51294 +               assert("vs-1583", (off & (PAGE_CACHE_SIZE - 1)) == 0);
51295 +               ext = extent_item(coord) + to;
51296 +               if (state_of_extent(ext) == ALLOCATED_EXTENT)
51297 +                       extent_set_start(ext,
51298 +                                        extent_get_start(ext) +
51299 +                                        (extent_get_width(ext) -
51300 +                                         (off >> PAGE_CACHE_SHIFT)));
51301 +
51302 +               extent_set_width(ext, (off >> PAGE_CACHE_SHIFT));
51303 +               count--;
51304 +       }
51305 +       return count * sizeof(reiser4_extent);
51306 +}
51307 +
51308 +/* item_plugin->b.unit_key */
51309 +reiser4_key *unit_key_extent(const coord_t * coord, reiser4_key * key)
51310 +{
51311 +       assert("vs-300", coord_is_existing_unit(coord));
51312 +
51313 +       item_key_by_coord(coord, key);
51314 +       set_key_offset(key,
51315 +                      (get_key_offset(key) +
51316 +                       extent_size(coord, coord->unit_pos)));
51317 +
51318 +       return key;
51319 +}
51320 +
51321 +/* item_plugin->b.max_unit_key */
51322 +reiser4_key *max_unit_key_extent(const coord_t * coord, reiser4_key * key)
51323 +{
51324 +       assert("vs-300", coord_is_existing_unit(coord));
51325 +
51326 +       item_key_by_coord(coord, key);
51327 +       set_key_offset(key,
51328 +                      (get_key_offset(key) +
51329 +                       extent_size(coord, coord->unit_pos + 1) - 1));
51330 +       return key;
51331 +}
51332 +
51333 +/* item_plugin->b.estimate
51334 +   item_plugin->b.item_data_by_flow */
51335 +
51336 +#if REISER4_DEBUG
51337 +
51338 +/* item_plugin->b.check
51339 +   used for debugging, every item should have here the most complete
51340 +   possible check of the consistency of the item that the inventor can
51341 +   construct
51342 +*/
51343 +int check_extent(const coord_t * coord /* coord of item to check */ ,
51344 +                const char **error /* where to store error message */ )
51345 +{
51346 +       reiser4_extent *ext, *first;
51347 +       unsigned i, j;
51348 +       reiser4_block_nr start, width, blk_cnt;
51349 +       unsigned num_units;
51350 +       reiser4_tree *tree;
51351 +       oid_t oid;
51352 +       reiser4_key key;
51353 +       coord_t scan;
51354 +
51355 +       assert("vs-933", REISER4_DEBUG);
51356 +
51357 +       if (znode_get_level(coord->node) != TWIG_LEVEL) {
51358 +               *error = "Extent on the wrong level";
51359 +               return -1;
51360 +       }
51361 +       if (item_length_by_coord(coord) % sizeof(reiser4_extent) != 0) {
51362 +               *error = "Wrong item size";
51363 +               return -1;
51364 +       }
51365 +       ext = first = extent_item(coord);
51366 +       blk_cnt = reiser4_block_count(reiser4_get_current_sb());
51367 +       num_units = coord_num_units(coord);
51368 +       tree = znode_get_tree(coord->node);
51369 +       item_key_by_coord(coord, &key);
51370 +       oid = get_key_objectid(&key);
51371 +       coord_dup(&scan, coord);
51372 +
51373 +       for (i = 0; i < num_units; ++i, ++ext) {
51374 +               __u64 index;
51375 +
51376 +               scan.unit_pos = i;
51377 +               index = extent_unit_index(&scan);
51378 +
51379 +#if 0
51380 +               /* check that all jnodes are present for the unallocated
51381 +                * extent */
51382 +               if (state_of_extent(ext) == UNALLOCATED_EXTENT) {
51383 +                       for (j = 0; j < extent_get_width(ext); j++) {
51384 +                               jnode *node;
51385 +
51386 +                               node = jlookup(tree, oid, index + j);
51387 +                               if (node == NULL) {
51388 +                                       print_coord("scan", &scan, 0);
51389 +                                       *error = "Jnode missing";
51390 +                                       return -1;
51391 +                               }
51392 +                               jput(node);
51393 +                       }
51394 +               }
51395 +#endif
51396 +
51397 +               start = extent_get_start(ext);
51398 +               if (start < 2)
51399 +                       continue;
51400 +               /* extent is allocated one */
51401 +               width = extent_get_width(ext);
51402 +               if (start >= blk_cnt) {
51403 +                       *error = "Start too large";
51404 +                       return -1;
51405 +               }
51406 +               if (start + width > blk_cnt) {
51407 +                       *error = "End too large";
51408 +                       return -1;
51409 +               }
51410 +               /* make sure that this extent does not overlap with other
51411 +                  allocated extents extents */
51412 +               for (j = 0; j < i; j++) {
51413 +                       if (state_of_extent(first + j) != ALLOCATED_EXTENT)
51414 +                               continue;
51415 +                       if (!
51416 +                           ((extent_get_start(ext) >=
51417 +                             extent_get_start(first + j) +
51418 +                             extent_get_width(first + j))
51419 +                            || (extent_get_start(ext) +
51420 +                                extent_get_width(ext) <=
51421 +                                extent_get_start(first + j)))) {
51422 +                               *error = "Extent overlaps with others";
51423 +                               return -1;
51424 +                       }
51425 +               }
51426 +
51427 +       }
51428 +
51429 +       return 0;
51430 +}
51431 +
51432 +#endif                         /* REISER4_DEBUG */
51433 +
51434 +/*
51435 +   Local variables:
51436 +   c-indentation-style: "K&R"
51437 +   mode-name: "LC"
51438 +   c-basic-offset: 8
51439 +   tab-width: 8
51440 +   fill-column: 120
51441 +   scroll-step: 1
51442 +   End:
51443 +*/
51444 diff --git a/fs/reiser4/plugin/item/internal.c b/fs/reiser4/plugin/item/internal.c
51445 new file mode 100644
51446 index 0000000..54e1bac
51447 --- /dev/null
51448 +++ b/fs/reiser4/plugin/item/internal.c
51449 @@ -0,0 +1,392 @@
51450 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
51451 +
51452 +/* Implementation of internal-item plugin methods. */
51453 +
51454 +#include "../../forward.h"
51455 +#include "../../debug.h"
51456 +#include "../../dformat.h"
51457 +#include "../../key.h"
51458 +#include "../../coord.h"
51459 +#include "internal.h"
51460 +#include "item.h"
51461 +#include "../node/node.h"
51462 +#include "../plugin.h"
51463 +#include "../../jnode.h"
51464 +#include "../../znode.h"
51465 +#include "../../tree_walk.h"
51466 +#include "../../tree_mod.h"
51467 +#include "../../tree.h"
51468 +#include "../../super.h"
51469 +#include "../../block_alloc.h"
51470 +
51471 +/* see internal.h for explanation */
51472 +
51473 +/* plugin->u.item.b.mergeable */
51474 +int mergeable_internal(const coord_t * p1 UNUSED_ARG /* first item */ ,
51475 +                      const coord_t * p2 UNUSED_ARG /* second item */ )
51476 +{
51477 +       /* internal items are not mergeable */
51478 +       return 0;
51479 +}
51480 +
51481 +/* ->lookup() method for internal items */
51482 +lookup_result lookup_internal(const reiser4_key * key /* key to look up */ ,
51483 +                             lookup_bias bias UNUSED_ARG /* lookup bias */ ,
51484 +                             coord_t * coord /* coord of item */ )
51485 +{
51486 +       reiser4_key ukey;
51487 +
51488 +       switch (keycmp(unit_key_by_coord(coord, &ukey), key)) {
51489 +       default:
51490 +               impossible("", "keycmp()?!");
51491 +       case LESS_THAN:
51492 +               /* FIXME-VS: AFTER_ITEM used to be here. But with new coord
51493 +                  item plugin can not be taken using coord set this way */
51494 +               assert("vs-681", coord->unit_pos == 0);
51495 +               coord->between = AFTER_UNIT;
51496 +       case EQUAL_TO:
51497 +               return CBK_COORD_FOUND;
51498 +       case GREATER_THAN:
51499 +               return CBK_COORD_NOTFOUND;
51500 +       }
51501 +}
51502 +
51503 +/* return body of internal item at @coord */
51504 +static internal_item_layout *internal_at(const coord_t * coord /* coord of
51505 +                                                                * item */ )
51506 +{
51507 +       assert("nikita-607", coord != NULL);
51508 +       assert("nikita-1650",
51509 +              item_plugin_by_coord(coord) ==
51510 +              item_plugin_by_id(NODE_POINTER_ID));
51511 +       return (internal_item_layout *) item_body_by_coord(coord);
51512 +}
51513 +
51514 +void update_internal(const coord_t * coord, const reiser4_block_nr * blocknr)
51515 +{
51516 +       internal_item_layout *item = internal_at(coord);
51517 +       assert("nikita-2959", reiser4_blocknr_is_sane(blocknr));
51518 +
51519 +       put_unaligned(cpu_to_le64(*blocknr), &item->pointer);
51520 +}
51521 +
51522 +/* return child block number stored in the internal item at @coord */
51523 +static reiser4_block_nr pointer_at(const coord_t * coord /* coord of item */ )
51524 +{
51525 +       assert("nikita-608", coord != NULL);
51526 +       return le64_to_cpu(get_unaligned(&internal_at(coord)->pointer));
51527 +}
51528 +
51529 +/* get znode pointed to by internal @item */
51530 +static znode *znode_at(const coord_t * item /* coord of item */ ,
51531 +                      znode * parent /* parent node */ )
51532 +{
51533 +       return child_znode(item, parent, 1, 0);
51534 +}
51535 +
51536 +/* store pointer from internal item into "block". Implementation of
51537 +    ->down_link() method */
51538 +void down_link_internal(const coord_t * coord /* coord of item */ ,
51539 +                       const reiser4_key * key UNUSED_ARG      /* key to get
51540 +                                                                * pointer for */ ,
51541 +                       reiser4_block_nr * block /* resulting block number */ )
51542 +{
51543 +       ON_DEBUG(reiser4_key item_key);
51544 +
51545 +       assert("nikita-609", coord != NULL);
51546 +       assert("nikita-611", block != NULL);
51547 +       assert("nikita-612", (key == NULL) ||
51548 +              /* twig horrors */
51549 +              (znode_get_level(coord->node) == TWIG_LEVEL)
51550 +              || keyle(item_key_by_coord(coord, &item_key), key));
51551 +
51552 +       *block = pointer_at(coord);
51553 +       assert("nikita-2960", reiser4_blocknr_is_sane(block));
51554 +}
51555 +
51556 +/* Get the child's block number, or 0 if the block is unallocated. */
51557 +int
51558 +utmost_child_real_block_internal(const coord_t * coord, sideof side UNUSED_ARG,
51559 +                                reiser4_block_nr * block)
51560 +{
51561 +       assert("jmacd-2059", coord != NULL);
51562 +
51563 +       *block = pointer_at(coord);
51564 +       assert("nikita-2961", reiser4_blocknr_is_sane(block));
51565 +
51566 +       if (blocknr_is_fake(block)) {
51567 +               *block = 0;
51568 +       }
51569 +
51570 +       return 0;
51571 +}
51572 +
51573 +/* Return the child. */
51574 +int
51575 +utmost_child_internal(const coord_t * coord, sideof side UNUSED_ARG,
51576 +                     jnode ** childp)
51577 +{
51578 +       reiser4_block_nr block = pointer_at(coord);
51579 +       znode *child;
51580 +
51581 +       assert("jmacd-2059", childp != NULL);
51582 +       assert("nikita-2962", reiser4_blocknr_is_sane(&block));
51583 +
51584 +       child = zlook(znode_get_tree(coord->node), &block);
51585 +
51586 +       if (IS_ERR(child)) {
51587 +               return PTR_ERR(child);
51588 +       }
51589 +
51590 +       *childp = ZJNODE(child);
51591 +
51592 +       return 0;
51593 +}
51594 +
51595 +static void check_link(znode * left, znode * right)
51596 +{
51597 +       znode *scan;
51598 +
51599 +       for (scan = left; scan != right; scan = scan->right) {
51600 +               if (ZF_ISSET(scan, JNODE_RIP))
51601 +                       break;
51602 +               if (znode_is_right_connected(scan) && scan->right != NULL) {
51603 +                       if (ZF_ISSET(scan->right, JNODE_RIP))
51604 +                               break;
51605 +                       assert("nikita-3285",
51606 +                              znode_is_left_connected(scan->right));
51607 +                       assert("nikita-3265",
51608 +                              ergo(scan != left,
51609 +                                   ZF_ISSET(scan, JNODE_HEARD_BANSHEE)));
51610 +                       assert("nikita-3284", scan->right->left == scan);
51611 +               } else
51612 +                       break;
51613 +       }
51614 +}
51615 +
51616 +int check__internal(const coord_t * coord, const char **error)
51617 +{
51618 +       reiser4_block_nr blk;
51619 +       znode *child;
51620 +       coord_t cpy;
51621 +
51622 +       blk = pointer_at(coord);
51623 +       if (!reiser4_blocknr_is_sane(&blk)) {
51624 +               *error = "Invalid pointer";
51625 +               return -1;
51626 +       }
51627 +       coord_dup(&cpy, coord);
51628 +       child = znode_at(&cpy, cpy.node);
51629 +       if (child != NULL) {
51630 +               znode *left_child;
51631 +               znode *right_child;
51632 +
51633 +               left_child = right_child = NULL;
51634 +
51635 +               assert("nikita-3256", znode_invariant(child));
51636 +               if (coord_prev_item(&cpy) == 0 && item_is_internal(&cpy)) {
51637 +                       left_child = znode_at(&cpy, cpy.node);
51638 +                       if (left_child != NULL) {
51639 +                               read_lock_tree(znode_get_tree(child));
51640 +                               check_link(left_child, child);
51641 +                               read_unlock_tree(znode_get_tree(child));
51642 +                               zput(left_child);
51643 +                       }
51644 +               }
51645 +               coord_dup(&cpy, coord);
51646 +               if (coord_next_item(&cpy) == 0 && item_is_internal(&cpy)) {
51647 +                       right_child = znode_at(&cpy, cpy.node);
51648 +                       if (right_child != NULL) {
51649 +                               read_lock_tree(znode_get_tree(child));
51650 +                               check_link(child, right_child);
51651 +                               read_unlock_tree(znode_get_tree(child));
51652 +                               zput(right_child);
51653 +                       }
51654 +               }
51655 +               zput(child);
51656 +       }
51657 +       return 0;
51658 +}
51659 +
51660 +/* return true only if this item really points to "block" */
51661 +/* Audited by: green(2002.06.14) */
51662 +int has_pointer_to_internal(const coord_t * coord /* coord of item */ ,
51663 +                           const reiser4_block_nr * block      /* block number to
51664 +                                                                * check */ )
51665 +{
51666 +       assert("nikita-613", coord != NULL);
51667 +       assert("nikita-614", block != NULL);
51668 +
51669 +       return pointer_at(coord) == *block;
51670 +}
51671 +
51672 +/* hook called by ->create_item() method of node plugin after new internal
51673 +   item was just created.
51674 +
51675 +   This is point where pointer to new node is inserted into tree. Initialize
51676 +   parent pointer in child znode, insert child into sibling list and slum.
51677 +
51678 +*/
51679 +int create_hook_internal(const coord_t * item /* coord of item */ ,
51680 +                        void *arg /* child's left neighbor, if any */ )
51681 +{
51682 +       znode *child;
51683 +       __u64 child_ptr;
51684 +
51685 +       assert("nikita-1252", item != NULL);
51686 +       assert("nikita-1253", item->node != NULL);
51687 +       assert("nikita-1181", znode_get_level(item->node) > LEAF_LEVEL);
51688 +       assert("nikita-1450", item->unit_pos == 0);
51689 +
51690 +       /*
51691 +        * preparing to item insertion build_child_ptr_data sets pointer to
51692 +        * data to be inserted to jnode's blocknr which is in cpu byte
51693 +        * order. Node's create_item simply copied those data. As result we
51694 +        * have child pointer in cpu's byte order. Convert content of internal
51695 +        * item to little endian byte order.
51696 +        */
51697 +       child_ptr = get_unaligned((__u64 *)item_body_by_coord(item));
51698 +       update_internal(item, &child_ptr);
51699 +
51700 +       child = znode_at(item, item->node);
51701 +       if (child != NULL && !IS_ERR(child)) {
51702 +               znode *left;
51703 +               int result = 0;
51704 +               reiser4_tree *tree;
51705 +
51706 +               left = arg;
51707 +               tree = znode_get_tree(item->node);
51708 +               write_lock_tree(tree);
51709 +               write_lock_dk(tree);
51710 +               assert("nikita-1400", (child->in_parent.node == NULL)
51711 +                      || (znode_above_root(child->in_parent.node)));
51712 +               ++item->node->c_count;
51713 +               coord_to_parent_coord(item, &child->in_parent);
51714 +               sibling_list_insert_nolock(child, left);
51715 +
51716 +               assert("nikita-3297", ZF_ISSET(child, JNODE_ORPHAN));
51717 +               ZF_CLR(child, JNODE_ORPHAN);
51718 +
51719 +               if ((left != NULL) && !keyeq(znode_get_rd_key(left),
51720 +                                            znode_get_rd_key(child))) {
51721 +                       znode_set_rd_key(child, znode_get_rd_key(left));
51722 +               }
51723 +               write_unlock_dk(tree);
51724 +               write_unlock_tree(tree);
51725 +               zput(child);
51726 +               return result;
51727 +       } else {
51728 +               if (child == NULL)
51729 +                       child = ERR_PTR(-EIO);
51730 +               return PTR_ERR(child);
51731 +       }
51732 +}
51733 +
51734 +/* hook called by ->cut_and_kill() method of node plugin just before internal
51735 +   item is removed.
51736 +
51737 +   This is point where empty node is removed from the tree. Clear parent
51738 +   pointer in child, and mark node for pending deletion.
51739 +
51740 +   Node will be actually deleted later and in several installations:
51741 +
51742 +    . when last lock on this node will be released, node will be removed from
51743 +    the sibling list and its lock will be invalidated
51744 +
51745 +    . when last reference to this node will be dropped, bitmap will be updated
51746 +    and node will be actually removed from the memory.
51747 +
51748 +
51749 +*/
51750 +int kill_hook_internal(const coord_t * item /* coord of item */ ,
51751 +                      pos_in_node_t from UNUSED_ARG /* start unit */ ,
51752 +                      pos_in_node_t count UNUSED_ARG /* stop unit */ ,
51753 +                      struct carry_kill_data *p UNUSED_ARG)
51754 +{
51755 +       znode *child;
51756 +
51757 +       assert("nikita-1222", item != NULL);
51758 +       assert("nikita-1224", from == 0);
51759 +       assert("nikita-1225", count == 1);
51760 +
51761 +       child = znode_at(item, item->node);
51762 +       if (IS_ERR(child))
51763 +               return PTR_ERR(child);
51764 +       else if (node_is_empty(child)) {
51765 +               reiser4_tree *tree;
51766 +
51767 +               assert("nikita-1397", znode_is_write_locked(child));
51768 +               assert("nikita-1398", child->c_count == 0);
51769 +               assert("nikita-2546", ZF_ISSET(child, JNODE_HEARD_BANSHEE));
51770 +
51771 +               tree = znode_get_tree(item->node);
51772 +               write_lock_tree(tree);
51773 +               init_parent_coord(&child->in_parent, NULL);
51774 +               --item->node->c_count;
51775 +               write_unlock_tree(tree);
51776 +               zput(child);
51777 +               return 0;
51778 +       } else {
51779 +               warning("nikita-1223",
51780 +                       "Cowardly refuse to remove link to non-empty node");
51781 +               zput(child);
51782 +               return RETERR(-EIO);
51783 +       }
51784 +}
51785 +
51786 +/* hook called by ->shift() node plugin method when iternal item was just
51787 +   moved from one node to another.
51788 +
51789 +   Update parent pointer in child and c_counts in old and new parent
51790 +
51791 +*/
51792 +int shift_hook_internal(const coord_t * item /* coord of item */ ,
51793 +                       unsigned from UNUSED_ARG /* start unit */ ,
51794 +                       unsigned count UNUSED_ARG /* stop unit */ ,
51795 +                       znode * old_node /* old parent */ )
51796 +{
51797 +       znode *child;
51798 +       znode *new_node;
51799 +       reiser4_tree *tree;
51800 +
51801 +       assert("nikita-1276", item != NULL);
51802 +       assert("nikita-1277", from == 0);
51803 +       assert("nikita-1278", count == 1);
51804 +       assert("nikita-1451", item->unit_pos == 0);
51805 +
51806 +       new_node = item->node;
51807 +       assert("nikita-2132", new_node != old_node);
51808 +       tree = znode_get_tree(item->node);
51809 +       child = child_znode(item, old_node, 1, 0);
51810 +       if (child == NULL)
51811 +               return 0;
51812 +       if (!IS_ERR(child)) {
51813 +               write_lock_tree(tree);
51814 +               ++new_node->c_count;
51815 +               assert("nikita-1395", znode_parent(child) == old_node);
51816 +               assert("nikita-1396", old_node->c_count > 0);
51817 +               coord_to_parent_coord(item, &child->in_parent);
51818 +               assert("nikita-1781", znode_parent(child) == new_node);
51819 +               assert("nikita-1782",
51820 +                      check_tree_pointer(item, child) == NS_FOUND);
51821 +               --old_node->c_count;
51822 +               write_unlock_tree(tree);
51823 +               zput(child);
51824 +               return 0;
51825 +       } else
51826 +               return PTR_ERR(child);
51827 +}
51828 +
51829 +/* plugin->u.item.b.max_key_inside - not defined */
51830 +
51831 +/* plugin->u.item.b.nr_units - item.c:single_unit */
51832 +
51833 +/* Make Linus happy.
51834 +   Local variables:
51835 +   c-indentation-style: "K&R"
51836 +   mode-name: "LC"
51837 +   c-basic-offset: 8
51838 +   tab-width: 8
51839 +   fill-column: 120
51840 +   End:
51841 +*/
51842 diff --git a/fs/reiser4/plugin/item/internal.h b/fs/reiser4/plugin/item/internal.h
51843 new file mode 100644
51844 index 0000000..ca4c13f
51845 --- /dev/null
51846 +++ b/fs/reiser4/plugin/item/internal.h
51847 @@ -0,0 +1,57 @@
51848 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
51849 +/* Internal item contains down-link to the child of the internal/twig
51850 +   node in a tree. It is internal items that are actually used during
51851 +   tree traversal. */
51852 +
51853 +#if !defined( __FS_REISER4_PLUGIN_ITEM_INTERNAL_H__ )
51854 +#define __FS_REISER4_PLUGIN_ITEM_INTERNAL_H__
51855 +
51856 +#include "../../forward.h"
51857 +#include "../../dformat.h"
51858 +
51859 +/* on-disk layout of internal item */
51860 +typedef struct internal_item_layout {
51861 +       /*  0 */ reiser4_dblock_nr pointer;
51862 +       /*  4 */
51863 +} internal_item_layout;
51864 +
51865 +struct cut_list;
51866 +
51867 +int mergeable_internal(const coord_t * p1, const coord_t * p2);
51868 +lookup_result lookup_internal(const reiser4_key * key, lookup_bias bias,
51869 +                             coord_t * coord);
51870 +/* store pointer from internal item into "block". Implementation of
51871 +    ->down_link() method */
51872 +extern void down_link_internal(const coord_t * coord, const reiser4_key * key,
51873 +                              reiser4_block_nr * block);
51874 +extern int has_pointer_to_internal(const coord_t * coord,
51875 +                                  const reiser4_block_nr * block);
51876 +extern int create_hook_internal(const coord_t * item, void *arg);
51877 +extern int kill_hook_internal(const coord_t * item, pos_in_node_t from,
51878 +                             pos_in_node_t count, struct carry_kill_data *);
51879 +extern int shift_hook_internal(const coord_t * item, unsigned from,
51880 +                              unsigned count, znode * old_node);
51881 +extern void print_internal(const char *prefix, coord_t * coord);
51882 +
51883 +extern int utmost_child_internal(const coord_t * coord, sideof side,
51884 +                                jnode ** child);
51885 +int utmost_child_real_block_internal(const coord_t * coord, sideof side,
51886 +                                    reiser4_block_nr * block);
51887 +
51888 +extern void update_internal(const coord_t * coord,
51889 +                           const reiser4_block_nr * blocknr);
51890 +/* FIXME: reiserfs has check_internal */
51891 +extern int check__internal(const coord_t * coord, const char **error);
51892 +
51893 +/* __FS_REISER4_PLUGIN_ITEM_INTERNAL_H__ */
51894 +#endif
51895 +
51896 +/* Make Linus happy.
51897 +   Local variables:
51898 +   c-indentation-style: "K&R"
51899 +   mode-name: "LC"
51900 +   c-basic-offset: 8
51901 +   tab-width: 8
51902 +   fill-column: 120
51903 +   End:
51904 +*/
51905 diff --git a/fs/reiser4/plugin/item/item.c b/fs/reiser4/plugin/item/item.c
51906 new file mode 100644
51907 index 0000000..62836c4
51908 --- /dev/null
51909 +++ b/fs/reiser4/plugin/item/item.c
51910 @@ -0,0 +1,727 @@
51911 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
51912 +
51913 +/* definition of item plugins. */
51914 +
51915 +#include "../../forward.h"
51916 +#include "../../debug.h"
51917 +#include "../../key.h"
51918 +#include "../../coord.h"
51919 +#include "../plugin_header.h"
51920 +#include "sde.h"
51921 +#include "internal.h"
51922 +#include "item.h"
51923 +#include "static_stat.h"
51924 +#include "../plugin.h"
51925 +#include "../../znode.h"
51926 +#include "../../tree.h"
51927 +#include "../../context.h"
51928 +#include "ctail.h"
51929 +
51930 +/* return pointer to item body */
51931 +void item_body_by_coord_hard(coord_t * coord /* coord to query */ )
51932 +{
51933 +       assert("nikita-324", coord != NULL);
51934 +       assert("nikita-325", coord->node != NULL);
51935 +       assert("nikita-326", znode_is_loaded(coord->node));
51936 +       assert("nikita-3200", coord->offset == INVALID_OFFSET);
51937 +
51938 +       coord->offset =
51939 +           node_plugin_by_node(coord->node)->item_by_coord(coord) -
51940 +           zdata(coord->node);
51941 +       ON_DEBUG(coord->body_v = coord->node->times_locked);
51942 +}
51943 +
51944 +void *item_body_by_coord_easy(const coord_t * coord /* coord to query */ )
51945 +{
51946 +       return zdata(coord->node) + coord->offset;
51947 +}
51948 +
51949 +#if REISER4_DEBUG
51950 +
51951 +int item_body_is_valid(const coord_t * coord)
51952 +{
51953 +       return
51954 +           coord->offset ==
51955 +           node_plugin_by_node(coord->node)->item_by_coord(coord) -
51956 +           zdata(coord->node);
51957 +}
51958 +
51959 +#endif
51960 +
51961 +/* return length of item at @coord */
51962 +pos_in_node_t item_length_by_coord(const coord_t * coord /* coord to query */ )
51963 +{
51964 +       int len;
51965 +
51966 +       assert("nikita-327", coord != NULL);
51967 +       assert("nikita-328", coord->node != NULL);
51968 +       assert("nikita-329", znode_is_loaded(coord->node));
51969 +
51970 +       len = node_plugin_by_node(coord->node)->length_by_coord(coord);
51971 +       return len;
51972 +}
51973 +
51974 +void obtain_item_plugin(const coord_t * coord)
51975 +{
51976 +       assert("nikita-330", coord != NULL);
51977 +       assert("nikita-331", coord->node != NULL);
51978 +       assert("nikita-332", znode_is_loaded(coord->node));
51979 +
51980 +       coord_set_iplug((coord_t *) coord,
51981 +                       node_plugin_by_node(coord->node)->
51982 +                       plugin_by_coord(coord));
51983 +       assert("nikita-2479",
51984 +              coord_iplug(coord) ==
51985 +              node_plugin_by_node(coord->node)->plugin_by_coord(coord));
51986 +}
51987 +
51988 +/* return type of item at @coord */
51989 +item_type_id item_type_by_coord(const coord_t * coord /* coord to query */ )
51990 +{
51991 +       assert("nikita-333", coord != NULL);
51992 +       assert("nikita-334", coord->node != NULL);
51993 +       assert("nikita-335", znode_is_loaded(coord->node));
51994 +       assert("nikita-336", item_plugin_by_coord(coord) != NULL);
51995 +
51996 +       return item_plugin_by_coord(coord)->b.item_type;
51997 +}
51998 +
51999 +/* return id of item */
52000 +/* Audited by: green(2002.06.15) */
52001 +item_id item_id_by_coord(const coord_t * coord /* coord to query */ )
52002 +{
52003 +       assert("vs-539", coord != NULL);
52004 +       assert("vs-538", coord->node != NULL);
52005 +       assert("vs-537", znode_is_loaded(coord->node));
52006 +       assert("vs-536", item_plugin_by_coord(coord) != NULL);
52007 +       assert("vs-540",
52008 +              item_id_by_plugin(item_plugin_by_coord(coord)) < LAST_ITEM_ID);
52009 +
52010 +       return item_id_by_plugin(item_plugin_by_coord(coord));
52011 +}
52012 +
52013 +/* return key of item at @coord */
52014 +/* Audited by: green(2002.06.15) */
52015 +reiser4_key *item_key_by_coord(const coord_t * coord /* coord to query */ ,
52016 +                              reiser4_key * key /* result */ )
52017 +{
52018 +       assert("nikita-338", coord != NULL);
52019 +       assert("nikita-339", coord->node != NULL);
52020 +       assert("nikita-340", znode_is_loaded(coord->node));
52021 +
52022 +       return node_plugin_by_node(coord->node)->key_at(coord, key);
52023 +}
52024 +
52025 +/* this returns max key in the item */
52026 +reiser4_key *max_item_key_by_coord(const coord_t * coord /* coord to query */ ,
52027 +                                  reiser4_key * key /* result */ )
52028 +{
52029 +       coord_t last;
52030 +
52031 +       assert("nikita-338", coord != NULL);
52032 +       assert("nikita-339", coord->node != NULL);
52033 +       assert("nikita-340", znode_is_loaded(coord->node));
52034 +
52035 +       /* make coord pointing to last item's unit */
52036 +       coord_dup(&last, coord);
52037 +       last.unit_pos = coord_num_units(&last) - 1;
52038 +       assert("vs-1560", coord_is_existing_unit(&last));
52039 +
52040 +       max_unit_key_by_coord(&last, key);
52041 +       return key;
52042 +}
52043 +
52044 +/* return key of unit at @coord */
52045 +reiser4_key *unit_key_by_coord(const coord_t * coord /* coord to query */ ,
52046 +                              reiser4_key * key /* result */ )
52047 +{
52048 +       assert("nikita-772", coord != NULL);
52049 +       assert("nikita-774", coord->node != NULL);
52050 +       assert("nikita-775", znode_is_loaded(coord->node));
52051 +
52052 +       if (item_plugin_by_coord(coord)->b.unit_key != NULL)
52053 +               return item_plugin_by_coord(coord)->b.unit_key(coord, key);
52054 +       else
52055 +               return item_key_by_coord(coord, key);
52056 +}
52057 +
52058 +/* return the biggest key contained the unit @coord */
52059 +reiser4_key *max_unit_key_by_coord(const coord_t * coord /* coord to query */ ,
52060 +                                  reiser4_key * key /* result */ )
52061 +{
52062 +       assert("nikita-772", coord != NULL);
52063 +       assert("nikita-774", coord->node != NULL);
52064 +       assert("nikita-775", znode_is_loaded(coord->node));
52065 +
52066 +       if (item_plugin_by_coord(coord)->b.max_unit_key != NULL)
52067 +               return item_plugin_by_coord(coord)->b.max_unit_key(coord, key);
52068 +       else
52069 +               return unit_key_by_coord(coord, key);
52070 +}
52071 +
52072 +/* ->max_key_inside() method for items consisting of exactly one key (like
52073 +    stat-data) */
52074 +static reiser4_key *max_key_inside_single_key(const coord_t *
52075 +                                             coord /* coord of item */ ,
52076 +                                             reiser4_key *
52077 +                                             result /* resulting key */ )
52078 +{
52079 +       assert("nikita-604", coord != NULL);
52080 +
52081 +       /* coord -> key is starting key of this item and it has to be already
52082 +          filled in */
52083 +       return unit_key_by_coord(coord, result);
52084 +}
52085 +
52086 +/* ->nr_units() method for items consisting of exactly one unit always */
52087 +static pos_in_node_t
52088 +nr_units_single_unit(const coord_t * coord UNUSED_ARG /* coord of item */ )
52089 +{
52090 +       return 1;
52091 +}
52092 +
52093 +static int
52094 +paste_no_paste(coord_t * coord UNUSED_ARG,
52095 +              reiser4_item_data * data UNUSED_ARG,
52096 +              carry_plugin_info * info UNUSED_ARG)
52097 +{
52098 +       return 0;
52099 +}
52100 +
52101 +/* default ->fast_paste() method */
52102 +static int
52103 +agree_to_fast_op(const coord_t * coord UNUSED_ARG /* coord of item */ )
52104 +{
52105 +       return 1;
52106 +}
52107 +
52108 +int item_can_contain_key(const coord_t * item /* coord of item */ ,
52109 +                        const reiser4_key * key /* key to check */ ,
52110 +                        const reiser4_item_data * data /* parameters of item
52111 +                                                        * being created */ )
52112 +{
52113 +       item_plugin *iplug;
52114 +       reiser4_key min_key_in_item;
52115 +       reiser4_key max_key_in_item;
52116 +
52117 +       assert("nikita-1658", item != NULL);
52118 +       assert("nikita-1659", key != NULL);
52119 +
52120 +       iplug = item_plugin_by_coord(item);
52121 +       if (iplug->b.can_contain_key != NULL)
52122 +               return iplug->b.can_contain_key(item, key, data);
52123 +       else {
52124 +               assert("nikita-1681", iplug->b.max_key_inside != NULL);
52125 +               item_key_by_coord(item, &min_key_in_item);
52126 +               iplug->b.max_key_inside(item, &max_key_in_item);
52127 +
52128 +               /* can contain key if
52129 +                  min_key_in_item <= key &&
52130 +                  key <= max_key_in_item
52131 +                */
52132 +               return keyle(&min_key_in_item, key)
52133 +                   && keyle(key, &max_key_in_item);
52134 +       }
52135 +}
52136 +
52137 +/* mergeable method for non mergeable items */
52138 +static int
52139 +not_mergeable(const coord_t * i1 UNUSED_ARG, const coord_t * i2 UNUSED_ARG)
52140 +{
52141 +       return 0;
52142 +}
52143 +
52144 +/* return 0 if @item1 and @item2 are not mergeable, !0 - otherwise */
52145 +int are_items_mergeable(const coord_t * i1 /* coord of first item */ ,
52146 +                       const coord_t * i2 /* coord of second item */ )
52147 +{
52148 +       item_plugin *iplug;
52149 +       reiser4_key k1;
52150 +       reiser4_key k2;
52151 +
52152 +       assert("nikita-1336", i1 != NULL);
52153 +       assert("nikita-1337", i2 != NULL);
52154 +
52155 +       iplug = item_plugin_by_coord(i1);
52156 +       assert("nikita-1338", iplug != NULL);
52157 +
52158 +       /* NOTE-NIKITA are_items_mergeable() is also called by assertions in
52159 +          shifting code when nodes are in "suspended" state. */
52160 +       assert("nikita-1663",
52161 +              keyle(item_key_by_coord(i1, &k1), item_key_by_coord(i2, &k2)));
52162 +
52163 +       if (iplug->b.mergeable != NULL) {
52164 +               return iplug->b.mergeable(i1, i2);
52165 +       } else if (iplug->b.max_key_inside != NULL) {
52166 +               iplug->b.max_key_inside(i1, &k1);
52167 +               item_key_by_coord(i2, &k2);
52168 +
52169 +               /* mergeable if ->max_key_inside() >= key of i2; */
52170 +               return keyge(iplug->b.max_key_inside(i1, &k1),
52171 +                            item_key_by_coord(i2, &k2));
52172 +       } else {
52173 +               item_key_by_coord(i1, &k1);
52174 +               item_key_by_coord(i2, &k2);
52175 +
52176 +               return
52177 +                   (get_key_locality(&k1) == get_key_locality(&k2)) &&
52178 +                   (get_key_objectid(&k1) == get_key_objectid(&k2))
52179 +                   && (iplug == item_plugin_by_coord(i2));
52180 +       }
52181 +}
52182 +
52183 +int item_is_extent(const coord_t * item)
52184 +{
52185 +       assert("vs-482", coord_is_existing_item(item));
52186 +       return item_id_by_coord(item) == EXTENT_POINTER_ID;
52187 +}
52188 +
52189 +int item_is_tail(const coord_t * item)
52190 +{
52191 +       assert("vs-482", coord_is_existing_item(item));
52192 +       return item_id_by_coord(item) == FORMATTING_ID;
52193 +}
52194 +
52195 +int item_is_statdata(const coord_t * item)
52196 +{
52197 +       assert("vs-516", coord_is_existing_item(item));
52198 +       return item_type_by_coord(item) == STAT_DATA_ITEM_TYPE;
52199 +}
52200 +
52201 +int item_is_ctail(const coord_t * item)
52202 +{
52203 +       assert("edward-xx", coord_is_existing_item(item));
52204 +       return item_id_by_coord(item) == CTAIL_ID;
52205 +}
52206 +
52207 +static int change_item(struct inode *inode, reiser4_plugin * plugin)
52208 +{
52209 +       /* cannot change constituent item (sd, or dir_item) */
52210 +       return RETERR(-EINVAL);
52211 +}
52212 +
52213 +static reiser4_plugin_ops item_plugin_ops = {
52214 +       .init = NULL,
52215 +       .load = NULL,
52216 +       .save_len = NULL,
52217 +       .save = NULL,
52218 +       .change = change_item
52219 +};
52220 +
52221 +item_plugin item_plugins[LAST_ITEM_ID] = {
52222 +       [STATIC_STAT_DATA_ID] = {
52223 +               .h = {
52224 +                       .type_id = REISER4_ITEM_PLUGIN_TYPE,
52225 +                       .id = STATIC_STAT_DATA_ID,
52226 +                       .pops = &item_plugin_ops,
52227 +                       .label = "sd",
52228 +                       .desc = "stat-data",
52229 +                       .linkage = {NULL, NULL}
52230 +               },
52231 +               .b = {
52232 +                       .item_type = STAT_DATA_ITEM_TYPE,
52233 +                       .max_key_inside = max_key_inside_single_key,
52234 +                       .can_contain_key = NULL,
52235 +                       .mergeable = not_mergeable,
52236 +                       .nr_units = nr_units_single_unit,
52237 +                       .lookup = NULL,
52238 +                       .init = NULL,
52239 +                       .paste = paste_no_paste,
52240 +                       .fast_paste = NULL,
52241 +                       .can_shift = NULL,
52242 +                       .copy_units = NULL,
52243 +                       .create_hook = NULL,
52244 +                       .kill_hook = NULL,
52245 +                       .shift_hook = NULL,
52246 +                       .cut_units = NULL,
52247 +                       .kill_units = NULL,
52248 +                       .unit_key = NULL,
52249 +                       .max_unit_key = NULL,
52250 +                       .estimate = NULL,
52251 +                       .item_data_by_flow = NULL,
52252 +#if REISER4_DEBUG
52253 +                       .check = NULL
52254 +#endif
52255 +               },
52256 +               .f = {
52257 +                       .utmost_child = NULL,
52258 +                       .utmost_child_real_block = NULL,
52259 +                       .update = NULL,
52260 +                       .scan = NULL,
52261 +                       .convert = NULL
52262 +               },
52263 +               .s = {
52264 +                       .sd = {
52265 +                               .init_inode = init_inode_static_sd,
52266 +                               .save_len = save_len_static_sd,
52267 +                               .save = save_static_sd
52268 +                       }
52269 +               }
52270 +       },
52271 +       [SIMPLE_DIR_ENTRY_ID] = {
52272 +               .h = {
52273 +                       .type_id = REISER4_ITEM_PLUGIN_TYPE,
52274 +                       .id = SIMPLE_DIR_ENTRY_ID,
52275 +                       .pops = &item_plugin_ops,
52276 +                       .label = "de",
52277 +                       .desc = "directory entry",
52278 +                       .linkage = {NULL, NULL}
52279 +               },
52280 +               .b = {
52281 +                       .item_type = DIR_ENTRY_ITEM_TYPE,
52282 +                       .max_key_inside = max_key_inside_single_key,
52283 +                       .can_contain_key = NULL,
52284 +                       .mergeable = NULL,
52285 +                       .nr_units = nr_units_single_unit,
52286 +                       .lookup = NULL,
52287 +                       .init = NULL,
52288 +                       .paste = NULL,
52289 +                       .fast_paste = NULL,
52290 +                       .can_shift = NULL,
52291 +                       .copy_units = NULL,
52292 +                       .create_hook = NULL,
52293 +                       .kill_hook = NULL,
52294 +                       .shift_hook = NULL,
52295 +                       .cut_units = NULL,
52296 +                       .kill_units = NULL,
52297 +                       .unit_key = NULL,
52298 +                       .max_unit_key = NULL,
52299 +                       .estimate = NULL,
52300 +                       .item_data_by_flow = NULL,
52301 +#if REISER4_DEBUG
52302 +                       .check = NULL
52303 +#endif
52304 +               },
52305 +               .f = {
52306 +                       .utmost_child = NULL,
52307 +                       .utmost_child_real_block = NULL,
52308 +                       .update = NULL,
52309 +                       .scan = NULL,
52310 +                       .convert = NULL
52311 +               },
52312 +               .s = {
52313 +                       .dir = {
52314 +                               .extract_key = extract_key_de,
52315 +                               .update_key = update_key_de,
52316 +                               .extract_name = extract_name_de,
52317 +                               .extract_file_type = extract_file_type_de,
52318 +                               .add_entry = add_entry_de,
52319 +                               .rem_entry = rem_entry_de,
52320 +                               .max_name_len = max_name_len_de
52321 +                       }
52322 +               }
52323 +       },
52324 +       [COMPOUND_DIR_ID] = {
52325 +               .h = {
52326 +                       .type_id = REISER4_ITEM_PLUGIN_TYPE,
52327 +                       .id = COMPOUND_DIR_ID,
52328 +                       .pops = &item_plugin_ops,
52329 +                       .label = "cde",
52330 +                       .desc = "compressed directory entry",
52331 +                       .linkage = {NULL, NULL}
52332 +               },
52333 +               .b = {
52334 +                       .item_type = DIR_ENTRY_ITEM_TYPE,
52335 +                       .max_key_inside = max_key_inside_cde,
52336 +                       .can_contain_key = can_contain_key_cde,
52337 +                       .mergeable = mergeable_cde,
52338 +                       .nr_units = nr_units_cde,
52339 +                       .lookup = lookup_cde,
52340 +                       .init = init_cde,
52341 +                       .paste = paste_cde,
52342 +                       .fast_paste = agree_to_fast_op,
52343 +                       .can_shift = can_shift_cde,
52344 +                       .copy_units = copy_units_cde,
52345 +                       .create_hook = NULL,
52346 +                       .kill_hook = NULL,
52347 +                       .shift_hook = NULL,
52348 +                       .cut_units = cut_units_cde,
52349 +                       .kill_units = kill_units_cde,
52350 +                       .unit_key = unit_key_cde,
52351 +                       .max_unit_key = unit_key_cde,
52352 +                       .estimate = estimate_cde,
52353 +                       .item_data_by_flow = NULL,
52354 +#if REISER4_DEBUG
52355 +                       .check = check_cde
52356 +#endif
52357 +               },
52358 +               .f = {
52359 +                       .utmost_child = NULL,
52360 +                       .utmost_child_real_block = NULL,
52361 +                       .update = NULL,
52362 +                       .scan = NULL,
52363 +                       .convert = NULL
52364 +               },
52365 +               .s = {
52366 +                       .dir = {
52367 +                               .extract_key = extract_key_cde,
52368 +                               .update_key = update_key_cde,
52369 +                               .extract_name = extract_name_cde,
52370 +                               .extract_file_type = extract_file_type_de,
52371 +                               .add_entry = add_entry_cde,
52372 +                               .rem_entry = rem_entry_cde,
52373 +                               .max_name_len = max_name_len_cde
52374 +                       }
52375 +               }
52376 +       },
52377 +       [NODE_POINTER_ID] = {
52378 +               .h = {
52379 +                       .type_id = REISER4_ITEM_PLUGIN_TYPE,
52380 +                       .id = NODE_POINTER_ID,
52381 +                       .pops = NULL,
52382 +                       .label = "internal",
52383 +                       .desc = "internal item",
52384 +                       .linkage = {NULL, NULL}
52385 +               },
52386 +               .b = {
52387 +                       .item_type = INTERNAL_ITEM_TYPE,
52388 +                       .max_key_inside = NULL,
52389 +                       .can_contain_key = NULL,
52390 +                       .mergeable = mergeable_internal,
52391 +                       .nr_units = nr_units_single_unit,
52392 +                       .lookup = lookup_internal,
52393 +                       .init = NULL,
52394 +                       .paste = NULL,
52395 +                       .fast_paste = NULL,
52396 +                       .can_shift = NULL,
52397 +                       .copy_units = NULL,
52398 +                       .create_hook = create_hook_internal,
52399 +                       .kill_hook = kill_hook_internal,
52400 +                       .shift_hook = shift_hook_internal,
52401 +                       .cut_units = NULL,
52402 +                       .kill_units = NULL,
52403 +                       .unit_key = NULL,
52404 +                       .max_unit_key = NULL,
52405 +                       .estimate = NULL,
52406 +                       .item_data_by_flow = NULL,
52407 +#if REISER4_DEBUG
52408 +                       .check = check__internal
52409 +#endif
52410 +               },
52411 +               .f = {
52412 +                       .utmost_child = utmost_child_internal,
52413 +                       .utmost_child_real_block =
52414 +                       utmost_child_real_block_internal,
52415 +                       .update = update_internal,
52416 +                       .scan = NULL,
52417 +                       .convert = NULL
52418 +               },
52419 +               .s = {
52420 +                       .internal = {
52421 +                               .down_link = down_link_internal,
52422 +                               .has_pointer_to = has_pointer_to_internal
52423 +                       }
52424 +               }
52425 +       },
52426 +       [EXTENT_POINTER_ID] = {
52427 +               .h = {
52428 +                       .type_id = REISER4_ITEM_PLUGIN_TYPE,
52429 +                       .id = EXTENT_POINTER_ID,
52430 +                       .pops = NULL,
52431 +                       .label = "extent",
52432 +                       .desc = "extent item",
52433 +                       .linkage = {NULL, NULL}
52434 +               },
52435 +               .b = {
52436 +                       .item_type = UNIX_FILE_METADATA_ITEM_TYPE,
52437 +                       .max_key_inside = max_key_inside_extent,
52438 +                       .can_contain_key = can_contain_key_extent,
52439 +                       .mergeable = mergeable_extent,
52440 +                       .nr_units = nr_units_extent,
52441 +                       .lookup = lookup_extent,
52442 +                       .init = NULL,
52443 +                       .paste = paste_extent,
52444 +                       .fast_paste = agree_to_fast_op,
52445 +                       .can_shift = can_shift_extent,
52446 +                       .create_hook = create_hook_extent,
52447 +                       .copy_units = copy_units_extent,
52448 +                       .kill_hook = kill_hook_extent,
52449 +                       .shift_hook = NULL,
52450 +                       .cut_units = cut_units_extent,
52451 +                       .kill_units = kill_units_extent,
52452 +                       .unit_key = unit_key_extent,
52453 +                       .max_unit_key = max_unit_key_extent,
52454 +                       .estimate = NULL,
52455 +                       .item_data_by_flow = NULL,
52456 +#if REISER4_DEBUG
52457 +                       .check = check_extent
52458 +#endif
52459 +               },
52460 +               .f = {
52461 +                       .utmost_child = utmost_child_extent,
52462 +                       .utmost_child_real_block =
52463 +                       utmost_child_real_block_extent,
52464 +                       .update = NULL,
52465 +                       .scan = scan_extent,
52466 +                       .convert = NULL,
52467 +                       .key_by_offset = key_by_offset_extent
52468 +               },
52469 +               .s = {
52470 +                       .file = {
52471 +                               .write = write_extent,
52472 +                               .read = read_extent,
52473 +                               .readpage = readpage_extent,
52474 +                               .get_block = get_block_address_extent,
52475 +                               .readpages = readpages_extent,
52476 +                               .append_key = append_key_extent,
52477 +                               .init_coord_extension =
52478 +                               init_coord_extension_extent
52479 +                       }
52480 +               }
52481 +       },
52482 +       [FORMATTING_ID] = {
52483 +               .h = {
52484 +                       .type_id = REISER4_ITEM_PLUGIN_TYPE,
52485 +                       .id = FORMATTING_ID,
52486 +                       .pops = NULL,
52487 +                       .label = "body",
52488 +                       .desc = "body (or tail?) item",
52489 +                       .linkage = {NULL, NULL}
52490 +               },
52491 +               .b = {
52492 +                       .item_type = UNIX_FILE_METADATA_ITEM_TYPE,
52493 +                       .max_key_inside = max_key_inside_tail,
52494 +                       .can_contain_key = can_contain_key_tail,
52495 +                       .mergeable = mergeable_tail,
52496 +                       .nr_units = nr_units_tail,
52497 +                       .lookup = lookup_tail,
52498 +                       .init = NULL,
52499 +                       .paste = paste_tail,
52500 +                       .fast_paste = agree_to_fast_op,
52501 +                       .can_shift = can_shift_tail,
52502 +                       .create_hook = NULL,
52503 +                       .copy_units = copy_units_tail,
52504 +                       .kill_hook = kill_hook_tail,
52505 +                       .shift_hook = NULL,
52506 +                       .cut_units = cut_units_tail,
52507 +                       .kill_units = kill_units_tail,
52508 +                       .unit_key = unit_key_tail,
52509 +                       .max_unit_key = unit_key_tail,
52510 +                       .estimate = NULL,
52511 +                       .item_data_by_flow = NULL,
52512 +#if REISER4_DEBUG
52513 +                       .check = NULL
52514 +#endif
52515 +               },
52516 +               .f = {
52517 +                       .utmost_child = NULL,
52518 +                       .utmost_child_real_block = NULL,
52519 +                       .update = NULL,
52520 +                       .scan = NULL,
52521 +                       .convert = NULL
52522 +               },
52523 +               .s = {
52524 +                       .file = {
52525 +                               .write = write_tail,
52526 +                               .read = read_tail,
52527 +                               .readpage = readpage_tail,
52528 +                               .get_block = NULL,
52529 +                               .readpages = NULL,
52530 +                               .append_key = append_key_tail,
52531 +                               .init_coord_extension =
52532 +                               init_coord_extension_tail
52533 +                       }
52534 +               }
52535 +       },
52536 +       [CTAIL_ID] = {
52537 +               .h = {
52538 +                       .type_id = REISER4_ITEM_PLUGIN_TYPE,
52539 +                       .id = CTAIL_ID,
52540 +                       .pops = NULL,
52541 +                       .label = "ctail",
52542 +                       .desc = "cryptcompress tail item",
52543 +                       .linkage = {NULL, NULL}
52544 +               },
52545 +               .b = {
52546 +                       .item_type = UNIX_FILE_METADATA_ITEM_TYPE,
52547 +                       .max_key_inside = max_key_inside_tail,
52548 +                       .can_contain_key = can_contain_key_ctail,
52549 +                       .mergeable = mergeable_ctail,
52550 +                       .nr_units = nr_units_ctail,
52551 +                       .lookup = NULL,
52552 +                       .init = init_ctail,
52553 +                       .paste = paste_ctail,
52554 +                       .fast_paste = agree_to_fast_op,
52555 +                       .can_shift = can_shift_ctail,
52556 +                       .create_hook = create_hook_ctail,
52557 +                       .copy_units = copy_units_ctail,
52558 +                       .kill_hook = kill_hook_ctail,
52559 +                       .shift_hook = shift_hook_ctail,
52560 +                       .cut_units = cut_units_ctail,
52561 +                       .kill_units = kill_units_ctail,
52562 +                       .unit_key = unit_key_tail,
52563 +                       .max_unit_key = unit_key_tail,
52564 +                       .estimate = estimate_ctail,
52565 +                       .item_data_by_flow = NULL,
52566 +#if REISER4_DEBUG
52567 +                       .check = check_ctail
52568 +#endif
52569 +               },
52570 +               .f = {
52571 +                       .utmost_child = utmost_child_ctail,
52572 +                       /* FIXME-EDWARD: write this */
52573 +                       .utmost_child_real_block = NULL,
52574 +                       .update = NULL,
52575 +                       .scan = scan_ctail,
52576 +                       .convert = convert_ctail
52577 +               },
52578 +               .s = {
52579 +                       .file = {
52580 +                               .write = NULL,
52581 +                               .read = read_ctail,
52582 +                               .readpage = readpage_ctail,
52583 +                               .get_block = get_block_address_tail,
52584 +                               .readpages = readpages_ctail,
52585 +                               .append_key = append_key_ctail,
52586 +                               .init_coord_extension =
52587 +                               init_coord_extension_tail
52588 +                       }
52589 +               }
52590 +       },
52591 +       [BLACK_BOX_ID] = {
52592 +               .h = {
52593 +                       .type_id = REISER4_ITEM_PLUGIN_TYPE,
52594 +                       .id = BLACK_BOX_ID,
52595 +                       .pops = NULL,
52596 +                       .label = "blackbox",
52597 +                       .desc = "black box item",
52598 +                       .linkage = {NULL, NULL}
52599 +               },
52600 +               .b = {
52601 +                       .item_type = OTHER_ITEM_TYPE,
52602 +                       .max_key_inside = NULL,
52603 +                       .can_contain_key = NULL,
52604 +                       .mergeable = not_mergeable,
52605 +                       .nr_units = nr_units_single_unit,
52606 +                       /* to need for ->lookup method */
52607 +                       .lookup = NULL,
52608 +                       .init = NULL,
52609 +                       .paste = NULL,
52610 +                       .fast_paste = NULL,
52611 +                       .can_shift = NULL,
52612 +                       .copy_units = NULL,
52613 +                       .create_hook = NULL,
52614 +                       .kill_hook = NULL,
52615 +                       .shift_hook = NULL,
52616 +                       .cut_units = NULL,
52617 +                       .kill_units = NULL,
52618 +                       .unit_key = NULL,
52619 +                       .max_unit_key = NULL,
52620 +                       .estimate = NULL,
52621 +                       .item_data_by_flow = NULL,
52622 +#if REISER4_DEBUG
52623 +                       .check = NULL
52624 +#endif
52625 +               }
52626 +       }
52627 +};
52628 +
52629 +/* Make Linus happy.
52630 +   Local variables:
52631 +   c-indentation-style: "K&R"
52632 +   mode-name: "LC"
52633 +   c-basic-offset: 8
52634 +   tab-width: 8
52635 +   fill-column: 120
52636 +   End:
52637 +*/
52638 diff --git a/fs/reiser4/plugin/item/item.h b/fs/reiser4/plugin/item/item.h
52639 new file mode 100644
52640 index 0000000..9865be3
52641 --- /dev/null
52642 +++ b/fs/reiser4/plugin/item/item.h
52643 @@ -0,0 +1,399 @@
52644 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
52645 +
52646 +/* first read balance.c comments before reading this */
52647 +
52648 +/* An item_plugin implements all of the operations required for
52649 +   balancing that are item specific. */
52650 +
52651 +/* an item plugin also implements other operations that are specific to that
52652 +   item.  These go into the item specific operations portion of the item
52653 +   handler, and all of the item specific portions of the item handler are put
52654 +   into a union. */
52655 +
52656 +#if !defined( __REISER4_ITEM_H__ )
52657 +#define __REISER4_ITEM_H__
52658 +
52659 +#include "../../forward.h"
52660 +#include "../plugin_header.h"
52661 +#include "../../dformat.h"
52662 +#include "../../seal.h"
52663 +#include "../../plugin/file/file.h"
52664 +
52665 +#include <linux/fs.h>          /* for struct file, struct inode  */
52666 +#include <linux/mm.h>          /* for struct page */
52667 +#include <linux/dcache.h>      /* for struct dentry */
52668 +
52669 +typedef enum {
52670 +       STAT_DATA_ITEM_TYPE,
52671 +       DIR_ENTRY_ITEM_TYPE,
52672 +       INTERNAL_ITEM_TYPE,
52673 +       UNIX_FILE_METADATA_ITEM_TYPE,
52674 +       OTHER_ITEM_TYPE
52675 +} item_type_id;
52676 +
52677 +/* this is the part of each item plugin that all items are expected to
52678 +   support or at least explicitly fail to support by setting the
52679 +   pointer to null. */
52680 +typedef struct {
52681 +       item_type_id item_type;
52682 +
52683 +       /* operations called by balancing
52684 +
52685 +          It is interesting to consider that some of these item
52686 +          operations could be given sources or targets that are not
52687 +          really items in nodes.  This could be ok/useful.
52688 +
52689 +        */
52690 +       /* maximal key that can _possibly_ be occupied by this item
52691 +
52692 +          When inserting, and node ->lookup() method (called by
52693 +          coord_by_key()) reaches an item after binary search,
52694 +          the  ->max_key_inside() item plugin method is used to determine
52695 +          whether new item should pasted into existing item
52696 +          (new_key<=max_key_inside()) or new item has to be created
52697 +          (new_key>max_key_inside()).
52698 +
52699 +          For items that occupy exactly one key (like stat-data)
52700 +          this method should return this key. For items that can
52701 +          grow indefinitely (extent, directory item) this should
52702 +          return max_key().
52703 +
52704 +          For example extent with the key
52705 +
52706 +          (LOCALITY,4,OBJID,STARTING-OFFSET), and length BLK blocks,
52707 +
52708 +          ->max_key_inside is (LOCALITY,4,OBJID,0xffffffffffffffff), and
52709 +        */
52710 +       reiser4_key *(*max_key_inside) (const coord_t *, reiser4_key *);
52711 +
52712 +       /* true if item @coord can merge data at @key. */
52713 +       int (*can_contain_key) (const coord_t *, const reiser4_key *,
52714 +                               const reiser4_item_data *);
52715 +       /* mergeable() - check items for mergeability
52716 +
52717 +          Optional method. Returns true if two items can be merged.
52718 +
52719 +        */
52720 +       int (*mergeable) (const coord_t *, const coord_t *);
52721 +
52722 +       /* number of atomic things in an item */
52723 +        pos_in_node_t(*nr_units) (const coord_t *);
52724 +
52725 +       /* search within item for a unit within the item, and return a
52726 +          pointer to it.  This can be used to calculate how many
52727 +          bytes to shrink an item if you use pointer arithmetic and
52728 +          compare to the start of the item body if the item's data
52729 +          are continuous in the node, if the item's data are not
52730 +          continuous in the node, all sorts of other things are maybe
52731 +          going to break as well. */
52732 +        lookup_result(*lookup) (const reiser4_key *, lookup_bias, coord_t *);
52733 +       /* method called by ode_plugin->create_item() to initialise new
52734 +          item */
52735 +       int (*init) (coord_t * target, coord_t * from,
52736 +                    reiser4_item_data * data);
52737 +       /* method called (e.g., by resize_item()) to place new data into
52738 +          item when it grows */
52739 +       int (*paste) (coord_t *, reiser4_item_data *, carry_plugin_info *);
52740 +       /* return true if paste into @coord is allowed to skip
52741 +          carry. That is, if such paste would require any changes
52742 +          at the parent level
52743 +        */
52744 +       int (*fast_paste) (const coord_t *);
52745 +       /* how many but not more than @want units of @source can be
52746 +          shifted into @target node. If pend == append - we try to
52747 +          append last item of @target by first units of @source. If
52748 +          pend == prepend - we try to "prepend" first item in @target
52749 +          by last units of @source. @target node has @free_space
52750 +          bytes of free space. Total size of those units are returned
52751 +          via @size.
52752 +
52753 +          @target is not NULL if shifting to the mergeable item and
52754 +          NULL is new item will be created during shifting.
52755 +        */
52756 +       int (*can_shift) (unsigned free_space, coord_t *,
52757 +                         znode *, shift_direction, unsigned *size,
52758 +                         unsigned want);
52759 +
52760 +       /* starting off @from-th unit of item @source append or
52761 +          prepend @count units to @target. @target has been already
52762 +          expanded by @free_space bytes. That must be exactly what is
52763 +          needed for those items in @target. If @where_is_free_space
52764 +          == SHIFT_LEFT - free space is at the end of @target item,
52765 +          othersize - it is in the beginning of it. */
52766 +       void (*copy_units) (coord_t *, coord_t *,
52767 +                           unsigned from, unsigned count,
52768 +                           shift_direction where_is_free_space,
52769 +                           unsigned free_space);
52770 +
52771 +       int (*create_hook) (const coord_t *, void *);
52772 +       /* do whatever is necessary to do when @count units starting
52773 +          from @from-th one are removed from the tree */
52774 +       /* FIXME-VS: this is used to be here for, in particular,
52775 +          extents and items of internal type to free blocks they point
52776 +          to at the same time with removing items from a
52777 +          tree. Problems start, however, when dealloc_block fails due
52778 +          to some reason. Item gets removed, but blocks it pointed to
52779 +          are not freed. It is not clear how to fix this for items of
52780 +          internal type because a need to remove internal item may
52781 +          appear in the middle of balancing, and there is no way to
52782 +          undo changes made. OTOH, if space allocator involves
52783 +          balancing to perform dealloc_block - this will probably
52784 +          break balancing due to deadlock issues
52785 +        */
52786 +       int (*kill_hook) (const coord_t *, pos_in_node_t from,
52787 +                         pos_in_node_t count, struct carry_kill_data *);
52788 +       int (*shift_hook) (const coord_t *, unsigned from, unsigned count,
52789 +                          znode * _node);
52790 +
52791 +       /* unit @*from contains @from_key. unit @*to contains @to_key. Cut all keys between @from_key and @to_key
52792 +          including boundaries. When units are cut from item beginning - move space which gets freed to head of
52793 +          item. When units are cut from item end - move freed space to item end. When units are cut from the middle of
52794 +          item - move freed space to item head. Return amount of space which got freed. Save smallest removed key in
52795 +          @smallest_removed if it is not 0. Save new first item key in @new_first_key if it is not 0
52796 +        */
52797 +       int (*cut_units) (coord_t *, pos_in_node_t from, pos_in_node_t to,
52798 +                         struct carry_cut_data *,
52799 +                         reiser4_key * smallest_removed,
52800 +                         reiser4_key * new_first_key);
52801 +
52802 +       /* like cut_units, except that these units are removed from the
52803 +          tree, not only from a node */
52804 +       int (*kill_units) (coord_t *, pos_in_node_t from, pos_in_node_t to,
52805 +                          struct carry_kill_data *,
52806 +                          reiser4_key * smallest_removed,
52807 +                          reiser4_key * new_first);
52808 +
52809 +       /* if @key_of_coord == 1 - returned key of coord, otherwise -
52810 +          key of unit is returned. If @coord is not set to certain
52811 +          unit - ERR_PTR(-ENOENT) is returned */
52812 +       reiser4_key *(*unit_key) (const coord_t *, reiser4_key *);
52813 +       reiser4_key *(*max_unit_key) (const coord_t *, reiser4_key *);
52814 +       /* estimate how much space is needed for paste @data into item at
52815 +          @coord. if @coord==0 - estimate insertion, otherwise - estimate
52816 +          pasting
52817 +        */
52818 +       int (*estimate) (const coord_t *, const reiser4_item_data *);
52819 +
52820 +       /* converts flow @f to item data. @coord == 0 on insert */
52821 +       int (*item_data_by_flow) (const coord_t *, const flow_t *,
52822 +                                 reiser4_item_data *);
52823 +
52824 +       /*void (*show) (struct seq_file *, coord_t *); */
52825 +
52826 +#if REISER4_DEBUG
52827 +       /* used for debugging, every item should have here the most
52828 +          complete possible check of the consistency of the item that
52829 +          the inventor can construct */
52830 +       int (*check) (const coord_t *, const char **error);
52831 +#endif
52832 +
52833 +} balance_ops;
52834 +
52835 +typedef struct {
52836 +       /* return the right or left child of @coord, only if it is in memory */
52837 +       int (*utmost_child) (const coord_t *, sideof side, jnode ** child);
52838 +
52839 +       /* return whether the right or left child of @coord has a non-fake
52840 +          block number. */
52841 +       int (*utmost_child_real_block) (const coord_t *, sideof side,
52842 +                                       reiser4_block_nr *);
52843 +       /* relocate child at @coord to the @block */
52844 +       void (*update) (const coord_t *, const reiser4_block_nr *);
52845 +       /* count unformatted nodes per item for leave relocation policy, etc.. */
52846 +       int (*scan) (flush_scan * scan);
52847 +       /* convert item by flush */
52848 +       int (*convert) (flush_pos_t * pos);
52849 +       /* backward mapping from jnode offset to a key.  */
52850 +       int (*key_by_offset) (struct inode *, loff_t, reiser4_key *);
52851 +} flush_ops;
52852 +
52853 +/* operations specific to the directory item */
52854 +typedef struct {
52855 +       /* extract stat-data key from directory entry at @coord and place it
52856 +          into @key. */
52857 +       int (*extract_key) (const coord_t *, reiser4_key * key);
52858 +       /* update object key in item. */
52859 +       int (*update_key) (const coord_t *, const reiser4_key *, lock_handle *);
52860 +       /* extract name from directory entry at @coord and return it */
52861 +       char *(*extract_name) (const coord_t *, char *buf);
52862 +       /* extract file type (DT_* stuff) from directory entry at @coord and
52863 +          return it */
52864 +       unsigned (*extract_file_type) (const coord_t *);
52865 +       int (*add_entry) (struct inode * dir,
52866 +                         coord_t *, lock_handle *,
52867 +                         const struct dentry * name,
52868 +                         reiser4_dir_entry_desc * entry);
52869 +       int (*rem_entry) (struct inode * dir, const struct qstr * name,
52870 +                         coord_t *, lock_handle *,
52871 +                         reiser4_dir_entry_desc * entry);
52872 +       int (*max_name_len) (const struct inode * dir);
52873 +} dir_entry_ops;
52874 +
52875 +/* operations specific to items regular (unix) file metadata are built of */
52876 +typedef struct {
52877 +       int (*write) (struct file *, const char __user *, size_t, loff_t *pos);
52878 +       int (*read) (struct file *, flow_t *, hint_t *);
52879 +       int (*readpage) (void *, struct page *);
52880 +       int (*get_block) (const coord_t *, sector_t, sector_t *);
52881 +       void (*readpages) (void *, struct address_space *,
52882 +                          struct list_head * pages);
52883 +       /*
52884 +        * key of first byte which is not addressed by the item @coord is set
52885 +        * to.
52886 +        * For example, for extent item with the key
52887 +        *
52888 +        * (LOCALITY,4,OBJID,STARTING-OFFSET), and length BLK blocks,
52889 +        *
52890 +        * ->append_key is
52891 +        *
52892 +        * (LOCALITY,4,OBJID,STARTING-OFFSET + BLK * block_size)
52893 +        */
52894 +       reiser4_key *(*append_key) (const coord_t *, reiser4_key *);
52895 +
52896 +       void (*init_coord_extension) (uf_coord_t *, loff_t);
52897 +} file_ops;
52898 +
52899 +/* operations specific to items of stat data type */
52900 +typedef struct {
52901 +       int (*init_inode) (struct inode * inode, char *sd, int len);
52902 +       int (*save_len) (struct inode * inode);
52903 +       int (*save) (struct inode * inode, char **area);
52904 +} sd_ops;
52905 +
52906 +/* operations specific to internal item */
52907 +typedef struct {
52908 +       /* all tree traversal want to know from internal item is where
52909 +          to go next. */
52910 +       void (*down_link) (const coord_t * coord,
52911 +                          const reiser4_key * key, reiser4_block_nr * block);
52912 +       /* check that given internal item contains given pointer. */
52913 +       int (*has_pointer_to) (const coord_t * coord,
52914 +                              const reiser4_block_nr * block);
52915 +} internal_item_ops;
52916 +
52917 +struct item_plugin {
52918 +       /* generic fields */
52919 +       plugin_header h;
52920 +
52921 +       /* methods common for all item types */
52922 +       balance_ops b;
52923 +       /* methods used during flush */
52924 +       flush_ops f;
52925 +
52926 +       /* methods specific to particular type of item */
52927 +       union {
52928 +               dir_entry_ops dir;
52929 +               file_ops file;
52930 +               sd_ops sd;
52931 +               internal_item_ops internal;
52932 +       } s;
52933 +
52934 +};
52935 +
52936 +static inline item_id item_id_by_plugin(item_plugin * plugin)
52937 +{
52938 +       return plugin->h.id;
52939 +}
52940 +
52941 +static inline char get_iplugid(item_plugin * iplug)
52942 +{
52943 +       assert("nikita-2838", iplug != NULL);
52944 +       assert("nikita-2839", iplug->h.id < 0xff);
52945 +       return (char)item_id_by_plugin(iplug);
52946 +}
52947 +
52948 +extern unsigned long znode_times_locked(const znode * z);
52949 +
52950 +static inline void coord_set_iplug(coord_t * coord, item_plugin * iplug)
52951 +{
52952 +       assert("nikita-2837", coord != NULL);
52953 +       assert("nikita-2838", iplug != NULL);
52954 +       coord->iplugid = get_iplugid(iplug);
52955 +       ON_DEBUG(coord->plug_v = znode_times_locked(coord->node));
52956 +}
52957 +
52958 +static inline item_plugin *coord_iplug(const coord_t * coord)
52959 +{
52960 +       assert("nikita-2833", coord != NULL);
52961 +       assert("nikita-2834", coord->iplugid != INVALID_PLUGID);
52962 +       assert("nikita-3549", coord->plug_v == znode_times_locked(coord->node));
52963 +       return (item_plugin *) plugin_by_id(REISER4_ITEM_PLUGIN_TYPE,
52964 +                                           coord->iplugid);
52965 +}
52966 +
52967 +extern int item_can_contain_key(const coord_t * item, const reiser4_key * key,
52968 +                               const reiser4_item_data *);
52969 +extern int are_items_mergeable(const coord_t * i1, const coord_t * i2);
52970 +extern int item_is_extent(const coord_t *);
52971 +extern int item_is_tail(const coord_t *);
52972 +extern int item_is_statdata(const coord_t * item);
52973 +extern int item_is_ctail(const coord_t *);
52974 +
52975 +extern pos_in_node_t item_length_by_coord(const coord_t * coord);
52976 +extern item_type_id item_type_by_coord(const coord_t * coord);
52977 +extern item_id item_id_by_coord(const coord_t * coord /* coord to query */ );
52978 +extern reiser4_key *item_key_by_coord(const coord_t * coord, reiser4_key * key);
52979 +extern reiser4_key *max_item_key_by_coord(const coord_t *, reiser4_key *);
52980 +extern reiser4_key *unit_key_by_coord(const coord_t * coord, reiser4_key * key);
52981 +extern reiser4_key *max_unit_key_by_coord(const coord_t * coord,
52982 +                                         reiser4_key * key);
52983 +
52984 +extern void obtain_item_plugin(const coord_t * coord);
52985 +
52986 +#if defined(REISER4_DEBUG)
52987 +extern int znode_is_loaded(const znode * node);
52988 +#endif
52989 +
52990 +/* return plugin of item at @coord */
52991 +static inline item_plugin *item_plugin_by_coord(const coord_t *
52992 +                                               coord /* coord to query */ )
52993 +{
52994 +       assert("nikita-330", coord != NULL);
52995 +       assert("nikita-331", coord->node != NULL);
52996 +       assert("nikita-332", znode_is_loaded(coord->node));
52997 +
52998 +       if (unlikely(!coord_is_iplug_set(coord)))
52999 +               obtain_item_plugin(coord);
53000 +       return coord_iplug(coord);
53001 +}
53002 +
53003 +/* this returns true if item is of internal type */
53004 +static inline int item_is_internal(const coord_t * item)
53005 +{
53006 +       assert("vs-483", coord_is_existing_item(item));
53007 +       return item_type_by_coord(item) == INTERNAL_ITEM_TYPE;
53008 +}
53009 +
53010 +extern void item_body_by_coord_hard(coord_t * coord);
53011 +extern void *item_body_by_coord_easy(const coord_t * coord);
53012 +#if REISER4_DEBUG
53013 +extern int item_body_is_valid(const coord_t * coord);
53014 +#endif
53015 +
53016 +/* return pointer to item body */
53017 +static inline void *item_body_by_coord(const coord_t *
53018 +                                      coord /* coord to query */ )
53019 +{
53020 +       assert("nikita-324", coord != NULL);
53021 +       assert("nikita-325", coord->node != NULL);
53022 +       assert("nikita-326", znode_is_loaded(coord->node));
53023 +
53024 +       if (coord->offset == INVALID_OFFSET)
53025 +               item_body_by_coord_hard((coord_t *) coord);
53026 +       assert("nikita-3201", item_body_is_valid(coord));
53027 +       assert("nikita-3550", coord->body_v == znode_times_locked(coord->node));
53028 +       return item_body_by_coord_easy(coord);
53029 +}
53030 +
53031 +/* __REISER4_ITEM_H__ */
53032 +#endif
53033 +/* Make Linus happy.
53034 +   Local variables:
53035 +   c-indentation-style: "K&R"
53036 +   mode-name: "LC"
53037 +   c-basic-offset: 8
53038 +   tab-width: 8
53039 +   fill-column: 120
53040 +   scroll-step: 1
53041 +   End:
53042 +*/
53043 diff --git a/fs/reiser4/plugin/item/sde.c b/fs/reiser4/plugin/item/sde.c
53044 new file mode 100644
53045 index 0000000..07a5212
53046 --- /dev/null
53047 +++ b/fs/reiser4/plugin/item/sde.c
53048 @@ -0,0 +1,190 @@
53049 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
53050 +
53051 +/* Directory entry implementation */
53052 +#include "../../forward.h"
53053 +#include "../../debug.h"
53054 +#include "../../dformat.h"
53055 +#include "../../kassign.h"
53056 +#include "../../coord.h"
53057 +#include "sde.h"
53058 +#include "item.h"
53059 +#include "../plugin.h"
53060 +#include "../../znode.h"
53061 +#include "../../carry.h"
53062 +#include "../../tree.h"
53063 +#include "../../inode.h"
53064 +
53065 +#include <linux/fs.h>          /* for struct inode */
53066 +#include <linux/dcache.h>      /* for struct dentry */
53067 +#include <linux/quotaops.h>
53068 +
53069 +/* ->extract_key() method of simple directory item plugin. */
53070 +int extract_key_de(const coord_t * coord /* coord of item */ ,
53071 +                  reiser4_key * key /* resulting key */ )
53072 +{
53073 +       directory_entry_format *dent;
53074 +
53075 +       assert("nikita-1458", coord != NULL);
53076 +       assert("nikita-1459", key != NULL);
53077 +
53078 +       dent = (directory_entry_format *) item_body_by_coord(coord);
53079 +       assert("nikita-1158", item_length_by_coord(coord) >= (int)sizeof *dent);
53080 +       return extract_key_from_id(&dent->id, key);
53081 +}
53082 +
53083 +int
53084 +update_key_de(const coord_t * coord, const reiser4_key * key,
53085 +             lock_handle * lh UNUSED_ARG)
53086 +{
53087 +       directory_entry_format *dent;
53088 +       obj_key_id obj_id;
53089 +       int result;
53090 +
53091 +       assert("nikita-2342", coord != NULL);
53092 +       assert("nikita-2343", key != NULL);
53093 +
53094 +       dent = (directory_entry_format *) item_body_by_coord(coord);
53095 +       result = build_obj_key_id(key, &obj_id);
53096 +       if (result == 0) {
53097 +               dent->id = obj_id;
53098 +               znode_make_dirty(coord->node);
53099 +       }
53100 +       return 0;
53101 +}
53102 +
53103 +char *extract_dent_name(const coord_t * coord, directory_entry_format * dent,
53104 +                       char *buf)
53105 +{
53106 +       reiser4_key key;
53107 +
53108 +       unit_key_by_coord(coord, &key);
53109 +       if (get_key_type(&key) != KEY_FILE_NAME_MINOR)
53110 +               reiser4_print_address("oops", znode_get_block(coord->node));
53111 +       if (!is_longname_key(&key)) {
53112 +               if (is_dot_key(&key))
53113 +                       return (char *)".";
53114 +               else
53115 +                       return extract_name_from_key(&key, buf);
53116 +       } else
53117 +               return (char *)dent->name;
53118 +}
53119 +
53120 +/* ->extract_name() method of simple directory item plugin. */
53121 +char *extract_name_de(const coord_t * coord /* coord of item */ , char *buf)
53122 +{
53123 +       directory_entry_format *dent;
53124 +
53125 +       assert("nikita-1460", coord != NULL);
53126 +
53127 +       dent = (directory_entry_format *) item_body_by_coord(coord);
53128 +       return extract_dent_name(coord, dent, buf);
53129 +}
53130 +
53131 +/* ->extract_file_type() method of simple directory item plugin. */
53132 +unsigned extract_file_type_de(const coord_t * coord UNUSED_ARG /* coord of
53133 +                                                                * item */ )
53134 +{
53135 +       assert("nikita-1764", coord != NULL);
53136 +       /* we don't store file type in the directory entry yet.
53137 +
53138 +          But see comments at kassign.h:obj_key_id
53139 +        */
53140 +       return DT_UNKNOWN;
53141 +}
53142 +
53143 +int add_entry_de(struct inode *dir /* directory of item */ ,
53144 +                coord_t * coord /* coord of item */ ,
53145 +                lock_handle * lh /* insertion lock handle */ ,
53146 +                const struct dentry *de /* name to add */ ,
53147 +                reiser4_dir_entry_desc * entry /* parameters of new directory
53148 +                                                * entry */ )
53149 +{
53150 +       reiser4_item_data data;
53151 +       directory_entry_format *dent;
53152 +       int result;
53153 +       const char *name;
53154 +       int len;
53155 +       int longname;
53156 +
53157 +       name = de->d_name.name;
53158 +       len = de->d_name.len;
53159 +       assert("nikita-1163", strlen(name) == len);
53160 +
53161 +       longname = is_longname(name, len);
53162 +
53163 +       data.length = sizeof *dent;
53164 +       if (longname)
53165 +               data.length += len + 1;
53166 +       data.data = NULL;
53167 +       data.user = 0;
53168 +       data.iplug = item_plugin_by_id(SIMPLE_DIR_ENTRY_ID);
53169 +
53170 +       /* NOTE-NIKITA quota plugin */
53171 +       if (DQUOT_ALLOC_SPACE_NODIRTY(dir, data.length))
53172 +               return -EDQUOT;
53173 +
53174 +       result = insert_by_coord(coord, &data, &entry->key, lh, 0 /*flags */ );
53175 +       if (result != 0)
53176 +               return result;
53177 +
53178 +       dent = (directory_entry_format *) item_body_by_coord(coord);
53179 +       build_inode_key_id(entry->obj, &dent->id);
53180 +       if (longname) {
53181 +               memcpy(dent->name, name, len);
53182 +               put_unaligned(0, &dent->name[len]);
53183 +       }
53184 +       return 0;
53185 +}
53186 +
53187 +int rem_entry_de(struct inode *dir /* directory of item */ ,
53188 +                const struct qstr *name UNUSED_ARG,
53189 +                coord_t * coord /* coord of item */ ,
53190 +                lock_handle * lh UNUSED_ARG    /* lock handle for
53191 +                                                * removal */ ,
53192 +                reiser4_dir_entry_desc * entry UNUSED_ARG      /* parameters of
53193 +                                                                * directory entry
53194 +                                                                * being removed */ )
53195 +{
53196 +       coord_t shadow;
53197 +       int result;
53198 +       int length;
53199 +
53200 +       length = item_length_by_coord(coord);
53201 +       if (inode_get_bytes(dir) < length) {
53202 +               warning("nikita-2627", "Dir is broke: %llu: %llu",
53203 +                       (unsigned long long)get_inode_oid(dir),
53204 +                       inode_get_bytes(dir));
53205 +
53206 +               return RETERR(-EIO);
53207 +       }
53208 +
53209 +       /* cut_node() is supposed to take pointers to _different_
53210 +          coords, because it will modify them without respect to
53211 +          possible aliasing. To work around this, create temporary copy
53212 +          of @coord.
53213 +        */
53214 +       coord_dup(&shadow, coord);
53215 +       result =
53216 +           kill_node_content(coord, &shadow, NULL, NULL, NULL, NULL, NULL, 0);
53217 +       if (result == 0) {
53218 +               /* NOTE-NIKITA quota plugin */
53219 +               DQUOT_FREE_SPACE_NODIRTY(dir, length);
53220 +       }
53221 +       return result;
53222 +}
53223 +
53224 +int max_name_len_de(const struct inode *dir)
53225 +{
53226 +       return tree_by_inode(dir)->nplug->max_item_size() -
53227 +           sizeof(directory_entry_format) - 2;
53228 +}
53229 +
53230 +/* Make Linus happy.
53231 +   Local variables:
53232 +   c-indentation-style: "K&R"
53233 +   mode-name: "LC"
53234 +   c-basic-offset: 8
53235 +   tab-width: 8
53236 +   fill-column: 120
53237 +   End:
53238 +*/
53239 diff --git a/fs/reiser4/plugin/item/sde.h b/fs/reiser4/plugin/item/sde.h
53240 new file mode 100644
53241 index 0000000..f26762a
53242 --- /dev/null
53243 +++ b/fs/reiser4/plugin/item/sde.h
53244 @@ -0,0 +1,66 @@
53245 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
53246 +
53247 +/* Directory entry. */
53248 +
53249 +#if !defined( __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__ )
53250 +#define __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__
53251 +
53252 +#include "../../forward.h"
53253 +#include "../../dformat.h"
53254 +#include "../../kassign.h"
53255 +#include "../../key.h"
53256 +
53257 +#include <linux/fs.h>
53258 +#include <linux/dcache.h>      /* for struct dentry */
53259 +
53260 +typedef struct directory_entry_format {
53261 +       /* key of object stat-data. It's not necessary to store whole
53262 +          key here, because it's always key of stat-data, so minor
53263 +          packing locality and offset can be omitted here. But this
53264 +          relies on particular key allocation scheme for stat-data, so,
53265 +          for extensibility sake, whole key can be stored here.
53266 +
53267 +          We store key as array of bytes, because we don't want 8-byte
53268 +          alignment of dir entries.
53269 +        */
53270 +       obj_key_id id;
53271 +       /* file name. Null terminated string. */
53272 +       d8 name[0];
53273 +} directory_entry_format;
53274 +
53275 +void print_de(const char *prefix, coord_t * coord);
53276 +int extract_key_de(const coord_t * coord, reiser4_key * key);
53277 +int update_key_de(const coord_t * coord, const reiser4_key * key,
53278 +                 lock_handle * lh);
53279 +char *extract_name_de(const coord_t * coord, char *buf);
53280 +unsigned extract_file_type_de(const coord_t * coord);
53281 +int add_entry_de(struct inode *dir, coord_t * coord,
53282 +                lock_handle * lh, const struct dentry *name,
53283 +                reiser4_dir_entry_desc * entry);
53284 +int rem_entry_de(struct inode *dir, const struct qstr *name, coord_t * coord,
53285 +                lock_handle * lh, reiser4_dir_entry_desc * entry);
53286 +int max_name_len_de(const struct inode *dir);
53287 +
53288 +int de_rem_and_shrink(struct inode *dir, coord_t * coord, int length);
53289 +
53290 +char *extract_dent_name(const coord_t * coord,
53291 +                       directory_entry_format * dent, char *buf);
53292 +
53293 +#if REISER4_LARGE_KEY
53294 +#define DE_NAME_BUF_LEN (24)
53295 +#else
53296 +#define DE_NAME_BUF_LEN (16)
53297 +#endif
53298 +
53299 +/* __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__ */
53300 +#endif
53301 +
53302 +/* Make Linus happy.
53303 +   Local variables:
53304 +   c-indentation-style: "K&R"
53305 +   mode-name: "LC"
53306 +   c-basic-offset: 8
53307 +   tab-width: 8
53308 +   fill-column: 120
53309 +   End:
53310 +*/
53311 diff --git a/fs/reiser4/plugin/item/static_stat.c b/fs/reiser4/plugin/item/static_stat.c
53312 new file mode 100644
53313 index 0000000..f53a5aa
53314 --- /dev/null
53315 +++ b/fs/reiser4/plugin/item/static_stat.c
53316 @@ -0,0 +1,1038 @@
53317 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
53318 +
53319 +/* stat data manipulation. */
53320 +
53321 +#include "../../forward.h"
53322 +#include "../../super.h"
53323 +#include "../../vfs_ops.h"
53324 +#include "../../inode.h"
53325 +#include "../../debug.h"
53326 +#include "../../dformat.h"
53327 +#include "../object.h"
53328 +#include "../plugin.h"
53329 +#include "../plugin_header.h"
53330 +#include "static_stat.h"
53331 +#include "item.h"
53332 +
53333 +#include <linux/types.h>
53334 +#include <linux/fs.h>
53335 +
53336 +/* see static_stat.h for explanation */
53337 +
53338 +/* helper function used while we are dumping/loading inode/plugin state
53339 +    to/from the stat-data. */
53340 +
53341 +static void move_on(int *length /* space remaining in stat-data */ ,
53342 +                   char **area /* current coord in stat data */ ,
53343 +                   int size_of /* how many bytes to move forward */ )
53344 +{
53345 +       assert("nikita-615", length != NULL);
53346 +       assert("nikita-616", area != NULL);
53347 +
53348 +       *length -= size_of;
53349 +       *area += size_of;
53350 +
53351 +       assert("nikita-617", *length >= 0);
53352 +}
53353 +
53354 +/* helper function used while loading inode/plugin state from stat-data.
53355 +    Complain if there is less space in stat-data than was expected.
53356 +    Can only happen on disk corruption. */
53357 +static int not_enough_space(struct inode *inode /* object being processed */ ,
53358 +                           const char *where /* error message */ )
53359 +{
53360 +       assert("nikita-618", inode != NULL);
53361 +
53362 +       warning("nikita-619", "Not enough space in %llu while loading %s",
53363 +               (unsigned long long)get_inode_oid(inode), where);
53364 +
53365 +       return RETERR(-EINVAL);
53366 +}
53367 +
53368 +/* helper function used while loading inode/plugin state from
53369 +    stat-data. Call it if invalid plugin id was found. */
53370 +static int unknown_plugin(reiser4_plugin_id id /* invalid id */ ,
53371 +                         struct inode *inode /* object being processed */ )
53372 +{
53373 +       warning("nikita-620", "Unknown plugin %i in %llu",
53374 +               id, (unsigned long long)get_inode_oid(inode));
53375 +
53376 +       return RETERR(-EINVAL);
53377 +}
53378 +
53379 +/* this is installed as ->init_inode() method of
53380 +    item_plugins[ STATIC_STAT_DATA_IT ] (fs/reiser4/plugin/item/item.c).
53381 +    Copies data from on-disk stat-data format into inode.
53382 +    Handles stat-data extensions. */
53383 +/* was sd_load */
53384 +int init_inode_static_sd(struct inode *inode /* object being processed */ ,
53385 +                        char *sd /* stat-data body */ ,
53386 +                        int len /* length of stat-data */ )
53387 +{
53388 +       int result;
53389 +       int bit;
53390 +       int chunk;
53391 +       __u16 mask;
53392 +       __u64 bigmask;
53393 +       reiser4_stat_data_base *sd_base;
53394 +       reiser4_inode *state;
53395 +
53396 +       assert("nikita-625", inode != NULL);
53397 +       assert("nikita-626", sd != NULL);
53398 +
53399 +       result = 0;
53400 +       sd_base = (reiser4_stat_data_base *) sd;
53401 +       state = reiser4_inode_data(inode);
53402 +       mask = le16_to_cpu(get_unaligned(&sd_base->extmask));
53403 +       bigmask = mask;
53404 +       inode_set_flag(inode, REISER4_SDLEN_KNOWN);
53405 +
53406 +       move_on(&len, &sd, sizeof *sd_base);
53407 +       for (bit = 0, chunk = 0;
53408 +            mask != 0 || bit <= LAST_IMPORTANT_SD_EXTENSION;
53409 +            ++bit, mask >>= 1) {
53410 +               if (((bit + 1) % 16) != 0) {
53411 +                       /* handle extension */
53412 +                       sd_ext_plugin *sdplug;
53413 +
53414 +                       if (bit >= LAST_SD_EXTENSION) {
53415 +                               warning("vpf-1904",
53416 +                                       "No such extension %i in inode %llu",
53417 +                                       bit,
53418 +                                       (unsigned long long)
53419 +                                       get_inode_oid(inode));
53420 +
53421 +                               result = RETERR(-EINVAL);
53422 +                               break;
53423 +                       }
53424 +
53425 +                       sdplug = sd_ext_plugin_by_id(bit);
53426 +                       if (sdplug == NULL) {
53427 +                               warning("nikita-627",
53428 +                                       "No such extension %i in inode %llu",
53429 +                                       bit,
53430 +                                       (unsigned long long)
53431 +                                       get_inode_oid(inode));
53432 +
53433 +                               result = RETERR(-EINVAL);
53434 +                               break;
53435 +                       }
53436 +                       if (mask & 1) {
53437 +                               assert("nikita-628", sdplug->present);
53438 +                               /* alignment is not supported in node layout
53439 +                                  plugin yet.
53440 +                                  result = align( inode, &len, &sd,
53441 +                                  sdplug -> alignment );
53442 +                                  if( result != 0 )
53443 +                                  return result; */
53444 +                               result = sdplug->present(inode, &sd, &len);
53445 +                       } else if (sdplug->absent != NULL)
53446 +                               result = sdplug->absent(inode);
53447 +                       if (result)
53448 +                               break;
53449 +                       /* else, we are looking at the last bit in 16-bit
53450 +                          portion of bitmask */
53451 +               } else if (mask & 1) {
53452 +                       /* next portion of bitmask */
53453 +                       if (len < (int)sizeof(d16)) {
53454 +                               warning("nikita-629",
53455 +                                       "No space for bitmap in inode %llu",
53456 +                                       (unsigned long long)
53457 +                                       get_inode_oid(inode));
53458 +
53459 +                               result = RETERR(-EINVAL);
53460 +                               break;
53461 +                       }
53462 +                       mask = le16_to_cpu(get_unaligned((d16 *)sd));
53463 +                       bigmask <<= 16;
53464 +                       bigmask |= mask;
53465 +                       move_on(&len, &sd, sizeof(d16));
53466 +                       ++chunk;
53467 +                       if (chunk == 3) {
53468 +                               if (!(mask & 0x8000)) {
53469 +                                       /* clear last bit */
53470 +                                       mask &= ~0x8000;
53471 +                                       continue;
53472 +                               }
53473 +                               /* too much */
53474 +                               warning("nikita-630",
53475 +                                       "Too many extensions in %llu",
53476 +                                       (unsigned long long)
53477 +                                       get_inode_oid(inode));
53478 +
53479 +                               result = RETERR(-EINVAL);
53480 +                               break;
53481 +                       }
53482 +               } else
53483 +                       /* bitmask exhausted */
53484 +                       break;
53485 +       }
53486 +       state->extmask = bigmask;
53487 +       if (len - (bit / 16 * sizeof(d16)) > 0) {
53488 +               /* alignment in save_len_static_sd() is taken into account
53489 +                  -edward */
53490 +               warning("nikita-631", "unused space in inode %llu",
53491 +                       (unsigned long long)get_inode_oid(inode));
53492 +       }
53493 +
53494 +       return result;
53495 +}
53496 +
53497 +/* estimates size of stat-data required to store inode.
53498 +    Installed as ->save_len() method of
53499 +    item_plugins[ STATIC_STAT_DATA_IT ] (fs/reiser4/plugin/item/item.c). */
53500 +/* was sd_len */
53501 +int save_len_static_sd(struct inode *inode /* object being processed */ )
53502 +{
53503 +       unsigned int result;
53504 +       __u64 mask;
53505 +       int bit;
53506 +
53507 +       assert("nikita-632", inode != NULL);
53508 +
53509 +       result = sizeof(reiser4_stat_data_base);
53510 +       mask = reiser4_inode_data(inode)->extmask;
53511 +       for (bit = 0; mask != 0; ++bit, mask >>= 1) {
53512 +               if (mask & 1) {
53513 +                       sd_ext_plugin *sdplug;
53514 +
53515 +                       sdplug = sd_ext_plugin_by_id(bit);
53516 +                       assert("nikita-633", sdplug != NULL);
53517 +                       /* no aligment support
53518 +                          result +=
53519 +                          round_up( result, sdplug -> alignment ) - result; */
53520 +                       result += sdplug->save_len(inode);
53521 +               }
53522 +       }
53523 +       result += bit / 16 * sizeof(d16);
53524 +       return result;
53525 +}
53526 +
53527 +/* saves inode into stat-data.
53528 +    Installed as ->save() method of
53529 +    item_plugins[ STATIC_STAT_DATA_IT ] (fs/reiser4/plugin/item/item.c). */
53530 +/* was sd_save */
53531 +int save_static_sd(struct inode *inode /* object being processed */ ,
53532 +                  char **area /* where to save stat-data */ )
53533 +{
53534 +       int result;
53535 +       __u64 emask;
53536 +       int bit;
53537 +       unsigned int len;
53538 +       reiser4_stat_data_base *sd_base;
53539 +
53540 +       assert("nikita-634", inode != NULL);
53541 +       assert("nikita-635", area != NULL);
53542 +
53543 +       result = 0;
53544 +       emask = reiser4_inode_data(inode)->extmask;
53545 +       sd_base = (reiser4_stat_data_base *) * area;
53546 +       put_unaligned(cpu_to_le16((__u16)(emask & 0xffff)), &sd_base->extmask);
53547 +       /*cputod16((unsigned)(emask & 0xffff), &sd_base->extmask);*/
53548 +
53549 +       *area += sizeof *sd_base;
53550 +       len = 0xffffffffu;
53551 +       for (bit = 0; emask != 0; ++bit, emask >>= 1) {
53552 +               if (emask & 1) {
53553 +                       if ((bit + 1) % 16 != 0) {
53554 +                               sd_ext_plugin *sdplug;
53555 +                               sdplug = sd_ext_plugin_by_id(bit);
53556 +                               assert("nikita-636", sdplug != NULL);
53557 +                               /* no alignment support yet
53558 +                                  align( inode, &len, area,
53559 +                                  sdplug -> alignment ); */
53560 +                               result = sdplug->save(inode, area);
53561 +                               if (result)
53562 +                                       break;
53563 +                       } else {
53564 +                               put_unaligned(cpu_to_le16((__u16)(emask & 0xffff)),
53565 +                                             (d16 *)(*area));
53566 +                               /*cputod16((unsigned)(emask & 0xffff),
53567 +                                 (d16 *) * area);*/
53568 +                               *area += sizeof(d16);
53569 +                       }
53570 +               }
53571 +       }
53572 +       return result;
53573 +}
53574 +
53575 +/* stat-data extension handling functions. */
53576 +
53577 +static int present_lw_sd(struct inode *inode /* object being processed */ ,
53578 +                        char **area /* position in stat-data */ ,
53579 +                        int *len /* remaining length */ )
53580 +{
53581 +       if (*len >= (int)sizeof(reiser4_light_weight_stat)) {
53582 +               reiser4_light_weight_stat *sd_lw;
53583 +
53584 +               sd_lw = (reiser4_light_weight_stat *) * area;
53585 +
53586 +               inode->i_mode = le16_to_cpu(get_unaligned(&sd_lw->mode));
53587 +               inode->i_nlink = le32_to_cpu(get_unaligned(&sd_lw->nlink));
53588 +               inode->i_size = le64_to_cpu(get_unaligned(&sd_lw->size));
53589 +               if ((inode->i_mode & S_IFMT) == (S_IFREG | S_IFIFO)) {
53590 +                       inode->i_mode &= ~S_IFIFO;
53591 +                       warning("", "partially converted file is encountered");
53592 +                       inode_set_flag(inode, REISER4_PART_MIXED);
53593 +               }
53594 +               move_on(len, area, sizeof *sd_lw);
53595 +               return 0;
53596 +       } else
53597 +               return not_enough_space(inode, "lw sd");
53598 +}
53599 +
53600 +static int save_len_lw_sd(struct inode *inode UNUSED_ARG       /* object being
53601 +                                                                * processed */ )
53602 +{
53603 +       return sizeof(reiser4_light_weight_stat);
53604 +}
53605 +
53606 +static int save_lw_sd(struct inode *inode /* object being processed */ ,
53607 +                     char **area /* position in stat-data */ )
53608 +{
53609 +       reiser4_light_weight_stat *sd;
53610 +       mode_t delta;
53611 +
53612 +       assert("nikita-2705", inode != NULL);
53613 +       assert("nikita-2706", area != NULL);
53614 +       assert("nikita-2707", *area != NULL);
53615 +
53616 +       sd = (reiser4_light_weight_stat *) * area;
53617 +
53618 +       delta = (inode_get_flag(inode, REISER4_PART_MIXED) ? S_IFIFO : 0);
53619 +       put_unaligned(cpu_to_le16(inode->i_mode | delta), &sd->mode);
53620 +       put_unaligned(cpu_to_le32(inode->i_nlink), &sd->nlink);
53621 +       put_unaligned(cpu_to_le64((__u64) inode->i_size), &sd->size);
53622 +       *area += sizeof *sd;
53623 +       return 0;
53624 +}
53625 +
53626 +static int present_unix_sd(struct inode *inode /* object being processed */ ,
53627 +                          char **area /* position in stat-data */ ,
53628 +                          int *len /* remaining length */ )
53629 +{
53630 +       assert("nikita-637", inode != NULL);
53631 +       assert("nikita-638", area != NULL);
53632 +       assert("nikita-639", *area != NULL);
53633 +       assert("nikita-640", len != NULL);
53634 +       assert("nikita-641", *len > 0);
53635 +
53636 +       if (*len >= (int)sizeof(reiser4_unix_stat)) {
53637 +               reiser4_unix_stat *sd;
53638 +
53639 +               sd = (reiser4_unix_stat *) * area;
53640 +
53641 +               inode->i_uid = le32_to_cpu(get_unaligned(&sd->uid));
53642 +               inode->i_gid = le32_to_cpu(get_unaligned(&sd->gid));
53643 +               inode->i_atime.tv_sec = le32_to_cpu(get_unaligned(&sd->atime));
53644 +               inode->i_mtime.tv_sec = le32_to_cpu(get_unaligned(&sd->mtime));
53645 +               inode->i_ctime.tv_sec = le32_to_cpu(get_unaligned(&sd->ctime));
53646 +               if (S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode))
53647 +                       inode->i_rdev = le64_to_cpu(get_unaligned(&sd->u.rdev));
53648 +               else
53649 +                       inode_set_bytes(inode, (loff_t) le64_to_cpu(get_unaligned(&sd->u.bytes)));
53650 +               move_on(len, area, sizeof *sd);
53651 +               return 0;
53652 +       } else
53653 +               return not_enough_space(inode, "unix sd");
53654 +}
53655 +
53656 +static int absent_unix_sd(struct inode *inode /* object being processed */ )
53657 +{
53658 +       inode->i_uid = get_super_private(inode->i_sb)->default_uid;
53659 +       inode->i_gid = get_super_private(inode->i_sb)->default_gid;
53660 +       inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
53661 +       inode_set_bytes(inode, inode->i_size);
53662 +       /* mark inode as lightweight, so that caller (reiser4_lookup) will
53663 +          complete initialisation by copying [ug]id from a parent. */
53664 +       inode_set_flag(inode, REISER4_LIGHT_WEIGHT);
53665 +       return 0;
53666 +}
53667 +
53668 +/* Audited by: green(2002.06.14) */
53669 +static int save_len_unix_sd(struct inode *inode UNUSED_ARG     /* object being
53670 +                                                                * processed */ )
53671 +{
53672 +       return sizeof(reiser4_unix_stat);
53673 +}
53674 +
53675 +static int save_unix_sd(struct inode *inode /* object being processed */ ,
53676 +                       char **area /* position in stat-data */ )
53677 +{
53678 +       reiser4_unix_stat *sd;
53679 +
53680 +       assert("nikita-642", inode != NULL);
53681 +       assert("nikita-643", area != NULL);
53682 +       assert("nikita-644", *area != NULL);
53683 +
53684 +       sd = (reiser4_unix_stat *) * area;
53685 +       put_unaligned(cpu_to_le32(inode->i_uid), &sd->uid);
53686 +       put_unaligned(cpu_to_le32(inode->i_gid), &sd->gid);
53687 +       put_unaligned(cpu_to_le32((__u32) inode->i_atime.tv_sec), &sd->atime);
53688 +       put_unaligned(cpu_to_le32((__u32) inode->i_ctime.tv_sec), &sd->ctime);
53689 +       put_unaligned(cpu_to_le32((__u32) inode->i_mtime.tv_sec), &sd->mtime);
53690 +       if (S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode))
53691 +               put_unaligned(cpu_to_le64(inode->i_rdev), &sd->u.rdev);
53692 +       else
53693 +               put_unaligned(cpu_to_le64((__u64) inode_get_bytes(inode)), &sd->u.bytes);
53694 +       *area += sizeof *sd;
53695 +       return 0;
53696 +}
53697 +
53698 +static int
53699 +present_large_times_sd(struct inode *inode /* object being processed */ ,
53700 +                      char **area /* position in stat-data */ ,
53701 +                      int *len /* remaining length */ )
53702 +{
53703 +       if (*len >= (int)sizeof(reiser4_large_times_stat)) {
53704 +               reiser4_large_times_stat *sd_lt;
53705 +
53706 +               sd_lt = (reiser4_large_times_stat *) * area;
53707 +
53708 +               inode->i_atime.tv_nsec = le32_to_cpu(get_unaligned(&sd_lt->atime));
53709 +               inode->i_mtime.tv_nsec = le32_to_cpu(get_unaligned(&sd_lt->mtime));
53710 +               inode->i_ctime.tv_nsec = le32_to_cpu(get_unaligned(&sd_lt->ctime));
53711 +
53712 +               move_on(len, area, sizeof *sd_lt);
53713 +               return 0;
53714 +       } else
53715 +               return not_enough_space(inode, "large times sd");
53716 +}
53717 +
53718 +static int
53719 +save_len_large_times_sd(struct inode *inode UNUSED_ARG
53720 +                       /* object being processed */ )
53721 +{
53722 +       return sizeof(reiser4_large_times_stat);
53723 +}
53724 +
53725 +static int
53726 +save_large_times_sd(struct inode *inode /* object being processed */ ,
53727 +                   char **area /* position in stat-data */ )
53728 +{
53729 +       reiser4_large_times_stat *sd;
53730 +
53731 +       assert("nikita-2817", inode != NULL);
53732 +       assert("nikita-2818", area != NULL);
53733 +       assert("nikita-2819", *area != NULL);
53734 +
53735 +       sd = (reiser4_large_times_stat *) * area;
53736 +
53737 +       put_unaligned(cpu_to_le32((__u32) inode->i_atime.tv_nsec), &sd->atime);
53738 +       put_unaligned(cpu_to_le32((__u32) inode->i_ctime.tv_nsec), &sd->ctime);
53739 +       put_unaligned(cpu_to_le32((__u32) inode->i_mtime.tv_nsec), &sd->mtime);
53740 +
53741 +       *area += sizeof *sd;
53742 +       return 0;
53743 +}
53744 +
53745 +/* symlink stat data extension */
53746 +
53747 +/* allocate memory for symlink target and attach it to inode->i_private */
53748 +static int
53749 +symlink_target_to_inode(struct inode *inode, const char *target, int len)
53750 +{
53751 +       assert("vs-845", inode->i_private == NULL);
53752 +       assert("vs-846", !inode_get_flag(inode, REISER4_GENERIC_PTR_USED));
53753 +
53754 +       /* FIXME-VS: this is prone to deadlock. Not more than other similar
53755 +          places, though */
53756 +       inode->i_private = kmalloc((size_t) len + 1, get_gfp_mask());
53757 +       if (!inode->i_private)
53758 +               return RETERR(-ENOMEM);
53759 +
53760 +       memcpy((char *)(inode->i_private), target, (size_t) len);
53761 +       ((char *)(inode->i_private))[len] = 0;
53762 +       inode_set_flag(inode, REISER4_GENERIC_PTR_USED);
53763 +       return 0;
53764 +}
53765 +
53766 +/* this is called on read_inode. There is nothing to do actually, but some
53767 +   sanity checks */
53768 +static int present_symlink_sd(struct inode *inode, char **area, int *len)
53769 +{
53770 +       int result;
53771 +       int length;
53772 +       reiser4_symlink_stat *sd;
53773 +
53774 +       length = (int)inode->i_size;
53775 +       /*
53776 +        * *len is number of bytes in stat data item from *area to the end of
53777 +        * item. It must be not less than size of symlink + 1 for ending 0
53778 +        */
53779 +       if (length > *len)
53780 +               return not_enough_space(inode, "symlink");
53781 +
53782 +       if (*(*area + length) != 0) {
53783 +               warning("vs-840", "Symlink is not zero terminated");
53784 +               return RETERR(-EIO);
53785 +       }
53786 +
53787 +       sd = (reiser4_symlink_stat *) * area;
53788 +       result = symlink_target_to_inode(inode, sd->body, length);
53789 +
53790 +       move_on(len, area, length + 1);
53791 +       return result;
53792 +}
53793 +
53794 +static int save_len_symlink_sd(struct inode *inode)
53795 +{
53796 +       return inode->i_size + 1;
53797 +}
53798 +
53799 +/* this is called on create and update stat data. Do nothing on update but
53800 +   update @area */
53801 +static int save_symlink_sd(struct inode *inode, char **area)
53802 +{
53803 +       int result;
53804 +       int length;
53805 +       reiser4_symlink_stat *sd;
53806 +
53807 +       length = (int)inode->i_size;
53808 +       /* inode->i_size must be set already */
53809 +       assert("vs-841", length);
53810 +
53811 +       result = 0;
53812 +       sd = (reiser4_symlink_stat *) * area;
53813 +       if (!inode_get_flag(inode, REISER4_GENERIC_PTR_USED)) {
53814 +               const char *target;
53815 +
53816 +               target = (const char *)(inode->i_private);
53817 +               inode->i_private = NULL;
53818 +
53819 +               result = symlink_target_to_inode(inode, target, length);
53820 +
53821 +               /* copy symlink to stat data */
53822 +               memcpy(sd->body, target, (size_t) length);
53823 +               (*area)[length] = 0;
53824 +       } else {
53825 +               /* there is nothing to do in update but move area */
53826 +               assert("vs-844",
53827 +                      !memcmp(inode->i_private, sd->body,
53828 +                              (size_t) length + 1));
53829 +       }
53830 +
53831 +       *area += (length + 1);
53832 +       return result;
53833 +}
53834 +
53835 +static int present_flags_sd(struct inode *inode /* object being processed */ ,
53836 +                           char **area /* position in stat-data */ ,
53837 +                           int *len /* remaining length */ )
53838 +{
53839 +       assert("nikita-645", inode != NULL);
53840 +       assert("nikita-646", area != NULL);
53841 +       assert("nikita-647", *area != NULL);
53842 +       assert("nikita-648", len != NULL);
53843 +       assert("nikita-649", *len > 0);
53844 +
53845 +       if (*len >= (int)sizeof(reiser4_flags_stat)) {
53846 +               reiser4_flags_stat *sd;
53847 +
53848 +               sd = (reiser4_flags_stat *) * area;
53849 +               inode->i_flags = le32_to_cpu(get_unaligned(&sd->flags));
53850 +               move_on(len, area, sizeof *sd);
53851 +               return 0;
53852 +       } else
53853 +               return not_enough_space(inode, "generation and attrs");
53854 +}
53855 +
53856 +/* Audited by: green(2002.06.14) */
53857 +static int save_len_flags_sd(struct inode *inode UNUSED_ARG    /* object being
53858 +                                                                * processed */ )
53859 +{
53860 +       return sizeof(reiser4_flags_stat);
53861 +}
53862 +
53863 +static int save_flags_sd(struct inode *inode /* object being processed */ ,
53864 +                        char **area /* position in stat-data */ )
53865 +{
53866 +       reiser4_flags_stat *sd;
53867 +
53868 +       assert("nikita-650", inode != NULL);
53869 +       assert("nikita-651", area != NULL);
53870 +       assert("nikita-652", *area != NULL);
53871 +
53872 +       sd = (reiser4_flags_stat *) * area;
53873 +       put_unaligned(cpu_to_le32(inode->i_flags), &sd->flags);
53874 +       *area += sizeof *sd;
53875 +       return 0;
53876 +}
53877 +
53878 +static int absent_plugin_sd(struct inode *inode);
53879 +static int present_plugin_sd(struct inode *inode /* object being processed */ ,
53880 +                            char **area /* position in stat-data */ ,
53881 +                            int *len /* remaining length */ )
53882 +{
53883 +       reiser4_plugin_stat *sd;
53884 +       reiser4_plugin *plugin;
53885 +       int i;
53886 +       __u16 mask;
53887 +       int result;
53888 +       int num_of_plugins;
53889 +
53890 +       assert("nikita-653", inode != NULL);
53891 +       assert("nikita-654", area != NULL);
53892 +       assert("nikita-655", *area != NULL);
53893 +       assert("nikita-656", len != NULL);
53894 +       assert("nikita-657", *len > 0);
53895 +
53896 +       if (*len < (int)sizeof(reiser4_plugin_stat))
53897 +               return not_enough_space(inode, "plugin");
53898 +
53899 +       sd = (reiser4_plugin_stat *) * area;
53900 +
53901 +       mask = 0;
53902 +       num_of_plugins = le16_to_cpu(get_unaligned(&sd->plugins_no));
53903 +       move_on(len, area, sizeof *sd);
53904 +       result = 0;
53905 +       for (i = 0; i < num_of_plugins; ++i) {
53906 +               reiser4_plugin_slot *slot;
53907 +               reiser4_plugin_type type;
53908 +               pset_member memb;
53909 +
53910 +               slot = (reiser4_plugin_slot *) * area;
53911 +               if (*len < (int)sizeof *slot)
53912 +                       return not_enough_space(inode, "additional plugin");
53913 +
53914 +               memb = le16_to_cpu(get_unaligned(&slot->pset_memb));
53915 +               type = pset_member_to_type_unsafe(memb);
53916 +               if (type == REISER4_PLUGIN_TYPES) {
53917 +                       warning("nikita-3502",
53918 +                               "wrong pset member (%i) for %llu", memb,
53919 +                               (unsigned long long)get_inode_oid(inode));
53920 +                       return RETERR(-EINVAL);
53921 +               }
53922 +               plugin = plugin_by_disk_id(tree_by_inode(inode),
53923 +                                          type, &slot->id);
53924 +               if (plugin == NULL)
53925 +                       return unknown_plugin(le16_to_cpu(get_unaligned(&slot->id)), inode);
53926 +
53927 +               /* plugin is loaded into inode, mark this into inode's
53928 +                  bitmask of loaded non-standard plugins */
53929 +               if (!(mask & (1 << memb))) {
53930 +                       mask |= (1 << memb);
53931 +               } else {
53932 +                       warning("nikita-658", "duplicate plugin for %llu",
53933 +                               (unsigned long long)get_inode_oid(inode));
53934 +                       return RETERR(-EINVAL);
53935 +               }
53936 +               move_on(len, area, sizeof *slot);
53937 +               /* load plugin data, if any */
53938 +               if (plugin->h.pops != NULL && plugin->h.pops->load) {
53939 +                       result = plugin->h.pops->load(inode, plugin, area, len);
53940 +                       if (result != 0)
53941 +                               return result;
53942 +               } else
53943 +                       result = grab_plugin_from(inode, memb, plugin);
53944 +       }
53945 +       /* if object plugin wasn't loaded from stat-data, guess it by
53946 +          mode bits */
53947 +       plugin = file_plugin_to_plugin(inode_file_plugin(inode));
53948 +       if (plugin == NULL)
53949 +               result = absent_plugin_sd(inode);
53950 +
53951 +       reiser4_inode_data(inode)->plugin_mask = mask;
53952 +       return result;
53953 +}
53954 +
53955 +/* Determine object plugin for @inode based on i_mode.
53956 +
53957 +   Many objects in reiser4 file system are controlled by standard object
53958 +   plugins that emulate traditional unix objects: unix file, directory, symlink, fifo, and so on.
53959 +
53960 +   For such files we don't explicitly store plugin id in object stat
53961 +   data. Rather required plugin is guessed from mode bits, where file "type"
53962 +   is encoded (see stat(2)).
53963 +*/
53964 +static int
53965 +guess_plugin_by_mode(struct inode *inode /* object to guess plugins for */ )
53966 +{
53967 +       int fplug_id;
53968 +       int dplug_id;
53969 +       reiser4_inode *info;
53970 +
53971 +       assert("nikita-736", inode != NULL);
53972 +
53973 +       dplug_id = fplug_id = -1;
53974 +
53975 +       switch (inode->i_mode & S_IFMT) {
53976 +       case S_IFSOCK:
53977 +       case S_IFBLK:
53978 +       case S_IFCHR:
53979 +       case S_IFIFO:
53980 +               fplug_id = SPECIAL_FILE_PLUGIN_ID;
53981 +               break;
53982 +       case S_IFLNK:
53983 +               fplug_id = SYMLINK_FILE_PLUGIN_ID;
53984 +               break;
53985 +       case S_IFDIR:
53986 +               fplug_id = DIRECTORY_FILE_PLUGIN_ID;
53987 +               dplug_id = HASHED_DIR_PLUGIN_ID;
53988 +               break;
53989 +       default:
53990 +               warning("nikita-737", "wrong file mode: %o", inode->i_mode);
53991 +               return RETERR(-EIO);
53992 +       case S_IFREG:
53993 +               fplug_id = UNIX_FILE_PLUGIN_ID;
53994 +               break;
53995 +       }
53996 +       info = reiser4_inode_data(inode);
53997 +       plugin_set_file(&info->pset,
53998 +                       (fplug_id >= 0) ? file_plugin_by_id(fplug_id) : NULL);
53999 +       plugin_set_dir(&info->pset,
54000 +                      (dplug_id >= 0) ? dir_plugin_by_id(dplug_id) : NULL);
54001 +       return 0;
54002 +}
54003 +
54004 +/* Audited by: green(2002.06.14) */
54005 +static int absent_plugin_sd(struct inode *inode /* object being processed */ )
54006 +{
54007 +       int result;
54008 +
54009 +       assert("nikita-659", inode != NULL);
54010 +
54011 +       result = guess_plugin_by_mode(inode);
54012 +       /* if mode was wrong, guess_plugin_by_mode() returns "regular file",
54013 +          but setup_inode_ops() will call make_bad_inode().
54014 +          Another, more logical but bit more complex solution is to add
54015 +          "bad-file plugin". */
54016 +       /* FIXME-VS: activate was called here */
54017 +       return result;
54018 +}
54019 +
54020 +/* helper function for plugin_sd_save_len(): calculate how much space
54021 +    required to save state of given plugin */
54022 +/* Audited by: green(2002.06.14) */
54023 +static int len_for(reiser4_plugin * plugin /* plugin to save */ ,
54024 +                  struct inode *inode /* object being processed */ ,
54025 +                  pset_member memb, int len)
54026 +{
54027 +       reiser4_inode *info;
54028 +       assert("nikita-661", inode != NULL);
54029 +
54030 +       info = reiser4_inode_data(inode);
54031 +       if (plugin != NULL && (info->plugin_mask & (1 << memb))) {
54032 +               len += sizeof(reiser4_plugin_slot);
54033 +               if (plugin->h.pops && plugin->h.pops->save_len != NULL) {
54034 +                       /* non-standard plugin, call method */
54035 +                       /* commented as it is incompatible with alignment
54036 +                        * policy in save_plug() -edward */
54037 +                       /* len = round_up(len, plugin->h.pops->alignment); */
54038 +                       len += plugin->h.pops->save_len(inode, plugin);
54039 +               }
54040 +       }
54041 +       return len;
54042 +}
54043 +
54044 +/* calculate how much space is required to save state of all plugins,
54045 +    associated with inode */
54046 +static int save_len_plugin_sd(struct inode *inode /* object being processed */ )
54047 +{
54048 +       int len;
54049 +       reiser4_inode *state;
54050 +       pset_member memb;
54051 +
54052 +       assert("nikita-663", inode != NULL);
54053 +
54054 +       state = reiser4_inode_data(inode);
54055 +       /* common case: no non-standard plugins */
54056 +       if (state->plugin_mask == 0)
54057 +               return 0;
54058 +       len = sizeof(reiser4_plugin_stat);
54059 +       for (memb = 0; memb < PSET_LAST; ++memb)
54060 +               len = len_for(pset_get(state->pset, memb), inode, memb, len);
54061 +       assert("nikita-664", len > (int)sizeof(reiser4_plugin_stat));
54062 +       return len;
54063 +}
54064 +
54065 +/* helper function for plugin_sd_save(): save plugin, associated with
54066 +    inode. */
54067 +static int save_plug(reiser4_plugin * plugin /* plugin to save */ ,
54068 +                    struct inode *inode /* object being processed */ ,
54069 +                    pset_member memb /* what element of pset is saved */ ,
54070 +                    char **area /* position in stat-data */ ,
54071 +                    int *count /* incremented if plugin were actually
54072 +                                * saved. */ )
54073 +{
54074 +       reiser4_plugin_slot *slot;
54075 +       int fake_len;
54076 +       int result;
54077 +
54078 +       assert("nikita-665", inode != NULL);
54079 +       assert("nikita-666", area != NULL);
54080 +       assert("nikita-667", *area != NULL);
54081 +
54082 +       if (plugin == NULL)
54083 +               return 0;
54084 +       if (!(reiser4_inode_data(inode)->plugin_mask & (1 << memb)))
54085 +               return 0;
54086 +       slot = (reiser4_plugin_slot *) * area;
54087 +       put_unaligned(cpu_to_le16(memb), &slot->pset_memb);
54088 +       put_unaligned(cpu_to_le16(plugin->h.id), &slot->id);
54089 +       fake_len = (int)0xffff;
54090 +       move_on(&fake_len, area, sizeof *slot);
54091 +       ++*count;
54092 +       result = 0;
54093 +       if (plugin->h.pops != NULL) {
54094 +               if (plugin->h.pops->save != NULL)
54095 +                       result = plugin->h.pops->save(inode, plugin, area);
54096 +       }
54097 +       return result;
54098 +}
54099 +
54100 +/* save state of all non-standard plugins associated with inode */
54101 +static int save_plugin_sd(struct inode *inode /* object being processed */ ,
54102 +                         char **area /* position in stat-data */ )
54103 +{
54104 +       int result = 0;
54105 +       int num_of_plugins;
54106 +       reiser4_plugin_stat *sd;
54107 +       reiser4_inode *state;
54108 +       int fake_len;
54109 +       pset_member memb;
54110 +
54111 +       assert("nikita-669", inode != NULL);
54112 +       assert("nikita-670", area != NULL);
54113 +       assert("nikita-671", *area != NULL);
54114 +
54115 +       state = reiser4_inode_data(inode);
54116 +       if (state->plugin_mask == 0)
54117 +               return 0;
54118 +       sd = (reiser4_plugin_stat *) * area;
54119 +       fake_len = (int)0xffff;
54120 +       move_on(&fake_len, area, sizeof *sd);
54121 +
54122 +       num_of_plugins = 0;
54123 +       for (memb = 0; memb < PSET_LAST; ++memb) {
54124 +               result = save_plug(pset_get(state->pset, memb),
54125 +                                  inode, memb, area, &num_of_plugins);
54126 +               if (result != 0)
54127 +                       break;
54128 +       }
54129 +
54130 +       put_unaligned(cpu_to_le16((__u16)num_of_plugins), &sd->plugins_no);
54131 +       return result;
54132 +}
54133 +
54134 +/* helper function for crypto_sd_present(), crypto_sd_save.
54135 +   Allocates memory for crypto stat, keyid and attaches it to the inode */
54136 +static int extract_crypto_stat (struct inode * inode,
54137 +                               reiser4_crypto_stat * sd)
54138 +{
54139 +       crypto_stat_t * info;
54140 +       assert("edward-11", !inode_crypto_stat(inode));
54141 +       assert("edward-1413",
54142 +              !inode_get_flag(inode, REISER4_CRYPTO_STAT_LOADED));
54143 +       /* create and attach a crypto-stat without secret key loaded */
54144 +       info = alloc_crypto_stat(inode);
54145 +       if (IS_ERR(info))
54146 +               return PTR_ERR(info);
54147 +       info->keysize = le16_to_cpu(get_unaligned(&sd->keysize));
54148 +       memcpy(info->keyid, sd->keyid, inode_digest_plugin(inode)->fipsize);
54149 +       attach_crypto_stat(inode, info);
54150 +       inode_set_flag(inode, REISER4_CRYPTO_STAT_LOADED);
54151 +       return 0;
54152 +}
54153 +
54154 +/* crypto stat-data extension */
54155 +
54156 +static int present_crypto_sd(struct inode *inode, char **area, int *len)
54157 +{
54158 +       int result;
54159 +       reiser4_crypto_stat *sd;
54160 +       digest_plugin *dplug = inode_digest_plugin(inode);
54161 +
54162 +       assert("edward-06", dplug != NULL);
54163 +       assert("edward-684", dplug->fipsize);
54164 +       assert("edward-07", area != NULL);
54165 +       assert("edward-08", *area != NULL);
54166 +       assert("edward-09", len != NULL);
54167 +       assert("edward-10", *len > 0);
54168 +
54169 +       if (*len < (int)sizeof(reiser4_crypto_stat)) {
54170 +               return not_enough_space(inode, "crypto-sd");
54171 +       }
54172 +       /* *len is number of bytes in stat data item from *area to the end of
54173 +          item. It must be not less than size of this extension */
54174 +       assert("edward-75", sizeof(*sd) + dplug->fipsize <= *len);
54175 +
54176 +       sd = (reiser4_crypto_stat *) * area;
54177 +       result = extract_crypto_stat(inode, sd);
54178 +       move_on(len, area, sizeof(*sd) + dplug->fipsize);
54179 +
54180 +       return result;
54181 +}
54182 +
54183 +static int save_len_crypto_sd(struct inode *inode)
54184 +{
54185 +       return sizeof(reiser4_crypto_stat) +
54186 +               inode_digest_plugin(inode)->fipsize;
54187 +}
54188 +
54189 +static int save_crypto_sd(struct inode *inode, char **area)
54190 +{
54191 +       int result = 0;
54192 +       reiser4_crypto_stat *sd;
54193 +       crypto_stat_t * info = inode_crypto_stat(inode);
54194 +       digest_plugin *dplug = inode_digest_plugin(inode);
54195 +
54196 +       assert("edward-12", dplug != NULL);
54197 +       assert("edward-13", area != NULL);
54198 +       assert("edward-14", *area != NULL);
54199 +       assert("edward-15", info != NULL);
54200 +       assert("edward-1414", info->keyid != NULL);
54201 +       assert("edward-1415", info->keysize != 0);
54202 +       assert("edward-76", reiser4_inode_data(inode) != NULL);
54203 +
54204 +       if (!inode_get_flag(inode, REISER4_CRYPTO_STAT_LOADED)) {
54205 +               /* file is just created */
54206 +               sd = (reiser4_crypto_stat *) *area;
54207 +               /* copy everything but private key to the disk stat-data */
54208 +               put_unaligned(cpu_to_le16(info->keysize), &sd->keysize);
54209 +               memcpy(sd->keyid, info->keyid, (size_t) dplug->fipsize);
54210 +               inode_set_flag(inode, REISER4_CRYPTO_STAT_LOADED);
54211 +       }
54212 +       *area += (sizeof(*sd) + dplug->fipsize);
54213 +       return result;
54214 +}
54215 +
54216 +static int eio(struct inode *inode, char **area, int *len)
54217 +{
54218 +       return RETERR(-EIO);
54219 +}
54220 +
54221 +sd_ext_plugin sd_ext_plugins[LAST_SD_EXTENSION] = {
54222 +       [LIGHT_WEIGHT_STAT] = {
54223 +               .h = {
54224 +                       .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
54225 +                       .id = LIGHT_WEIGHT_STAT,
54226 +                       .pops = NULL,
54227 +                       .label = "light-weight sd",
54228 +                       .desc = "sd for light-weight files",
54229 +                       .linkage = {NULL,NULL}
54230 +               },
54231 +               .present = present_lw_sd,
54232 +               .absent = NULL,
54233 +               .save_len = save_len_lw_sd,
54234 +               .save = save_lw_sd,
54235 +               .alignment = 8
54236 +       },
54237 +       [UNIX_STAT] = {
54238 +               .h = {
54239 +                       .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
54240 +                       .id = UNIX_STAT,
54241 +                       .pops = NULL,
54242 +                       .label = "unix-sd",
54243 +                       .desc = "unix stat-data fields",
54244 +                       .linkage = {NULL,NULL}
54245 +               },
54246 +               .present = present_unix_sd,
54247 +               .absent = absent_unix_sd,
54248 +               .save_len = save_len_unix_sd,
54249 +               .save = save_unix_sd,
54250 +               .alignment = 8
54251 +       },
54252 +       [LARGE_TIMES_STAT] = {
54253 +               .h = {
54254 +                       .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
54255 +                       .id = LARGE_TIMES_STAT,
54256 +                       .pops = NULL,
54257 +                       .label = "64time-sd",
54258 +                       .desc = "nanosecond resolution for times",
54259 +                       .linkage = {NULL,NULL}
54260 +               },
54261 +               .present = present_large_times_sd,
54262 +               .absent = NULL,
54263 +               .save_len = save_len_large_times_sd,
54264 +               .save = save_large_times_sd,
54265 +               .alignment = 8
54266 +       },
54267 +       [SYMLINK_STAT] = {
54268 +               /* stat data of symlink has this extension */
54269 +               .h = {
54270 +                       .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
54271 +                       .id = SYMLINK_STAT,
54272 +                       .pops = NULL,
54273 +                       .label = "symlink-sd",
54274 +                       .desc =
54275 +                       "stat data is appended with symlink name",
54276 +                       .linkage = {NULL,NULL}
54277 +               },
54278 +               .present = present_symlink_sd,
54279 +               .absent = NULL,
54280 +               .save_len = save_len_symlink_sd,
54281 +               .save = save_symlink_sd,
54282 +               .alignment = 8
54283 +       },
54284 +       [PLUGIN_STAT] = {
54285 +               .h = {
54286 +                       .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
54287 +                       .id = PLUGIN_STAT,
54288 +                       .pops = NULL,
54289 +                       .label = "plugin-sd",
54290 +                       .desc = "plugin stat-data fields",
54291 +                       .linkage = {NULL,NULL}
54292 +               },
54293 +               .present = present_plugin_sd,
54294 +               .absent = absent_plugin_sd,
54295 +               .save_len = save_len_plugin_sd,
54296 +               .save = save_plugin_sd,
54297 +               .alignment = 8
54298 +       },
54299 +       [FLAGS_STAT] = {
54300 +               .h = {
54301 +                       .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
54302 +                       .id = FLAGS_STAT,
54303 +                       .pops = NULL,
54304 +                       .label = "flags-sd",
54305 +                       .desc = "inode bit flags",
54306 +                       .linkage = {NULL, NULL}
54307 +               },
54308 +               .present = present_flags_sd,
54309 +               .absent = NULL,
54310 +               .save_len = save_len_flags_sd,
54311 +               .save = save_flags_sd,
54312 +               .alignment = 8
54313 +       },
54314 +       [CAPABILITIES_STAT] = {
54315 +               .h = {
54316 +                       .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
54317 +                       .id = CAPABILITIES_STAT,
54318 +                       .pops = NULL,
54319 +                       .label = "capabilities-sd",
54320 +                       .desc = "capabilities",
54321 +                       .linkage = {NULL, NULL}
54322 +               },
54323 +               .present = eio,
54324 +               .absent = NULL,
54325 +               .save_len = save_len_flags_sd,
54326 +               .save = save_flags_sd,
54327 +               .alignment = 8
54328 +       },
54329 +       [CRYPTO_STAT] = {
54330 +               .h = {
54331 +                       .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
54332 +                       .id = CRYPTO_STAT,
54333 +                       .pops = NULL,
54334 +                       .label = "crypto-sd",
54335 +                       .desc = "secret key size and id",
54336 +                       .linkage = {NULL, NULL}
54337 +               },
54338 +               .present = present_crypto_sd,
54339 +               .absent = NULL,
54340 +               .save_len = save_len_crypto_sd,
54341 +               .save = save_crypto_sd,
54342 +               .alignment = 8
54343 +       }
54344 +};
54345 +
54346 +/* Make Linus happy.
54347 +   Local variables:
54348 +   c-indentation-style: "K&R"
54349 +   mode-name: "LC"
54350 +   c-basic-offset: 8
54351 +   tab-width: 8
54352 +   fill-column: 120
54353 +   End:
54354 +*/
54355 diff --git a/fs/reiser4/plugin/item/static_stat.h b/fs/reiser4/plugin/item/static_stat.h
54356 new file mode 100644
54357 index 0000000..f5dec11
54358 --- /dev/null
54359 +++ b/fs/reiser4/plugin/item/static_stat.h
54360 @@ -0,0 +1,219 @@
54361 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
54362 +
54363 +/* This describes the static_stat item, used to hold all information needed by the stat() syscall.
54364 +
54365 +In the case where each file has not less than the fields needed by the
54366 +stat() syscall, it is more compact to store those fields in this
54367 +struct.
54368 +
54369 +If this item does not exist, then all stats are dynamically resolved.
54370 +At the moment, we either resolve all stats dynamically or all of them
54371 +statically.  If you think this is not fully optimal, and the rest of
54372 +reiser4 is working, then fix it...:-)
54373 +
54374 +*/
54375 +
54376 +#if !defined( __FS_REISER4_PLUGIN_ITEM_STATIC_STAT_H__ )
54377 +#define __FS_REISER4_PLUGIN_ITEM_STATIC_STAT_H__
54378 +
54379 +#include "../../forward.h"
54380 +#include "../../dformat.h"
54381 +
54382 +#include <linux/fs.h>          /* for struct inode */
54383 +
54384 +/* Stat data layout: goals and implementation.
54385 +
54386 +   We want to be able to have lightweight files which have complete flexibility in what semantic metadata is attached to
54387 +   them, including not having semantic metadata attached to them.
54388 +
54389 +   There is one problem with doing that, which is that if in fact you have exactly the same metadata for most files you
54390 +   want to store, then it takes more space to store that metadata in a dynamically sized structure than in a statically
54391 +   sized structure because the statically sized structure knows without recording it what the names and lengths of the
54392 +   attributes are.
54393 +
54394 +   This leads to a natural compromise, which is to special case those files which have simply the standard unix file
54395 +   attributes, and only employ the full dynamic stat data mechanism for those files that differ from the standard unix
54396 +   file in their use of file attributes.
54397 +
54398 +   Yet this compromise deserves to be compromised a little.
54399 +
54400 +   We accommodate the case where you have no more than the standard unix file attributes by using an "extension
54401 +   bitmask": each bit in it indicates presence or absence of or particular stat data extension (see sd_ext_bits enum).
54402 +
54403 +   If the first bit of the extension bitmask bit is 0, we have light-weight file whose attributes are either inherited
54404 +   from parent directory (as uid, gid) or initialised to some sane values.
54405 +
54406 +   To capitalize on existing code infrastructure, extensions are
54407 +   implemented as plugins of type REISER4_SD_EXT_PLUGIN_TYPE.
54408 +   Each stat-data extension plugin implements four methods:
54409 +
54410 +    ->present() called by sd_load() when this extension is found in stat-data
54411 +    ->absent() called by sd_load() when this extension is not found in stat-data
54412 +    ->save_len() called by sd_len() to calculate total length of stat-data
54413 +    ->save() called by sd_save() to store extension data into stat-data
54414 +
54415 +    Implementation is in fs/reiser4/plugin/item/static_stat.c
54416 +*/
54417 +
54418 +/* stat-data extension. Please order this by presumed frequency of use */
54419 +typedef enum {
54420 +       /* support for light-weight files */
54421 +       LIGHT_WEIGHT_STAT,
54422 +       /* data required to implement unix stat(2) call. Layout is in
54423 +          reiser4_unix_stat. If this is not present, file is light-weight */
54424 +       UNIX_STAT,
54425 +       /* this contains additional set of 32bit [anc]time fields to implement
54426 +          nanosecond resolution. Layout is in reiser4_large_times_stat. Usage
54427 +          if this extension is governed by 32bittimes mount option. */
54428 +       LARGE_TIMES_STAT,
54429 +       /* stat data has link name included */
54430 +       SYMLINK_STAT,
54431 +       /* if this is present, file is controlled by non-standard
54432 +          plugin (that is, plugin that cannot be deduced from file
54433 +          mode bits), for example, aggregation, interpolation etc. */
54434 +       PLUGIN_STAT,
54435 +       /* this extension contains persistent inode flags. These flags are
54436 +          single bits: immutable, append, only, etc. Layout is in
54437 +          reiser4_flags_stat. */
54438 +       FLAGS_STAT,
54439 +       /* this extension contains capabilities sets, associated with this
54440 +          file. Layout is in reiser4_capabilities_stat */
54441 +       CAPABILITIES_STAT,
54442 +       /* this extension contains size and public id of the secret key.
54443 +          Layout is in reiser4_crypto_stat */
54444 +       CRYPTO_STAT,
54445 +       LAST_SD_EXTENSION,
54446 +       /*
54447 +        * init_inode_static_sd() iterates over extension mask until all
54448 +        * non-zero bits are processed. This means, that neither ->present(),
54449 +        * nor ->absent() methods will be called for stat-data extensions that
54450 +        * go after last present extension. But some basic extensions, we want
54451 +        * either ->absent() or ->present() method to be called, because these
54452 +        * extensions set up something in inode even when they are not
54453 +        * present. This is what LAST_IMPORTANT_SD_EXTENSION is for: for all
54454 +        * extensions before and including LAST_IMPORTANT_SD_EXTENSION either
54455 +        * ->present(), or ->absent() method will be called, independently of
54456 +        * what other extensions are present.
54457 +        */
54458 +       LAST_IMPORTANT_SD_EXTENSION = PLUGIN_STAT,
54459 +} sd_ext_bits;
54460 +
54461 +/* minimal stat-data. This allows to support light-weight files. */
54462 +typedef struct reiser4_stat_data_base {
54463 +       /*  0 */ __le16 extmask;
54464 +       /*  2 */
54465 +} PACKED reiser4_stat_data_base;
54466 +
54467 +typedef struct reiser4_light_weight_stat {
54468 +       /*  0 */ __le16 mode;
54469 +       /*  2 */ __le32 nlink;
54470 +       /*  8 */ __le64 size;
54471 +       /* size in bytes */
54472 +       /* 16 */
54473 +} PACKED reiser4_light_weight_stat;
54474 +
54475 +typedef struct reiser4_unix_stat {
54476 +       /* owner id */
54477 +       /*  0 */ __le32 uid;
54478 +       /* group id */
54479 +       /*  4 */ __le32 gid;
54480 +       /* access time */
54481 +       /*  8 */ __le32 atime;
54482 +       /* modification time */
54483 +       /* 12 */ __le32 mtime;
54484 +       /* change time */
54485 +       /* 16 */ __le32 ctime;
54486 +       union {
54487 +               /* minor:major for device files */
54488 +               /* 20 */ __le64 rdev;
54489 +               /* bytes used by file */
54490 +               /* 20 */ __le64 bytes;
54491 +       } u;
54492 +       /* 28 */
54493 +} PACKED reiser4_unix_stat;
54494 +
54495 +/* symlink stored as part of inode */
54496 +typedef struct reiser4_symlink_stat {
54497 +       char body[0];
54498 +} PACKED reiser4_symlink_stat;
54499 +
54500 +typedef struct reiser4_plugin_slot {
54501 +       /*  0 */ __le16 pset_memb;
54502 +       /*  2 */ __le16 id;
54503 +       /*  4 *//* here plugin stores its persistent state */
54504 +} PACKED reiser4_plugin_slot;
54505 +
54506 +/* stat-data extension for files with non-standard plugin. */
54507 +typedef struct reiser4_plugin_stat {
54508 +       /* number of additional plugins, associated with this object */
54509 +       /*  0 */ __le16 plugins_no;
54510 +       /*  2 */ reiser4_plugin_slot slot[0];
54511 +       /*  2 */
54512 +} PACKED reiser4_plugin_stat;
54513 +
54514 +/* stat-data extension for inode flags. Currently it is just fixed-width 32
54515 + * bit mask. If need arise, this can be replaced with variable width
54516 + * bitmask. */
54517 +typedef struct reiser4_flags_stat {
54518 +       /*  0 */ __le32 flags;
54519 +       /*  4 */
54520 +} PACKED reiser4_flags_stat;
54521 +
54522 +typedef struct reiser4_capabilities_stat {
54523 +       /*  0 */ __le32 effective;
54524 +       /*  8 */ __le32 permitted;
54525 +       /* 16 */
54526 +} PACKED reiser4_capabilities_stat;
54527 +
54528 +typedef struct reiser4_cluster_stat {
54529 +/* this defines cluster size (an attribute of cryptcompress objects) as PAGE_SIZE << cluster shift */
54530 +       /* 0 */ d8 cluster_shift;
54531 +       /* 1 */
54532 +} PACKED reiser4_cluster_stat;
54533 +
54534 +typedef struct reiser4_crypto_stat {
54535 +       /* secret key size, bits */
54536 +       /*  0 */ d16 keysize;
54537 +       /* secret key id */
54538 +       /*  2 */ d8 keyid[0];
54539 +       /* 2 */
54540 +} PACKED reiser4_crypto_stat;
54541 +
54542 +typedef struct reiser4_large_times_stat {
54543 +       /* access time */
54544 +       /*  0 */ d32 atime;
54545 +       /* modification time */
54546 +       /*  8 */ d32 mtime;
54547 +       /* change time */
54548 +       /* 16 */ d32 ctime;
54549 +       /* 24 */
54550 +} PACKED reiser4_large_times_stat;
54551 +
54552 +/* this structure is filled by sd_item_stat */
54553 +typedef struct sd_stat {
54554 +       int dirs;
54555 +       int files;
54556 +       int others;
54557 +} sd_stat;
54558 +
54559 +/* plugin->item.common.* */
54560 +extern void print_sd(const char *prefix, coord_t * coord);
54561 +extern void item_stat_static_sd(const coord_t * coord, void *vp);
54562 +
54563 +/* plugin->item.s.sd.* */
54564 +extern int init_inode_static_sd(struct inode *inode, char *sd, int len);
54565 +extern int save_len_static_sd(struct inode *inode);
54566 +extern int save_static_sd(struct inode *inode, char **area);
54567 +
54568 +/* __FS_REISER4_PLUGIN_ITEM_STATIC_STAT_H__ */
54569 +#endif
54570 +
54571 +/* Make Linus happy.
54572 +   Local variables:
54573 +   c-indentation-style: "K&R"
54574 +   mode-name: "LC"
54575 +   c-basic-offset: 8
54576 +   tab-width: 8
54577 +   fill-column: 120
54578 +   End:
54579 +*/
54580 diff --git a/fs/reiser4/plugin/item/tail.c b/fs/reiser4/plugin/item/tail.c
54581 new file mode 100644
54582 index 0000000..c3498ce
54583 --- /dev/null
54584 +++ b/fs/reiser4/plugin/item/tail.c
54585 @@ -0,0 +1,805 @@
54586 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
54587 +
54588 +#include "item.h"
54589 +#include "../../inode.h"
54590 +#include "../../page_cache.h"
54591 +#include "../../carry.h"
54592 +#include "../../vfs_ops.h"
54593 +
54594 +#include <linux/quotaops.h>
54595 +#include <asm/uaccess.h>
54596 +#include <linux/swap.h>
54597 +#include <linux/writeback.h>
54598 +
54599 +/* plugin->u.item.b.max_key_inside */
54600 +reiser4_key *max_key_inside_tail(const coord_t *coord, reiser4_key *key)
54601 +{
54602 +       item_key_by_coord(coord, key);
54603 +       set_key_offset(key, get_key_offset(max_key()));
54604 +       return key;
54605 +}
54606 +
54607 +/* plugin->u.item.b.can_contain_key */
54608 +int can_contain_key_tail(const coord_t *coord, const reiser4_key *key,
54609 +                        const reiser4_item_data *data)
54610 +{
54611 +       reiser4_key item_key;
54612 +
54613 +       if (item_plugin_by_coord(coord) != data->iplug)
54614 +               return 0;
54615 +
54616 +       item_key_by_coord(coord, &item_key);
54617 +       if (get_key_locality(key) != get_key_locality(&item_key) ||
54618 +           get_key_objectid(key) != get_key_objectid(&item_key))
54619 +               return 0;
54620 +
54621 +       return 1;
54622 +}
54623 +
54624 +/* plugin->u.item.b.mergeable
54625 +   first item is of tail type */
54626 +/* Audited by: green(2002.06.14) */
54627 +int mergeable_tail(const coord_t *p1, const coord_t *p2)
54628 +{
54629 +       reiser4_key key1, key2;
54630 +
54631 +       assert("vs-535",
54632 +              item_type_by_coord(p1) == UNIX_FILE_METADATA_ITEM_TYPE);
54633 +       assert("vs-365", item_id_by_coord(p1) == FORMATTING_ID);
54634 +
54635 +       if (item_id_by_coord(p2) != FORMATTING_ID) {
54636 +               /* second item is of another type */
54637 +               return 0;
54638 +       }
54639 +
54640 +       item_key_by_coord(p1, &key1);
54641 +       item_key_by_coord(p2, &key2);
54642 +       if (get_key_locality(&key1) != get_key_locality(&key2) ||
54643 +           get_key_objectid(&key1) != get_key_objectid(&key2)
54644 +           || get_key_type(&key1) != get_key_type(&key2)) {
54645 +               /* items of different objects */
54646 +               return 0;
54647 +       }
54648 +       if (get_key_offset(&key1) + nr_units_tail(p1) != get_key_offset(&key2)) {
54649 +               /* not adjacent items */
54650 +               return 0;
54651 +       }
54652 +       return 1;
54653 +}
54654 +
54655 +/* plugin->u.item.b.print
54656 +   plugin->u.item.b.check */
54657 +
54658 +/* plugin->u.item.b.nr_units */
54659 +pos_in_node_t nr_units_tail(const coord_t * coord)
54660 +{
54661 +       return item_length_by_coord(coord);
54662 +}
54663 +
54664 +/* plugin->u.item.b.lookup */
54665 +lookup_result
54666 +lookup_tail(const reiser4_key * key, lookup_bias bias, coord_t * coord)
54667 +{
54668 +       reiser4_key item_key;
54669 +       __u64 lookuped, offset;
54670 +       unsigned nr_units;
54671 +
54672 +       item_key_by_coord(coord, &item_key);
54673 +       offset = get_key_offset(item_key_by_coord(coord, &item_key));
54674 +       nr_units = nr_units_tail(coord);
54675 +
54676 +       /* key we are looking for must be greater than key of item @coord */
54677 +       assert("vs-416", keygt(key, &item_key));
54678 +
54679 +       /* offset we are looking for */
54680 +       lookuped = get_key_offset(key);
54681 +
54682 +       if (lookuped >= offset && lookuped < offset + nr_units) {
54683 +               /* byte we are looking for is in this item */
54684 +               coord->unit_pos = lookuped - offset;
54685 +               coord->between = AT_UNIT;
54686 +               return CBK_COORD_FOUND;
54687 +       }
54688 +
54689 +       /* set coord after last unit */
54690 +       coord->unit_pos = nr_units - 1;
54691 +       coord->between = AFTER_UNIT;
54692 +       return bias ==
54693 +           FIND_MAX_NOT_MORE_THAN ? CBK_COORD_FOUND : CBK_COORD_NOTFOUND;
54694 +}
54695 +
54696 +/* plugin->u.item.b.paste */
54697 +int
54698 +paste_tail(coord_t *coord, reiser4_item_data *data,
54699 +          carry_plugin_info *info UNUSED_ARG)
54700 +{
54701 +       unsigned old_item_length;
54702 +       char *item;
54703 +
54704 +       /* length the item had before resizing has been performed */
54705 +       old_item_length = item_length_by_coord(coord) - data->length;
54706 +
54707 +       /* tail items never get pasted in the middle */
54708 +       assert("vs-363",
54709 +              (coord->unit_pos == 0 && coord->between == BEFORE_UNIT) ||
54710 +              (coord->unit_pos == old_item_length - 1 &&
54711 +               coord->between == AFTER_UNIT) ||
54712 +              (coord->unit_pos == 0 && old_item_length == 0
54713 +               && coord->between == AT_UNIT));
54714 +
54715 +       item = item_body_by_coord(coord);
54716 +       if (coord->unit_pos == 0)
54717 +               /* make space for pasted data when pasting at the beginning of
54718 +                  the item */
54719 +               memmove(item + data->length, item, old_item_length);
54720 +
54721 +       if (coord->between == AFTER_UNIT)
54722 +               coord->unit_pos++;
54723 +
54724 +       if (data->data) {
54725 +               assert("vs-554", data->user == 0 || data->user == 1);
54726 +               if (data->user) {
54727 +                       assert("nikita-3035", schedulable());
54728 +                       /* copy from user space */
54729 +                       if (__copy_from_user(item + coord->unit_pos,
54730 +                                            (const char __user *)data->data,
54731 +                                            (unsigned)data->length))
54732 +                               return RETERR(-EFAULT);
54733 +               } else
54734 +                       /* copy from kernel space */
54735 +                       memcpy(item + coord->unit_pos, data->data,
54736 +                              (unsigned)data->length);
54737 +       } else {
54738 +               memset(item + coord->unit_pos, 0, (unsigned)data->length);
54739 +       }
54740 +       return 0;
54741 +}
54742 +
54743 +/* plugin->u.item.b.fast_paste */
54744 +
54745 +/* plugin->u.item.b.can_shift
54746 +   number of units is returned via return value, number of bytes via @size. For
54747 +   tail items they coincide */
54748 +int
54749 +can_shift_tail(unsigned free_space, coord_t * source UNUSED_ARG,
54750 +              znode * target UNUSED_ARG, shift_direction direction UNUSED_ARG,
54751 +              unsigned *size, unsigned want)
54752 +{
54753 +       /* make sure that that we do not want to shift more than we have */
54754 +       assert("vs-364", want > 0
54755 +              && want <= (unsigned)item_length_by_coord(source));
54756 +
54757 +       *size = min(want, free_space);
54758 +       return *size;
54759 +}
54760 +
54761 +/* plugin->u.item.b.copy_units */
54762 +void
54763 +copy_units_tail(coord_t * target, coord_t * source,
54764 +               unsigned from, unsigned count,
54765 +               shift_direction where_is_free_space,
54766 +               unsigned free_space UNUSED_ARG)
54767 +{
54768 +       /* make sure that item @target is expanded already */
54769 +       assert("vs-366", (unsigned)item_length_by_coord(target) >= count);
54770 +       assert("vs-370", free_space >= count);
54771 +
54772 +       if (where_is_free_space == SHIFT_LEFT) {
54773 +               /* append item @target with @count first bytes of @source */
54774 +               assert("vs-365", from == 0);
54775 +
54776 +               memcpy((char *)item_body_by_coord(target) +
54777 +                      item_length_by_coord(target) - count,
54778 +                      (char *)item_body_by_coord(source), count);
54779 +       } else {
54780 +               /* target item is moved to right already */
54781 +               reiser4_key key;
54782 +
54783 +               assert("vs-367",
54784 +                      (unsigned)item_length_by_coord(source) == from + count);
54785 +
54786 +               memcpy((char *)item_body_by_coord(target),
54787 +                      (char *)item_body_by_coord(source) + from, count);
54788 +
54789 +               /* new units are inserted before first unit in an item,
54790 +                  therefore, we have to update item key */
54791 +               item_key_by_coord(source, &key);
54792 +               set_key_offset(&key, get_key_offset(&key) + from);
54793 +
54794 +               node_plugin_by_node(target->node)->update_item_key(target, &key,
54795 +                                                                  NULL /*info */);
54796 +       }
54797 +}
54798 +
54799 +/* plugin->u.item.b.create_hook */
54800 +
54801 +/* item_plugin->b.kill_hook
54802 +   this is called when @count units starting from @from-th one are going to be removed
54803 +   */
54804 +int
54805 +kill_hook_tail(const coord_t * coord, pos_in_node_t from,
54806 +              pos_in_node_t count, struct carry_kill_data *kdata)
54807 +{
54808 +       reiser4_key key;
54809 +       loff_t start, end;
54810 +
54811 +       assert("vs-1577", kdata);
54812 +       assert("vs-1579", kdata->inode);
54813 +
54814 +       item_key_by_coord(coord, &key);
54815 +       start = get_key_offset(&key) + from;
54816 +       end = start + count;
54817 +       fake_kill_hook_tail(kdata->inode, start, end, kdata->params.truncate);
54818 +       return 0;
54819 +}
54820 +
54821 +/* plugin->u.item.b.shift_hook */
54822 +
54823 +/* helper for kill_units_tail and cut_units_tail */
54824 +static int
54825 +do_cut_or_kill(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
54826 +              reiser4_key * smallest_removed, reiser4_key * new_first)
54827 +{
54828 +       pos_in_node_t count;
54829 +
54830 +       /* this method is only called to remove part of item */
54831 +       assert("vs-374", (to - from + 1) < item_length_by_coord(coord));
54832 +       /* tails items are never cut from the middle of an item */
54833 +       assert("vs-396", ergo(from != 0, to == coord_last_unit_pos(coord)));
54834 +       assert("vs-1558", ergo(from == 0, to < coord_last_unit_pos(coord)));
54835 +
54836 +       count = to - from + 1;
54837 +
54838 +       if (smallest_removed) {
54839 +               /* store smallest key removed */
54840 +               item_key_by_coord(coord, smallest_removed);
54841 +               set_key_offset(smallest_removed,
54842 +                              get_key_offset(smallest_removed) + from);
54843 +       }
54844 +       if (new_first) {
54845 +               /* head of item is cut */
54846 +               assert("vs-1529", from == 0);
54847 +
54848 +               item_key_by_coord(coord, new_first);
54849 +               set_key_offset(new_first,
54850 +                              get_key_offset(new_first) + from + count);
54851 +       }
54852 +
54853 +       if (REISER4_DEBUG)
54854 +               memset((char *)item_body_by_coord(coord) + from, 0, count);
54855 +       return count;
54856 +}
54857 +
54858 +/* plugin->u.item.b.cut_units */
54859 +int
54860 +cut_units_tail(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
54861 +              struct carry_cut_data *cdata UNUSED_ARG,
54862 +              reiser4_key * smallest_removed, reiser4_key * new_first)
54863 +{
54864 +       return do_cut_or_kill(coord, from, to, smallest_removed, new_first);
54865 +}
54866 +
54867 +/* plugin->u.item.b.kill_units */
54868 +int
54869 +kill_units_tail(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
54870 +               struct carry_kill_data *kdata, reiser4_key * smallest_removed,
54871 +               reiser4_key * new_first)
54872 +{
54873 +       kill_hook_tail(coord, from, to - from + 1, kdata);
54874 +       return do_cut_or_kill(coord, from, to, smallest_removed, new_first);
54875 +}
54876 +
54877 +/* plugin->u.item.b.unit_key */
54878 +reiser4_key *unit_key_tail(const coord_t * coord, reiser4_key * key)
54879 +{
54880 +       assert("vs-375", coord_is_existing_unit(coord));
54881 +
54882 +       item_key_by_coord(coord, key);
54883 +       set_key_offset(key, (get_key_offset(key) + coord->unit_pos));
54884 +
54885 +       return key;
54886 +}
54887 +
54888 +/* plugin->u.item.b.estimate
54889 +   plugin->u.item.b.item_data_by_flow */
54890 +
54891 +/* tail redpage function. It is called from readpage_tail(). */
54892 +static int do_readpage_tail(uf_coord_t *uf_coord, struct page *page)
54893 +{
54894 +       tap_t tap;
54895 +       int result;
54896 +       coord_t coord;
54897 +       lock_handle lh;
54898 +       int count, mapped;
54899 +       struct inode *inode;
54900 +       char *pagedata;
54901 +
54902 +       /* saving passed coord in order to do not move it by tap. */
54903 +       init_lh(&lh);
54904 +       copy_lh(&lh, uf_coord->lh);
54905 +       inode = page->mapping->host;
54906 +       coord_dup(&coord, &uf_coord->coord);
54907 +
54908 +       tap_init(&tap, &coord, &lh, ZNODE_READ_LOCK);
54909 +
54910 +       if ((result = tap_load(&tap)))
54911 +               goto out_tap_done;
54912 +
54913 +       /* lookup until page is filled up. */
54914 +       for (mapped = 0; mapped < PAGE_CACHE_SIZE; ) {
54915 +               /* number of bytes to be copied to page */
54916 +               count = item_length_by_coord(&coord) - coord.unit_pos;
54917 +               if (count > PAGE_CACHE_SIZE - mapped)
54918 +                       count = PAGE_CACHE_SIZE - mapped;
54919 +
54920 +               /* attach @page to address space and get data address */
54921 +               pagedata = kmap_atomic(page, KM_USER0);
54922 +
54923 +               /* copy tail item to page */
54924 +               memcpy(pagedata + mapped,
54925 +                      ((char *)item_body_by_coord(&coord) + coord.unit_pos),
54926 +                      count);
54927 +               mapped += count;
54928 +
54929 +               flush_dcache_page(page);
54930 +
54931 +               /* dettach page from address space */
54932 +               kunmap_atomic(pagedata, KM_USER0);
54933 +
54934 +               /* Getting next tail item. */
54935 +               if (mapped < PAGE_CACHE_SIZE) {
54936 +                       /*
54937 +                        * unlock page in order to avoid keep it locked
54938 +                        * during tree lookup, which takes long term locks
54939 +                        */
54940 +                       unlock_page(page);
54941 +
54942 +                       /* getting right neighbour. */
54943 +                       result = go_dir_el(&tap, RIGHT_SIDE, 0);
54944 +
54945 +                       /* lock page back */
54946 +                       lock_page(page);
54947 +                       if (PageUptodate(page)) {
54948 +                               /*
54949 +                                * another thread read the page, we have
54950 +                                * nothing to do
54951 +                                */
54952 +                               result = 0;
54953 +                               goto out_unlock_page;
54954 +                       }
54955 +
54956 +                       if (result) {
54957 +                               if (result == -E_NO_NEIGHBOR) {
54958 +                                       /*
54959 +                                        * rigth neighbor is not a formatted
54960 +                                        * node
54961 +                                        */
54962 +                                       result = 0;
54963 +                                       goto done;
54964 +                               } else {
54965 +                                       goto out_tap_relse;
54966 +                               }
54967 +                       } else {
54968 +                               if (!inode_file_plugin(inode)->
54969 +                                   owns_item(inode, &coord)) {
54970 +                                       /* item of another file is found */
54971 +                                       result = 0;
54972 +                                       goto done;
54973 +                               }
54974 +                       }
54975 +               }
54976 +       }
54977 +
54978 + done:
54979 +       if (mapped != PAGE_CACHE_SIZE) {
54980 +               pagedata = kmap_atomic(page, KM_USER0);
54981 +               memset(pagedata + mapped, 0, PAGE_CACHE_SIZE - mapped);
54982 +               flush_dcache_page(page);
54983 +               kunmap_atomic(pagedata, KM_USER0);
54984 +       }
54985 +       SetPageUptodate(page);
54986 + out_unlock_page:
54987 +       unlock_page(page);
54988 + out_tap_relse:
54989 +       tap_relse(&tap);
54990 + out_tap_done:
54991 +       tap_done(&tap);
54992 +       return result;
54993 +}
54994 +
54995 +/*
54996 +   plugin->s.file.readpage
54997 +   reiser4_read->unix_file_read->page_cache_readahead->reiser4_readpage->unix_file_readpage->readpage_tail
54998 +   or
54999 +   filemap_nopage->reiser4_readpage->readpage_unix_file->->readpage_tail
55000 +
55001 +   At the beginning: coord->node is read locked, zloaded, page is locked, coord is set to existing unit inside of tail
55002 +   item. */
55003 +int readpage_tail(void *vp, struct page *page)
55004 +{
55005 +       uf_coord_t *uf_coord = vp;
55006 +       ON_DEBUG(coord_t * coord = &uf_coord->coord);
55007 +       ON_DEBUG(reiser4_key key);
55008 +
55009 +       assert("umka-2515", PageLocked(page));
55010 +       assert("umka-2516", !PageUptodate(page));
55011 +       assert("umka-2517", !jprivate(page) && !PagePrivate(page));
55012 +       assert("umka-2518", page->mapping && page->mapping->host);
55013 +
55014 +       assert("umka-2519", znode_is_loaded(coord->node));
55015 +       assert("umka-2520", item_is_tail(coord));
55016 +       assert("umka-2521", coord_is_existing_unit(coord));
55017 +       assert("umka-2522", znode_is_rlocked(coord->node));
55018 +       assert("umka-2523",
55019 +              page->mapping->host->i_ino ==
55020 +              get_key_objectid(item_key_by_coord(coord, &key)));
55021 +
55022 +       return do_readpage_tail(uf_coord, page);
55023 +}
55024 +
55025 +/**
55026 + * overwrite_tail
55027 + * @flow:
55028 + * @coord:
55029 + *
55030 + * Overwrites tail item or its part by user data. Returns number of bytes
55031 + * written or error code.
55032 + */
55033 +static int overwrite_tail(flow_t *flow, coord_t *coord)
55034 +{
55035 +       unsigned count;
55036 +
55037 +       assert("vs-570", flow->user == 1);
55038 +       assert("vs-946", flow->data);
55039 +       assert("vs-947", coord_is_existing_unit(coord));
55040 +       assert("vs-948", znode_is_write_locked(coord->node));
55041 +       assert("nikita-3036", schedulable());
55042 +
55043 +       count = item_length_by_coord(coord) - coord->unit_pos;
55044 +       if (count > flow->length)
55045 +               count = flow->length;
55046 +
55047 +       if (__copy_from_user((char *)item_body_by_coord(coord) + coord->unit_pos,
55048 +                            (const char __user *)flow->data, count))
55049 +               return RETERR(-EFAULT);
55050 +
55051 +       znode_make_dirty(coord->node);
55052 +       return count;
55053 +}
55054 +
55055 +/**
55056 + * insert_first_tail
55057 + * @inode:
55058 + * @flow:
55059 + * @coord:
55060 + * @lh:
55061 + *
55062 + * Returns number of bytes written or error code.
55063 + */
55064 +static ssize_t insert_first_tail(struct inode *inode, flow_t *flow,
55065 +                                coord_t *coord, lock_handle *lh)
55066 +{
55067 +       int result;
55068 +       loff_t to_write;
55069 +       unix_file_info_t *uf_info;
55070 +
55071 +       if (get_key_offset(&flow->key) != 0) {
55072 +               /*
55073 +                * file is empty and we have to write not to the beginning of
55074 +                * file. Create a hole at the beginning of file. On success
55075 +                * insert_flow returns 0 as number of written bytes which is
55076 +                * what we have to return on padding a file with holes
55077 +                */
55078 +               flow->data = NULL;
55079 +               flow->length = get_key_offset(&flow->key);
55080 +               set_key_offset(&flow->key, 0);
55081 +               /*
55082 +                * holes in files built of tails are stored just like if there
55083 +                * were real data which are all zeros. Therefore we have to
55084 +                * allocate quota here as well
55085 +                */
55086 +               if (DQUOT_ALLOC_SPACE_NODIRTY(inode, flow->length))
55087 +                       return RETERR(-EDQUOT);
55088 +               result = insert_flow(coord, lh, flow);
55089 +               if (flow->length)
55090 +                       DQUOT_FREE_SPACE_NODIRTY(inode, flow->length);
55091 +
55092 +               uf_info = unix_file_inode_data(inode);
55093 +
55094 +               /*
55095 +                * first item insertion is only possible when writing to empty
55096 +                * file or performing tail conversion
55097 +                */
55098 +               assert("", (uf_info->container == UF_CONTAINER_EMPTY ||
55099 +                           (inode_get_flag(inode, REISER4_PART_MIXED) &&
55100 +                            inode_get_flag(inode, REISER4_PART_IN_CONV))));
55101 +
55102 +               /* if file was empty - update its state */
55103 +               if (result == 0 && uf_info->container == UF_CONTAINER_EMPTY)
55104 +                       uf_info->container = UF_CONTAINER_TAILS;
55105 +               return result;
55106 +       }
55107 +
55108 +       /* check quota before appending data */
55109 +       if (DQUOT_ALLOC_SPACE_NODIRTY(inode, flow->length))
55110 +               return RETERR(-EDQUOT);
55111 +
55112 +       to_write = flow->length;
55113 +       result = insert_flow(coord, lh, flow);
55114 +       if (flow->length)
55115 +               DQUOT_FREE_SPACE_NODIRTY(inode, flow->length);
55116 +       return (to_write - flow->length) ? (to_write - flow->length) : result;
55117 +}
55118 +
55119 +/**
55120 + * append_tail
55121 + * @inode:
55122 + * @flow:
55123 + * @coord:
55124 + * @lh:
55125 + *
55126 + * Returns number of bytes written or error code.
55127 + */
55128 +static ssize_t append_tail(struct inode *inode,
55129 +                          flow_t *flow, coord_t *coord, lock_handle *lh)
55130 +{
55131 +       int result;
55132 +       reiser4_key append_key;
55133 +       loff_t to_write;
55134 +
55135 +       if (!keyeq(&flow->key, append_key_tail(coord, &append_key))) {
55136 +               flow->data = NULL;
55137 +               flow->length = get_key_offset(&flow->key) - get_key_offset(&append_key);
55138 +               set_key_offset(&flow->key, get_key_offset(&append_key));
55139 +               /*
55140 +                * holes in files built of tails are stored just like if there
55141 +                * were real data which are all zeros. Therefore we have to
55142 +                * allocate quota here as well
55143 +                */
55144 +               if (DQUOT_ALLOC_SPACE_NODIRTY(inode, flow->length))
55145 +                       return RETERR(-EDQUOT);
55146 +               result = insert_flow(coord, lh, flow);
55147 +               if (flow->length)
55148 +                       DQUOT_FREE_SPACE_NODIRTY(inode, flow->length);
55149 +               return result;
55150 +       }
55151 +
55152 +       /* check quota before appending data */
55153 +       if (DQUOT_ALLOC_SPACE_NODIRTY(inode, flow->length))
55154 +               return RETERR(-EDQUOT);
55155 +
55156 +       to_write = flow->length;
55157 +       result = insert_flow(coord, lh, flow);
55158 +       if (flow->length)
55159 +               DQUOT_FREE_SPACE_NODIRTY(inode, flow->length);
55160 +       return (to_write - flow->length) ? (to_write - flow->length) : result;
55161 +}
55162 +
55163 +/**
55164 + * write_tail_reserve_space - reserve space for tail write operation
55165 + * @inode:
55166 + *
55167 + * Estimates and reserves space which may be required for writing one flow to a
55168 + * file
55169 + */
55170 +static int write_extent_reserve_space(struct inode *inode)
55171 +{
55172 +       __u64 count;
55173 +       reiser4_tree *tree;
55174 +
55175 +       /*
55176 +        * to write one flow to a file by tails we have to reserve disk space for:
55177 +
55178 +        * 1. find_file_item may have to insert empty node to the tree (empty
55179 +        * leaf node between two extent items). This requires 1 block and
55180 +        * number of blocks which are necessary to perform insertion of an
55181 +        * internal item into twig level.
55182 +        *
55183 +        * 2. flow insertion
55184 +        *
55185 +        * 3. stat data update
55186 +        */
55187 +       tree = tree_by_inode(inode);
55188 +       count = estimate_one_insert_item(tree) +
55189 +               estimate_insert_flow(tree->height) +
55190 +               estimate_one_insert_item(tree);
55191 +       grab_space_enable();
55192 +       return reiser4_grab_space(count, 0 /* flags */);
55193 +}
55194 +
55195 +#define PAGE_PER_FLOW 4
55196 +
55197 +static loff_t faultin_user_pages(const char __user *buf, size_t count)
55198 +{
55199 +       loff_t faulted;
55200 +       int to_fault;
55201 +
55202 +       if (count > PAGE_PER_FLOW * PAGE_CACHE_SIZE)
55203 +               count = PAGE_PER_FLOW * PAGE_CACHE_SIZE;
55204 +       faulted = 0;
55205 +       while (count > 0) {
55206 +               to_fault = PAGE_CACHE_SIZE;
55207 +               if (count < to_fault)
55208 +                       to_fault = count;
55209 +               fault_in_pages_readable(buf + faulted, to_fault);
55210 +               count -= to_fault;
55211 +               faulted += to_fault;
55212 +       }
55213 +       return faulted;
55214 +}
55215 +
55216 +/**
55217 + * write_extent - write method of tail item plugin
55218 + * @file: file to write to
55219 + * @buf: address of user-space buffer
55220 + * @count: number of bytes to write
55221 + * @pos: position in file to write to
55222 + *
55223 + * Returns number of written bytes or error code.
55224 + */
55225 +ssize_t write_tail(struct file *file, const char __user *buf, size_t count,
55226 +                  loff_t *pos)
55227 +{
55228 +       struct inode *inode;
55229 +       struct hint hint;
55230 +       int result;
55231 +       flow_t flow;
55232 +       coord_t *coord;
55233 +       lock_handle *lh;
55234 +       znode *loaded;
55235 +
55236 +       inode = file->f_dentry->d_inode;
55237 +
55238 +       if (write_extent_reserve_space(inode))
55239 +               return RETERR(-ENOSPC);
55240 +
55241 +       result = load_file_hint(file, &hint);
55242 +       BUG_ON(result != 0);
55243 +
55244 +       flow.length = faultin_user_pages(buf, count);
55245 +       flow.user = 1;
55246 +       memcpy(&flow.data, &buf, sizeof(buf));
55247 +       flow.op = WRITE_OP;
55248 +       key_by_inode_and_offset_common(inode, *pos, &flow.key);
55249 +
55250 +       result = find_file_item(&hint, &flow.key, ZNODE_WRITE_LOCK, inode);
55251 +       if (IS_CBKERR(result))
55252 +               return result;
55253 +
55254 +       coord = &hint.ext_coord.coord;
55255 +       lh = hint.ext_coord.lh;
55256 +
55257 +       result = zload(coord->node);
55258 +       BUG_ON(result != 0);
55259 +       loaded = coord->node;
55260 +
55261 +       if (coord->between == AFTER_UNIT) {
55262 +               /* append with data or hole */
55263 +               result = append_tail(inode, &flow, coord, lh);
55264 +       } else if (coord->between == AT_UNIT) {
55265 +               /* overwrite */
55266 +               result = overwrite_tail(&flow, coord);
55267 +       } else {
55268 +               /* no items of this file yet. insert data or hole */
55269 +               result = insert_first_tail(inode, &flow, coord, lh);
55270 +       }
55271 +       zrelse(loaded);
55272 +       if (result < 0) {
55273 +               done_lh(lh);
55274 +               return result;
55275 +       }
55276 +
55277 +       /* seal and unlock znode */
55278 +       hint.ext_coord.valid = 0;
55279 +       if (hint.ext_coord.valid)
55280 +               set_hint(&hint, &flow.key, ZNODE_WRITE_LOCK);
55281 +       else
55282 +               unset_hint(&hint);
55283 +
55284 +       save_file_hint(file, &hint);
55285 +       return result;
55286 +}
55287 +
55288 +#if REISER4_DEBUG
55289 +
55290 +static int
55291 +coord_matches_key_tail(const coord_t * coord, const reiser4_key * key)
55292 +{
55293 +       reiser4_key item_key;
55294 +
55295 +       assert("vs-1356", coord_is_existing_unit(coord));
55296 +       assert("vs-1354", keylt(key, append_key_tail(coord, &item_key)));
55297 +       assert("vs-1355", keyge(key, item_key_by_coord(coord, &item_key)));
55298 +       return get_key_offset(key) ==
55299 +           get_key_offset(&item_key) + coord->unit_pos;
55300 +
55301 +}
55302 +
55303 +#endif
55304 +
55305 +/* plugin->u.item.s.file.read */
55306 +int read_tail(struct file *file UNUSED_ARG, flow_t *f, hint_t *hint)
55307 +{
55308 +       unsigned count;
55309 +       int item_length;
55310 +       coord_t *coord;
55311 +       uf_coord_t *uf_coord;
55312 +
55313 +       uf_coord = &hint->ext_coord;
55314 +       coord = &uf_coord->coord;
55315 +
55316 +       assert("vs-571", f->user == 1);
55317 +       assert("vs-571", f->data);
55318 +       assert("vs-967", coord && coord->node);
55319 +       assert("vs-1117", znode_is_rlocked(coord->node));
55320 +       assert("vs-1118", znode_is_loaded(coord->node));
55321 +
55322 +       assert("nikita-3037", schedulable());
55323 +       assert("vs-1357", coord_matches_key_tail(coord, &f->key));
55324 +
55325 +       /* calculate number of bytes to read off the item */
55326 +       item_length = item_length_by_coord(coord);
55327 +       count = item_length_by_coord(coord) - coord->unit_pos;
55328 +       if (count > f->length)
55329 +               count = f->length;
55330 +
55331 +       /* user page has to be brought in so that major page fault does not
55332 +        * occur here when longtem lock is held */
55333 +       if (__copy_to_user((char __user *)f->data,
55334 +                          ((char *)item_body_by_coord(coord) + coord->unit_pos),
55335 +                          count))
55336 +               return RETERR(-EFAULT);
55337 +
55338 +       /* probably mark_page_accessed() should only be called if
55339 +        * coord->unit_pos is zero. */
55340 +       mark_page_accessed(znode_page(coord->node));
55341 +       move_flow_forward(f, count);
55342 +
55343 +       coord->unit_pos += count;
55344 +       if (item_length == coord->unit_pos) {
55345 +               coord->unit_pos--;
55346 +               coord->between = AFTER_UNIT;
55347 +       }
55348 +
55349 +       return 0;
55350 +}
55351 +
55352 +/*
55353 +   plugin->u.item.s.file.append_key
55354 +   key of first byte which is the next to last byte by addressed by this item
55355 +*/
55356 +reiser4_key *append_key_tail(const coord_t * coord, reiser4_key * key)
55357 +{
55358 +       item_key_by_coord(coord, key);
55359 +       set_key_offset(key, get_key_offset(key) + item_length_by_coord(coord));
55360 +       return key;
55361 +}
55362 +
55363 +/* plugin->u.item.s.file.init_coord_extension */
55364 +void init_coord_extension_tail(uf_coord_t * uf_coord, loff_t lookuped)
55365 +{
55366 +       uf_coord->valid = 1;
55367 +}
55368 +
55369 +/*
55370 +  plugin->u.item.s.file.get_block
55371 +*/
55372 +int
55373 +get_block_address_tail(const coord_t * coord, sector_t lblock, sector_t * block)
55374 +{
55375 +       assert("nikita-3252", znode_get_level(coord->node) == LEAF_LEVEL);
55376 +
55377 +       *block = *znode_get_block(coord->node);
55378 +       return 0;
55379 +}
55380 +
55381 +/*
55382 + * Local variables:
55383 + * c-indentation-style: "K&R"
55384 + * mode-name: "LC"
55385 + * c-basic-offset: 8
55386 + * tab-width: 8
55387 + * fill-column: 79
55388 + * scroll-step: 1
55389 + * End:
55390 + */
55391 diff --git a/fs/reiser4/plugin/item/tail.h b/fs/reiser4/plugin/item/tail.h
55392 new file mode 100644
55393 index 0000000..8d03f68
55394 --- /dev/null
55395 +++ b/fs/reiser4/plugin/item/tail.h
55396 @@ -0,0 +1,58 @@
55397 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
55398 +
55399 +#if !defined( __REISER4_TAIL_H__ )
55400 +#define __REISER4_TAIL_H__
55401 +
55402 +typedef struct {
55403 +       int not_used;
55404 +} tail_coord_extension_t;
55405 +
55406 +struct cut_list;
55407 +
55408 +/* plugin->u.item.b.* */
55409 +reiser4_key *max_key_inside_tail(const coord_t *, reiser4_key *);
55410 +int can_contain_key_tail(const coord_t * coord, const reiser4_key * key,
55411 +                        const reiser4_item_data *);
55412 +int mergeable_tail(const coord_t * p1, const coord_t * p2);
55413 +pos_in_node_t nr_units_tail(const coord_t *);
55414 +lookup_result lookup_tail(const reiser4_key *, lookup_bias, coord_t *);
55415 +int paste_tail(coord_t *, reiser4_item_data *, carry_plugin_info *);
55416 +int can_shift_tail(unsigned free_space, coord_t * source,
55417 +                  znode * target, shift_direction, unsigned *size,
55418 +                  unsigned want);
55419 +void copy_units_tail(coord_t * target, coord_t * source, unsigned from,
55420 +                    unsigned count, shift_direction, unsigned free_space);
55421 +int kill_hook_tail(const coord_t *, pos_in_node_t from, pos_in_node_t count,
55422 +                  struct carry_kill_data *);
55423 +int cut_units_tail(coord_t *, pos_in_node_t from, pos_in_node_t to,
55424 +                  struct carry_cut_data *, reiser4_key * smallest_removed,
55425 +                  reiser4_key * new_first);
55426 +int kill_units_tail(coord_t *, pos_in_node_t from, pos_in_node_t to,
55427 +                   struct carry_kill_data *, reiser4_key * smallest_removed,
55428 +                   reiser4_key * new_first);
55429 +reiser4_key *unit_key_tail(const coord_t *, reiser4_key *);
55430 +
55431 +/* plugin->u.item.s.* */
55432 +ssize_t write_tail(struct file *file, const char __user *buf, size_t count,
55433 +                  loff_t *pos);
55434 +int read_tail(struct file *, flow_t *, hint_t *);
55435 +int readpage_tail(void *vp, struct page *page);
55436 +reiser4_key *append_key_tail(const coord_t *, reiser4_key *);
55437 +void init_coord_extension_tail(uf_coord_t *, loff_t offset);
55438 +int get_block_address_tail(const coord_t *, sector_t, sector_t *);
55439 +int item_balance_dirty_pages(struct address_space *, const flow_t *,
55440 +                            hint_t *, int back_to_dirty, int set_hint);
55441 +
55442 +/* __REISER4_TAIL_H__ */
55443 +#endif
55444 +
55445 +/* Make Linus happy.
55446 +   Local variables:
55447 +   c-indentation-style: "K&R"
55448 +   mode-name: "LC"
55449 +   c-basic-offset: 8
55450 +   tab-width: 8
55451 +   fill-column: 120
55452 +   scroll-step: 1
55453 +   End:
55454 +*/
55455 diff --git a/fs/reiser4/plugin/node/Makefile b/fs/reiser4/plugin/node/Makefile
55456 new file mode 100644
55457 index 0000000..9400627
55458 --- /dev/null
55459 +++ b/fs/reiser4/plugin/node/Makefile
55460 @@ -0,0 +1,5 @@
55461 +obj-$(CONFIG_REISER4_FS) += node_plugins.o
55462 +
55463 +node_plugins-objs :=   \
55464 +       node.o          \
55465 +       node40.o
55466 diff --git a/fs/reiser4/plugin/node/node.c b/fs/reiser4/plugin/node/node.c
55467 new file mode 100644
55468 index 0000000..39ee37c
55469 --- /dev/null
55470 +++ b/fs/reiser4/plugin/node/node.c
55471 @@ -0,0 +1,131 @@
55472 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
55473 +
55474 +/* Node plugin interface.
55475 +
55476 +   Description: The tree provides the abstraction of flows, which it
55477 +   internally fragments into items which it stores in nodes.
55478 +
55479 +   A key_atom is a piece of data bound to a single key.
55480 +
55481 +   For reasonable space efficiency to be achieved it is often
55482 +   necessary to store key_atoms in the nodes in the form of items, where
55483 +   an item is a sequence of key_atoms of the same or similar type. It is
55484 +   more space-efficient, because the item can implement (very)
55485 +   efficient compression of key_atom's bodies using internal knowledge
55486 +   about their semantics, and it can often avoid having a key for each
55487 +   key_atom. Each type of item has specific operations implemented by its
55488 +   item handler (see balance.c).
55489 +
55490 +   Rationale: the rest of the code (specifically balancing routines)
55491 +   accesses leaf level nodes through this interface. This way we can
55492 +   implement various block layouts and even combine various layouts
55493 +   within the same tree. Balancing/allocating algorithms should not
55494 +   care about peculiarities of splitting/merging specific item types,
55495 +   but rather should leave that to the item's item handler.
55496 +
55497 +   Items, including those that provide the abstraction of flows, have
55498 +   the property that if you move them in part or in whole to another
55499 +   node, the balancing code invokes their is_left_mergeable()
55500 +   item_operation to determine if they are mergeable with their new
55501 +   neighbor in the node you have moved them to.  For some items the
55502 +   is_left_mergeable() function always returns null.
55503 +
55504 +   When moving the bodies of items from one node to another:
55505 +
55506 +     if a partial item is shifted to another node the balancing code invokes
55507 +     an item handler method to handle the item splitting.
55508 +
55509 +     if the balancing code needs to merge with an item in the node it
55510 +     is shifting to, it will invoke an item handler method to handle
55511 +     the item merging.
55512 +
55513 +     if it needs to move whole item bodies unchanged, the balancing code uses xmemcpy()
55514 +     adjusting the item headers after the move is done using the node handler.
55515 +*/
55516 +
55517 +#include "../../forward.h"
55518 +#include "../../debug.h"
55519 +#include "../../key.h"
55520 +#include "../../coord.h"
55521 +#include "../plugin_header.h"
55522 +#include "../item/item.h"
55523 +#include "node.h"
55524 +#include "../plugin.h"
55525 +#include "../../znode.h"
55526 +#include "../../tree.h"
55527 +#include "../../super.h"
55528 +#include "../../reiser4.h"
55529 +
55530 +/**
55531 + * leftmost_key_in_node - get the smallest key in node
55532 + * @node:
55533 + * @key: store result here
55534 + *
55535 + * Stores the leftmost key of @node in @key.
55536 + */
55537 +reiser4_key *leftmost_key_in_node(const znode *node, reiser4_key *key)
55538 +{
55539 +       assert("nikita-1634", node != NULL);
55540 +       assert("nikita-1635", key != NULL);
55541 +
55542 +       if (!node_is_empty(node)) {
55543 +               coord_t first_item;
55544 +
55545 +               coord_init_first_unit(&first_item, (znode *) node);
55546 +               item_key_by_coord(&first_item, key);
55547 +       } else
55548 +               *key = *max_key();
55549 +       return key;
55550 +}
55551 +
55552 +node_plugin node_plugins[LAST_NODE_ID] = {
55553 +       [NODE40_ID] = {
55554 +               .h = {
55555 +                       .type_id = REISER4_NODE_PLUGIN_TYPE,
55556 +                       .id = NODE40_ID,
55557 +                       .pops = NULL,
55558 +                       .label = "unified",
55559 +                       .desc = "unified node layout",
55560 +                       .linkage = {NULL, NULL}
55561 +               },
55562 +               .item_overhead = item_overhead_node40,
55563 +               .free_space = free_space_node40,
55564 +               .lookup = lookup_node40,
55565 +               .num_of_items = num_of_items_node40,
55566 +               .item_by_coord = item_by_coord_node40,
55567 +               .length_by_coord = length_by_coord_node40,
55568 +               .plugin_by_coord = plugin_by_coord_node40,
55569 +               .key_at = key_at_node40,
55570 +               .estimate = estimate_node40,
55571 +               .check = check_node40,
55572 +               .parse = parse_node40,
55573 +               .init = init_node40,
55574 +#ifdef GUESS_EXISTS
55575 +               .guess = guess_node40,
55576 +#endif
55577 +               .change_item_size = change_item_size_node40,
55578 +               .create_item = create_item_node40,
55579 +               .update_item_key = update_item_key_node40,
55580 +               .cut_and_kill = kill_node40,
55581 +               .cut = cut_node40,
55582 +               .shift = shift_node40,
55583 +               .shrink_item = shrink_item_node40,
55584 +               .fast_insert = fast_insert_node40,
55585 +               .fast_paste = fast_paste_node40,
55586 +               .fast_cut = fast_cut_node40,
55587 +               .max_item_size = max_item_size_node40,
55588 +               .prepare_removal = prepare_removal_node40,
55589 +               .set_item_plugin = set_item_plugin_node40
55590 +       }
55591 +};
55592 +
55593 +/*
55594 +   Local variables:
55595 +   c-indentation-style: "K&R"
55596 +   mode-name: "LC"
55597 +   c-basic-offset: 8
55598 +   tab-width: 8
55599 +   fill-column: 120
55600 +   scroll-step: 1
55601 +   End:
55602 +*/
55603 diff --git a/fs/reiser4/plugin/node/node.h b/fs/reiser4/plugin/node/node.h
55604 new file mode 100644
55605 index 0000000..af0c641
55606 --- /dev/null
55607 +++ b/fs/reiser4/plugin/node/node.h
55608 @@ -0,0 +1,272 @@
55609 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
55610 +
55611 +/* We need a definition of the default node layout here. */
55612 +
55613 +/* Generally speaking, it is best to have free space in the middle of the
55614 +   node so that two sets of things can grow towards it, and to have the
55615 +   item bodies on the left so that the last one of them grows into free
55616 +   space.  We optimize for the case where we append new items to the end
55617 +   of the node, or grow the last item, because it hurts nothing to so
55618 +   optimize and it is a common special case to do massive insertions in
55619 +   increasing key order (and one of cases more likely to have a real user
55620 +   notice the delay time for).
55621 +
55622 +   formatted leaf default layout: (leaf1)
55623 +
55624 +   |node header:item bodies:free space:key + pluginid + item offset|
55625 +
55626 +   We grow towards the middle, optimizing layout for the case where we
55627 +   append new items to the end of the node.  The node header is fixed
55628 +   length.  Keys, and item offsets plus pluginids for the items
55629 +   corresponding to them are in increasing key order, and are fixed
55630 +   length.  Item offsets are relative to start of node (16 bits creating
55631 +   a node size limit of 64k, 12 bits might be a better choice....).  Item
55632 +   bodies are in decreasing key order.  Item bodies have a variable size.
55633 +   There is a one to one to one mapping of keys to item offsets to item
55634 +   bodies.  Item offsets consist of pointers to the zeroth byte of the
55635 +   item body.  Item length equals the start of the next item minus the
55636 +   start of this item, except the zeroth item whose length equals the end
55637 +   of the node minus the start of that item (plus a byte).  In other
55638 +   words, the item length is not recorded anywhere, and it does not need
55639 +   to be since it is computable.
55640 +
55641 +   Leaf variable length items and keys layout : (lvar)
55642 +
55643 +   |node header:key offset + item offset + pluginid triplets:free space:key bodies:item bodies|
55644 +
55645 +   We grow towards the middle, optimizing layout for the case where we
55646 +   append new items to the end of the node.  The node header is fixed
55647 +   length.  Keys and item offsets for the items corresponding to them are
55648 +   in increasing key order, and keys are variable length.  Item offsets
55649 +   are relative to start of node (16 bits).  Item bodies are in
55650 +   decreasing key order.  Item bodies have a variable size.  There is a
55651 +   one to one to one mapping of keys to item offsets to item bodies.
55652 +   Item offsets consist of pointers to the zeroth byte of the item body.
55653 +   Item length equals the start of the next item's key minus the start of
55654 +   this item, except the zeroth item whose length equals the end of the
55655 +   node minus the start of that item (plus a byte).
55656 +
55657 +   leaf compressed keys layout: (lcomp)
55658 +
55659 +   |node header:key offset + key inherit + item offset pairs:free space:key bodies:item bodies|
55660 +
55661 +   We grow towards the middle, optimizing layout for the case where we
55662 +   append new items to the end of the node.  The node header is fixed
55663 +   length.  Keys and item offsets for the items corresponding to them are
55664 +   in increasing key order, and keys are variable length.  The "key
55665 +   inherit" field indicates how much of the key prefix is identical to
55666 +   the previous key (stem compression as described in "Managing
55667 +   Gigabytes" is used).  key_inherit is a one byte integer.  The
55668 +   intra-node searches performed through this layout are linear searches,
55669 +   and this is theorized to not hurt performance much due to the high
55670 +   cost of processor stalls on modern CPUs, and the small number of keys
55671 +   in a single node.  Item offsets are relative to start of node (16
55672 +   bits).  Item bodies are in decreasing key order.  Item bodies have a
55673 +   variable size.  There is a one to one to one mapping of keys to item
55674 +   offsets to item bodies.  Item offsets consist of pointers to the
55675 +   zeroth byte of the item body.  Item length equals the start of the
55676 +   next item minus the start of this item, except the zeroth item whose
55677 +   length equals the end of the node minus the start of that item (plus a
55678 +   byte).  In other words, item length and key length is not recorded
55679 +   anywhere, and it does not need to be since it is computable.
55680 +
55681 +   internal node default layout: (idef1)
55682 +
55683 +   just like ldef1 except that item bodies are either blocknrs of
55684 +   children or extents, and moving them may require updating parent
55685 +   pointers in the nodes that they point to.
55686 +*/
55687 +
55688 +/* There is an inherent 3-way tradeoff between optimizing and
55689 +   exchanging disks between different architectures and code
55690 +   complexity.  This is optimal and simple and inexchangeable.
55691 +   Someone else can do the code for exchanging disks and make it
55692 +   complex. It would not be that hard.  Using other than the PAGE_SIZE
55693 +   might be suboptimal.
55694 +*/
55695 +
55696 +#if !defined( __REISER4_NODE_H__ )
55697 +#define __REISER4_NODE_H__
55698 +
55699 +#define LEAF40_NODE_SIZE PAGE_CACHE_SIZE
55700 +
55701 +#include "../../dformat.h"
55702 +#include "../plugin_header.h"
55703 +
55704 +#include <linux/types.h>
55705 +
55706 +typedef enum {
55707 +       NS_FOUND = 0,
55708 +       NS_NOT_FOUND = -ENOENT
55709 +} node_search_result;
55710 +
55711 +/* Maximal possible space overhead for creation of new item in a node */
55712 +#define REISER4_NODE_MAX_OVERHEAD ( sizeof( reiser4_key ) + 32 )
55713 +
55714 +typedef enum {
55715 +       REISER4_NODE_DKEYS = (1 << 0),
55716 +       REISER4_NODE_TREE_STABLE = (1 << 1)
55717 +} reiser4_node_check_flag;
55718 +
55719 +/* cut and cut_and_kill have too long list of parameters. This structure is just to safe some space on stack */
55720 +struct cut_list {
55721 +       coord_t *from;
55722 +       coord_t *to;
55723 +       const reiser4_key *from_key;
55724 +       const reiser4_key *to_key;
55725 +       reiser4_key *smallest_removed;
55726 +       carry_plugin_info *info;
55727 +       __u32 flags;
55728 +       struct inode *inode;    /* this is to pass list of eflushed jnodes down to extent_kill_hook */
55729 +       lock_handle *left;
55730 +       lock_handle *right;
55731 +};
55732 +
55733 +struct carry_cut_data;
55734 +struct carry_kill_data;
55735 +
55736 +/* The responsibility of the node plugin is to store and give access
55737 +   to the sequence of items within the node.  */
55738 +typedef struct node_plugin {
55739 +       /* generic plugin fields */
55740 +       plugin_header h;
55741 +
55742 +       /* calculates the amount of space that will be required to store an
55743 +          item which is in addition to the space consumed by the item body.
55744 +          (the space consumed by the item body can be gotten by calling
55745 +          item->estimate) */
55746 +        size_t(*item_overhead) (const znode * node, flow_t * f);
55747 +
55748 +       /* returns free space by looking into node (i.e., without using
55749 +          znode->free_space). */
55750 +        size_t(*free_space) (znode * node);
55751 +       /* search within the node for the one item which might
55752 +          contain the key, invoking item->search_within to search within
55753 +          that item to see if it is in there */
55754 +        node_search_result(*lookup) (znode * node, const reiser4_key * key,
55755 +                                     lookup_bias bias, coord_t * coord);
55756 +       /* number of items in node */
55757 +       int (*num_of_items) (const znode * node);
55758 +
55759 +       /* store information about item in @coord in @data */
55760 +       /* break into several node ops, don't add any more uses of this before doing so */
55761 +       /*int ( *item_at )( const coord_t *coord, reiser4_item_data *data ); */
55762 +       char *(*item_by_coord) (const coord_t * coord);
55763 +       int (*length_by_coord) (const coord_t * coord);
55764 +       item_plugin *(*plugin_by_coord) (const coord_t * coord);
55765 +
55766 +       /* store item key in @key */
55767 +       reiser4_key *(*key_at) (const coord_t * coord, reiser4_key * key);
55768 +       /* conservatively estimate whether unit of what size can fit
55769 +          into node. This estimation should be performed without
55770 +          actually looking into the node's content (free space is saved in
55771 +          znode). */
55772 +        size_t(*estimate) (znode * node);
55773 +
55774 +       /* performs every consistency check the node plugin author could
55775 +          imagine. Optional. */
55776 +       int (*check) (const znode * node, __u32 flags, const char **error);
55777 +
55778 +       /* Called when node is read into memory and node plugin is
55779 +          already detected. This should read some data into znode (like free
55780 +          space counter) and, optionally, check data consistency.
55781 +        */
55782 +       int (*parse) (znode * node);
55783 +       /* This method is called on a new node to initialise plugin specific
55784 +          data (header, etc.) */
55785 +       int (*init) (znode * node);
55786 +       /* Check whether @node content conforms to this plugin format.
55787 +          Probably only useful after support for old V3.x formats is added.
55788 +          Uncomment after 4.0 only.
55789 +        */
55790 +       /*      int ( *guess )( const znode *node ); */
55791 +#if REISER4_DEBUG
55792 +       void (*print) (const char *prefix, const znode * node, __u32 flags);
55793 +#endif
55794 +       /* change size of @item by @by bytes. @item->node has enough free
55795 +          space. When @by > 0 - free space is appended to end of item. When
55796 +          @by < 0 - item is truncated - it is assumed that last @by bytes if
55797 +          the item are freed already */
55798 +       void (*change_item_size) (coord_t * item, int by);
55799 +
55800 +       /* create new item @length bytes long in coord @target */
55801 +       int (*create_item) (coord_t * target, const reiser4_key * key,
55802 +                           reiser4_item_data * data, carry_plugin_info * info);
55803 +
55804 +       /* update key of item. */
55805 +       void (*update_item_key) (coord_t * target, const reiser4_key * key,
55806 +                                carry_plugin_info * info);
55807 +
55808 +       int (*cut_and_kill) (struct carry_kill_data *, carry_plugin_info *);
55809 +       int (*cut) (struct carry_cut_data *, carry_plugin_info *);
55810 +
55811 +       /*
55812 +        * shrink item pointed to by @coord by @delta bytes.
55813 +        */
55814 +       int (*shrink_item) (coord_t * coord, int delta);
55815 +
55816 +       /* copy as much as possible but not more than up to @stop from
55817 +          @stop->node to @target. If (pend == append) then data from beginning of
55818 +          @stop->node are copied to the end of @target. If (pend == prepend) then
55819 +          data from the end of @stop->node are copied to the beginning of
55820 +          @target. Copied data are removed from @stop->node. Information
55821 +          about what to do on upper level is stored in @todo */
55822 +       int (*shift) (coord_t * stop, znode * target, shift_direction pend,
55823 +                     int delete_node, int including_insert_coord,
55824 +                     carry_plugin_info * info);
55825 +       /* return true if this node allows skip carry() in some situations
55826 +          (see fs/reiser4/tree.c:insert_by_coord()). Reiser3.x format
55827 +          emulation doesn't.
55828 +
55829 +          This will speedup insertions that doesn't require updates to the
55830 +          parent, by bypassing initialisation of carry() structures. It's
55831 +          believed that majority of insertions will fit there.
55832 +
55833 +        */
55834 +       int (*fast_insert) (const coord_t * coord);
55835 +       int (*fast_paste) (const coord_t * coord);
55836 +       int (*fast_cut) (const coord_t * coord);
55837 +       /* this limits max size of item which can be inserted into a node and
55838 +          number of bytes item in a node may be appended with */
55839 +       int (*max_item_size) (void);
55840 +       int (*prepare_removal) (znode * empty, carry_plugin_info * info);
55841 +       /* change plugin id of items which are in a node already. Currently it is Used in tail conversion for regular
55842 +        * files */
55843 +       int (*set_item_plugin) (coord_t * coord, item_id);
55844 +} node_plugin;
55845 +
55846 +typedef enum {
55847 +       /* standard unified node layout used for both leaf and internal
55848 +          nodes */
55849 +       NODE40_ID,
55850 +       LAST_NODE_ID
55851 +} reiser4_node_id;
55852 +
55853 +extern reiser4_key *leftmost_key_in_node(const znode * node, reiser4_key * key);
55854 +#if REISER4_DEBUG
55855 +extern void print_node_content(const char *prefix, const znode * node,
55856 +                              __u32 flags);
55857 +#endif
55858 +
55859 +extern void indent_znode(const znode * node);
55860 +
55861 +typedef struct common_node_header {
55862 +       /*
55863 +        * identifier of node plugin. Must be located at the very beginning of
55864 +        * a node.
55865 +        */
55866 +       __le16 plugin_id;
55867 +} common_node_header;
55868 +
55869 +/* __REISER4_NODE_H__ */
55870 +#endif
55871 +/*
55872 + * Local variables:
55873 + * c-indentation-style: "K&R"
55874 + * mode-name: "LC"
55875 + * c-basic-offset: 8
55876 + * tab-width: 8
55877 + * fill-column: 79
55878 + * scroll-step: 1
55879 + * End:
55880 + */
55881 diff --git a/fs/reiser4/plugin/node/node40.c b/fs/reiser4/plugin/node/node40.c
55882 new file mode 100644
55883 index 0000000..33c4a7e
55884 --- /dev/null
55885 +++ b/fs/reiser4/plugin/node/node40.c
55886 @@ -0,0 +1,2924 @@
55887 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
55888 +
55889 +#include "../../debug.h"
55890 +#include "../../key.h"
55891 +#include "../../coord.h"
55892 +#include "../plugin_header.h"
55893 +#include "../item/item.h"
55894 +#include "node.h"
55895 +#include "node40.h"
55896 +#include "../plugin.h"
55897 +#include "../../jnode.h"
55898 +#include "../../znode.h"
55899 +#include "../../pool.h"
55900 +#include "../../carry.h"
55901 +#include "../../tap.h"
55902 +#include "../../tree.h"
55903 +#include "../../super.h"
55904 +#include "../../reiser4.h"
55905 +
55906 +#include <asm/uaccess.h>
55907 +#include <linux/types.h>
55908 +#include <linux/prefetch.h>
55909 +
55910 +/* leaf 40 format:
55911 +
55912 +  [node header | item 0, item 1, .., item N-1 |  free space | item_head N-1, .. item_head 1, item head 0 ]
55913 +   plugin_id (16)                                                key
55914 +   free_space (16)                                               pluginid (16)
55915 +   free_space_start (16)                                         offset (16)
55916 +   level (8)
55917 +   num_items (16)
55918 +   magic (32)
55919 +   flush_time (32)
55920 +*/
55921 +/* NIKITA-FIXME-HANS: I told you guys not less than 10 times to not call it r4fs.  Change to "ReIs". */
55922 +/* magic number that is stored in ->magic field of node header */
55923 +static const __u32 REISER4_NODE_MAGIC = 0x52344653;    /* (*(__u32 *)"R4FS"); */
55924 +
55925 +static int prepare_for_update(znode * left, znode * right,
55926 +                             carry_plugin_info * info);
55927 +
55928 +/* header of node of reiser40 format is at the beginning of node */
55929 +static inline node40_header *node40_node_header(const znode * node     /* node to
55930 +                                                                        * query */ )
55931 +{
55932 +       assert("nikita-567", node != NULL);
55933 +       assert("nikita-568", znode_page(node) != NULL);
55934 +       assert("nikita-569", zdata(node) != NULL);
55935 +       return (node40_header *) zdata(node);
55936 +}
55937 +
55938 +/* functions to get/set fields of node40_header */
55939 +#define nh40_get_magic(nh) le32_to_cpu(get_unaligned(&(nh)->magic))
55940 +#define nh40_get_free_space(nh) le16_to_cpu(get_unaligned(&(nh)->free_space))
55941 +#define nh40_get_free_space_start(nh) le16_to_cpu(get_unaligned(&(nh)->free_space_start))
55942 +#define nh40_get_level(nh) get_unaligned(&(nh)->level)
55943 +#define nh40_get_num_items(nh) le16_to_cpu(get_unaligned(&(nh)->nr_items))
55944 +#define nh40_get_flush_id(nh) le64_to_cpu(get_unaligned(&(nh)->flush_id))
55945 +
55946 +#define nh40_set_magic(nh, value) put_unaligned(cpu_to_le32(value), &(nh)->magic)
55947 +#define nh40_set_free_space(nh, value) put_unaligned(cpu_to_le16(value), &(nh)->free_space)
55948 +#define nh40_set_free_space_start(nh, value) put_unaligned(cpu_to_le16(value), &(nh)->free_space_start)
55949 +#define nh40_set_level(nh, value) put_unaligned(value, &(nh)->level)
55950 +#define nh40_set_num_items(nh, value) put_unaligned(cpu_to_le16(value), &(nh)->nr_items)
55951 +#define nh40_set_mkfs_id(nh, value) put_unaligned(cpu_to_le32(value), &(nh)->mkfs_id)
55952 +
55953 +
55954 +/* plugin field of node header should be read/set by
55955 +   plugin_by_disk_id/save_disk_plugin */
55956 +
55957 +/* array of item headers is at the end of node */
55958 +static inline item_header40 *node40_ih_at(const znode * node, unsigned pos)
55959 +{
55960 +       return (item_header40 *) (zdata(node) + znode_size(node)) - pos - 1;
55961 +}
55962 +
55963 +/* ( page_address( node -> pg ) + PAGE_CACHE_SIZE ) - pos - 1
55964 + */
55965 +static inline item_header40 *node40_ih_at_coord(const coord_t * coord)
55966 +{
55967 +       return (item_header40 *) (zdata(coord->node) +
55968 +                                 znode_size(coord->node)) - (coord->item_pos) -
55969 +           1;
55970 +}
55971 +
55972 +/* functions to get/set fields of item_header40 */
55973 +#define ih40_get_offset(ih) le16_to_cpu(get_unaligned(&(ih)->offset))
55974 +
55975 +#define ih40_set_offset(ih, value) put_unaligned(cpu_to_le16(value), &(ih)->offset)
55976 +
55977 +/* plugin field of item header should be read/set by
55978 +   plugin_by_disk_id/save_disk_plugin */
55979 +
55980 +/* plugin methods */
55981 +
55982 +/* plugin->u.node.item_overhead
55983 +   look for description of this method in plugin/node/node.h */
55984 +size_t
55985 +item_overhead_node40(const znode * node UNUSED_ARG, flow_t * f UNUSED_ARG)
55986 +{
55987 +       return sizeof(item_header40);
55988 +}
55989 +
55990 +/* plugin->u.node.free_space
55991 +   look for description of this method in plugin/node/node.h */
55992 +size_t free_space_node40(znode * node)
55993 +{
55994 +       assert("nikita-577", node != NULL);
55995 +       assert("nikita-578", znode_is_loaded(node));
55996 +       assert("nikita-579", zdata(node) != NULL);
55997 +
55998 +       return nh40_get_free_space(node40_node_header(node));
55999 +}
56000 +
56001 +/* private inline version of node40_num_of_items() for use in this file. This
56002 +   is necessary, because address of node40_num_of_items() is taken and it is
56003 +   never inlined as a result. */
56004 +static inline short node40_num_of_items_internal(const znode * node)
56005 +{
56006 +       return nh40_get_num_items(node40_node_header(node));
56007 +}
56008 +
56009 +#if REISER4_DEBUG
56010 +static inline void check_num_items(const znode * node)
56011 +{
56012 +       assert("nikita-2749",
56013 +              node40_num_of_items_internal(node) == node->nr_items);
56014 +       assert("nikita-2746", znode_is_write_locked(node));
56015 +}
56016 +#else
56017 +#define check_num_items(node) noop
56018 +#endif
56019 +
56020 +/* plugin->u.node.num_of_items
56021 +   look for description of this method in plugin/node/node.h */
56022 +int num_of_items_node40(const znode * node)
56023 +{
56024 +       return node40_num_of_items_internal(node);
56025 +}
56026 +
56027 +static void
56028 +node40_set_num_items(znode * node, node40_header * nh, unsigned value)
56029 +{
56030 +       assert("nikita-2751", node != NULL);
56031 +       assert("nikita-2750", nh == node40_node_header(node));
56032 +
56033 +       check_num_items(node);
56034 +       nh40_set_num_items(nh, value);
56035 +       node->nr_items = value;
56036 +       check_num_items(node);
56037 +}
56038 +
56039 +/* plugin->u.node.item_by_coord
56040 +   look for description of this method in plugin/node/node.h */
56041 +char *item_by_coord_node40(const coord_t * coord)
56042 +{
56043 +       item_header40 *ih;
56044 +       char *p;
56045 +
56046 +       /* @coord is set to existing item */
56047 +       assert("nikita-596", coord != NULL);
56048 +       assert("vs-255", coord_is_existing_item(coord));
56049 +
56050 +       ih = node40_ih_at_coord(coord);
56051 +       p = zdata(coord->node) + ih40_get_offset(ih);
56052 +       return p;
56053 +}
56054 +
56055 +/* plugin->u.node.length_by_coord
56056 +   look for description of this method in plugin/node/node.h */
56057 +int length_by_coord_node40(const coord_t * coord)
56058 +{
56059 +       item_header40 *ih;
56060 +       int result;
56061 +
56062 +       /* @coord is set to existing item */
56063 +       assert("vs-256", coord != NULL);
56064 +       assert("vs-257", coord_is_existing_item(coord));
56065 +
56066 +       ih = node40_ih_at_coord(coord);
56067 +       if ((int)coord->item_pos ==
56068 +           node40_num_of_items_internal(coord->node) - 1)
56069 +               result =
56070 +                   nh40_get_free_space_start(node40_node_header(coord->node)) -
56071 +                   ih40_get_offset(ih);
56072 +       else
56073 +               result = ih40_get_offset(ih - 1) - ih40_get_offset(ih);
56074 +
56075 +       return result;
56076 +}
56077 +
56078 +static pos_in_node_t
56079 +node40_item_length(const znode * node, pos_in_node_t item_pos)
56080 +{
56081 +       item_header40 *ih;
56082 +       pos_in_node_t result;
56083 +
56084 +       /* @coord is set to existing item */
56085 +       assert("vs-256", node != NULL);
56086 +       assert("vs-257", node40_num_of_items_internal(node) > item_pos);
56087 +
56088 +       ih = node40_ih_at(node, item_pos);
56089 +       if (item_pos == node40_num_of_items_internal(node) - 1)
56090 +               result =
56091 +                   nh40_get_free_space_start(node40_node_header(node)) -
56092 +                   ih40_get_offset(ih);
56093 +       else
56094 +               result = ih40_get_offset(ih - 1) - ih40_get_offset(ih);
56095 +
56096 +       return result;
56097 +}
56098 +
56099 +/* plugin->u.node.plugin_by_coord
56100 +   look for description of this method in plugin/node/node.h */
56101 +item_plugin *plugin_by_coord_node40(const coord_t * coord)
56102 +{
56103 +       item_header40 *ih;
56104 +       item_plugin *result;
56105 +
56106 +       /* @coord is set to existing item */
56107 +       assert("vs-258", coord != NULL);
56108 +       assert("vs-259", coord_is_existing_item(coord));
56109 +
56110 +       ih = node40_ih_at_coord(coord);
56111 +       /* pass NULL in stead of current tree. This is time critical call. */
56112 +       result = item_plugin_by_disk_id(NULL, &ih->plugin_id);
56113 +       return result;
56114 +}
56115 +
56116 +/* plugin->u.node.key_at
56117 +   look for description of this method in plugin/node/node.h */
56118 +reiser4_key *key_at_node40(const coord_t * coord, reiser4_key * key)
56119 +{
56120 +       item_header40 *ih;
56121 +
56122 +       assert("nikita-1765", coord_is_existing_item(coord));
56123 +
56124 +       /* @coord is set to existing item */
56125 +       ih = node40_ih_at_coord(coord);
56126 +       memcpy(key, &ih->key, sizeof(reiser4_key));
56127 +       return key;
56128 +}
56129 +
56130 +/* VS-FIXME-HANS: please review whether the below are properly disabled when debugging is disabled */
56131 +
56132 +#define NODE_INCSTAT(n, counter)                                               \
56133 +       reiser4_stat_inc_at_level(znode_get_level(n), node.lookup.counter)
56134 +
56135 +#define NODE_ADDSTAT(n, counter, val)                                          \
56136 +       reiser4_stat_add_at_level(znode_get_level(n), node.lookup.counter, val)
56137 +
56138 +/* plugin->u.node.lookup
56139 +   look for description of this method in plugin/node/node.h */
56140 +node_search_result lookup_node40(znode * node /* node to query */ ,
56141 +                                const reiser4_key * key /* key to look for */ ,
56142 +                                lookup_bias bias /* search bias */ ,
56143 +                                coord_t * coord /* resulting coord */ )
56144 +{
56145 +       int left;
56146 +       int right;
56147 +       int found;
56148 +       int items;
56149 +
56150 +       item_header40 *lefth;
56151 +       item_header40 *righth;
56152 +
56153 +       item_plugin *iplug;
56154 +       item_header40 *bstop;
56155 +       item_header40 *ih;
56156 +       cmp_t order;
56157 +
56158 +       assert("nikita-583", node != NULL);
56159 +       assert("nikita-584", key != NULL);
56160 +       assert("nikita-585", coord != NULL);
56161 +       assert("nikita-2693", znode_is_any_locked(node));
56162 +       cassert(REISER4_SEQ_SEARCH_BREAK > 2);
56163 +
56164 +       items = node_num_items(node);
56165 +
56166 +       if (unlikely(items == 0)) {
56167 +               coord_init_first_unit(coord, node);
56168 +               return NS_NOT_FOUND;
56169 +       }
56170 +
56171 +       /* binary search for item that can contain given key */
56172 +       left = 0;
56173 +       right = items - 1;
56174 +       coord->node = node;
56175 +       coord_clear_iplug(coord);
56176 +       found = 0;
56177 +
56178 +       lefth = node40_ih_at(node, left);
56179 +       righth = node40_ih_at(node, right);
56180 +
56181 +       /* It is known that for small arrays sequential search is on average
56182 +          more efficient than binary. This is because sequential search is
56183 +          coded as tight loop that can be better optimized by compilers and
56184 +          for small array size gain from this optimization makes sequential
56185 +          search the winner. Another, maybe more important, reason for this,
56186 +          is that sequential array is more CPU cache friendly, whereas binary
56187 +          search effectively destroys CPU caching.
56188 +
56189 +          Critical here is the notion of "smallness". Reasonable value of
56190 +          REISER4_SEQ_SEARCH_BREAK can be found by playing with code in
56191 +          fs/reiser4/ulevel/ulevel.c:test_search().
56192 +
56193 +          Don't try to further optimize sequential search by scanning from
56194 +          right to left in attempt to use more efficient loop termination
56195 +          condition (comparison with 0). This doesn't work.
56196 +
56197 +        */
56198 +
56199 +       while (right - left >= REISER4_SEQ_SEARCH_BREAK) {
56200 +               int median;
56201 +               item_header40 *medianh;
56202 +
56203 +               median = (left + right) / 2;
56204 +               medianh = node40_ih_at(node, median);
56205 +
56206 +               assert("nikita-1084", median >= 0);
56207 +               assert("nikita-1085", median < items);
56208 +               switch (keycmp(key, &medianh->key)) {
56209 +               case LESS_THAN:
56210 +                       right = median;
56211 +                       righth = medianh;
56212 +                       break;
56213 +               default:
56214 +                       wrong_return_value("nikita-586", "keycmp");
56215 +               case GREATER_THAN:
56216 +                       left = median;
56217 +                       lefth = medianh;
56218 +                       break;
56219 +               case EQUAL_TO:
56220 +                       do {
56221 +                               --median;
56222 +                               /* headers are ordered from right to left */
56223 +                               ++medianh;
56224 +                       } while (median >= 0 && keyeq(key, &medianh->key));
56225 +                       right = left = median + 1;
56226 +                       ih = lefth = righth = medianh - 1;
56227 +                       found = 1;
56228 +                       break;
56229 +               }
56230 +       }
56231 +       /* sequential scan. Item headers, and, therefore, keys are stored at
56232 +          the rightmost part of a node from right to left. We are trying to
56233 +          access memory from left to right, and hence, scan in _descending_
56234 +          order of item numbers.
56235 +        */
56236 +       if (!found) {
56237 +               for (left = right, ih = righth; left >= 0; ++ih, --left) {
56238 +                       cmp_t comparison;
56239 +
56240 +                       prefetchkey(&(ih + 1)->key);
56241 +                       comparison = keycmp(&ih->key, key);
56242 +                       if (comparison == GREATER_THAN)
56243 +                               continue;
56244 +                       if (comparison == EQUAL_TO) {
56245 +                               found = 1;
56246 +                               do {
56247 +                                       --left;
56248 +                                       ++ih;
56249 +                               } while (left >= 0 && keyeq(&ih->key, key));
56250 +                               ++left;
56251 +                               --ih;
56252 +                       } else {
56253 +                               assert("nikita-1256", comparison == LESS_THAN);
56254 +                       }
56255 +                       break;
56256 +               }
56257 +               if (unlikely(left < 0))
56258 +                       left = 0;
56259 +       }
56260 +
56261 +       assert("nikita-3212", right >= left);
56262 +       assert("nikita-3214",
56263 +              equi(found, keyeq(&node40_ih_at(node, left)->key, key)));
56264 +
56265 +       coord_set_item_pos(coord, left);
56266 +       coord->unit_pos = 0;
56267 +       coord->between = AT_UNIT;
56268 +
56269 +       /* key < leftmost key in a mode or node is corrupted and keys
56270 +          are not sorted  */
56271 +       bstop = node40_ih_at(node, (unsigned)left);
56272 +       order = keycmp(&bstop->key, key);
56273 +       if (unlikely(order == GREATER_THAN)) {
56274 +               if (unlikely(left != 0)) {
56275 +                       /* screw up */
56276 +                       warning("nikita-587", "Key less than %i key in a node",
56277 +                               left);
56278 +                       print_key("key", key);
56279 +                       print_key("min", &bstop->key);
56280 +                       print_coord_content("coord", coord);
56281 +                       return RETERR(-EIO);
56282 +               } else {
56283 +                       coord->between = BEFORE_UNIT;
56284 +                       return NS_NOT_FOUND;
56285 +               }
56286 +       }
56287 +       /* left <= key, ok */
56288 +       iplug = item_plugin_by_disk_id(znode_get_tree(node), &bstop->plugin_id);
56289 +
56290 +       if (unlikely(iplug == NULL)) {
56291 +               warning("nikita-588", "Unknown plugin %i",
56292 +                       le16_to_cpu(get_unaligned(&bstop->plugin_id)));
56293 +               print_key("key", key);
56294 +               print_coord_content("coord", coord);
56295 +               return RETERR(-EIO);
56296 +       }
56297 +
56298 +       coord_set_iplug(coord, iplug);
56299 +
56300 +       /* if exact key from item header was found by binary search, no
56301 +          further checks are necessary. */
56302 +       if (found) {
56303 +               assert("nikita-1259", order == EQUAL_TO);
56304 +               return NS_FOUND;
56305 +       }
56306 +       if (iplug->b.max_key_inside != NULL) {
56307 +               reiser4_key max_item_key;
56308 +
56309 +               /* key > max_item_key --- outside of an item */
56310 +               if (keygt(key, iplug->b.max_key_inside(coord, &max_item_key))) {
56311 +                       coord->unit_pos = 0;
56312 +                       coord->between = AFTER_ITEM;
56313 +                       /* FIXME-VS: key we are looking for does not fit into
56314 +                          found item. Return NS_NOT_FOUND then. Without that
56315 +                          the following case does not work: there is extent of
56316 +                          file 10000, 10001. File 10000, 10002 has been just
56317 +                          created. When writing to position 0 in that file -
56318 +                          traverse_tree will stop here on twig level. When we
56319 +                          want it to go down to leaf level
56320 +                        */
56321 +                       return NS_NOT_FOUND;
56322 +               }
56323 +       }
56324 +
56325 +       if (iplug->b.lookup != NULL) {
56326 +               return iplug->b.lookup(key, bias, coord);
56327 +       } else {
56328 +               assert("nikita-1260", order == LESS_THAN);
56329 +               coord->between = AFTER_UNIT;
56330 +               return (bias == FIND_EXACT) ? NS_NOT_FOUND : NS_FOUND;
56331 +       }
56332 +}
56333 +
56334 +#undef NODE_ADDSTAT
56335 +#undef NODE_INCSTAT
56336 +
56337 +/* plugin->u.node.estimate
56338 +   look for description of this method in plugin/node/node.h */
56339 +size_t estimate_node40(znode * node)
56340 +{
56341 +       size_t result;
56342 +
56343 +       assert("nikita-597", node != NULL);
56344 +
56345 +       result = free_space_node40(node) - sizeof(item_header40);
56346 +
56347 +       return (result > 0) ? result : 0;
56348 +}
56349 +
56350 +/* plugin->u.node.check
56351 +   look for description of this method in plugin/node/node.h */
56352 +int check_node40(const znode * node /* node to check */ ,
56353 +                __u32 flags /* check flags */ ,
56354 +                const char **error /* where to store error message */ )
56355 +{
56356 +       int nr_items;
56357 +       int i;
56358 +       reiser4_key prev;
56359 +       unsigned old_offset;
56360 +       tree_level level;
56361 +       coord_t coord;
56362 +       int result;
56363 +
56364 +       assert("nikita-580", node != NULL);
56365 +       assert("nikita-581", error != NULL);
56366 +       assert("nikita-2948", znode_is_loaded(node));
56367 +
56368 +       if (ZF_ISSET(node, JNODE_HEARD_BANSHEE))
56369 +               return 0;
56370 +
56371 +       assert("nikita-582", zdata(node) != NULL);
56372 +
56373 +       nr_items = node40_num_of_items_internal(node);
56374 +       if (nr_items < 0) {
56375 +               *error = "Negative number of items";
56376 +               return -1;
56377 +       }
56378 +
56379 +       if (flags & REISER4_NODE_DKEYS)
56380 +               prev = *znode_get_ld_key((znode *) node);
56381 +       else
56382 +               prev = *min_key();
56383 +
56384 +       old_offset = 0;
56385 +       coord_init_zero(&coord);
56386 +       coord.node = (znode *) node;
56387 +       coord.unit_pos = 0;
56388 +       coord.between = AT_UNIT;
56389 +       level = znode_get_level(node);
56390 +       for (i = 0; i < nr_items; i++) {
56391 +               item_header40 *ih;
56392 +               reiser4_key unit_key;
56393 +               unsigned j;
56394 +
56395 +               ih = node40_ih_at(node, (unsigned)i);
56396 +               coord_set_item_pos(&coord, i);
56397 +               if ((ih40_get_offset(ih) >=
56398 +                    znode_size(node) - nr_items * sizeof(item_header40)) ||
56399 +                   (ih40_get_offset(ih) < sizeof(node40_header))) {
56400 +                       *error = "Offset is out of bounds";
56401 +                       return -1;
56402 +               }
56403 +               if (ih40_get_offset(ih) <= old_offset) {
56404 +                       *error = "Offsets are in wrong order";
56405 +                       return -1;
56406 +               }
56407 +               if ((i == 0) && (ih40_get_offset(ih) != sizeof(node40_header))) {
56408 +                       *error = "Wrong offset of first item";
56409 +                       return -1;
56410 +               }
56411 +               old_offset = ih40_get_offset(ih);
56412 +
56413 +               if (keygt(&prev, &ih->key)) {
56414 +                       *error = "Keys are in wrong order";
56415 +                       return -1;
56416 +               }
56417 +               if (!keyeq(&ih->key, unit_key_by_coord(&coord, &unit_key))) {
56418 +                       *error = "Wrong key of first unit";
56419 +                       return -1;
56420 +               }
56421 +               prev = ih->key;
56422 +               for (j = 0; j < coord_num_units(&coord); ++j) {
56423 +                       coord.unit_pos = j;
56424 +                       unit_key_by_coord(&coord, &unit_key);
56425 +                       if (keygt(&prev, &unit_key)) {
56426 +                               *error = "Unit keys are in wrong order";
56427 +                               return -1;
56428 +                       }
56429 +                       prev = unit_key;
56430 +               }
56431 +               coord.unit_pos = 0;
56432 +               if (level != TWIG_LEVEL && item_is_extent(&coord)) {
56433 +                       *error = "extent on the wrong level";
56434 +                       return -1;
56435 +               }
56436 +               if (level == LEAF_LEVEL && item_is_internal(&coord)) {
56437 +                       *error = "internal item on the wrong level";
56438 +                       return -1;
56439 +               }
56440 +               if (level != LEAF_LEVEL &&
56441 +                   !item_is_internal(&coord) && !item_is_extent(&coord)) {
56442 +                       *error = "wrong item on the internal level";
56443 +                       return -1;
56444 +               }
56445 +               if (level > TWIG_LEVEL && !item_is_internal(&coord)) {
56446 +                       *error = "non-internal item on the internal level";
56447 +                       return -1;
56448 +               }
56449 +#if REISER4_DEBUG
56450 +               if (item_plugin_by_coord(&coord)->b.check
56451 +                   && item_plugin_by_coord(&coord)->b.check(&coord, error))
56452 +                       return -1;
56453 +#endif
56454 +               if (i) {
56455 +                       coord_t prev_coord;
56456 +                       /* two neighboring items can not be mergeable */
56457 +                       coord_dup(&prev_coord, &coord);
56458 +                       coord_prev_item(&prev_coord);
56459 +                       if (are_items_mergeable(&prev_coord, &coord)) {
56460 +                               *error = "mergeable items in one node";
56461 +                               return -1;
56462 +                       }
56463 +
56464 +               }
56465 +       }
56466 +
56467 +       if ((flags & REISER4_NODE_DKEYS) && !node_is_empty(node)) {
56468 +               coord_t coord;
56469 +               item_plugin *iplug;
56470 +
56471 +               coord_init_last_unit(&coord, node);
56472 +               iplug = item_plugin_by_coord(&coord);
56473 +               if ((item_is_extent(&coord) || item_is_tail(&coord)) &&
56474 +                   iplug->s.file.append_key != NULL) {
56475 +                       reiser4_key mkey;
56476 +
56477 +                       iplug->s.file.append_key(&coord, &mkey);
56478 +                       set_key_offset(&mkey, get_key_offset(&mkey) - 1);
56479 +                       read_lock_dk(current_tree);
56480 +                       result = keygt(&mkey, znode_get_rd_key((znode *) node));
56481 +                       read_unlock_dk(current_tree);
56482 +                       if (result) {
56483 +                               *error = "key of rightmost item is too large";
56484 +                               return -1;
56485 +                       }
56486 +               }
56487 +       }
56488 +       if (flags & REISER4_NODE_DKEYS) {
56489 +               read_lock_tree(current_tree);
56490 +               read_lock_dk(current_tree);
56491 +
56492 +               flags |= REISER4_NODE_TREE_STABLE;
56493 +
56494 +               if (keygt(&prev, znode_get_rd_key((znode *) node))) {
56495 +                       if (flags & REISER4_NODE_TREE_STABLE) {
56496 +                               *error = "Last key is greater than rdkey";
56497 +                               read_unlock_dk(current_tree);
56498 +                               read_unlock_tree(current_tree);
56499 +                               return -1;
56500 +                       }
56501 +               }
56502 +               if (keygt
56503 +                   (znode_get_ld_key((znode *) node),
56504 +                    znode_get_rd_key((znode *) node))) {
56505 +                       *error = "ldkey is greater than rdkey";
56506 +                       read_unlock_dk(current_tree);
56507 +                       read_unlock_tree(current_tree);
56508 +                       return -1;
56509 +               }
56510 +               if (ZF_ISSET(node, JNODE_LEFT_CONNECTED) &&
56511 +                   (node->left != NULL) &&
56512 +                   !ZF_ISSET(node->left, JNODE_HEARD_BANSHEE) &&
56513 +                   ergo(flags & REISER4_NODE_TREE_STABLE,
56514 +                        !keyeq(znode_get_rd_key(node->left),
56515 +                               znode_get_ld_key((znode *) node)))
56516 +                   && ergo(!(flags & REISER4_NODE_TREE_STABLE),
56517 +                           keygt(znode_get_rd_key(node->left),
56518 +                                 znode_get_ld_key((znode *) node)))) {
56519 +                       *error = "left rdkey or ldkey is wrong";
56520 +                       read_unlock_dk(current_tree);
56521 +                       read_unlock_tree(current_tree);
56522 +                       return -1;
56523 +               }
56524 +               if (ZF_ISSET(node, JNODE_RIGHT_CONNECTED) &&
56525 +                   (node->right != NULL) &&
56526 +                   !ZF_ISSET(node->right, JNODE_HEARD_BANSHEE) &&
56527 +                   ergo(flags & REISER4_NODE_TREE_STABLE,
56528 +                        !keyeq(znode_get_rd_key((znode *) node),
56529 +                               znode_get_ld_key(node->right)))
56530 +                   && ergo(!(flags & REISER4_NODE_TREE_STABLE),
56531 +                           keygt(znode_get_rd_key((znode *) node),
56532 +                                 znode_get_ld_key(node->right)))) {
56533 +                       *error = "rdkey or right ldkey is wrong";
56534 +                       read_unlock_dk(current_tree);
56535 +                       read_unlock_tree(current_tree);
56536 +                       return -1;
56537 +               }
56538 +
56539 +               read_unlock_dk(current_tree);
56540 +               read_unlock_tree(current_tree);
56541 +       }
56542 +
56543 +       return 0;
56544 +}
56545 +
56546 +/* plugin->u.node.parse
56547 +   look for description of this method in plugin/node/node.h */
56548 +int parse_node40(znode * node /* node to parse */ )
56549 +{
56550 +       node40_header *header;
56551 +       int result;
56552 +       d8 level;
56553 +
56554 +       header = node40_node_header((znode *) node);
56555 +       result = -EIO;
56556 +       level = nh40_get_level(header);
56557 +       if (unlikely(((__u8) znode_get_level(node)) != level))
56558 +               warning("nikita-494", "Wrong level found in node: %i != %i",
56559 +                       znode_get_level(node), level);
56560 +       else if (unlikely(nh40_get_magic(header) != REISER4_NODE_MAGIC))
56561 +               warning("nikita-495",
56562 +                       "Wrong magic in tree node: want %x, got %x",
56563 +                       REISER4_NODE_MAGIC, nh40_get_magic(header));
56564 +       else {
56565 +               node->nr_items = node40_num_of_items_internal(node);
56566 +               result = 0;
56567 +       }
56568 +       if (unlikely(result != 0))
56569 +               /* print_znode("node", node) */ ;
56570 +       return RETERR(result);
56571 +}
56572 +
56573 +/* plugin->u.node.init
56574 +   look for description of this method in plugin/node/node.h */
56575 +int init_node40(znode * node /* node to initialise */ )
56576 +{
56577 +       node40_header *header;
56578 +
56579 +       assert("nikita-570", node != NULL);
56580 +       assert("nikita-572", zdata(node) != NULL);
56581 +
56582 +       header = node40_node_header(node);
56583 +       memset(header, 0, sizeof(node40_header));
56584 +       nh40_set_free_space(header, znode_size(node) - sizeof(node40_header));
56585 +       nh40_set_free_space_start(header, sizeof(node40_header));
56586 +       /* sane hypothesis: 0 in CPU format is 0 in disk format */
56587 +       /* items: 0 */
56588 +       save_plugin_id(node_plugin_to_plugin(node->nplug),
56589 +                      &header->common_header.plugin_id);
56590 +       nh40_set_level(header, znode_get_level(node));
56591 +       nh40_set_magic(header, REISER4_NODE_MAGIC);
56592 +       node->nr_items = 0;
56593 +       nh40_set_mkfs_id(header, reiser4_mkfs_id(reiser4_get_current_sb()));
56594 +
56595 +       /* flags: 0 */
56596 +       return 0;
56597 +}
56598 +
56599 +#ifdef GUESS_EXISTS
56600 +int guess_node40(const znode * node /* node to guess plugin of */ )
56601 +{
56602 +       node40_header *nethack;
56603 +
56604 +       assert("nikita-1058", node != NULL);
56605 +       nethack = node40_node_header(node);
56606 +       return
56607 +           (nh40_get_magic(nethack) == REISER4_NODE_MAGIC) &&
56608 +           (plugin_by_disk_id(znode_get_tree(node),
56609 +                              REISER4_NODE_PLUGIN_TYPE,
56610 +                              &nethack->common_header.plugin_id)->h.id ==
56611 +            NODE40_ID);
56612 +}
56613 +#endif
56614 +
56615 +/* plugin->u.node.chage_item_size
56616 +   look for description of this method in plugin/node/node.h */
56617 +void change_item_size_node40(coord_t * coord, int by)
56618 +{
56619 +       node40_header *nh;
56620 +       item_header40 *ih;
56621 +       char *item_data;
56622 +       int item_length;
56623 +       unsigned i;
56624 +
56625 +       /* make sure that @item is coord of existing item */
56626 +       assert("vs-210", coord_is_existing_item(coord));
56627 +
56628 +       nh = node40_node_header(coord->node);
56629 +
56630 +       item_data = item_by_coord_node40(coord);
56631 +       item_length = length_by_coord_node40(coord);
56632 +
56633 +       /* move item bodies */
56634 +       ih = node40_ih_at_coord(coord);
56635 +       memmove(item_data + item_length + by, item_data + item_length,
56636 +               nh40_get_free_space_start(node40_node_header(coord->node)) -
56637 +               (ih40_get_offset(ih) + item_length));
56638 +
56639 +       /* update offsets of moved items */
56640 +       for (i = coord->item_pos + 1; i < nh40_get_num_items(nh); i++) {
56641 +               ih = node40_ih_at(coord->node, i);
56642 +               ih40_set_offset(ih, ih40_get_offset(ih) + by);
56643 +       }
56644 +
56645 +       /* update node header */
56646 +       nh40_set_free_space(nh, nh40_get_free_space(nh) - by);
56647 +       nh40_set_free_space_start(nh, nh40_get_free_space_start(nh) + by);
56648 +}
56649 +
56650 +static int should_notify_parent(const znode * node)
56651 +{
56652 +       /* FIXME_JMACD This looks equivalent to znode_is_root(), right? -josh */
56653 +       return !disk_addr_eq(znode_get_block(node),
56654 +                            &znode_get_tree(node)->root_block);
56655 +}
56656 +
56657 +/* plugin->u.node.create_item
56658 +   look for description of this method in plugin/node/node.h */
56659 +int
56660 +create_item_node40(coord_t *target, const reiser4_key *key,
56661 +                  reiser4_item_data *data, carry_plugin_info *info)
56662 +{
56663 +       node40_header *nh;
56664 +       item_header40 *ih;
56665 +       unsigned offset;
56666 +       unsigned i;
56667 +
56668 +       nh = node40_node_header(target->node);
56669 +
56670 +       assert("vs-212", coord_is_between_items(target));
56671 +       /* node must have enough free space */
56672 +       assert("vs-254",
56673 +              free_space_node40(target->node) >=
56674 +              data->length + sizeof(item_header40));
56675 +       assert("vs-1410", data->length >= 0);
56676 +
56677 +       if (coord_set_to_right(target))
56678 +               /* there are not items to the right of @target, so, new item
56679 +                  will be inserted after last one */
56680 +               coord_set_item_pos(target, nh40_get_num_items(nh));
56681 +
56682 +       if (target->item_pos < nh40_get_num_items(nh)) {
56683 +               /* there are items to be moved to prepare space for new
56684 +                  item */
56685 +               ih = node40_ih_at_coord(target);
56686 +               /* new item will start at this offset */
56687 +               offset = ih40_get_offset(ih);
56688 +
56689 +               memmove(zdata(target->node) + offset + data->length,
56690 +                       zdata(target->node) + offset,
56691 +                       nh40_get_free_space_start(nh) - offset);
56692 +               /* update headers of moved items */
56693 +               for (i = target->item_pos; i < nh40_get_num_items(nh); i++) {
56694 +                       ih = node40_ih_at(target->node, i);
56695 +                       ih40_set_offset(ih, ih40_get_offset(ih) + data->length);
56696 +               }
56697 +
56698 +               /* @ih is set to item header of the last item, move item headers */
56699 +               memmove(ih - 1, ih,
56700 +                       sizeof(item_header40) * (nh40_get_num_items(nh) -
56701 +                                                target->item_pos));
56702 +       } else {
56703 +               /* new item will start at this offset */
56704 +               offset = nh40_get_free_space_start(nh);
56705 +       }
56706 +
56707 +       /* make item header for the new item */
56708 +       ih = node40_ih_at_coord(target);
56709 +       memcpy(&ih->key, key, sizeof(reiser4_key));
56710 +       ih40_set_offset(ih, offset);
56711 +       save_plugin_id(item_plugin_to_plugin(data->iplug), &ih->plugin_id);
56712 +
56713 +       /* update node header */
56714 +       nh40_set_free_space(nh,
56715 +                           nh40_get_free_space(nh) - data->length -
56716 +                           sizeof(item_header40));
56717 +       nh40_set_free_space_start(nh,
56718 +                                 nh40_get_free_space_start(nh) + data->length);
56719 +       node40_set_num_items(target->node, nh, nh40_get_num_items(nh) + 1);
56720 +
56721 +       /* FIXME: check how does create_item work when between is set to BEFORE_UNIT */
56722 +       target->unit_pos = 0;
56723 +       target->between = AT_UNIT;
56724 +       coord_clear_iplug(target);
56725 +
56726 +       /* initialize item */
56727 +       if (data->iplug->b.init != NULL) {
56728 +               data->iplug->b.init(target, NULL, data);
56729 +       }
56730 +       /* copy item body */
56731 +       if (data->iplug->b.paste != NULL) {
56732 +               data->iplug->b.paste(target, data, info);
56733 +       } else if (data->data != NULL) {
56734 +               if (data->user) {
56735 +                       /* AUDIT: Are we really should not check that pointer
56736 +                          from userspace was valid and data bytes were
56737 +                          available? How will we return -EFAULT of some kind
56738 +                          without this check? */
56739 +                       assert("nikita-3038", schedulable());
56740 +                       /* copy data from user space */
56741 +                       __copy_from_user(zdata(target->node) + offset,
56742 +                                        (const char __user *)data->data,
56743 +                                        (unsigned)data->length);
56744 +               } else
56745 +                       /* copy from kernel space */
56746 +                       memcpy(zdata(target->node) + offset, data->data,
56747 +                              (unsigned)data->length);
56748 +       }
56749 +
56750 +       if (target->item_pos == 0) {
56751 +               /* left delimiting key has to be updated */
56752 +               prepare_for_update(NULL, target->node, info);
56753 +       }
56754 +
56755 +       if (item_plugin_by_coord(target)->b.create_hook != NULL) {
56756 +               item_plugin_by_coord(target)->b.create_hook(target, data->arg);
56757 +       }
56758 +
56759 +       return 0;
56760 +}
56761 +
56762 +/* plugin->u.node.update_item_key
56763 +   look for description of this method in plugin/node/node.h */
56764 +void
56765 +update_item_key_node40(coord_t * target, const reiser4_key * key,
56766 +                      carry_plugin_info * info)
56767 +{
56768 +       item_header40 *ih;
56769 +
56770 +       ih = node40_ih_at_coord(target);
56771 +       memcpy(&ih->key, key, sizeof(reiser4_key));
56772 +
56773 +       if (target->item_pos == 0) {
56774 +               prepare_for_update(NULL, target->node, info);
56775 +       }
56776 +}
56777 +
56778 +/* this bits encode cut mode */
56779 +#define CMODE_TAIL 1
56780 +#define CMODE_WHOLE 2
56781 +#define CMODE_HEAD 4
56782 +
56783 +struct cut40_info {
56784 +       int mode;
56785 +       pos_in_node_t tail_removed;     /* position of item which gets tail removed */
56786 +       pos_in_node_t first_removed;    /* position of first the leftmost item among items removed completely */
56787 +       pos_in_node_t removed_count;    /* number of items removed completely */
56788 +       pos_in_node_t head_removed;     /* position of item which gets head removed */
56789 +
56790 +       pos_in_node_t freed_space_start;
56791 +       pos_in_node_t freed_space_end;
56792 +       pos_in_node_t first_moved;
56793 +       pos_in_node_t head_removed_location;
56794 +};
56795 +
56796 +static void init_cinfo(struct cut40_info *cinfo)
56797 +{
56798 +       cinfo->mode = 0;
56799 +       cinfo->tail_removed = MAX_POS_IN_NODE;
56800 +       cinfo->first_removed = MAX_POS_IN_NODE;
56801 +       cinfo->removed_count = MAX_POS_IN_NODE;
56802 +       cinfo->head_removed = MAX_POS_IN_NODE;
56803 +       cinfo->freed_space_start = MAX_POS_IN_NODE;
56804 +       cinfo->freed_space_end = MAX_POS_IN_NODE;
56805 +       cinfo->first_moved = MAX_POS_IN_NODE;
56806 +       cinfo->head_removed_location = MAX_POS_IN_NODE;
56807 +}
56808 +
56809 +/* complete cut_node40/kill_node40 content by removing the gap created by */
56810 +static void compact(znode * node, struct cut40_info *cinfo)
56811 +{
56812 +       node40_header *nh;
56813 +       item_header40 *ih;
56814 +       pos_in_node_t freed;
56815 +       pos_in_node_t pos, nr_items;
56816 +
56817 +       assert("vs-1526", (cinfo->freed_space_start != MAX_POS_IN_NODE &&
56818 +                          cinfo->freed_space_end != MAX_POS_IN_NODE &&
56819 +                          cinfo->first_moved != MAX_POS_IN_NODE));
56820 +       assert("vs-1523", cinfo->freed_space_end >= cinfo->freed_space_start);
56821 +
56822 +       nh = node40_node_header(node);
56823 +       nr_items = nh40_get_num_items(nh);
56824 +
56825 +       /* remove gap made up by removal */
56826 +       memmove(zdata(node) + cinfo->freed_space_start,
56827 +               zdata(node) + cinfo->freed_space_end,
56828 +               nh40_get_free_space_start(nh) - cinfo->freed_space_end);
56829 +
56830 +       /* update item headers of moved items - change their locations */
56831 +       pos = cinfo->first_moved;
56832 +       ih = node40_ih_at(node, pos);
56833 +       if (cinfo->head_removed_location != MAX_POS_IN_NODE) {
56834 +               assert("vs-1580", pos == cinfo->head_removed);
56835 +               ih40_set_offset(ih, cinfo->head_removed_location);
56836 +               pos++;
56837 +               ih--;
56838 +       }
56839 +
56840 +       freed = cinfo->freed_space_end - cinfo->freed_space_start;
56841 +       for (; pos < nr_items; pos++, ih--) {
56842 +               assert("vs-1581", ih == node40_ih_at(node, pos));
56843 +               ih40_set_offset(ih, ih40_get_offset(ih) - freed);
56844 +       }
56845 +
56846 +       /* free space start moved to right */
56847 +       nh40_set_free_space_start(nh, nh40_get_free_space_start(nh) - freed);
56848 +
56849 +       if (cinfo->removed_count != MAX_POS_IN_NODE) {
56850 +               /* number of items changed. Remove item headers of those items */
56851 +               ih = node40_ih_at(node, nr_items - 1);
56852 +               memmove(ih + cinfo->removed_count, ih,
56853 +                       sizeof(item_header40) * (nr_items -
56854 +                                                cinfo->removed_count -
56855 +                                                cinfo->first_removed));
56856 +               freed += sizeof(item_header40) * cinfo->removed_count;
56857 +               node40_set_num_items(node, nh, nr_items - cinfo->removed_count);
56858 +       }
56859 +
56860 +       /* total amount of free space increased */
56861 +       nh40_set_free_space(nh, nh40_get_free_space(nh) + freed);
56862 +}
56863 +
56864 +int shrink_item_node40(coord_t * coord, int delta)
56865 +{
56866 +       node40_header *nh;
56867 +       item_header40 *ih;
56868 +       pos_in_node_t pos;
56869 +       pos_in_node_t nr_items;
56870 +       char *end;
56871 +       znode *node;
56872 +       int off;
56873 +
56874 +       assert("nikita-3487", coord != NULL);
56875 +       assert("nikita-3488", delta >= 0);
56876 +
56877 +       node = coord->node;
56878 +       nh = node40_node_header(node);
56879 +       nr_items = nh40_get_num_items(nh);
56880 +
56881 +       ih = node40_ih_at_coord(coord);
56882 +       assert("nikita-3489", delta <= length_by_coord_node40(coord));
56883 +       off = ih40_get_offset(ih) + length_by_coord_node40(coord);
56884 +       end = zdata(node) + off;
56885 +
56886 +       /* remove gap made up by removal */
56887 +       memmove(end - delta, end, nh40_get_free_space_start(nh) - off);
56888 +
56889 +       /* update item headers of moved items - change their locations */
56890 +       pos = coord->item_pos + 1;
56891 +       ih = node40_ih_at(node, pos);
56892 +       for (; pos < nr_items; pos++, ih--) {
56893 +               assert("nikita-3490", ih == node40_ih_at(node, pos));
56894 +               ih40_set_offset(ih, ih40_get_offset(ih) - delta);
56895 +       }
56896 +
56897 +       /* free space start moved to left */
56898 +       nh40_set_free_space_start(nh, nh40_get_free_space_start(nh) - delta);
56899 +       /* total amount of free space increased */
56900 +       nh40_set_free_space(nh, nh40_get_free_space(nh) + delta);
56901 +       /*
56902 +        * This method does _not_ changes number of items. Hence, it cannot
56903 +        * make node empty. Also it doesn't remove items at all, which means
56904 +        * that no keys have to be updated either.
56905 +        */
56906 +       return 0;
56907 +}
56908 +
56909 +/* this is used by cut_node40 and kill_node40. It analyses input parameters and calculates cut mode. There are 2 types
56910 +   of cut. First is when a unit is removed from the middle of an item.  In this case this function returns 1. All the
56911 +   rest fits into second case: 0 or 1 of items getting tail cut, 0 or more items removed completely and 0 or 1 item
56912 +   getting head cut. Function returns 0 in this case */
56913 +static int
56914 +parse_cut(struct cut40_info *cinfo, const struct cut_kill_params *params)
56915 +{
56916 +       reiser4_key left_key, right_key;
56917 +       reiser4_key min_from_key, max_to_key;
56918 +       const reiser4_key *from_key, *to_key;
56919 +
56920 +       init_cinfo(cinfo);
56921 +
56922 +       /* calculate minimal key stored in first item of items to be cut (params->from) */
56923 +       item_key_by_coord(params->from, &min_from_key);
56924 +       /* and max key stored in last item of items to be cut (params->to) */
56925 +       max_item_key_by_coord(params->to, &max_to_key);
56926 +
56927 +       /* if cut key range is not defined in input parameters - define it using cut coord range */
56928 +       if (params->from_key == NULL) {
56929 +               assert("vs-1513", params->to_key == NULL);
56930 +               unit_key_by_coord(params->from, &left_key);
56931 +               from_key = &left_key;
56932 +               max_unit_key_by_coord(params->to, &right_key);
56933 +               to_key = &right_key;
56934 +       } else {
56935 +               from_key = params->from_key;
56936 +               to_key = params->to_key;
56937 +       }
56938 +
56939 +       if (params->from->item_pos == params->to->item_pos) {
56940 +               if (keylt(&min_from_key, from_key)
56941 +                   && keylt(to_key, &max_to_key))
56942 +                       return 1;
56943 +
56944 +               if (keygt(from_key, &min_from_key)) {
56945 +                       /* tail of item is to be cut cut */
56946 +                       cinfo->tail_removed = params->from->item_pos;
56947 +                       cinfo->mode |= CMODE_TAIL;
56948 +               } else if (keylt(to_key, &max_to_key)) {
56949 +                       /* head of item is to be cut */
56950 +                       cinfo->head_removed = params->from->item_pos;
56951 +                       cinfo->mode |= CMODE_HEAD;
56952 +               } else {
56953 +                       /* item is removed completely */
56954 +                       cinfo->first_removed = params->from->item_pos;
56955 +                       cinfo->removed_count = 1;
56956 +                       cinfo->mode |= CMODE_WHOLE;
56957 +               }
56958 +       } else {
56959 +               cinfo->first_removed = params->from->item_pos + 1;
56960 +               cinfo->removed_count =
56961 +                   params->to->item_pos - params->from->item_pos - 1;
56962 +
56963 +               if (keygt(from_key, &min_from_key)) {
56964 +                       /* first item is not cut completely */
56965 +                       cinfo->tail_removed = params->from->item_pos;
56966 +                       cinfo->mode |= CMODE_TAIL;
56967 +               } else {
56968 +                       cinfo->first_removed--;
56969 +                       cinfo->removed_count++;
56970 +               }
56971 +               if (keylt(to_key, &max_to_key)) {
56972 +                       /* last item is not cut completely */
56973 +                       cinfo->head_removed = params->to->item_pos;
56974 +                       cinfo->mode |= CMODE_HEAD;
56975 +               } else {
56976 +                       cinfo->removed_count++;
56977 +               }
56978 +               if (cinfo->removed_count)
56979 +                       cinfo->mode |= CMODE_WHOLE;
56980 +       }
56981 +
56982 +       return 0;
56983 +}
56984 +
56985 +static void
56986 +call_kill_hooks(znode * node, pos_in_node_t from, pos_in_node_t count,
56987 +               carry_kill_data * kdata)
56988 +{
56989 +       coord_t coord;
56990 +       item_plugin *iplug;
56991 +       pos_in_node_t pos;
56992 +
56993 +       coord.node = node;
56994 +       coord.unit_pos = 0;
56995 +       coord.between = AT_UNIT;
56996 +       for (pos = 0; pos < count; pos++) {
56997 +               coord_set_item_pos(&coord, from + pos);
56998 +               coord.unit_pos = 0;
56999 +               coord.between = AT_UNIT;
57000 +               iplug = item_plugin_by_coord(&coord);
57001 +               if (iplug->b.kill_hook) {
57002 +                       iplug->b.kill_hook(&coord, 0, coord_num_units(&coord),
57003 +                                          kdata);
57004 +               }
57005 +       }
57006 +}
57007 +
57008 +/* this is used to kill item partially */
57009 +static pos_in_node_t
57010 +kill_units(coord_t * coord, pos_in_node_t from, pos_in_node_t to, void *data,
57011 +          reiser4_key * smallest_removed, reiser4_key * new_first_key)
57012 +{
57013 +       struct carry_kill_data *kdata;
57014 +       item_plugin *iplug;
57015 +
57016 +       kdata = data;
57017 +       iplug = item_plugin_by_coord(coord);
57018 +
57019 +       assert("vs-1524", iplug->b.kill_units);
57020 +       return iplug->b.kill_units(coord, from, to, kdata, smallest_removed,
57021 +                                  new_first_key);
57022 +}
57023 +
57024 +/* call item plugin to cut tail of file */
57025 +static pos_in_node_t
57026 +kill_tail(coord_t * coord, void *data, reiser4_key * smallest_removed)
57027 +{
57028 +       struct carry_kill_data *kdata;
57029 +       pos_in_node_t to;
57030 +
57031 +       kdata = data;
57032 +       to = coord_last_unit_pos(coord);
57033 +       return kill_units(coord, coord->unit_pos, to, kdata, smallest_removed,
57034 +                         NULL);
57035 +}
57036 +
57037 +/* call item plugin to cut head of item */
57038 +static pos_in_node_t
57039 +kill_head(coord_t * coord, void *data, reiser4_key * smallest_removed,
57040 +         reiser4_key * new_first_key)
57041 +{
57042 +       return kill_units(coord, 0, coord->unit_pos, data, smallest_removed,
57043 +                         new_first_key);
57044 +}
57045 +
57046 +/* this is used to cut item partially */
57047 +static pos_in_node_t
57048 +cut_units(coord_t * coord, pos_in_node_t from, pos_in_node_t to, void *data,
57049 +         reiser4_key * smallest_removed, reiser4_key * new_first_key)
57050 +{
57051 +       carry_cut_data *cdata;
57052 +       item_plugin *iplug;
57053 +
57054 +       cdata = data;
57055 +       iplug = item_plugin_by_coord(coord);
57056 +       assert("vs-302", iplug->b.cut_units);
57057 +       return iplug->b.cut_units(coord, from, to, cdata, smallest_removed,
57058 +                                 new_first_key);
57059 +}
57060 +
57061 +/* call item plugin to cut tail of file */
57062 +static pos_in_node_t
57063 +cut_tail(coord_t * coord, void *data, reiser4_key * smallest_removed)
57064 +{
57065 +       carry_cut_data *cdata;
57066 +       pos_in_node_t to;
57067 +
57068 +       cdata = data;
57069 +       to = coord_last_unit_pos(cdata->params.from);
57070 +       return cut_units(coord, coord->unit_pos, to, data, smallest_removed, NULL);
57071 +}
57072 +
57073 +/* call item plugin to cut head of item */
57074 +static pos_in_node_t
57075 +cut_head(coord_t * coord, void *data, reiser4_key * smallest_removed,
57076 +        reiser4_key * new_first_key)
57077 +{
57078 +       return cut_units(coord, 0, coord->unit_pos, data, smallest_removed,
57079 +                        new_first_key);
57080 +}
57081 +
57082 +/* this returns 1 of key of first item changed, 0 - if it did not */
57083 +static int
57084 +prepare_for_compact(struct cut40_info *cinfo,
57085 +                   const struct cut_kill_params *params, int is_cut,
57086 +                   void *data, carry_plugin_info * info)
57087 +{
57088 +       znode *node;
57089 +       item_header40 *ih;
57090 +       pos_in_node_t freed;
57091 +       pos_in_node_t item_pos;
57092 +       coord_t coord;
57093 +       reiser4_key new_first_key;
57094 +       pos_in_node_t(*kill_units_f) (coord_t *, pos_in_node_t, pos_in_node_t,
57095 +                                     void *, reiser4_key *, reiser4_key *);
57096 +       pos_in_node_t(*kill_tail_f) (coord_t *, void *, reiser4_key *);
57097 +       pos_in_node_t(*kill_head_f) (coord_t *, void *, reiser4_key *,
57098 +                                    reiser4_key *);
57099 +       int retval;
57100 +
57101 +       retval = 0;
57102 +
57103 +       node = params->from->node;
57104 +
57105 +       assert("vs-184", node == params->to->node);
57106 +       assert("vs-312", !node_is_empty(node));
57107 +       assert("vs-297",
57108 +              coord_compare(params->from, params->to) != COORD_CMP_ON_RIGHT);
57109 +
57110 +       if (is_cut) {
57111 +               kill_units_f = cut_units;
57112 +               kill_tail_f = cut_tail;
57113 +               kill_head_f = cut_head;
57114 +       } else {
57115 +               kill_units_f = kill_units;
57116 +               kill_tail_f = kill_tail;
57117 +               kill_head_f = kill_head;
57118 +       }
57119 +
57120 +       if (parse_cut(cinfo, params) == 1) {
57121 +               /* cut from the middle of item */
57122 +               freed =
57123 +                   kill_units_f(params->from, params->from->unit_pos,
57124 +                                params->to->unit_pos, data,
57125 +                                params->smallest_removed, NULL);
57126 +
57127 +               item_pos = params->from->item_pos;
57128 +               ih = node40_ih_at(node, item_pos);
57129 +               cinfo->freed_space_start =
57130 +                   ih40_get_offset(ih) + node40_item_length(node,
57131 +                                                            item_pos) - freed;
57132 +               cinfo->freed_space_end = cinfo->freed_space_start + freed;
57133 +               cinfo->first_moved = item_pos + 1;
57134 +       } else {
57135 +               assert("vs-1521", (cinfo->tail_removed != MAX_POS_IN_NODE ||
57136 +                                  cinfo->first_removed != MAX_POS_IN_NODE ||
57137 +                                  cinfo->head_removed != MAX_POS_IN_NODE));
57138 +
57139 +               switch (cinfo->mode) {
57140 +               case CMODE_TAIL:
57141 +                       /* one item gets cut partially from its end */
57142 +                       assert("vs-1562",
57143 +                              cinfo->tail_removed == params->from->item_pos);
57144 +
57145 +                       freed =
57146 +                           kill_tail_f(params->from, data,
57147 +                                       params->smallest_removed);
57148 +
57149 +                       item_pos = cinfo->tail_removed;
57150 +                       ih = node40_ih_at(node, item_pos);
57151 +                       cinfo->freed_space_start =
57152 +                           ih40_get_offset(ih) + node40_item_length(node,
57153 +                                                                    item_pos) -
57154 +                           freed;
57155 +                       cinfo->freed_space_end =
57156 +                           cinfo->freed_space_start + freed;
57157 +                       cinfo->first_moved = cinfo->tail_removed + 1;
57158 +                       break;
57159 +
57160 +               case CMODE_WHOLE:
57161 +                       /* one or more items get removed completely */
57162 +                       assert("vs-1563",
57163 +                              cinfo->first_removed == params->from->item_pos);
57164 +                       assert("vs-1564", cinfo->removed_count > 0
57165 +                              && cinfo->removed_count != MAX_POS_IN_NODE);
57166 +
57167 +                       /* call kill hook for all items removed completely */
57168 +                       if (is_cut == 0)
57169 +                               call_kill_hooks(node, cinfo->first_removed,
57170 +                                               cinfo->removed_count, data);
57171 +
57172 +                       item_pos = cinfo->first_removed;
57173 +                       ih = node40_ih_at(node, item_pos);
57174 +
57175 +                       if (params->smallest_removed)
57176 +                               memcpy(params->smallest_removed, &ih->key,
57177 +                                      sizeof(reiser4_key));
57178 +
57179 +                       cinfo->freed_space_start = ih40_get_offset(ih);
57180 +
57181 +                       item_pos += (cinfo->removed_count - 1);
57182 +                       ih -= (cinfo->removed_count - 1);
57183 +                       cinfo->freed_space_end =
57184 +                           ih40_get_offset(ih) + node40_item_length(node,
57185 +                                                                    item_pos);
57186 +                       cinfo->first_moved = item_pos + 1;
57187 +                       if (cinfo->first_removed == 0)
57188 +                               /* key of first item of the node changes */
57189 +                               retval = 1;
57190 +                       break;
57191 +
57192 +               case CMODE_HEAD:
57193 +                       /* one item gets cut partially from its head */
57194 +                       assert("vs-1565",
57195 +                              cinfo->head_removed == params->from->item_pos);
57196 +
57197 +                       freed =
57198 +                           kill_head_f(params->to, data,
57199 +                                       params->smallest_removed,
57200 +                                       &new_first_key);
57201 +
57202 +                       item_pos = cinfo->head_removed;
57203 +                       ih = node40_ih_at(node, item_pos);
57204 +                       cinfo->freed_space_start = ih40_get_offset(ih);
57205 +                       cinfo->freed_space_end = ih40_get_offset(ih) + freed;
57206 +                       cinfo->first_moved = cinfo->head_removed + 1;
57207 +
57208 +                       /* item head is removed, therefore, item key changed */
57209 +                       coord.node = node;
57210 +                       coord_set_item_pos(&coord, item_pos);
57211 +                       coord.unit_pos = 0;
57212 +                       coord.between = AT_UNIT;
57213 +                       update_item_key_node40(&coord, &new_first_key, NULL);
57214 +                       if (item_pos == 0)
57215 +                               /* key of first item of the node changes */
57216 +                               retval = 1;
57217 +                       break;
57218 +
57219 +               case CMODE_TAIL | CMODE_WHOLE:
57220 +                       /* one item gets cut from its end and one or more items get removed completely */
57221 +                       assert("vs-1566",
57222 +                              cinfo->tail_removed == params->from->item_pos);
57223 +                       assert("vs-1567",
57224 +                              cinfo->first_removed == cinfo->tail_removed + 1);
57225 +                       assert("vs-1564", cinfo->removed_count > 0
57226 +                              && cinfo->removed_count != MAX_POS_IN_NODE);
57227 +
57228 +                       freed =
57229 +                           kill_tail_f(params->from, data,
57230 +                                       params->smallest_removed);
57231 +
57232 +                       item_pos = cinfo->tail_removed;
57233 +                       ih = node40_ih_at(node, item_pos);
57234 +                       cinfo->freed_space_start =
57235 +                           ih40_get_offset(ih) + node40_item_length(node,
57236 +                                                                    item_pos) -
57237 +                           freed;
57238 +
57239 +                       /* call kill hook for all items removed completely */
57240 +                       if (is_cut == 0)
57241 +                               call_kill_hooks(node, cinfo->first_removed,
57242 +                                               cinfo->removed_count, data);
57243 +
57244 +                       item_pos += cinfo->removed_count;
57245 +                       ih -= cinfo->removed_count;
57246 +                       cinfo->freed_space_end =
57247 +                           ih40_get_offset(ih) + node40_item_length(node,
57248 +                                                                    item_pos);
57249 +                       cinfo->first_moved = item_pos + 1;
57250 +                       break;
57251 +
57252 +               case CMODE_WHOLE | CMODE_HEAD:
57253 +                       /* one or more items get removed completely and one item gets cut partially from its head */
57254 +                       assert("vs-1568",
57255 +                              cinfo->first_removed == params->from->item_pos);
57256 +                       assert("vs-1564", cinfo->removed_count > 0
57257 +                              && cinfo->removed_count != MAX_POS_IN_NODE);
57258 +                       assert("vs-1569",
57259 +                              cinfo->head_removed ==
57260 +                              cinfo->first_removed + cinfo->removed_count);
57261 +
57262 +                       /* call kill hook for all items removed completely */
57263 +                       if (is_cut == 0)
57264 +                               call_kill_hooks(node, cinfo->first_removed,
57265 +                                               cinfo->removed_count, data);
57266 +
57267 +                       item_pos = cinfo->first_removed;
57268 +                       ih = node40_ih_at(node, item_pos);
57269 +
57270 +                       if (params->smallest_removed)
57271 +                               memcpy(params->smallest_removed, &ih->key,
57272 +                                      sizeof(reiser4_key));
57273 +
57274 +                       freed =
57275 +                           kill_head_f(params->to, data, NULL, &new_first_key);
57276 +
57277 +                       cinfo->freed_space_start = ih40_get_offset(ih);
57278 +
57279 +                       ih = node40_ih_at(node, cinfo->head_removed);
57280 +                       /* this is the most complex case. Item which got head removed and items which are to be moved
57281 +                          intact change their location differently. */
57282 +                       cinfo->freed_space_end = ih40_get_offset(ih) + freed;
57283 +                       cinfo->first_moved = cinfo->head_removed;
57284 +                       cinfo->head_removed_location = cinfo->freed_space_start;
57285 +
57286 +                       /* item head is removed, therefore, item key changed */
57287 +                       coord.node = node;
57288 +                       coord_set_item_pos(&coord, cinfo->head_removed);
57289 +                       coord.unit_pos = 0;
57290 +                       coord.between = AT_UNIT;
57291 +                       update_item_key_node40(&coord, &new_first_key, NULL);
57292 +
57293 +                       assert("vs-1579", cinfo->first_removed == 0);
57294 +                       /* key of first item of the node changes */
57295 +                       retval = 1;
57296 +                       break;
57297 +
57298 +               case CMODE_TAIL | CMODE_HEAD:
57299 +                       /* one item get cut from its end and its neighbor gets cut from its tail */
57300 +                       impossible("vs-1576", "this can not happen currently");
57301 +                       break;
57302 +
57303 +               case CMODE_TAIL | CMODE_WHOLE | CMODE_HEAD:
57304 +                       impossible("vs-1577", "this can not happen currently");
57305 +                       break;
57306 +               default:
57307 +                       impossible("vs-1578", "unexpected cut mode");
57308 +                       break;
57309 +               }
57310 +       }
57311 +       return retval;
57312 +}
57313 +
57314 +/* plugin->u.node.kill
57315 +   return value is number of items removed completely */
57316 +int kill_node40(struct carry_kill_data *kdata, carry_plugin_info * info)
57317 +{
57318 +       znode *node;
57319 +       struct cut40_info cinfo;
57320 +       int first_key_changed;
57321 +
57322 +       node = kdata->params.from->node;
57323 +
57324 +       first_key_changed =
57325 +           prepare_for_compact(&cinfo, &kdata->params, 0 /* not cut */ , kdata,
57326 +                               info);
57327 +       compact(node, &cinfo);
57328 +
57329 +       if (info) {
57330 +               /* it is not called by node40_shift, so we have to take care
57331 +                  of changes on upper levels */
57332 +               if (node_is_empty(node)
57333 +                   && !(kdata->flags & DELETE_RETAIN_EMPTY))
57334 +                       /* all contents of node is deleted */
57335 +                       prepare_removal_node40(node, info);
57336 +               else if (first_key_changed) {
57337 +                       prepare_for_update(NULL, node, info);
57338 +               }
57339 +       }
57340 +
57341 +       coord_clear_iplug(kdata->params.from);
57342 +       coord_clear_iplug(kdata->params.to);
57343 +
57344 +       znode_make_dirty(node);
57345 +       return cinfo.removed_count == MAX_POS_IN_NODE ? 0 : cinfo.removed_count;
57346 +}
57347 +
57348 +/* plugin->u.node.cut
57349 +   return value is number of items removed completely */
57350 +int cut_node40(struct carry_cut_data *cdata, carry_plugin_info * info)
57351 +{
57352 +       znode *node;
57353 +       struct cut40_info cinfo;
57354 +       int first_key_changed;
57355 +
57356 +       node = cdata->params.from->node;
57357 +
57358 +       first_key_changed =
57359 +           prepare_for_compact(&cinfo, &cdata->params, 1 /* not cut */ , cdata,
57360 +                               info);
57361 +       compact(node, &cinfo);
57362 +
57363 +       if (info) {
57364 +               /* it is not called by node40_shift, so we have to take care
57365 +                  of changes on upper levels */
57366 +               if (node_is_empty(node))
57367 +                       /* all contents of node is deleted */
57368 +                       prepare_removal_node40(node, info);
57369 +               else if (first_key_changed) {
57370 +                       prepare_for_update(NULL, node, info);
57371 +               }
57372 +       }
57373 +
57374 +       coord_clear_iplug(cdata->params.from);
57375 +       coord_clear_iplug(cdata->params.to);
57376 +
57377 +       znode_make_dirty(node);
57378 +       return cinfo.removed_count == MAX_POS_IN_NODE ? 0 : cinfo.removed_count;
57379 +}
57380 +
57381 +/* this structure is used by shift method of node40 plugin */
57382 +struct shift_params {
57383 +       shift_direction pend;   /* when @pend == append - we are shifting to
57384 +                                  left, when @pend == prepend - to right */
57385 +       coord_t wish_stop;      /* when shifting to left this is last unit we
57386 +                                  want shifted, when shifting to right - this
57387 +                                  is set to unit we want to start shifting
57388 +                                  from */
57389 +       znode *target;
57390 +       int everything;         /* it is set to 1 if everything we have to shift is
57391 +                                  shifted, 0 - otherwise */
57392 +
57393 +       /* FIXME-VS: get rid of read_stop */
57394 +
57395 +       /* these are set by estimate_shift */
57396 +       coord_t real_stop;      /* this will be set to last unit which will be
57397 +                                  really shifted */
57398 +
57399 +       /* coordinate in source node before operation of unit which becomes
57400 +          first after shift to left of last after shift to right */
57401 +       union {
57402 +               coord_t future_first;
57403 +               coord_t future_last;
57404 +       } u;
57405 +
57406 +       unsigned merging_units; /* number of units of first item which have to
57407 +                                  be merged with last item of target node */
57408 +       unsigned merging_bytes; /* number of bytes in those units */
57409 +
57410 +       unsigned entire;        /* items shifted in their entirety */
57411 +       unsigned entire_bytes;  /* number of bytes in those items */
57412 +
57413 +       unsigned part_units;    /* number of units of partially copied item */
57414 +       unsigned part_bytes;    /* number of bytes in those units */
57415 +
57416 +       unsigned shift_bytes;   /* total number of bytes in items shifted (item
57417 +                                  headers not included) */
57418 +
57419 +};
57420 +
57421 +static int item_creation_overhead(coord_t *item)
57422 +{
57423 +       return node_plugin_by_coord(item)->item_overhead(item->node, NULL);
57424 +}
57425 +
57426 +/* how many units are there in @source starting from source->unit_pos
57427 +   but not further than @stop_coord */
57428 +static int
57429 +wanted_units(coord_t *source, coord_t *stop_coord, shift_direction pend)
57430 +{
57431 +       if (pend == SHIFT_LEFT) {
57432 +               assert("vs-181", source->unit_pos == 0);
57433 +       } else {
57434 +               assert("vs-182",
57435 +                      source->unit_pos == coord_last_unit_pos(source));
57436 +       }
57437 +
57438 +       if (source->item_pos != stop_coord->item_pos) {
57439 +               /* @source and @stop_coord are different items */
57440 +               return coord_last_unit_pos(source) + 1;
57441 +       }
57442 +
57443 +       if (pend == SHIFT_LEFT) {
57444 +               return stop_coord->unit_pos + 1;
57445 +       } else {
57446 +               return source->unit_pos - stop_coord->unit_pos + 1;
57447 +       }
57448 +}
57449 +
57450 +/* this calculates what can be copied from @shift->wish_stop.node to
57451 +   @shift->target */
57452 +static void
57453 +estimate_shift(struct shift_params *shift, const reiser4_context * ctx)
57454 +{
57455 +       unsigned target_free_space, size;
57456 +       pos_in_node_t stop_item;        /* item which estimating should not consider */
57457 +       unsigned want;          /* number of units of item we want shifted */
57458 +       coord_t source;         /* item being estimated */
57459 +       item_plugin *iplug;
57460 +
57461 +       /* shifting to left/right starts from first/last units of
57462 +          @shift->wish_stop.node */
57463 +       if (shift->pend == SHIFT_LEFT) {
57464 +               coord_init_first_unit(&source, shift->wish_stop.node);
57465 +       } else {
57466 +               coord_init_last_unit(&source, shift->wish_stop.node);
57467 +       }
57468 +       shift->real_stop = source;
57469 +
57470 +       /* free space in target node and number of items in source */
57471 +       target_free_space = znode_free_space(shift->target);
57472 +
57473 +       shift->everything = 0;
57474 +       if (!node_is_empty(shift->target)) {
57475 +               /* target node is not empty, check for boundary items
57476 +                  mergeability */
57477 +               coord_t to;
57478 +
57479 +               /* item we try to merge @source with */
57480 +               if (shift->pend == SHIFT_LEFT) {
57481 +                       coord_init_last_unit(&to, shift->target);
57482 +               } else {
57483 +                       coord_init_first_unit(&to, shift->target);
57484 +               }
57485 +
57486 +               if ((shift->pend == SHIFT_LEFT) ? are_items_mergeable(&to,
57487 +                                                                     &source) :
57488 +                   are_items_mergeable(&source, &to)) {
57489 +                       /* how many units of @source do we want to merge to
57490 +                          item @to */
57491 +                       want =
57492 +                           wanted_units(&source, &shift->wish_stop,
57493 +                                        shift->pend);
57494 +
57495 +                       /* how many units of @source we can merge to item
57496 +                          @to */
57497 +                       iplug = item_plugin_by_coord(&source);
57498 +                       if (iplug->b.can_shift != NULL)
57499 +                               shift->merging_units =
57500 +                                   iplug->b.can_shift(target_free_space,
57501 +                                                      &source, shift->target,
57502 +                                                      shift->pend, &size,
57503 +                                                      want);
57504 +                       else {
57505 +                               shift->merging_units = 0;
57506 +                               size = 0;
57507 +                       }
57508 +                       shift->merging_bytes = size;
57509 +                       shift->shift_bytes += size;
57510 +                       /* update stop coord to be set to last unit of @source
57511 +                          we can merge to @target */
57512 +                       if (shift->merging_units)
57513 +                               /* at least one unit can be shifted */
57514 +                               shift->real_stop.unit_pos =
57515 +                                   (shift->merging_units - source.unit_pos -
57516 +                                    1) * shift->pend;
57517 +                       else {
57518 +                               /* nothing can be shifted */
57519 +                               if (shift->pend == SHIFT_LEFT)
57520 +                                       coord_init_before_first_item(&shift->
57521 +                                                                    real_stop,
57522 +                                                                    source.
57523 +                                                                    node);
57524 +                               else
57525 +                                       coord_init_after_last_item(&shift->
57526 +                                                                  real_stop,
57527 +                                                                  source.node);
57528 +                       }
57529 +                       assert("nikita-2081", shift->real_stop.unit_pos + 1);
57530 +
57531 +                       if (shift->merging_units != want) {
57532 +                               /* we could not copy as many as we want, so,
57533 +                                  there is no reason for estimating any
57534 +                                  longer */
57535 +                               return;
57536 +                       }
57537 +
57538 +                       target_free_space -= size;
57539 +                       coord_add_item_pos(&source, shift->pend);
57540 +               }
57541 +       }
57542 +
57543 +       /* number of item nothing of which we want to shift */
57544 +       stop_item = shift->wish_stop.item_pos + shift->pend;
57545 +
57546 +       /* calculate how many items can be copied into given free
57547 +          space as whole */
57548 +       for (; source.item_pos != stop_item;
57549 +            coord_add_item_pos(&source, shift->pend)) {
57550 +               if (shift->pend == SHIFT_RIGHT)
57551 +                       source.unit_pos = coord_last_unit_pos(&source);
57552 +
57553 +               /* how many units of @source do we want to copy */
57554 +               want = wanted_units(&source, &shift->wish_stop, shift->pend);
57555 +
57556 +               if (want == coord_last_unit_pos(&source) + 1) {
57557 +                       /* we want this item to be copied entirely */
57558 +                       size =
57559 +                           item_length_by_coord(&source) +
57560 +                           item_creation_overhead(&source);
57561 +                       if (size <= target_free_space) {
57562 +                               /* item fits into target node as whole */
57563 +                               target_free_space -= size;
57564 +                               shift->shift_bytes +=
57565 +                                   size - item_creation_overhead(&source);
57566 +                               shift->entire_bytes +=
57567 +                                   size - item_creation_overhead(&source);
57568 +                               shift->entire++;
57569 +
57570 +                               /* update shift->real_stop coord to be set to
57571 +                                  last unit of @source we can merge to
57572 +                                  @target */
57573 +                               shift->real_stop = source;
57574 +                               if (shift->pend == SHIFT_LEFT)
57575 +                                       shift->real_stop.unit_pos =
57576 +                                           coord_last_unit_pos(&shift->
57577 +                                                               real_stop);
57578 +                               else
57579 +                                       shift->real_stop.unit_pos = 0;
57580 +                               continue;
57581 +                       }
57582 +               }
57583 +
57584 +               /* we reach here only for an item which does not fit into
57585 +                  target node in its entirety. This item may be either
57586 +                  partially shifted, or not shifted at all. We will have to
57587 +                  create new item in target node, so decrease amout of free
57588 +                  space by an item creation overhead. We can reach here also
57589 +                  if stop coord is in this item */
57590 +               if (target_free_space >=
57591 +                   (unsigned)item_creation_overhead(&source)) {
57592 +                       target_free_space -= item_creation_overhead(&source);
57593 +                       iplug = item_plugin_by_coord(&source);
57594 +                       if (iplug->b.can_shift) {
57595 +                               shift->part_units = iplug->b.can_shift(target_free_space,
57596 +                                                                      &source,
57597 +                                                                      NULL, /* target */
57598 +                                                                      shift->pend,
57599 +                                                                      &size,
57600 +                                                                      want);
57601 +                       } else {
57602 +                               target_free_space = 0;
57603 +                               shift->part_units = 0;
57604 +                               size = 0;
57605 +                       }
57606 +               } else {
57607 +                       target_free_space = 0;
57608 +                       shift->part_units = 0;
57609 +                       size = 0;
57610 +               }
57611 +               shift->part_bytes = size;
57612 +               shift->shift_bytes += size;
57613 +
57614 +               /* set @shift->real_stop to last unit of @source we can merge
57615 +                  to @shift->target */
57616 +               if (shift->part_units) {
57617 +                       shift->real_stop = source;
57618 +                       shift->real_stop.unit_pos =
57619 +                           (shift->part_units - source.unit_pos -
57620 +                            1) * shift->pend;
57621 +                       assert("nikita-2082", shift->real_stop.unit_pos + 1);
57622 +               }
57623 +
57624 +               if (want != shift->part_units)
57625 +                       /* not everything wanted were shifted */
57626 +                       return;
57627 +               break;
57628 +       }
57629 +
57630 +       shift->everything = 1;
57631 +}
57632 +
57633 +static void
57634 +copy_units(coord_t * target, coord_t * source, unsigned from, unsigned count,
57635 +          shift_direction dir, unsigned free_space)
57636 +{
57637 +       item_plugin *iplug;
57638 +
57639 +       assert("nikita-1463", target != NULL);
57640 +       assert("nikita-1464", source != NULL);
57641 +       assert("nikita-1465", from + count <= coord_num_units(source));
57642 +
57643 +       iplug = item_plugin_by_coord(source);
57644 +       assert("nikita-1468", iplug == item_plugin_by_coord(target));
57645 +       iplug->b.copy_units(target, source, from, count, dir, free_space);
57646 +
57647 +       if (dir == SHIFT_RIGHT) {
57648 +               /* FIXME-VS: this looks not necessary. update_item_key was
57649 +                  called already by copy_units method */
57650 +               reiser4_key split_key;
57651 +
57652 +               assert("nikita-1469", target->unit_pos == 0);
57653 +
57654 +               unit_key_by_coord(target, &split_key);
57655 +               node_plugin_by_coord(target)->update_item_key(target,
57656 +                                                             &split_key, NULL);
57657 +       }
57658 +}
57659 +
57660 +/* copy part of @shift->real_stop.node starting either from its beginning or
57661 +   from its end and ending at @shift->real_stop to either the end or the
57662 +   beginning of @shift->target */
57663 +static void copy(struct shift_params *shift)
57664 +{
57665 +       node40_header *nh;
57666 +       coord_t from;
57667 +       coord_t to;
57668 +       item_header40 *from_ih, *to_ih;
57669 +       int free_space_start;
57670 +       int new_items;
57671 +       unsigned old_items;
57672 +       int old_offset;
57673 +       unsigned i;
57674 +
57675 +       nh = node40_node_header(shift->target);
57676 +       free_space_start = nh40_get_free_space_start(nh);
57677 +       old_items = nh40_get_num_items(nh);
57678 +       new_items = shift->entire + (shift->part_units ? 1 : 0);
57679 +       assert("vs-185",
57680 +              shift->shift_bytes ==
57681 +              shift->merging_bytes + shift->entire_bytes + shift->part_bytes);
57682 +
57683 +       from = shift->wish_stop;
57684 +
57685 +       coord_init_first_unit(&to, shift->target);
57686 +
57687 +       /* NOTE:NIKITA->VS not sure what I am doing: shift->target is empty,
57688 +          hence to.between is set to EMPTY_NODE above. Looks like we want it
57689 +          to be AT_UNIT.
57690 +
57691 +          Oh, wonders of ->betweeness...
57692 +
57693 +        */
57694 +       to.between = AT_UNIT;
57695 +
57696 +       if (shift->pend == SHIFT_LEFT) {
57697 +               /* copying to left */
57698 +
57699 +               coord_set_item_pos(&from, 0);
57700 +               from_ih = node40_ih_at(from.node, 0);
57701 +
57702 +               coord_set_item_pos(&to,
57703 +                                  node40_num_of_items_internal(to.node) - 1);
57704 +               if (shift->merging_units) {
57705 +                       /* expand last item, so that plugin methods will see
57706 +                          correct data */
57707 +                       free_space_start += shift->merging_bytes;
57708 +                       nh40_set_free_space_start(nh,
57709 +                                                 (unsigned)free_space_start);
57710 +                       nh40_set_free_space(nh,
57711 +                                           nh40_get_free_space(nh) -
57712 +                                           shift->merging_bytes);
57713 +
57714 +                       /* appending last item of @target */
57715 +                       copy_units(&to, &from, 0,       /* starting from 0-th unit */
57716 +                                  shift->merging_units, SHIFT_LEFT,
57717 +                                  shift->merging_bytes);
57718 +                       coord_inc_item_pos(&from);
57719 +                       from_ih--;
57720 +                       coord_inc_item_pos(&to);
57721 +               }
57722 +
57723 +               to_ih = node40_ih_at(shift->target, old_items);
57724 +               if (shift->entire) {
57725 +                       /* copy @entire items entirely */
57726 +
57727 +                       /* copy item headers */
57728 +                       memcpy(to_ih - shift->entire + 1,
57729 +                              from_ih - shift->entire + 1,
57730 +                              shift->entire * sizeof(item_header40));
57731 +                       /* update item header offset */
57732 +                       old_offset = ih40_get_offset(from_ih);
57733 +                       /* AUDIT: Looks like if we calculate old_offset + free_space_start here instead of just old_offset, we can perform one "add" operation less per each iteration */
57734 +                       for (i = 0; i < shift->entire; i++, to_ih--, from_ih--)
57735 +                               ih40_set_offset(to_ih,
57736 +                                               ih40_get_offset(from_ih) -
57737 +                                               old_offset + free_space_start);
57738 +
57739 +                       /* copy item bodies */
57740 +                       memcpy(zdata(shift->target) + free_space_start, zdata(from.node) + old_offset,  /*ih40_get_offset (from_ih), */
57741 +                              shift->entire_bytes);
57742 +
57743 +                       coord_add_item_pos(&from, (int)shift->entire);
57744 +                       coord_add_item_pos(&to, (int)shift->entire);
57745 +               }
57746 +
57747 +               nh40_set_free_space_start(nh,
57748 +                                         free_space_start +
57749 +                                         shift->shift_bytes -
57750 +                                         shift->merging_bytes);
57751 +               nh40_set_free_space(nh,
57752 +                                   nh40_get_free_space(nh) -
57753 +                                   (shift->shift_bytes - shift->merging_bytes +
57754 +                                    sizeof(item_header40) * new_items));
57755 +
57756 +               /* update node header */
57757 +               node40_set_num_items(shift->target, nh, old_items + new_items);
57758 +               assert("vs-170",
57759 +                      nh40_get_free_space(nh) < znode_size(shift->target));
57760 +
57761 +               if (shift->part_units) {
57762 +                       /* copy heading part (@part units) of @source item as
57763 +                          a new item into @target->node */
57764 +
57765 +                       /* copy item header of partially copied item */
57766 +                       coord_set_item_pos(&to,
57767 +                                          node40_num_of_items_internal(to.node)
57768 +                                          - 1);
57769 +                       memcpy(to_ih, from_ih, sizeof(item_header40));
57770 +                       ih40_set_offset(to_ih,
57771 +                                       nh40_get_free_space_start(nh) -
57772 +                                       shift->part_bytes);
57773 +                       if (item_plugin_by_coord(&to)->b.init)
57774 +                               item_plugin_by_coord(&to)->b.init(&to, &from,
57775 +                                                                 NULL);
57776 +                       copy_units(&to, &from, 0, shift->part_units, SHIFT_LEFT,
57777 +                                  shift->part_bytes);
57778 +               }
57779 +
57780 +       } else {
57781 +               /* copying to right */
57782 +
57783 +               coord_set_item_pos(&from,
57784 +                                  node40_num_of_items_internal(from.node) - 1);
57785 +               from_ih = node40_ih_at_coord(&from);
57786 +
57787 +               coord_set_item_pos(&to, 0);
57788 +
57789 +               /* prepare space for new items */
57790 +               memmove(zdata(to.node) + sizeof(node40_header) +
57791 +                       shift->shift_bytes,
57792 +                       zdata(to.node) + sizeof(node40_header),
57793 +                       free_space_start - sizeof(node40_header));
57794 +               /* update item headers of moved items */
57795 +               to_ih = node40_ih_at(to.node, 0);
57796 +               /* first item gets @merging_bytes longer. free space appears
57797 +                  at its beginning */
57798 +               if (!node_is_empty(to.node))
57799 +                       ih40_set_offset(to_ih,
57800 +                                       ih40_get_offset(to_ih) +
57801 +                                       shift->shift_bytes -
57802 +                                       shift->merging_bytes);
57803 +
57804 +               for (i = 1; i < old_items; i++)
57805 +                       ih40_set_offset(to_ih - i,
57806 +                                       ih40_get_offset(to_ih - i) +
57807 +                                       shift->shift_bytes);
57808 +
57809 +               /* move item headers to make space for new items */
57810 +               memmove(to_ih - old_items + 1 - new_items,
57811 +                       to_ih - old_items + 1,
57812 +                       sizeof(item_header40) * old_items);
57813 +               to_ih -= (new_items - 1);
57814 +
57815 +               nh40_set_free_space_start(nh,
57816 +                                         free_space_start +
57817 +                                         shift->shift_bytes);
57818 +               nh40_set_free_space(nh,
57819 +                                   nh40_get_free_space(nh) -
57820 +                                   (shift->shift_bytes +
57821 +                                    sizeof(item_header40) * new_items));
57822 +
57823 +               /* update node header */
57824 +               node40_set_num_items(shift->target, nh, old_items + new_items);
57825 +               assert("vs-170",
57826 +                      nh40_get_free_space(nh) < znode_size(shift->target));
57827 +
57828 +               if (shift->merging_units) {
57829 +                       coord_add_item_pos(&to, new_items);
57830 +                       to.unit_pos = 0;
57831 +                       to.between = AT_UNIT;
57832 +                       /* prepend first item of @to */
57833 +                       copy_units(&to, &from,
57834 +                                  coord_last_unit_pos(&from) -
57835 +                                  shift->merging_units + 1,
57836 +                                  shift->merging_units, SHIFT_RIGHT,
57837 +                                  shift->merging_bytes);
57838 +                       coord_dec_item_pos(&from);
57839 +                       from_ih++;
57840 +               }
57841 +
57842 +               if (shift->entire) {
57843 +                       /* copy @entire items entirely */
57844 +
57845 +                       /* copy item headers */
57846 +                       memcpy(to_ih, from_ih,
57847 +                              shift->entire * sizeof(item_header40));
57848 +
57849 +                       /* update item header offset */
57850 +                       old_offset =
57851 +                           ih40_get_offset(from_ih + shift->entire - 1);
57852 +                       /* AUDIT: old_offset + sizeof (node40_header) + shift->part_bytes calculation can be taken off the loop. */
57853 +                       for (i = 0; i < shift->entire; i++, to_ih++, from_ih++)
57854 +                               ih40_set_offset(to_ih,
57855 +                                               ih40_get_offset(from_ih) -
57856 +                                               old_offset +
57857 +                                               sizeof(node40_header) +
57858 +                                               shift->part_bytes);
57859 +                       /* copy item bodies */
57860 +                       coord_add_item_pos(&from, -(int)(shift->entire - 1));
57861 +                       memcpy(zdata(to.node) + sizeof(node40_header) +
57862 +                              shift->part_bytes, item_by_coord_node40(&from),
57863 +                              shift->entire_bytes);
57864 +                       coord_dec_item_pos(&from);
57865 +               }
57866 +
57867 +               if (shift->part_units) {
57868 +                       coord_set_item_pos(&to, 0);
57869 +                       to.unit_pos = 0;
57870 +                       to.between = AT_UNIT;
57871 +                       /* copy heading part (@part units) of @source item as
57872 +                          a new item into @target->node */
57873 +
57874 +                       /* copy item header of partially copied item */
57875 +                       memcpy(to_ih, from_ih, sizeof(item_header40));
57876 +                       ih40_set_offset(to_ih, sizeof(node40_header));
57877 +                       if (item_plugin_by_coord(&to)->b.init)
57878 +                               item_plugin_by_coord(&to)->b.init(&to, &from,
57879 +                                                                 NULL);
57880 +                       copy_units(&to, &from,
57881 +                                  coord_last_unit_pos(&from) -
57882 +                                  shift->part_units + 1, shift->part_units,
57883 +                                  SHIFT_RIGHT, shift->part_bytes);
57884 +               }
57885 +       }
57886 +}
57887 +
57888 +/* remove everything either before or after @fact_stop. Number of items
57889 +   removed completely is returned */
57890 +static int delete_copied(struct shift_params *shift)
57891 +{
57892 +       coord_t from;
57893 +       coord_t to;
57894 +       struct carry_cut_data cdata;
57895 +
57896 +       if (shift->pend == SHIFT_LEFT) {
57897 +               /* we were shifting to left, remove everything from the
57898 +                  beginning of @shift->wish_stop->node upto
57899 +                  @shift->wish_stop */
57900 +               coord_init_first_unit(&from, shift->real_stop.node);
57901 +               to = shift->real_stop;
57902 +
57903 +               /* store old coordinate of unit which will be first after
57904 +                  shift to left */
57905 +               shift->u.future_first = to;
57906 +               coord_next_unit(&shift->u.future_first);
57907 +       } else {
57908 +               /* we were shifting to right, remove everything from
57909 +                  @shift->stop_coord upto to end of
57910 +                  @shift->stop_coord->node */
57911 +               from = shift->real_stop;
57912 +               coord_init_last_unit(&to, from.node);
57913 +
57914 +               /* store old coordinate of unit which will be last after
57915 +                  shift to right */
57916 +               shift->u.future_last = from;
57917 +               coord_prev_unit(&shift->u.future_last);
57918 +       }
57919 +
57920 +       cdata.params.from = &from;
57921 +       cdata.params.to = &to;
57922 +       cdata.params.from_key = NULL;
57923 +       cdata.params.to_key = NULL;
57924 +       cdata.params.smallest_removed = NULL;
57925 +       return cut_node40(&cdata, NULL);
57926 +}
57927 +
57928 +/* something was moved between @left and @right. Add carry operation to @info
57929 +   list to have carry to update delimiting key between them */
57930 +static int
57931 +prepare_for_update(znode * left, znode * right, carry_plugin_info * info)
57932 +{
57933 +       carry_op *op;
57934 +       carry_node *cn;
57935 +
57936 +       if (info == NULL)
57937 +               /* nowhere to send operation to. */
57938 +               return 0;
57939 +
57940 +       if (!should_notify_parent(right))
57941 +               return 0;
57942 +
57943 +       op = node_post_carry(info, COP_UPDATE, right, 1);
57944 +       if (IS_ERR(op) || op == NULL)
57945 +               return op ? PTR_ERR(op) : -EIO;
57946 +
57947 +       if (left != NULL) {
57948 +               carry_node *reference;
57949 +
57950 +               if (info->doing)
57951 +                       reference = insert_carry_node(info->doing,
57952 +                                                     info->todo, left);
57953 +               else
57954 +                       reference = op->node;
57955 +               assert("nikita-2992", reference != NULL);
57956 +               cn = add_carry(info->todo, POOLO_BEFORE, reference);
57957 +               if (IS_ERR(cn))
57958 +                       return PTR_ERR(cn);
57959 +               cn->parent = 1;
57960 +               cn->node = left;
57961 +               if (ZF_ISSET(left, JNODE_ORPHAN))
57962 +                       cn->left_before = 1;
57963 +               op->u.update.left = cn;
57964 +       } else
57965 +               op->u.update.left = NULL;
57966 +       return 0;
57967 +}
57968 +
57969 +/* plugin->u.node.prepare_removal
57970 +   to delete a pointer to @empty from the tree add corresponding carry
57971 +   operation (delete) to @info list */
57972 +int prepare_removal_node40(znode * empty, carry_plugin_info * info)
57973 +{
57974 +       carry_op *op;
57975 +       reiser4_tree *tree;
57976 +
57977 +       if (!should_notify_parent(empty))
57978 +               return 0;
57979 +       /* already on a road to Styx */
57980 +       if (ZF_ISSET(empty, JNODE_HEARD_BANSHEE))
57981 +               return 0;
57982 +       op = node_post_carry(info, COP_DELETE, empty, 1);
57983 +       if (IS_ERR(op) || op == NULL)
57984 +               return RETERR(op ? PTR_ERR(op) : -EIO);
57985 +
57986 +       op->u.delete.child = NULL;
57987 +       op->u.delete.flags = 0;
57988 +
57989 +       /* fare thee well */
57990 +       tree = znode_get_tree(empty);
57991 +       read_lock_tree(tree);
57992 +       write_lock_dk(tree);
57993 +       znode_set_ld_key(empty, znode_get_rd_key(empty));
57994 +       if (znode_is_left_connected(empty) && empty->left)
57995 +               znode_set_rd_key(empty->left, znode_get_rd_key(empty));
57996 +       write_unlock_dk(tree);
57997 +       read_unlock_tree(tree);
57998 +
57999 +       ZF_SET(empty, JNODE_HEARD_BANSHEE);
58000 +       return 0;
58001 +}
58002 +
58003 +/* something were shifted from @insert_coord->node to @shift->target, update
58004 +   @insert_coord correspondingly */
58005 +static void
58006 +adjust_coord(coord_t * insert_coord, struct shift_params *shift, int removed,
58007 +            int including_insert_coord)
58008 +{
58009 +       /* item plugin was invalidated by shifting */
58010 +       coord_clear_iplug(insert_coord);
58011 +
58012 +       if (node_is_empty(shift->wish_stop.node)) {
58013 +               assert("vs-242", shift->everything);
58014 +               if (including_insert_coord) {
58015 +                       if (shift->pend == SHIFT_RIGHT) {
58016 +                               /* set @insert_coord before first unit of
58017 +                                  @shift->target node */
58018 +                               coord_init_before_first_item(insert_coord,
58019 +                                                            shift->target);
58020 +                       } else {
58021 +                               /* set @insert_coord after last in target node */
58022 +                               coord_init_after_last_item(insert_coord,
58023 +                                                          shift->target);
58024 +                       }
58025 +               } else {
58026 +                       /* set @insert_coord inside of empty node. There is
58027 +                          only one possible coord within an empty
58028 +                          node. init_first_unit will set that coord */
58029 +                       coord_init_first_unit(insert_coord,
58030 +                                             shift->wish_stop.node);
58031 +               }
58032 +               return;
58033 +       }
58034 +
58035 +       if (shift->pend == SHIFT_RIGHT) {
58036 +               /* there was shifting to right */
58037 +               if (shift->everything) {
58038 +                       /* everything wanted was shifted */
58039 +                       if (including_insert_coord) {
58040 +                               /* @insert_coord is set before first unit of
58041 +                                  @to node */
58042 +                               coord_init_before_first_item(insert_coord,
58043 +                                                            shift->target);
58044 +                               insert_coord->between = BEFORE_UNIT;
58045 +                       } else {
58046 +                               /* @insert_coord is set after last unit of
58047 +                                  @insert->node */
58048 +                               coord_init_last_unit(insert_coord,
58049 +                                                    shift->wish_stop.node);
58050 +                               insert_coord->between = AFTER_UNIT;
58051 +                       }
58052 +               }
58053 +               return;
58054 +       }
58055 +
58056 +       /* there was shifting to left */
58057 +       if (shift->everything) {
58058 +               /* everything wanted was shifted */
58059 +               if (including_insert_coord) {
58060 +                       /* @insert_coord is set after last unit in @to node */
58061 +                       coord_init_after_last_item(insert_coord, shift->target);
58062 +               } else {
58063 +                       /* @insert_coord is set before first unit in the same
58064 +                          node */
58065 +                       coord_init_before_first_item(insert_coord,
58066 +                                                    shift->wish_stop.node);
58067 +               }
58068 +               return;
58069 +       }
58070 +
58071 +       /* FIXME-VS: the code below is complicated because with between ==
58072 +          AFTER_ITEM unit_pos is set to 0 */
58073 +
58074 +       if (!removed) {
58075 +               /* no items were shifted entirely */
58076 +               assert("vs-195", shift->merging_units == 0
58077 +                      || shift->part_units == 0);
58078 +
58079 +               if (shift->real_stop.item_pos == insert_coord->item_pos) {
58080 +                       if (shift->merging_units) {
58081 +                               if (insert_coord->between == AFTER_UNIT) {
58082 +                                       assert("nikita-1441",
58083 +                                              insert_coord->unit_pos >=
58084 +                                              shift->merging_units);
58085 +                                       insert_coord->unit_pos -=
58086 +                                           shift->merging_units;
58087 +                               } else if (insert_coord->between == BEFORE_UNIT) {
58088 +                                       assert("nikita-2090",
58089 +                                              insert_coord->unit_pos >
58090 +                                              shift->merging_units);
58091 +                                       insert_coord->unit_pos -=
58092 +                                           shift->merging_units;
58093 +                               }
58094 +
58095 +                               assert("nikita-2083",
58096 +                                      insert_coord->unit_pos + 1);
58097 +                       } else {
58098 +                               if (insert_coord->between == AFTER_UNIT) {
58099 +                                       assert("nikita-1442",
58100 +                                              insert_coord->unit_pos >=
58101 +                                              shift->part_units);
58102 +                                       insert_coord->unit_pos -=
58103 +                                           shift->part_units;
58104 +                               } else if (insert_coord->between == BEFORE_UNIT) {
58105 +                                       assert("nikita-2089",
58106 +                                              insert_coord->unit_pos >
58107 +                                              shift->part_units);
58108 +                                       insert_coord->unit_pos -=
58109 +                                           shift->part_units;
58110 +                               }
58111 +
58112 +                               assert("nikita-2084",
58113 +                                      insert_coord->unit_pos + 1);
58114 +                       }
58115 +               }
58116 +               return;
58117 +       }
58118 +
58119 +       /* we shifted to left and there was no enough space for everything */
58120 +       switch (insert_coord->between) {
58121 +       case AFTER_UNIT:
58122 +       case BEFORE_UNIT:
58123 +               if (shift->real_stop.item_pos == insert_coord->item_pos)
58124 +                       insert_coord->unit_pos -= shift->part_units;
58125 +       case AFTER_ITEM:
58126 +               coord_add_item_pos(insert_coord, -removed);
58127 +               break;
58128 +       default:
58129 +               impossible("nikita-2087", "not ready");
58130 +       }
58131 +       assert("nikita-2085", insert_coord->unit_pos + 1);
58132 +}
58133 +
58134 +static int call_shift_hooks(struct shift_params *shift)
58135 +{
58136 +       unsigned i, shifted;
58137 +       coord_t coord;
58138 +       item_plugin *iplug;
58139 +
58140 +       assert("vs-275", !node_is_empty(shift->target));
58141 +
58142 +       /* number of items shift touches */
58143 +       shifted =
58144 +           shift->entire + (shift->merging_units ? 1 : 0) +
58145 +           (shift->part_units ? 1 : 0);
58146 +
58147 +       if (shift->pend == SHIFT_LEFT) {
58148 +               /* moved items are at the end */
58149 +               coord_init_last_unit(&coord, shift->target);
58150 +               coord.unit_pos = 0;
58151 +
58152 +               assert("vs-279", shift->pend == 1);
58153 +               for (i = 0; i < shifted; i++) {
58154 +                       unsigned from, count;
58155 +
58156 +                       iplug = item_plugin_by_coord(&coord);
58157 +                       if (i == 0 && shift->part_units) {
58158 +                               assert("vs-277",
58159 +                                      coord_num_units(&coord) ==
58160 +                                      shift->part_units);
58161 +                               count = shift->part_units;
58162 +                               from = 0;
58163 +                       } else if (i == shifted - 1 && shift->merging_units) {
58164 +                               count = shift->merging_units;
58165 +                               from = coord_num_units(&coord) - count;
58166 +                       } else {
58167 +                               count = coord_num_units(&coord);
58168 +                               from = 0;
58169 +                       }
58170 +
58171 +                       if (iplug->b.shift_hook) {
58172 +                               iplug->b.shift_hook(&coord, from, count,
58173 +                                                   shift->wish_stop.node);
58174 +                       }
58175 +                       coord_add_item_pos(&coord, -shift->pend);
58176 +               }
58177 +       } else {
58178 +               /* moved items are at the beginning */
58179 +               coord_init_first_unit(&coord, shift->target);
58180 +
58181 +               assert("vs-278", shift->pend == -1);
58182 +               for (i = 0; i < shifted; i++) {
58183 +                       unsigned from, count;
58184 +
58185 +                       iplug = item_plugin_by_coord(&coord);
58186 +                       if (i == 0 && shift->part_units) {
58187 +                               assert("vs-277",
58188 +                                      coord_num_units(&coord) ==
58189 +                                      shift->part_units);
58190 +                               count = coord_num_units(&coord);
58191 +                               from = 0;
58192 +                       } else if (i == shifted - 1 && shift->merging_units) {
58193 +                               count = shift->merging_units;
58194 +                               from = 0;
58195 +                       } else {
58196 +                               count = coord_num_units(&coord);
58197 +                               from = 0;
58198 +                       }
58199 +
58200 +                       if (iplug->b.shift_hook) {
58201 +                               iplug->b.shift_hook(&coord, from, count,
58202 +                                                   shift->wish_stop.node);
58203 +                       }
58204 +                       coord_add_item_pos(&coord, -shift->pend);
58205 +               }
58206 +       }
58207 +
58208 +       return 0;
58209 +}
58210 +
58211 +/* shift to left is completed. Return 1 if unit @old was moved to left neighbor */
58212 +static int
58213 +unit_moved_left(const struct shift_params *shift, const coord_t * old)
58214 +{
58215 +       assert("vs-944", shift->real_stop.node == old->node);
58216 +
58217 +       if (shift->real_stop.item_pos < old->item_pos)
58218 +               return 0;
58219 +       if (shift->real_stop.item_pos == old->item_pos) {
58220 +               if (shift->real_stop.unit_pos < old->unit_pos)
58221 +                       return 0;
58222 +       }
58223 +       return 1;
58224 +}
58225 +
58226 +/* shift to right is completed. Return 1 if unit @old was moved to right
58227 +   neighbor */
58228 +static int
58229 +unit_moved_right(const struct shift_params *shift, const coord_t * old)
58230 +{
58231 +       assert("vs-944", shift->real_stop.node == old->node);
58232 +
58233 +       if (shift->real_stop.item_pos > old->item_pos)
58234 +               return 0;
58235 +       if (shift->real_stop.item_pos == old->item_pos) {
58236 +               if (shift->real_stop.unit_pos > old->unit_pos)
58237 +                       return 0;
58238 +       }
58239 +       return 1;
58240 +}
58241 +
58242 +/* coord @old was set in node from which shift was performed. What was shifted
58243 +   is stored in @shift. Update @old correspondingly to performed shift */
58244 +static coord_t *adjust_coord2(const struct shift_params *shift,
58245 +                             const coord_t * old, coord_t * new)
58246 +{
58247 +       coord_clear_iplug(new);
58248 +       new->between = old->between;
58249 +
58250 +       coord_clear_iplug(new);
58251 +       if (old->node == shift->target) {
58252 +               if (shift->pend == SHIFT_LEFT) {
58253 +                       /* coord which is set inside of left neighbor does not
58254 +                          change during shift to left */
58255 +                       coord_dup(new, old);
58256 +                       return new;
58257 +               }
58258 +               new->node = old->node;
58259 +               coord_set_item_pos(new,
58260 +                                  old->item_pos + shift->entire +
58261 +                                  (shift->part_units ? 1 : 0));
58262 +               new->unit_pos = old->unit_pos;
58263 +               if (old->item_pos == 0 && shift->merging_units)
58264 +                       new->unit_pos += shift->merging_units;
58265 +               return new;
58266 +       }
58267 +
58268 +       assert("vs-977", old->node == shift->wish_stop.node);
58269 +       if (shift->pend == SHIFT_LEFT) {
58270 +               if (unit_moved_left(shift, old)) {
58271 +                       /* unit @old moved to left neighbor. Calculate its
58272 +                          coordinate there */
58273 +                       new->node = shift->target;
58274 +                       coord_set_item_pos(new,
58275 +                                          node_num_items(shift->target) -
58276 +                                          shift->entire -
58277 +                                          (shift->part_units ? 1 : 0) +
58278 +                                          old->item_pos);
58279 +
58280 +                       new->unit_pos = old->unit_pos;
58281 +                       if (shift->merging_units) {
58282 +                               coord_dec_item_pos(new);
58283 +                               if (old->item_pos == 0) {
58284 +                                       /* unit_pos only changes if item got
58285 +                                          merged */
58286 +                                       new->unit_pos =
58287 +                                           coord_num_units(new) -
58288 +                                           (shift->merging_units -
58289 +                                            old->unit_pos);
58290 +                               }
58291 +                       }
58292 +               } else {
58293 +                       /* unit @old did not move to left neighbor.
58294 +
58295 +                          Use _nocheck, because @old is outside of its node.
58296 +                        */
58297 +                       coord_dup_nocheck(new, old);
58298 +                       coord_add_item_pos(new,
58299 +                                          -shift->u.future_first.item_pos);
58300 +                       if (new->item_pos == 0)
58301 +                               new->unit_pos -= shift->u.future_first.unit_pos;
58302 +               }
58303 +       } else {
58304 +               if (unit_moved_right(shift, old)) {
58305 +                       /* unit @old moved to right neighbor */
58306 +                       new->node = shift->target;
58307 +                       coord_set_item_pos(new,
58308 +                                          old->item_pos -
58309 +                                          shift->real_stop.item_pos);
58310 +                       if (new->item_pos == 0) {
58311 +                               /* unit @old might change unit pos */
58312 +                               coord_set_item_pos(new,
58313 +                                                  old->unit_pos -
58314 +                                                  shift->real_stop.unit_pos);
58315 +                       }
58316 +               } else {
58317 +                       /* unit @old did not move to right neighbor, therefore
58318 +                          it did not change */
58319 +                       coord_dup(new, old);
58320 +               }
58321 +       }
58322 +       coord_set_iplug(new, item_plugin_by_coord(new));
58323 +       return new;
58324 +}
58325 +
58326 +/* this is called when shift is completed (something of source node is copied
58327 +   to target and deleted in source) to update all taps set in current
58328 +   context */
58329 +static void update_taps(const struct shift_params *shift)
58330 +{
58331 +       tap_t *tap;
58332 +       coord_t new;
58333 +
58334 +       for_all_taps(tap) {
58335 +               /* update only taps set to nodes participating in shift */
58336 +               if (tap->coord->node == shift->wish_stop.node
58337 +                   || tap->coord->node == shift->target)
58338 +                       tap_to_coord(tap,
58339 +                                    adjust_coord2(shift, tap->coord, &new));
58340 +       }
58341 +}
58342 +
58343 +#if REISER4_DEBUG
58344 +
58345 +struct shift_check {
58346 +       reiser4_key key;
58347 +       __u16 plugin_id;
58348 +       union {
58349 +               __u64 bytes;
58350 +               __u64 entries;
58351 +               void *unused;
58352 +       } u;
58353 +};
58354 +
58355 +void *shift_check_prepare(const znode * left, const znode * right)
58356 +{
58357 +       pos_in_node_t i, nr_items;
58358 +       int mergeable;
58359 +       struct shift_check *data;
58360 +       item_header40 *ih;
58361 +
58362 +       if (node_is_empty(left) || node_is_empty(right))
58363 +               mergeable = 0;
58364 +       else {
58365 +               coord_t l, r;
58366 +
58367 +               coord_init_last_unit(&l, left);
58368 +               coord_init_first_unit(&r, right);
58369 +               mergeable = are_items_mergeable(&l, &r);
58370 +       }
58371 +       nr_items =
58372 +           node40_num_of_items_internal(left) +
58373 +           node40_num_of_items_internal(right) - (mergeable ? 1 : 0);
58374 +       data =
58375 +               kmalloc(sizeof(struct shift_check) * nr_items, get_gfp_mask());
58376 +       if (data != NULL) {
58377 +               coord_t coord;
58378 +               pos_in_node_t item_pos;
58379 +
58380 +               coord_init_first_unit(&coord, left);
58381 +               i = 0;
58382 +
58383 +               for (item_pos = 0;
58384 +                    item_pos < node40_num_of_items_internal(left);
58385 +                    item_pos++) {
58386 +
58387 +                       coord_set_item_pos(&coord, item_pos);
58388 +                       ih = node40_ih_at_coord(&coord);
58389 +
58390 +                       data[i].key = ih->key;
58391 +                       data[i].plugin_id = le16_to_cpu(get_unaligned(&ih->plugin_id));
58392 +                       switch (data[i].plugin_id) {
58393 +                       case CTAIL_ID:
58394 +                       case FORMATTING_ID:
58395 +                               data[i].u.bytes = coord_num_units(&coord);
58396 +                               break;
58397 +                       case EXTENT_POINTER_ID:
58398 +                               data[i].u.bytes =
58399 +                                   extent_size(&coord,
58400 +                                               coord_num_units(&coord));
58401 +                               break;
58402 +                       case COMPOUND_DIR_ID:
58403 +                               data[i].u.entries = coord_num_units(&coord);
58404 +                               break;
58405 +                       default:
58406 +                               data[i].u.unused = NULL;
58407 +                               break;
58408 +                       }
58409 +                       i++;
58410 +               }
58411 +
58412 +               coord_init_first_unit(&coord, right);
58413 +
58414 +               if (mergeable) {
58415 +                       assert("vs-1609", i != 0);
58416 +
58417 +                       ih = node40_ih_at_coord(&coord);
58418 +
58419 +                       assert("vs-1589",
58420 +                              data[i - 1].plugin_id ==
58421 +                              le16_to_cpu(get_unaligned(&ih->plugin_id)));
58422 +                       switch (data[i - 1].plugin_id) {
58423 +                       case CTAIL_ID:
58424 +                       case FORMATTING_ID:
58425 +                               data[i - 1].u.bytes += coord_num_units(&coord);
58426 +                               break;
58427 +                       case EXTENT_POINTER_ID:
58428 +                               data[i - 1].u.bytes +=
58429 +                                   extent_size(&coord,
58430 +                                               coord_num_units(&coord));
58431 +                               break;
58432 +                       case COMPOUND_DIR_ID:
58433 +                               data[i - 1].u.entries +=
58434 +                                   coord_num_units(&coord);
58435 +                               break;
58436 +                       default:
58437 +                               impossible("vs-1605", "wrong mergeable item");
58438 +                               break;
58439 +                       }
58440 +                       item_pos = 1;
58441 +               } else
58442 +                       item_pos = 0;
58443 +               for (; item_pos < node40_num_of_items_internal(right);
58444 +                    item_pos++) {
58445 +
58446 +                       assert("vs-1604", i < nr_items);
58447 +                       coord_set_item_pos(&coord, item_pos);
58448 +                       ih = node40_ih_at_coord(&coord);
58449 +
58450 +                       data[i].key = ih->key;
58451 +                       data[i].plugin_id = le16_to_cpu(get_unaligned(&ih->plugin_id));
58452 +                       switch (data[i].plugin_id) {
58453 +                       case CTAIL_ID:
58454 +                       case FORMATTING_ID:
58455 +                               data[i].u.bytes = coord_num_units(&coord);
58456 +                               break;
58457 +                       case EXTENT_POINTER_ID:
58458 +                               data[i].u.bytes =
58459 +                                   extent_size(&coord,
58460 +                                               coord_num_units(&coord));
58461 +                               break;
58462 +                       case COMPOUND_DIR_ID:
58463 +                               data[i].u.entries = coord_num_units(&coord);
58464 +                               break;
58465 +                       default:
58466 +                               data[i].u.unused = NULL;
58467 +                               break;
58468 +                       }
58469 +                       i++;
58470 +               }
58471 +               assert("vs-1606", i == nr_items);
58472 +       }
58473 +       return data;
58474 +}
58475 +
58476 +void shift_check(void *vp, const znode * left, const znode * right)
58477 +{
58478 +       pos_in_node_t i, nr_items;
58479 +       coord_t coord;
58480 +       __u64 last_bytes;
58481 +       int mergeable;
58482 +       item_header40 *ih;
58483 +       pos_in_node_t item_pos;
58484 +       struct shift_check *data;
58485 +
58486 +       data = (struct shift_check *)vp;
58487 +
58488 +       if (data == NULL)
58489 +               return;
58490 +
58491 +       if (node_is_empty(left) || node_is_empty(right))
58492 +               mergeable = 0;
58493 +       else {
58494 +               coord_t l, r;
58495 +
58496 +               coord_init_last_unit(&l, left);
58497 +               coord_init_first_unit(&r, right);
58498 +               mergeable = are_items_mergeable(&l, &r);
58499 +       }
58500 +
58501 +       nr_items =
58502 +           node40_num_of_items_internal(left) +
58503 +           node40_num_of_items_internal(right) - (mergeable ? 1 : 0);
58504 +
58505 +       i = 0;
58506 +       last_bytes = 0;
58507 +
58508 +       coord_init_first_unit(&coord, left);
58509 +
58510 +       for (item_pos = 0; item_pos < node40_num_of_items_internal(left);
58511 +            item_pos++) {
58512 +
58513 +               coord_set_item_pos(&coord, item_pos);
58514 +               ih = node40_ih_at_coord(&coord);
58515 +
58516 +               assert("vs-1611", i == item_pos);
58517 +               assert("vs-1590", keyeq(&ih->key, &data[i].key));
58518 +               assert("vs-1591",
58519 +                      le16_to_cpu(get_unaligned(&ih->plugin_id)) == data[i].plugin_id);
58520 +               if ((i < (node40_num_of_items_internal(left) - 1))
58521 +                   || !mergeable) {
58522 +                       switch (data[i].plugin_id) {
58523 +                       case CTAIL_ID:
58524 +                       case FORMATTING_ID:
58525 +                               assert("vs-1592",
58526 +                                      data[i].u.bytes ==
58527 +                                      coord_num_units(&coord));
58528 +                               break;
58529 +                       case EXTENT_POINTER_ID:
58530 +                               assert("vs-1593",
58531 +                                      data[i].u.bytes == extent_size(&coord,
58532 +                                                                     coord_num_units
58533 +                                                                     (&coord)));
58534 +                               break;
58535 +                       case COMPOUND_DIR_ID:
58536 +                               assert("vs-1594",
58537 +                                      data[i].u.entries ==
58538 +                                      coord_num_units(&coord));
58539 +                               break;
58540 +                       default:
58541 +                               break;
58542 +                       }
58543 +               }
58544 +               if (item_pos == (node40_num_of_items_internal(left) - 1)
58545 +                   && mergeable) {
58546 +                       switch (data[i].plugin_id) {
58547 +                       case CTAIL_ID:
58548 +                       case FORMATTING_ID:
58549 +                               last_bytes = coord_num_units(&coord);
58550 +                               break;
58551 +                       case EXTENT_POINTER_ID:
58552 +                               last_bytes =
58553 +                                   extent_size(&coord,
58554 +                                               coord_num_units(&coord));
58555 +                               break;
58556 +                       case COMPOUND_DIR_ID:
58557 +                               last_bytes = coord_num_units(&coord);
58558 +                               break;
58559 +                       default:
58560 +                               impossible("vs-1595", "wrong mergeable item");
58561 +                               break;
58562 +                       }
58563 +               }
58564 +               i++;
58565 +       }
58566 +
58567 +       coord_init_first_unit(&coord, right);
58568 +       if (mergeable) {
58569 +               ih = node40_ih_at_coord(&coord);
58570 +
58571 +               assert("vs-1589",
58572 +                      data[i - 1].plugin_id == le16_to_cpu(get_unaligned(&ih->plugin_id)));
58573 +               assert("vs-1608", last_bytes != 0);
58574 +               switch (data[i - 1].plugin_id) {
58575 +               case CTAIL_ID:
58576 +               case FORMATTING_ID:
58577 +                       assert("vs-1596",
58578 +                              data[i - 1].u.bytes ==
58579 +                              last_bytes + coord_num_units(&coord));
58580 +                       break;
58581 +
58582 +               case EXTENT_POINTER_ID:
58583 +                       assert("vs-1597",
58584 +                              data[i - 1].u.bytes ==
58585 +                              last_bytes + extent_size(&coord,
58586 +                                                       coord_num_units
58587 +                                                       (&coord)));
58588 +                       break;
58589 +
58590 +               case COMPOUND_DIR_ID:
58591 +                       assert("vs-1598",
58592 +                              data[i - 1].u.bytes ==
58593 +                              last_bytes + coord_num_units(&coord));
58594 +                       break;
58595 +               default:
58596 +                       impossible("vs-1599", "wrong mergeable item");
58597 +                       break;
58598 +               }
58599 +               item_pos = 1;
58600 +       } else
58601 +               item_pos = 0;
58602 +
58603 +       for (; item_pos < node40_num_of_items_internal(right); item_pos++) {
58604 +
58605 +               coord_set_item_pos(&coord, item_pos);
58606 +               ih = node40_ih_at_coord(&coord);
58607 +
58608 +               assert("vs-1612", keyeq(&ih->key, &data[i].key));
58609 +               assert("vs-1613",
58610 +                      le16_to_cpu(get_unaligned(&ih->plugin_id)) == data[i].plugin_id);
58611 +               switch (data[i].plugin_id) {
58612 +               case CTAIL_ID:
58613 +               case FORMATTING_ID:
58614 +                       assert("vs-1600",
58615 +                              data[i].u.bytes == coord_num_units(&coord));
58616 +                       break;
58617 +               case EXTENT_POINTER_ID:
58618 +                       assert("vs-1601",
58619 +                              data[i].u.bytes == extent_size(&coord,
58620 +                                                             coord_num_units
58621 +                                                             (&coord)));
58622 +                       break;
58623 +               case COMPOUND_DIR_ID:
58624 +                       assert("vs-1602",
58625 +                              data[i].u.entries == coord_num_units(&coord));
58626 +                       break;
58627 +               default:
58628 +                       break;
58629 +               }
58630 +               i++;
58631 +       }
58632 +
58633 +       assert("vs-1603", i == nr_items);
58634 +       kfree(data);
58635 +}
58636 +
58637 +#endif
58638 +
58639 +/* plugin->u.node.shift
58640 +   look for description of this method in plugin/node/node.h */
58641 +int shift_node40(coord_t * from, znode * to, shift_direction pend, int delete_child,   /* if @from->node becomes empty - it will be
58642 +                                                                                          deleted from the tree if this is set to 1 */
58643 +                int including_stop_coord, carry_plugin_info * info)
58644 +{
58645 +       struct shift_params shift;
58646 +       int result;
58647 +       znode *left, *right;
58648 +       znode *source;
58649 +       int target_empty;
58650 +
58651 +       assert("nikita-2161", coord_check(from));
58652 +
58653 +       memset(&shift, 0, sizeof(shift));
58654 +       shift.pend = pend;
58655 +       shift.wish_stop = *from;
58656 +       shift.target = to;
58657 +
58658 +       assert("nikita-1473", znode_is_write_locked(from->node));
58659 +       assert("nikita-1474", znode_is_write_locked(to));
58660 +
58661 +       source = from->node;
58662 +
58663 +       /* set @shift.wish_stop to rightmost/leftmost unit among units we want
58664 +          shifted */
58665 +       if (pend == SHIFT_LEFT) {
58666 +               result = coord_set_to_left(&shift.wish_stop);
58667 +               left = to;
58668 +               right = from->node;
58669 +       } else {
58670 +               result = coord_set_to_right(&shift.wish_stop);
58671 +               left = from->node;
58672 +               right = to;
58673 +       }
58674 +
58675 +       if (result) {
58676 +               /* move insertion coord even if there is nothing to move */
58677 +               if (including_stop_coord) {
58678 +                       /* move insertion coord (@from) */
58679 +                       if (pend == SHIFT_LEFT) {
58680 +                               /* after last item in target node */
58681 +                               coord_init_after_last_item(from, to);
58682 +                       } else {
58683 +                               /* before first item in target node */
58684 +                               coord_init_before_first_item(from, to);
58685 +                       }
58686 +               }
58687 +
58688 +               if (delete_child && node_is_empty(shift.wish_stop.node))
58689 +                       result =
58690 +                           prepare_removal_node40(shift.wish_stop.node, info);
58691 +               else
58692 +                       result = 0;
58693 +               /* there is nothing to shift */
58694 +               assert("nikita-2078", coord_check(from));
58695 +               return result;
58696 +       }
58697 +
58698 +       target_empty = node_is_empty(to);
58699 +
58700 +       /* when first node plugin with item body compression is implemented,
58701 +          this must be changed to call node specific plugin */
58702 +
58703 +       /* shift->stop_coord is updated to last unit which really will be
58704 +          shifted */
58705 +       estimate_shift(&shift, get_current_context());
58706 +       if (!shift.shift_bytes) {
58707 +               /* we could not shift anything */
58708 +               assert("nikita-2079", coord_check(from));
58709 +               return 0;
58710 +       }
58711 +
58712 +       copy(&shift);
58713 +
58714 +       /* result value of this is important. It is used by adjust_coord below */
58715 +       result = delete_copied(&shift);
58716 +
58717 +       assert("vs-1610", result >= 0);
58718 +       assert("vs-1471",
58719 +              ((reiser4_context *) current->journal_info)->magic ==
58720 +              context_magic);
58721 +
58722 +       /* item which has been moved from one node to another might want to do
58723 +          something on that event. This can be done by item's shift_hook
58724 +          method, which will be now called for every moved items */
58725 +       call_shift_hooks(&shift);
58726 +
58727 +       assert("vs-1472",
58728 +              ((reiser4_context *) current->journal_info)->magic ==
58729 +              context_magic);
58730 +
58731 +       update_taps(&shift);
58732 +
58733 +       assert("vs-1473",
58734 +              ((reiser4_context *) current->journal_info)->magic ==
58735 +              context_magic);
58736 +
58737 +       /* adjust @from pointer in accordance with @including_stop_coord flag
58738 +          and amount of data which was really shifted */
58739 +       adjust_coord(from, &shift, result, including_stop_coord);
58740 +
58741 +       if (target_empty)
58742 +               /*
58743 +                * items were shifted into empty node. Update delimiting key.
58744 +                */
58745 +               result = prepare_for_update(NULL, left, info);
58746 +
58747 +       /* add update operation to @info, which is the list of operations to
58748 +          be performed on a higher level */
58749 +       result = prepare_for_update(left, right, info);
58750 +       if (!result && node_is_empty(source) && delete_child) {
58751 +               /* all contents of @from->node is moved to @to and @from->node
58752 +                  has to be removed from the tree, so, on higher level we
58753 +                  will be removing the pointer to node @from->node */
58754 +               result = prepare_removal_node40(source, info);
58755 +       }
58756 +       assert("nikita-2080", coord_check(from));
58757 +       return result ? result : (int)shift.shift_bytes;
58758 +}
58759 +
58760 +/* plugin->u.node.fast_insert()
58761 +   look for description of this method in plugin/node/node.h */
58762 +int fast_insert_node40(const coord_t * coord UNUSED_ARG /* node to query */ )
58763 +{
58764 +       return 1;
58765 +}
58766 +
58767 +/* plugin->u.node.fast_paste()
58768 +   look for description of this method in plugin/node/node.h */
58769 +int fast_paste_node40(const coord_t * coord UNUSED_ARG /* node to query */ )
58770 +{
58771 +       return 1;
58772 +}
58773 +
58774 +/* plugin->u.node.fast_cut()
58775 +   look for description of this method in plugin/node/node.h */
58776 +int fast_cut_node40(const coord_t * coord UNUSED_ARG /* node to query */ )
58777 +{
58778 +       return 1;
58779 +}
58780 +
58781 +/* plugin->u.node.modify - not defined */
58782 +
58783 +/* plugin->u.node.max_item_size */
58784 +int max_item_size_node40(void)
58785 +{
58786 +       return reiser4_get_current_sb()->s_blocksize - sizeof(node40_header) -
58787 +           sizeof(item_header40);
58788 +}
58789 +
58790 +/* plugin->u.node.set_item_plugin */
58791 +int set_item_plugin_node40(coord_t *coord, item_id id)
58792 +{
58793 +       item_header40 *ih;
58794 +
58795 +       ih = node40_ih_at_coord(coord);
58796 +       put_unaligned(cpu_to_le16(id), &ih->plugin_id);
58797 +       coord->iplugid = id;
58798 +       return 0;
58799 +}
58800 +
58801 +/*
58802 +   Local variables:
58803 +   c-indentation-style: "K&R"
58804 +   mode-name: "LC"
58805 +   c-basic-offset: 8
58806 +   tab-width: 8
58807 +   fill-column: 120
58808 +   scroll-step: 1
58809 +   End:
58810 +*/
58811 diff --git a/fs/reiser4/plugin/node/node40.h b/fs/reiser4/plugin/node/node40.h
58812 new file mode 100644
58813 index 0000000..8ae375b
58814 --- /dev/null
58815 +++ b/fs/reiser4/plugin/node/node40.h
58816 @@ -0,0 +1,125 @@
58817 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
58818 +
58819 +#if !defined( __REISER4_NODE40_H__ )
58820 +#define __REISER4_NODE40_H__
58821 +
58822 +#include "../../forward.h"
58823 +#include "../../dformat.h"
58824 +#include "node.h"
58825 +
58826 +#include <linux/types.h>
58827 +
58828 +/* format of node header for 40 node layouts. Keep bloat out of this struct.  */
58829 +typedef struct node40_header {
58830 +       /* identifier of node plugin. Must be located at the very beginning
58831 +          of a node. */
58832 +       common_node_header common_header;       /* this is 16 bits */
58833 +       /* number of items. Should be first element in the node header,
58834 +          because we haven't yet finally decided whether it shouldn't go into
58835 +          common_header.
58836 +        */
58837 +/* NIKITA-FIXME-HANS: Create a macro such that if there is only one
58838 + * node format at compile time, and it is this one, accesses do not function dereference when
58839 + * accessing these fields (and otherwise they do).  Probably 80% of users will only have one node format at a time throughout the life of reiser4.  */
58840 +       d16 nr_items;
58841 +       /* free space in node measured in bytes */
58842 +       d16 free_space;
58843 +       /* offset to start of free space in node */
58844 +       d16 free_space_start;
58845 +       /* for reiser4_fsck.  When information about what is a free
58846 +          block is corrupted, and we try to recover everything even
58847 +          if marked as freed, then old versions of data may
58848 +          duplicate newer versions, and this field allows us to
58849 +          restore the newer version.  Also useful for when users
58850 +          who don't have the new trashcan installed on their linux distro
58851 +          delete the wrong files and send us desperate emails
58852 +          offering $25 for them back.  */
58853 +
58854 +       /* magic field we need to tell formatted nodes NIKITA-FIXME-HANS: improve this comment */
58855 +       d32 magic;
58856 +       /* flushstamp is made of mk_id and write_counter. mk_id is an
58857 +          id generated randomly at mkreiserfs time. So we can just
58858 +          skip all nodes with different mk_id. write_counter is d64
58859 +          incrementing counter of writes on disk. It is used for
58860 +          choosing the newest data at fsck time. NIKITA-FIXME-HANS: why was field name changed but not comment? */
58861 +
58862 +       d32 mkfs_id;
58863 +       d64 flush_id;
58864 +       /* node flags to be used by fsck (reiser4ck or reiser4fsck?)
58865 +          and repacker NIKITA-FIXME-HANS: say more or reference elsewhere that says more */
58866 +       d16 flags;
58867 +
58868 +       /* 1 is leaf level, 2 is twig level, root is the numerically
58869 +          largest level */
58870 +       d8 level;
58871 +
58872 +       d8 pad;
58873 +} PACKED node40_header;
58874 +
58875 +/* item headers are not standard across all node layouts, pass
58876 +   pos_in_node to functions instead */
58877 +typedef struct item_header40 {
58878 +       /* key of item */
58879 +       /*  0 */ reiser4_key key;
58880 +       /* offset from start of a node measured in 8-byte chunks */
58881 +       /* 24 */ d16 offset;
58882 +       /* 26 */ d16 flags;
58883 +       /* 28 */ d16 plugin_id;
58884 +} PACKED item_header40;
58885 +
58886 +size_t item_overhead_node40(const znode * node, flow_t * aflow);
58887 +size_t free_space_node40(znode * node);
58888 +node_search_result lookup_node40(znode * node, const reiser4_key * key,
58889 +                                lookup_bias bias, coord_t * coord);
58890 +int num_of_items_node40(const znode * node);
58891 +char *item_by_coord_node40(const coord_t * coord);
58892 +int length_by_coord_node40(const coord_t * coord);
58893 +item_plugin *plugin_by_coord_node40(const coord_t * coord);
58894 +reiser4_key *key_at_node40(const coord_t * coord, reiser4_key * key);
58895 +size_t estimate_node40(znode * node);
58896 +int check_node40(const znode * node, __u32 flags, const char **error);
58897 +int parse_node40(znode * node);
58898 +int init_node40(znode * node);
58899 +#ifdef GUESS_EXISTS
58900 +int guess_node40(const znode * node);
58901 +#endif
58902 +void change_item_size_node40(coord_t * coord, int by);
58903 +int create_item_node40(coord_t * target, const reiser4_key * key,
58904 +                      reiser4_item_data * data, carry_plugin_info * info);
58905 +void update_item_key_node40(coord_t * target, const reiser4_key * key,
58906 +                           carry_plugin_info * info);
58907 +int kill_node40(struct carry_kill_data *, carry_plugin_info *);
58908 +int cut_node40(struct carry_cut_data *, carry_plugin_info *);
58909 +int shift_node40(coord_t * from, znode * to, shift_direction pend,
58910 +                /* if @from->node becomes
58911 +                   empty - it will be deleted from
58912 +                   the tree if this is set to 1
58913 +                 */
58914 +                int delete_child, int including_stop_coord,
58915 +                carry_plugin_info * info);
58916 +
58917 +int fast_insert_node40(const coord_t * coord);
58918 +int fast_paste_node40(const coord_t * coord);
58919 +int fast_cut_node40(const coord_t * coord);
58920 +int max_item_size_node40(void);
58921 +int prepare_removal_node40(znode * empty, carry_plugin_info * info);
58922 +int set_item_plugin_node40(coord_t * coord, item_id id);
58923 +int shrink_item_node40(coord_t * coord, int delta);
58924 +
58925 +#if REISER4_DEBUG
58926 +void *shift_check_prepare(const znode *left, const znode *right);
58927 +void shift_check(void *vp, const znode *left, const znode *right);
58928 +#endif
58929 +
58930 +/* __REISER4_NODE40_H__ */
58931 +#endif
58932 +/*
58933 +   Local variables:
58934 +   c-indentation-style: "K&R"
58935 +   mode-name: "LC"
58936 +   c-basic-offset: 8
58937 +   tab-width: 8
58938 +   fill-column: 120
58939 +   scroll-step: 1
58940 +   End:
58941 +*/
58942 diff --git a/fs/reiser4/plugin/object.c b/fs/reiser4/plugin/object.c
58943 new file mode 100644
58944 index 0000000..9f11dd6
58945 --- /dev/null
58946 +++ b/fs/reiser4/plugin/object.c
58947 @@ -0,0 +1,502 @@
58948 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
58949 + * reiser4/README */
58950 +
58951 +/*
58952 + * Examples of object plugins: file, directory, symlink, special file.
58953 + *
58954 + * Plugins associated with inode:
58955 + *
58956 + * Plugin of inode is plugin referenced by plugin-id field of on-disk
58957 + * stat-data. How we store this plugin in in-core inode is not
58958 + * important. Currently pointers are used, another variant is to store offsets
58959 + * and do array lookup on each access.
58960 + *
58961 + * Now, each inode has one selected plugin: object plugin that
58962 + * determines what type of file this object is: directory, regular etc.
58963 + *
58964 + * This main plugin can use other plugins that are thus subordinated to
58965 + * it. Directory instance of object plugin uses hash; regular file
58966 + * instance uses tail policy plugin.
58967 + *
58968 + * Object plugin is either taken from id in stat-data or guessed from
58969 + * i_mode bits. Once it is established we ask it to install its
58970 + * subordinate plugins, by looking again in stat-data or inheriting them
58971 + * from parent.
58972 + *
58973 + * How new inode is initialized during ->read_inode():
58974 + * 1 read stat-data and initialize inode fields: i_size, i_mode,
58975 + *   i_generation, capabilities etc.
58976 + * 2 read plugin id from stat data or try to guess plugin id
58977 + *   from inode->i_mode bits if plugin id is missing.
58978 + * 3 Call ->init_inode() method of stat-data plugin to initialise inode fields.
58979 + *
58980 + * NIKITA-FIXME-HANS: can you say a little about 1 being done before 3?  What
58981 + * if stat data does contain i_size, etc., due to it being an unusual plugin?
58982 + *
58983 + * 4 Call ->activate() method of object's plugin. Plugin is either read from
58984 + *    from stat-data or guessed from mode bits
58985 + * 5 Call ->inherit() method of object plugin to inherit as yet un initialized
58986 + *    plugins from parent.
58987 + *
58988 + * Easy induction proves that on last step all plugins of inode would be
58989 + * initialized.
58990 + *
58991 + * When creating new object:
58992 + * 1 obtain object plugin id (see next period)
58993 + * NIKITA-FIXME-HANS: period?
58994 + * 2 ->install() this plugin
58995 + * 3 ->inherit() the rest from the parent
58996 + *
58997 + * We need some examples of creating an object with default and non-default
58998 + * plugin ids.  Nikita, please create them.
58999 + */
59000 +
59001 +#include "../inode.h"
59002 +
59003 +static int _bugop(void)
59004 +{
59005 +       BUG_ON(1);
59006 +       return 0;
59007 +}
59008 +
59009 +#define bugop ((void *)_bugop)
59010 +
59011 +static int _dummyop(void)
59012 +{
59013 +       return 0;
59014 +}
59015 +
59016 +#define dummyop ((void *)_dummyop)
59017 +
59018 +static int change_file(struct inode *inode, reiser4_plugin * plugin)
59019 +{
59020 +       /* cannot change object plugin of already existing object */
59021 +       return RETERR(-EINVAL);
59022 +}
59023 +
59024 +static reiser4_plugin_ops file_plugin_ops = {
59025 +       .change = change_file
59026 +};
59027 +
59028 +/*
59029 + * Definitions of object plugins.
59030 + */
59031 +
59032 +file_plugin file_plugins[LAST_FILE_PLUGIN_ID] = {
59033 +       [UNIX_FILE_PLUGIN_ID] = {
59034 +               .h = {
59035 +                       .type_id = REISER4_FILE_PLUGIN_TYPE,
59036 +                       .id = UNIX_FILE_PLUGIN_ID,
59037 +                       .pops = &file_plugin_ops,
59038 +                       .label = "reg",
59039 +                       .desc = "regular file",
59040 +                       .linkage = {NULL, NULL},
59041 +               },
59042 +               .inode_ops = {
59043 +                       .permission = permission_common,
59044 +                       .setattr = setattr_unix_file,
59045 +                       .getattr = getattr_common
59046 +               },
59047 +               .file_ops = {
59048 +                       .llseek = generic_file_llseek,
59049 +                       .read = read_unix_file,
59050 +                       .write = write_unix_file,
59051 +                       .ioctl = ioctl_unix_file,
59052 +                       .mmap = mmap_unix_file,
59053 +                       .open = open_unix_file,
59054 +                       .release = release_unix_file,
59055 +                       .fsync = sync_unix_file,
59056 +                       .sendfile = sendfile_unix_file
59057 +               },
59058 +               .as_ops = {
59059 +                       .writepage = reiser4_writepage,
59060 +                       .readpage = readpage_unix_file,
59061 +                       .sync_page = block_sync_page,
59062 +                       .writepages = writepages_unix_file,
59063 +                       .set_page_dirty = reiser4_set_page_dirty,
59064 +                       .readpages = reiser4_readpages,
59065 +                       .prepare_write = prepare_write_unix_file,
59066 +                       .commit_write = commit_write_unix_file,
59067 +                       .bmap = bmap_unix_file,
59068 +                       .invalidatepage = reiser4_invalidatepage,
59069 +                       .releasepage = reiser4_releasepage
59070 +               },
59071 +               .write_sd_by_inode = write_sd_by_inode_common,
59072 +               .flow_by_inode = flow_by_inode_unix_file,
59073 +               .key_by_inode = key_by_inode_and_offset_common,
59074 +               .set_plug_in_inode = set_plug_in_inode_common,
59075 +               .adjust_to_parent = adjust_to_parent_common,
59076 +               .create_object = create_object_common,  /* this is not inode_operations's create */
59077 +               .delete_object = delete_object_unix_file,
59078 +               .add_link = add_link_common,
59079 +               .rem_link = rem_link_common,
59080 +               .owns_item = owns_item_unix_file,
59081 +               .can_add_link = can_add_link_common,
59082 +               .detach = dummyop,
59083 +               .bind = dummyop,
59084 +               .safelink = safelink_common,
59085 +               .estimate = {
59086 +                       .create = estimate_create_common,
59087 +                       .update = estimate_update_common,
59088 +                       .unlink = estimate_unlink_common
59089 +               },
59090 +               .init_inode_data = init_inode_data_unix_file,
59091 +               .cut_tree_worker = cut_tree_worker_common,
59092 +               .wire = {
59093 +                       .write = wire_write_common,
59094 +                       .read = wire_read_common,
59095 +                       .get = wire_get_common,
59096 +                       .size = wire_size_common,
59097 +                       .done = wire_done_common
59098 +               }
59099 +       },
59100 +       [DIRECTORY_FILE_PLUGIN_ID] = {
59101 +               .h = {
59102 +                       .type_id = REISER4_FILE_PLUGIN_TYPE,
59103 +                       .id = DIRECTORY_FILE_PLUGIN_ID,
59104 +                       .pops = &file_plugin_ops,
59105 +                       .label = "dir",
59106 +                       .desc = "directory",
59107 +                       .linkage = {NULL, NULL}
59108 +               },
59109 +               .inode_ops = {NULL,},
59110 +               .file_ops = {NULL,},
59111 +               .as_ops = {NULL,},
59112 +
59113 +               .write_sd_by_inode = write_sd_by_inode_common,
59114 +               .flow_by_inode = bugop,
59115 +               .key_by_inode = bugop,
59116 +               .set_plug_in_inode = set_plug_in_inode_common,
59117 +               .adjust_to_parent = adjust_to_parent_common_dir,
59118 +               .create_object = create_object_common,
59119 +               .delete_object = delete_directory_common,
59120 +               .add_link = add_link_common,
59121 +               .rem_link = rem_link_common_dir,
59122 +               .owns_item = owns_item_common_dir,
59123 +               .can_add_link = can_add_link_common,
59124 +               .can_rem_link = can_rem_link_common_dir,
59125 +               .detach = detach_common_dir,
59126 +               .bind = bind_common_dir,
59127 +               .safelink = safelink_common,
59128 +               .estimate = {
59129 +                       .create = estimate_create_common_dir,
59130 +                       .update = estimate_update_common,
59131 +                       .unlink = estimate_unlink_common_dir
59132 +               },
59133 +               .wire = {
59134 +                       .write = wire_write_common,
59135 +                       .read = wire_read_common,
59136 +                       .get = wire_get_common,
59137 +                       .size = wire_size_common,
59138 +                       .done = wire_done_common
59139 +               },
59140 +               .init_inode_data = init_inode_ordering,
59141 +               .cut_tree_worker = cut_tree_worker_common,
59142 +       },
59143 +       [SYMLINK_FILE_PLUGIN_ID] = {
59144 +               .h = {
59145 +                       .type_id = REISER4_FILE_PLUGIN_TYPE,
59146 +                       .id = SYMLINK_FILE_PLUGIN_ID,
59147 +                       .pops = &file_plugin_ops,
59148 +                       .label = "symlink",
59149 +                       .desc = "symbolic link",
59150 +                       .linkage = {NULL,NULL}
59151 +               },
59152 +               .inode_ops = {
59153 +                       .readlink = generic_readlink,
59154 +                       .follow_link = follow_link_common,
59155 +                       .permission = permission_common,
59156 +                       .setattr = setattr_common,
59157 +                       .getattr = getattr_common
59158 +               },
59159 +               /* inode->i_fop of symlink is initialized by NULL in setup_inode_ops */
59160 +               .file_ops = {NULL,},
59161 +               .as_ops = {NULL,},
59162 +
59163 +               .write_sd_by_inode = write_sd_by_inode_common,
59164 +               .set_plug_in_inode = set_plug_in_inode_common,
59165 +               .adjust_to_parent = adjust_to_parent_common,
59166 +               .create_object = create_symlink,
59167 +               .delete_object = delete_object_common,
59168 +               .add_link = add_link_common,
59169 +               .rem_link = rem_link_common,
59170 +               .can_add_link = can_add_link_common,
59171 +               .detach = dummyop,
59172 +               .bind = dummyop,
59173 +               .safelink = safelink_common,
59174 +               .estimate = {
59175 +                       .create = estimate_create_common,
59176 +                       .update = estimate_update_common,
59177 +                       .unlink = estimate_unlink_common
59178 +               },
59179 +               .init_inode_data = init_inode_ordering,
59180 +               .cut_tree_worker = cut_tree_worker_common,
59181 +               .destroy_inode = destroy_inode_symlink,
59182 +               .wire = {
59183 +                       .write = wire_write_common,
59184 +                       .read = wire_read_common,
59185 +                       .get = wire_get_common,
59186 +                       .size = wire_size_common,
59187 +                       .done = wire_done_common
59188 +               }
59189 +       },
59190 +       [SPECIAL_FILE_PLUGIN_ID] = {
59191 +               .h = {
59192 +                       .type_id = REISER4_FILE_PLUGIN_TYPE,
59193 +                       .id = SPECIAL_FILE_PLUGIN_ID,
59194 +                       .pops = &file_plugin_ops,
59195 +                       .label = "special",
59196 +                       .desc =
59197 +                       "special: fifo, device or socket",
59198 +                       .linkage = {NULL, NULL}
59199 +               },
59200 +               .inode_ops = {
59201 +                       .permission = permission_common,
59202 +                       .setattr = setattr_common,
59203 +                       .getattr = getattr_common
59204 +               },
59205 +               /* file_ops of special files (sockets, block, char, fifo) are
59206 +                  initialized by init_special_inode. */
59207 +               .file_ops = {NULL,},
59208 +               .as_ops = {NULL,},
59209 +
59210 +               .write_sd_by_inode = write_sd_by_inode_common,
59211 +               .set_plug_in_inode = set_plug_in_inode_common,
59212 +               .adjust_to_parent = adjust_to_parent_common,
59213 +               .create_object = create_object_common,
59214 +               .delete_object = delete_object_common,
59215 +               .add_link = add_link_common,
59216 +               .rem_link = rem_link_common,
59217 +               .owns_item = owns_item_common,
59218 +               .can_add_link = can_add_link_common,
59219 +               .detach = dummyop,
59220 +               .bind = dummyop,
59221 +               .safelink = safelink_common,
59222 +               .estimate = {
59223 +                       .create = estimate_create_common,
59224 +                       .update = estimate_update_common,
59225 +                       .unlink = estimate_unlink_common
59226 +               },
59227 +               .init_inode_data = init_inode_ordering,
59228 +               .cut_tree_worker = cut_tree_worker_common,
59229 +               .wire = {
59230 +                       .write = wire_write_common,
59231 +                       .read = wire_read_common,
59232 +                       .get = wire_get_common,
59233 +                       .size = wire_size_common,
59234 +                       .done = wire_done_common
59235 +               }
59236 +       },
59237 +       [CRC_FILE_PLUGIN_ID] = {
59238 +               .h = {
59239 +                       .type_id = REISER4_FILE_PLUGIN_TYPE,
59240 +                       .id = CRC_FILE_PLUGIN_ID,
59241 +                       .pops = &cryptcompress_plugin_ops,
59242 +                       .label = "cryptcompress",
59243 +                       .desc = "cryptcompress file",
59244 +                       .linkage = {NULL, NULL}
59245 +               },
59246 +               .inode_ops = {
59247 +                       .permission = permission_common,
59248 +                       .setattr = setattr_cryptcompress,
59249 +                       .getattr = getattr_common
59250 +               },
59251 +               .file_ops = {
59252 +                       .llseek = generic_file_llseek,
59253 +                       .read = read_cryptcompress,
59254 +                       .write = write_cryptcompress,
59255 +                       .aio_read = generic_file_aio_read,
59256 +                       .mmap = mmap_cryptcompress,
59257 +                       .release = release_cryptcompress,
59258 +                       .fsync = sync_common,
59259 +                       .sendfile = sendfile_cryptcompress
59260 +               },
59261 +               .as_ops = {
59262 +                       .writepage = reiser4_writepage,
59263 +                       .readpage = readpage_cryptcompress,
59264 +                       .sync_page = block_sync_page,
59265 +                       .writepages = writepages_cryptcompress,
59266 +                       .set_page_dirty = reiser4_set_page_dirty,
59267 +                       .readpages = reiser4_readpages,
59268 +                       .prepare_write = prepare_write_common,
59269 +                       .invalidatepage = reiser4_invalidatepage,
59270 +                       .releasepage = reiser4_releasepage
59271 +               },
59272 +               .write_sd_by_inode = write_sd_by_inode_common,
59273 +               .flow_by_inode = flow_by_inode_cryptcompress,
59274 +               .key_by_inode = key_by_inode_cryptcompress,
59275 +               .set_plug_in_inode = set_plug_in_inode_common,
59276 +               .adjust_to_parent = adjust_to_parent_cryptcompress,
59277 +               .create_object = create_cryptcompress,
59278 +               .open_object = open_cryptcompress,
59279 +               .delete_object = delete_cryptcompress,
59280 +               .add_link = add_link_common,
59281 +               .rem_link = rem_link_common,
59282 +               .owns_item = owns_item_common,
59283 +               .can_add_link = can_add_link_common,
59284 +               .detach = dummyop,
59285 +               .bind = dummyop,
59286 +               .safelink = safelink_common,
59287 +               .estimate = {
59288 +                       .create = estimate_create_common,
59289 +                       .update = estimate_update_common,
59290 +                       .unlink = estimate_unlink_common
59291 +               },
59292 +               .init_inode_data = init_inode_data_cryptcompress,
59293 +               .cut_tree_worker = cut_tree_worker_cryptcompress,
59294 +               .destroy_inode = destroy_inode_cryptcompress,
59295 +               .wire = {
59296 +                       .write = wire_write_common,
59297 +                       .read = wire_read_common,
59298 +                       .get = wire_get_common,
59299 +                       .size = wire_size_common,
59300 +                       .done = wire_done_common
59301 +               }
59302 +       }
59303 +};
59304 +
59305 +static int change_dir(struct inode *inode, reiser4_plugin * plugin)
59306 +{
59307 +       /* cannot change dir plugin of already existing object */
59308 +       return RETERR(-EINVAL);
59309 +}
59310 +
59311 +static reiser4_plugin_ops dir_plugin_ops = {
59312 +       .change = change_dir
59313 +};
59314 +
59315 +/*
59316 + * definition of directory plugins
59317 + */
59318 +
59319 +dir_plugin dir_plugins[LAST_DIR_ID] = {
59320 +       /* standard hashed directory plugin */
59321 +       [HASHED_DIR_PLUGIN_ID] = {
59322 +               .h = {
59323 +                       .type_id = REISER4_DIR_PLUGIN_TYPE,
59324 +                       .id = HASHED_DIR_PLUGIN_ID,
59325 +                       .pops = &dir_plugin_ops,
59326 +                       .label = "dir",
59327 +                       .desc = "hashed directory",
59328 +                       .linkage = {NULL, NULL}
59329 +               },
59330 +               .inode_ops = {
59331 +                       .create = create_common,
59332 +                       .lookup = lookup_common,
59333 +                       .link = link_common,
59334 +                       .unlink = unlink_common,
59335 +                       .symlink = symlink_common,
59336 +                       .mkdir = mkdir_common,
59337 +                       .rmdir = unlink_common,
59338 +                       .mknod = mknod_common,
59339 +                       .rename = rename_common,
59340 +                       .permission = permission_common,
59341 +                       .setattr = setattr_common,
59342 +                       .getattr = getattr_common
59343 +               },
59344 +               .file_ops = {
59345 +                       .llseek = llseek_common_dir,
59346 +                       .read = generic_read_dir,
59347 +                       .readdir = readdir_common,
59348 +                       .release = release_dir_common,
59349 +                       .fsync = sync_common
59350 +               },
59351 +               .as_ops = {
59352 +                       .writepage = bugop,
59353 +                       .sync_page = bugop,
59354 +                       .writepages = dummyop,
59355 +                       .set_page_dirty = bugop,
59356 +                       .readpages = bugop,
59357 +                       .prepare_write = bugop,
59358 +                       .commit_write = bugop,
59359 +                       .bmap = bugop,
59360 +                       .invalidatepage = bugop,
59361 +                       .releasepage = bugop
59362 +               },
59363 +               .get_parent = get_parent_common,
59364 +               .is_name_acceptable = is_name_acceptable_common,
59365 +               .build_entry_key = build_entry_key_hashed,
59366 +               .build_readdir_key = build_readdir_key_common,
59367 +               .add_entry = add_entry_common,
59368 +               .rem_entry = rem_entry_common,
59369 +               .init = init_common,
59370 +               .done = done_common,
59371 +               .attach = attach_common,
59372 +               .detach = detach_common,
59373 +               .estimate = {
59374 +                       .add_entry = estimate_add_entry_common,
59375 +                       .rem_entry = estimate_rem_entry_common,
59376 +                       .unlink = dir_estimate_unlink_common
59377 +               }
59378 +       },
59379 +       /* hashed directory for which seekdir/telldir are guaranteed to
59380 +        * work. Brain-damage. */
59381 +       [SEEKABLE_HASHED_DIR_PLUGIN_ID] = {
59382 +               .h = {
59383 +                       .type_id = REISER4_DIR_PLUGIN_TYPE,
59384 +                       .id = SEEKABLE_HASHED_DIR_PLUGIN_ID,
59385 +                       .pops = &dir_plugin_ops,
59386 +                       .label = "dir32",
59387 +                       .desc = "directory hashed with 31 bit hash",
59388 +                       .linkage = {NULL, NULL}
59389 +               },
59390 +               .inode_ops = {
59391 +                       .create = create_common,
59392 +                       .lookup = lookup_common,
59393 +                       .link = link_common,
59394 +                       .unlink = unlink_common,
59395 +                       .symlink = symlink_common,
59396 +                       .mkdir = mkdir_common,
59397 +                       .rmdir = unlink_common,
59398 +                       .mknod = mknod_common,
59399 +                       .rename = rename_common,
59400 +                       .permission = permission_common,
59401 +                       .setattr = setattr_common,
59402 +                       .getattr = getattr_common
59403 +               },
59404 +               .file_ops = {
59405 +                       .llseek = llseek_common_dir,
59406 +                       .read = generic_read_dir,
59407 +                       .readdir = readdir_common,
59408 +                       .release = release_dir_common,
59409 +                       .fsync = sync_common
59410 +               },
59411 +               .as_ops = {
59412 +                       .writepage = bugop,
59413 +                       .sync_page = bugop,
59414 +                       .writepages = dummyop,
59415 +                       .set_page_dirty = bugop,
59416 +                       .readpages = bugop,
59417 +                       .prepare_write = bugop,
59418 +                       .commit_write = bugop,
59419 +                       .bmap = bugop,
59420 +                       .invalidatepage = bugop,
59421 +                       .releasepage = bugop
59422 +               },
59423 +               .get_parent = get_parent_common,
59424 +               .is_name_acceptable = is_name_acceptable_common,
59425 +               .build_entry_key = build_entry_key_seekable,
59426 +               .build_readdir_key = build_readdir_key_common,
59427 +               .add_entry = add_entry_common,
59428 +               .rem_entry = rem_entry_common,
59429 +               .init = init_common,
59430 +               .done = done_common,
59431 +               .attach = attach_common,
59432 +               .detach = detach_common,
59433 +               .estimate = {
59434 +                       .add_entry = estimate_add_entry_common,
59435 +                       .rem_entry = estimate_rem_entry_common,
59436 +                       .unlink = dir_estimate_unlink_common
59437 +               }
59438 +       }
59439 +};
59440 +
59441 +/* Make Linus happy.
59442 +   Local variables:
59443 +   c-indentation-style: "K&R"
59444 +   mode-name: "LC"
59445 +   c-basic-offset: 8
59446 +   tab-width: 8
59447 +   fill-column: 120
59448 +   End:
59449 +*/
59450 diff --git a/fs/reiser4/plugin/object.h b/fs/reiser4/plugin/object.h
59451 new file mode 100644
59452 index 0000000..12f593b
59453 --- /dev/null
59454 +++ b/fs/reiser4/plugin/object.h
59455 @@ -0,0 +1,121 @@
59456 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by
59457 + * reiser4/README */
59458 +
59459 +/* Declaration of object plugin functions. */
59460 +
59461 +#if !defined( __FS_REISER4_PLUGIN_OBJECT_H__ )
59462 +#define __FS_REISER4_PLUGIN_OBJECT_H__
59463 +
59464 +#include "../type_safe_hash.h"
59465 +
59466 +/* common implementations of inode operations */
59467 +int create_common(struct inode *parent, struct dentry *dentry,
59468 +                 int mode, struct nameidata *);
59469 +struct dentry *lookup_common(struct inode *parent, struct dentry *dentry,
59470 +                            struct nameidata *nameidata);
59471 +int link_common(struct dentry *existing, struct inode *parent,
59472 +               struct dentry *newname);
59473 +int unlink_common(struct inode *parent, struct dentry *victim);
59474 +int mkdir_common(struct inode *parent, struct dentry *dentry, int mode);
59475 +int symlink_common(struct inode *parent, struct dentry *dentry,
59476 +                  const char *linkname);
59477 +int mknod_common(struct inode *parent, struct dentry *dentry,
59478 +                int mode, dev_t rdev);
59479 +int rename_common(struct inode *old_dir, struct dentry *old_name,
59480 +                 struct inode *new_dir, struct dentry *new_name);
59481 +void *follow_link_common(struct dentry *, struct nameidata *data);
59482 +int permission_common(struct inode *, int mask,        /* mode bits to check permissions for */
59483 +                     struct nameidata *nameidata);
59484 +int setattr_common(struct dentry *, struct iattr *);
59485 +int getattr_common(struct vfsmount *mnt, struct dentry *, struct kstat *);
59486 +
59487 +/* common implementations of file operations */
59488 +loff_t llseek_common_dir(struct file *, loff_t off, int origin);
59489 +int readdir_common(struct file *, void *dirent, filldir_t);
59490 +int release_dir_common(struct inode *, struct file *);
59491 +int sync_common(struct file *, struct dentry *, int datasync);
59492 +
59493 +/* common implementations of address space operations */
59494 +int prepare_write_common(struct file *, struct page *, unsigned from,
59495 +                        unsigned to);
59496 +
59497 +/* file plugin operations: common implementations */
59498 +int write_sd_by_inode_common(struct inode *);
59499 +int key_by_inode_and_offset_common(struct inode *, loff_t, reiser4_key *);
59500 +int set_plug_in_inode_common(struct inode *object, struct inode *parent,
59501 +                            reiser4_object_create_data *);
59502 +int adjust_to_parent_common(struct inode *object, struct inode *parent,
59503 +                           struct inode *root);
59504 +int adjust_to_parent_common_dir(struct inode *object, struct inode *parent,
59505 +                               struct inode *root);
59506 +int adjust_to_parent_cryptcompress(struct inode *object, struct inode *parent,
59507 +                                  struct inode *root);
59508 +int create_object_common(struct inode *object, struct inode *parent,
59509 +                        reiser4_object_create_data *);
59510 +int delete_object_common(struct inode *);
59511 +int delete_directory_common(struct inode *);
59512 +int add_link_common(struct inode *object, struct inode *parent);
59513 +int rem_link_common(struct inode *object, struct inode *parent);
59514 +int rem_link_common_dir(struct inode *object, struct inode *parent);
59515 +int owns_item_common(const struct inode *, const coord_t *);
59516 +int owns_item_common_dir(const struct inode *, const coord_t *);
59517 +int can_add_link_common(const struct inode *);
59518 +int can_rem_link_common_dir(const struct inode *);
59519 +int detach_common_dir(struct inode *child, struct inode *parent);
59520 +int open_cryptcompress(struct inode * inode, struct file * file);
59521 +int bind_common_dir(struct inode *child, struct inode *parent);
59522 +int safelink_common(struct inode *, reiser4_safe_link_t, __u64 value);
59523 +reiser4_block_nr estimate_create_common(const struct inode *);
59524 +reiser4_block_nr estimate_create_common_dir(const struct inode *);
59525 +reiser4_block_nr estimate_update_common(const struct inode *);
59526 +reiser4_block_nr estimate_unlink_common(const struct inode *,
59527 +                                       const struct inode *);
59528 +reiser4_block_nr estimate_unlink_common_dir(const struct inode *,
59529 +                                           const struct inode *);
59530 +char *wire_write_common(struct inode *, char *start);
59531 +char *wire_read_common(char *addr, reiser4_object_on_wire *);
59532 +struct dentry *wire_get_common(struct super_block *, reiser4_object_on_wire *);
59533 +int wire_size_common(struct inode *);
59534 +void wire_done_common(reiser4_object_on_wire *);
59535 +
59536 +/* dir plugin operations: common implementations */
59537 +struct dentry *get_parent_common(struct inode *child);
59538 +int is_name_acceptable_common(const struct inode *, const char *name, int len);
59539 +void build_entry_key_common(const struct inode *,
59540 +                           const struct qstr *qname, reiser4_key *);
59541 +int build_readdir_key_common(struct file *dir, reiser4_key *);
59542 +int add_entry_common(struct inode *object, struct dentry *where,
59543 +                    reiser4_object_create_data *, reiser4_dir_entry_desc *);
59544 +int rem_entry_common(struct inode *object, struct dentry *where,
59545 +                    reiser4_dir_entry_desc *);
59546 +int init_common(struct inode *object, struct inode *parent,
59547 +               reiser4_object_create_data *);
59548 +int done_common(struct inode *);
59549 +int attach_common(struct inode *child, struct inode *parent);
59550 +int detach_common(struct inode *object, struct inode *parent);
59551 +reiser4_block_nr estimate_add_entry_common(const struct inode *);
59552 +reiser4_block_nr estimate_rem_entry_common(const struct inode *);
59553 +reiser4_block_nr dir_estimate_unlink_common(const struct inode *,
59554 +                                           const struct inode *);
59555 +
59556 +/* these are essential parts of common implementations, they are to make
59557 +   customized implementations easier */
59558 +int do_prepare_write(struct file *, struct page *, unsigned from, unsigned to);
59559 +
59560 +/* merely useful functions */
59561 +int lookup_sd(struct inode *, znode_lock_mode, coord_t *, lock_handle *,
59562 +             const reiser4_key *, int silent);
59563 +
59564 +
59565 +/* __FS_REISER4_PLUGIN_OBJECT_H__ */
59566 +#endif
59567 +
59568 +/* Make Linus happy.
59569 +   Local variables:
59570 +   c-indentation-style: "K&R"
59571 +   mode-name: "LC"
59572 +   c-basic-offset: 8
59573 +   tab-width: 8
59574 +   fill-column: 120
59575 +   End:
59576 +*/
59577 diff --git a/fs/reiser4/plugin/plugin.c b/fs/reiser4/plugin/plugin.c
59578 new file mode 100644
59579 index 0000000..61eb800
59580 --- /dev/null
59581 +++ b/fs/reiser4/plugin/plugin.c
59582 @@ -0,0 +1,535 @@
59583 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
59584 + * reiser4/README */
59585 +
59586 +/* Basic plugin infrastructure, lookup etc. */
59587 +
59588 +/* PLUGINS:
59589 +
59590 +   Plugins are internal Reiser4 "modules" or "objects" used to increase
59591 +   extensibility and allow external users to easily adapt reiser4 to
59592 +   their needs.
59593 +
59594 +   Plugins are classified into several disjoint "types". Plugins
59595 +   belonging to the particular plugin type are termed "instances" of
59596 +   this type. Currently the following types are present:
59597 +
59598 +    . object plugin
59599 +    . hash plugin
59600 +    . tail plugin
59601 +    . perm plugin
59602 +    . item plugin
59603 +    . node layout plugin
59604 +
59605 +NIKITA-FIXME-HANS: update this list, and review this entire comment for currency
59606 +
59607 +   Object (file) plugin determines how given file-system object serves
59608 +   standard VFS requests for read, write, seek, mmap etc. Instances of
59609 +   file plugins are: regular file, directory, symlink. Another example
59610 +   of file plugin is audit plugin, that optionally records accesses to
59611 +   underlying object and forwards requests to it.
59612 +
59613 +   Hash plugins compute hashes used by reiser4 to store and locate
59614 +   files within directories. Instances of hash plugin type are: r5,
59615 +   tea, rupasov.
59616 +
59617 +   Tail plugins (or, more precisely, tail policy plugins) determine
59618 +   when last part of the file should be stored in a formatted item.
59619 +
59620 +   Perm plugins control permissions granted for a process accessing a file.
59621 +
59622 +   Scope and lookup:
59623 +
59624 +   label such that pair ( type_label, plugin_label ) is unique.  This
59625 +   pair is a globally persistent and user-visible plugin
59626 +   identifier. Internally kernel maintains plugins and plugin types in
59627 +   arrays using an index into those arrays as plugin and plugin type
59628 +   identifiers. File-system in turn, also maintains persistent
59629 +   "dictionary" which is mapping from plugin label to numerical
59630 +   identifier which is stored in file-system objects.  That is, we
59631 +   store the offset into the plugin array for that plugin type as the
59632 +   plugin id in the stat data of the filesystem object.
59633 +
59634 +   plugin_labels have meaning for the user interface that assigns
59635 +   plugins to files, and may someday have meaning for dynamic loading of
59636 +   plugins and for copying of plugins from one fs instance to
59637 +   another by utilities like cp and tar.
59638 +
59639 +   Internal kernel plugin type identifier (index in plugins[] array) is
59640 +   of type reiser4_plugin_type. Set of available plugin types is
59641 +   currently static, but dynamic loading doesn't seem to pose
59642 +   insurmountable problems.
59643 +
59644 +   Within each type plugins are addressed by the identifiers of type
59645 +   reiser4_plugin_id (indices in
59646 +   reiser4_plugin_type_data.builtin[]). Such identifiers are only
59647 +   required to be unique within one type, not globally.
59648 +
59649 +   Thus, plugin in memory is uniquely identified by the pair (type_id,
59650 +   id).
59651 +
59652 +   Usage:
59653 +
59654 +   There exists only one instance of each plugin instance, but this
59655 +   single instance can be associated with many entities (file-system
59656 +   objects, items, nodes, transactions, file-descriptors etc.). Entity
59657 +   to which plugin of given type is termed (due to the lack of
59658 +   imagination) "subject" of this plugin type and, by abuse of
59659 +   terminology, subject of particular instance of this type to which
59660 +   it's attached currently. For example, inode is subject of object
59661 +   plugin type. Inode representing directory is subject of directory
59662 +   plugin, hash plugin type and some particular instance of hash plugin
59663 +   type. Inode, representing regular file is subject of "regular file"
59664 +   plugin, tail-policy plugin type etc.
59665 +
59666 +   With each subject the plugin possibly stores some state. For example,
59667 +   the state of a directory plugin (instance of object plugin type) is pointer
59668 +   to hash plugin (if directories always use hashing that is). State of
59669 +   audit plugin is file descriptor (struct file) of log file or some
59670 +   magic value to do logging through printk().
59671 +
59672 +   Interface:
59673 +
59674 +   In addition to a scalar identifier, each plugin type and plugin
59675 +   proper has a "label": short string and a "description"---longer
59676 +   descriptive string. Labels and descriptions of plugin types are
59677 +   hard-coded into plugins[] array, declared and defined in
59678 +   plugin.c. Label and description of plugin are stored in .label and
59679 +   .desc fields of reiser4_plugin_header respectively. It's possible to
59680 +   locate plugin by the pair of labels.
59681 +
59682 +   Features:
59683 +
59684 +    . user-level plugin manipulations:
59685 +      + reiser4("filename/..file_plugin<='audit'");
59686 +      + write(open("filename/..file_plugin"), "audit", 8);
59687 +
59688 +    . user level utilities lsplug and chplug to manipulate plugins.
59689 +      Utilities are not of primary priority. Possibly they will be not
59690 +      working on v4.0
59691 +
59692 +NIKITA-FIXME-HANS: this should be a mkreiserfs option not a mount option, do you agree?  I don't think that specifying it at mount time, and then changing it with each mount, is a good model for usage.
59693 +
59694 +    . mount option "plug" to set-up plugins of root-directory.
59695 +      "plug=foo:bar" will set "bar" as default plugin of type "foo".
59696 +
59697 +   Limitations:
59698 +
59699 +    . each plugin type has to provide at least one builtin
59700 +      plugin. This is technical limitation and it can be lifted in the
59701 +      future.
59702 +
59703 +   TODO:
59704 +
59705 +   New plugin types/plugings:
59706 +   Things we should be able to separately choose to inherit:
59707 +
59708 +   security plugins
59709 +
59710 +   stat data
59711 +
59712 +   file bodies
59713 +
59714 +   file plugins
59715 +
59716 +   dir plugins
59717 +
59718 +    . perm:acl
59719 +
59720 +    d audi---audit plugin intercepting and possibly logging all
59721 +      accesses to object. Requires to put stub functions in file_operations
59722 +      in stead of generic_file_*.
59723 +
59724 +NIKITA-FIXME-HANS: why make overflows a plugin?
59725 +    . over---handle hash overflows
59726 +
59727 +    . sqnt---handle different access patterns and instruments read-ahead
59728 +
59729 +NIKITA-FIXME-HANS: describe the line below in more detail.
59730 +
59731 +    . hier---handle inheritance of plugins along file-system hierarchy
59732 +
59733 +   Different kinds of inheritance: on creation vs. on access.
59734 +   Compatible/incompatible plugins.
59735 +   Inheritance for multi-linked files.
59736 +   Layered plugins.
59737 +   Notion of plugin context is abandoned.
59738 +
59739 +Each file is associated
59740 +   with one plugin and dependant plugins (hash, etc.) are stored as
59741 +   main plugin state. Now, if we have plugins used for regular files
59742 +   but not for directories, how such plugins would be inherited?
59743 +    . always store them with directories also
59744 +
59745 +NIKTIA-FIXME-HANS: Do the line above.  It is not exclusive of doing the line below which is also useful.
59746 +
59747 +    . use inheritance hierarchy, independent of file-system namespace
59748 +
59749 +*/
59750 +
59751 +#include "../debug.h"
59752 +#include "../dformat.h"
59753 +#include "plugin_header.h"
59754 +#include "item/static_stat.h"
59755 +#include "node/node.h"
59756 +#include "security/perm.h"
59757 +#include "space/space_allocator.h"
59758 +#include "disk_format/disk_format.h"
59759 +#include "plugin.h"
59760 +#include "../reiser4.h"
59761 +#include "../jnode.h"
59762 +#include "../inode.h"
59763 +
59764 +#include <linux/fs.h>          /* for struct super_block  */
59765 +
59766 +/* public interface */
59767 +
59768 +/* initialise plugin sub-system. Just call this once on reiser4 startup. */
59769 +int init_plugins(void);
59770 +int setup_plugins(struct super_block *super, reiser4_plugin ** area);
59771 +int locate_plugin(struct inode *inode, plugin_locator * loc);
59772 +
59773 +
59774 +/**
59775 + * init_plugins - initialize plugins
59776 + *
59777 + * Initializes plugin sub-system. It is part of reiser4 module
59778 + * initialization. For each plugin of each type init method is called and each
59779 + * plugin is put into list of plugins.
59780 + */
59781 +int init_plugins(void)
59782 +{
59783 +       reiser4_plugin_type type_id;
59784 +
59785 +       for (type_id = 0; type_id < REISER4_PLUGIN_TYPES; ++type_id) {
59786 +               reiser4_plugin_type_data *ptype;
59787 +               int i;
59788 +
59789 +               ptype = &plugins[type_id];
59790 +               assert("nikita-3508", ptype->label != NULL);
59791 +               assert("nikita-3509", ptype->type_id == type_id);
59792 +
59793 +               INIT_LIST_HEAD(&ptype->plugins_list);
59794 +/* NIKITA-FIXME-HANS: change builtin_num to some other name lacking the term builtin. */
59795 +               for (i = 0; i < ptype->builtin_num; ++i) {
59796 +                       reiser4_plugin *plugin;
59797 +
59798 +                       plugin = plugin_at(ptype, i);
59799 +
59800 +                       if (plugin->h.label == NULL)
59801 +                               /* uninitialized slot encountered */
59802 +                               continue;
59803 +                       assert("nikita-3445", plugin->h.type_id == type_id);
59804 +                       plugin->h.id = i;
59805 +                       if (plugin->h.pops != NULL &&
59806 +                           plugin->h.pops->init != NULL) {
59807 +                               int result;
59808 +
59809 +                               result = plugin->h.pops->init(plugin);
59810 +                               if (result != 0)
59811 +                                       return result;
59812 +                       }
59813 +                       INIT_LIST_HEAD(&plugin->h.linkage);
59814 +                       list_add_tail(&plugin->h.linkage, &ptype->plugins_list);
59815 +               }
59816 +       }
59817 +       return 0;
59818 +}
59819 +
59820 +/* true if plugin type id is valid */
59821 +int is_type_id_valid(reiser4_plugin_type type_id /* plugin type id */ )
59822 +{
59823 +       /* "type_id" is unsigned, so no comparison with 0 is
59824 +          necessary */
59825 +       return (type_id < REISER4_PLUGIN_TYPES);
59826 +}
59827 +
59828 +/* true if plugin id is valid */
59829 +int is_plugin_id_valid(reiser4_plugin_type type_id /* plugin type id */ ,
59830 +                      reiser4_plugin_id id /* plugin id */ )
59831 +{
59832 +       assert("nikita-1653", is_type_id_valid(type_id));
59833 +       return id < plugins[type_id].builtin_num;
59834 +}
59835 +
59836 +/* return plugin by its @type_id and @id.
59837 +
59838 +   Both arguments are checked for validness: this is supposed to be called
59839 +   from user-level.
59840 +
59841 +NIKITA-FIXME-HANS: Do you instead mean that this checks ids created in
59842 +user space, and passed to the filesystem by use of method files? Your
59843 +comment really confused me on the first reading....
59844 +
59845 +*/
59846 +reiser4_plugin *plugin_by_unsafe_id(reiser4_plugin_type type_id        /* plugin
59847 +                                                                * type id,
59848 +                                                                * unchecked */ ,
59849 +                                   reiser4_plugin_id id        /* plugin id,
59850 +                                                                * unchecked */ )
59851 +{
59852 +       if (is_type_id_valid(type_id)) {
59853 +               if (is_plugin_id_valid(type_id, id))
59854 +                       return plugin_at(&plugins[type_id], id);
59855 +               else
59856 +                       /* id out of bounds */
59857 +                       warning("nikita-2913",
59858 +                               "Invalid plugin id: [%i:%i]", type_id, id);
59859 +       } else
59860 +               /* type_id out of bounds */
59861 +               warning("nikita-2914", "Invalid type_id: %i", type_id);
59862 +       return NULL;
59863 +}
59864 +
59865 +/**
59866 + * save_plugin_id - store plugin id in disk format
59867 + * @plugin: plugin to convert
59868 + * @area: where to store result
59869 + *
59870 + * Puts id of @plugin in little endian format to address @area.
59871 + */
59872 +int save_plugin_id(reiser4_plugin *plugin /* plugin to convert */ ,
59873 +                  d16 *area /* where to store result */ )
59874 +{
59875 +       assert("nikita-1261", plugin != NULL);
59876 +       assert("nikita-1262", area != NULL);
59877 +
59878 +       put_unaligned(cpu_to_le16(plugin->h.id), area);
59879 +       return 0;
59880 +}
59881 +
59882 +/* list of all plugins of given type */
59883 +struct list_head *get_plugin_list(reiser4_plugin_type type_id  /* plugin type
59884 +                                                                * id */ )
59885 +{
59886 +       assert("nikita-1056", is_type_id_valid(type_id));
59887 +       return &plugins[type_id].plugins_list;
59888 +}
59889 +
59890 +int grab_plugin(struct inode *self, struct inode *ancestor, pset_member memb)
59891 +{
59892 +       reiser4_plugin *plug;
59893 +       reiser4_inode *parent;
59894 +
59895 +       parent = reiser4_inode_data(ancestor);
59896 +       plug = pset_get(parent->hset, memb) ? : pset_get(parent->pset, memb);
59897 +       return grab_plugin_from(self, memb, plug);
59898 +}
59899 +
59900 +static void update_plugin_mask(reiser4_inode * info, pset_member memb)
59901 +{
59902 +       struct dentry *rootdir;
59903 +       reiser4_inode *root;
59904 +
59905 +       rootdir = inode_by_reiser4_inode(info)->i_sb->s_root;
59906 +       if (rootdir != NULL) {
59907 +               root = reiser4_inode_data(rootdir->d_inode);
59908 +               /*
59909 +                * if inode is different from the default one, or we are
59910 +                * changing plugin of root directory, update plugin_mask
59911 +                */
59912 +               if (pset_get(info->pset, memb) != pset_get(root->pset, memb) ||
59913 +                   info == root)
59914 +                       info->plugin_mask |= (1 << memb);
59915 +       }
59916 +}
59917 +
59918 +int
59919 +grab_plugin_from(struct inode *self, pset_member memb, reiser4_plugin * plug)
59920 +{
59921 +       reiser4_inode *info;
59922 +       int result = 0;
59923 +
59924 +       info = reiser4_inode_data(self);
59925 +       if (pset_get(info->pset, memb) == NULL) {
59926 +               result = pset_set(&info->pset, memb, plug);
59927 +               if (result == 0)
59928 +                       update_plugin_mask(info, memb);
59929 +       }
59930 +       return result;
59931 +}
59932 +
59933 +#if 0
59934 +int force_plugin(struct inode *self, pset_member memb, reiser4_plugin * plug)
59935 +{
59936 +       reiser4_inode *info;
59937 +       int result = 0;
59938 +
59939 +       info = reiser4_inode_data(self);
59940 +       if (plug->h.pops != NULL && plug->h.pops->change != NULL)
59941 +               result = plug->h.pops->change(self, plug);
59942 +       else
59943 +               result = pset_set(&info->pset, memb, plug);
59944 +       if (result == 0)
59945 +               update_plugin_mask(info, memb);
59946 +       return result;
59947 +}
59948 +#endif  /*  0  */
59949 +
59950 +reiser4_plugin_type_data plugins[REISER4_PLUGIN_TYPES] = {
59951 +       /* C90 initializers */
59952 +       [REISER4_FILE_PLUGIN_TYPE] = {
59953 +               .type_id = REISER4_FILE_PLUGIN_TYPE,
59954 +               .label = "file",
59955 +               .desc = "Object plugins",
59956 +               .builtin_num = sizeof_array(file_plugins),
59957 +               .builtin = file_plugins,
59958 +               .plugins_list = {NULL, NULL},
59959 +               .size = sizeof(file_plugin)
59960 +       },
59961 +       [REISER4_DIR_PLUGIN_TYPE] = {
59962 +               .type_id = REISER4_DIR_PLUGIN_TYPE,
59963 +               .label = "dir",
59964 +               .desc = "Directory plugins",
59965 +               .builtin_num = sizeof_array(dir_plugins),
59966 +               .builtin = dir_plugins,
59967 +               .plugins_list = {NULL, NULL},
59968 +               .size = sizeof(dir_plugin)
59969 +       },
59970 +       [REISER4_HASH_PLUGIN_TYPE] = {
59971 +               .type_id = REISER4_HASH_PLUGIN_TYPE,
59972 +               .label = "hash",
59973 +               .desc = "Directory hashes",
59974 +               .builtin_num = sizeof_array(hash_plugins),
59975 +               .builtin = hash_plugins,
59976 +               .plugins_list = {NULL, NULL},
59977 +               .size = sizeof(hash_plugin)
59978 +       },
59979 +       [REISER4_FIBRATION_PLUGIN_TYPE] = {
59980 +               .type_id =
59981 +               REISER4_FIBRATION_PLUGIN_TYPE,
59982 +               .label = "fibration",
59983 +               .desc = "Directory fibrations",
59984 +               .builtin_num = sizeof_array(fibration_plugins),
59985 +               .builtin = fibration_plugins,
59986 +               .plugins_list = {NULL, NULL},
59987 +               .size = sizeof(fibration_plugin)
59988 +       },
59989 +       [REISER4_CIPHER_PLUGIN_TYPE] = {
59990 +               .type_id = REISER4_CIPHER_PLUGIN_TYPE,
59991 +               .label = "cipher",
59992 +               .desc = "Cipher plugins",
59993 +               .builtin_num = sizeof_array(cipher_plugins),
59994 +               .builtin = cipher_plugins,
59995 +               .plugins_list = {NULL, NULL},
59996 +               .size = sizeof(cipher_plugin)
59997 +       },
59998 +       [REISER4_DIGEST_PLUGIN_TYPE] = {
59999 +               .type_id = REISER4_DIGEST_PLUGIN_TYPE,
60000 +               .label = "digest",
60001 +               .desc = "Digest plugins",
60002 +               .builtin_num = sizeof_array(digest_plugins),
60003 +               .builtin = digest_plugins,
60004 +               .plugins_list = {NULL, NULL},
60005 +               .size = sizeof(digest_plugin)
60006 +       },
60007 +       [REISER4_COMPRESSION_PLUGIN_TYPE] = {
60008 +               .type_id = REISER4_COMPRESSION_PLUGIN_TYPE,
60009 +               .label = "compression",
60010 +               .desc = "Compression plugins",
60011 +               .builtin_num = sizeof_array(compression_plugins),
60012 +               .builtin = compression_plugins,
60013 +               .plugins_list = {NULL, NULL},
60014 +               .size = sizeof(compression_plugin)
60015 +       },
60016 +       [REISER4_FORMATTING_PLUGIN_TYPE] = {
60017 +               .type_id = REISER4_FORMATTING_PLUGIN_TYPE,
60018 +               .label = "formatting",
60019 +               .desc = "Tail inlining policies",
60020 +               .builtin_num = sizeof_array(formatting_plugins),
60021 +               .builtin = formatting_plugins,
60022 +               .plugins_list = {NULL, NULL},
60023 +               .size = sizeof(formatting_plugin)
60024 +       },
60025 +       [REISER4_PERM_PLUGIN_TYPE] = {
60026 +               .type_id = REISER4_PERM_PLUGIN_TYPE,
60027 +               .label = "perm",
60028 +               .desc = "Permission checks",
60029 +               .builtin_num = sizeof_array(perm_plugins),
60030 +               .builtin = perm_plugins,
60031 +               .plugins_list = {NULL, NULL},
60032 +               .size = sizeof(perm_plugin)
60033 +       },
60034 +       [REISER4_ITEM_PLUGIN_TYPE] = {
60035 +               .type_id = REISER4_ITEM_PLUGIN_TYPE,
60036 +               .label = "item",
60037 +               .desc = "Item handlers",
60038 +               .builtin_num = sizeof_array(item_plugins),
60039 +               .builtin = item_plugins,
60040 +               .plugins_list = {NULL, NULL},
60041 +               .size = sizeof(item_plugin)
60042 +       },
60043 +       [REISER4_NODE_PLUGIN_TYPE] = {
60044 +               .type_id = REISER4_NODE_PLUGIN_TYPE,
60045 +               .label = "node",
60046 +               .desc = "node layout handlers",
60047 +               .builtin_num = sizeof_array(node_plugins),
60048 +               .builtin = node_plugins,
60049 +               .plugins_list = {NULL, NULL},
60050 +               .size = sizeof(node_plugin)
60051 +       },
60052 +       [REISER4_SD_EXT_PLUGIN_TYPE] = {
60053 +               .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
60054 +               .label = "sd_ext",
60055 +               .desc = "Parts of stat-data",
60056 +               .builtin_num = sizeof_array(sd_ext_plugins),
60057 +               .builtin = sd_ext_plugins,
60058 +               .plugins_list = {NULL, NULL},
60059 +               .size = sizeof(sd_ext_plugin)
60060 +       },
60061 +       [REISER4_FORMAT_PLUGIN_TYPE] = {
60062 +               .type_id = REISER4_FORMAT_PLUGIN_TYPE,
60063 +               .label = "disk_layout",
60064 +               .desc = "defines filesystem on disk layout",
60065 +               .builtin_num = sizeof_array(format_plugins),
60066 +               .builtin = format_plugins,
60067 +               .plugins_list = {NULL, NULL},
60068 +               .size = sizeof(disk_format_plugin)
60069 +       },
60070 +       [REISER4_JNODE_PLUGIN_TYPE] = {
60071 +               .type_id = REISER4_JNODE_PLUGIN_TYPE,
60072 +               .label = "jnode",
60073 +               .desc = "defines kind of jnode",
60074 +               .builtin_num = sizeof_array(jnode_plugins),
60075 +               .builtin = jnode_plugins,
60076 +               .plugins_list = {NULL, NULL},
60077 +               .size = sizeof(jnode_plugin)
60078 +       },
60079 +       [REISER4_COMPRESSION_MODE_PLUGIN_TYPE] = {
60080 +               .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
60081 +               .label = "compression_mode",
60082 +               .desc = "Defines compression mode",
60083 +               .builtin_num = sizeof_array(compression_mode_plugins),
60084 +               .builtin = compression_mode_plugins,
60085 +               .plugins_list = {NULL, NULL},
60086 +               .size = sizeof(compression_mode_plugin)
60087 +       },
60088 +       [REISER4_CLUSTER_PLUGIN_TYPE] = {
60089 +               .type_id = REISER4_CLUSTER_PLUGIN_TYPE,
60090 +               .label = "cluster",
60091 +               .desc = "Defines cluster size",
60092 +               .builtin_num = sizeof_array(cluster_plugins),
60093 +               .builtin = cluster_plugins,
60094 +               .plugins_list = {NULL, NULL},
60095 +               .size = sizeof(cluster_plugin)
60096 +       },
60097 +       [REISER4_REGULAR_PLUGIN_TYPE] = {
60098 +               .type_id = REISER4_REGULAR_PLUGIN_TYPE,
60099 +               .label = "regular",
60100 +               .desc = "Defines kind of regular file",
60101 +               .builtin_num =
60102 +               sizeof_array(regular_plugins),
60103 +               .builtin = regular_plugins,
60104 +               .plugins_list = {NULL, NULL},
60105 +               .size = sizeof(regular_plugin)
60106 +       }
60107 +};
60108 +
60109 +/*
60110 + * Local variables:
60111 + * c-indentation-style: "K&R"
60112 + * mode-name: "LC"
60113 + * c-basic-offset: 8
60114 + * tab-width: 8
60115 + * fill-column: 120
60116 + * End:
60117 + */
60118 diff --git a/fs/reiser4/plugin/plugin.h b/fs/reiser4/plugin/plugin.h
60119 new file mode 100644
60120 index 0000000..1b33280
60121 --- /dev/null
60122 +++ b/fs/reiser4/plugin/plugin.h
60123 @@ -0,0 +1,935 @@
60124 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
60125 +
60126 +/* Basic plugin data-types.
60127 +   see fs/reiser4/plugin/plugin.c for details */
60128 +
60129 +#if !defined( __FS_REISER4_PLUGIN_TYPES_H__ )
60130 +#define __FS_REISER4_PLUGIN_TYPES_H__
60131 +
60132 +#include "../forward.h"
60133 +#include "../debug.h"
60134 +#include "../dformat.h"
60135 +#include "../key.h"
60136 +#include "compress/compress.h"
60137 +#include "crypto/cipher.h"
60138 +#include "plugin_header.h"
60139 +#include "item/static_stat.h"
60140 +#include "item/internal.h"
60141 +#include "item/sde.h"
60142 +#include "item/cde.h"
60143 +#include "item/item.h"
60144 +#include "node/node.h"
60145 +#include "node/node40.h"
60146 +#include "security/perm.h"
60147 +#include "fibration.h"
60148 +
60149 +#include "space/bitmap.h"
60150 +#include "space/space_allocator.h"
60151 +
60152 +#include "disk_format/disk_format40.h"
60153 +#include "disk_format/disk_format.h"
60154 +
60155 +#include <linux/fs.h>          /* for struct super_block, address_space  */
60156 +#include <linux/mm.h>          /* for struct page */
60157 +#include <linux/buffer_head.h> /* for struct buffer_head */
60158 +#include <linux/dcache.h>      /* for struct dentry */
60159 +#include <linux/types.h>
60160 +#include <linux/crypto.h>
60161 +
60162 +typedef struct reiser4_object_on_wire reiser4_object_on_wire;
60163 +
60164 +/*
60165 + * File plugin.  Defines the set of methods that file plugins implement, some
60166 + * of which are optional.
60167 + *
60168 + * A file plugin offers to the caller an interface for IO ( writing to and/or
60169 + * reading from) to what the caller sees as one sequence of bytes.  An IO to it
60170 + * may affect more than one physical sequence of bytes, or no physical sequence
60171 + * of bytes, it may affect sequences of bytes offered by other file plugins to
60172 + * the semantic layer, and the file plugin may invoke other plugins and
60173 + * delegate work to them, but its interface is structured for offering the
60174 + * caller the ability to read and/or write what the caller sees as being a
60175 + * single sequence of bytes.
60176 + *
60177 + * The file plugin must present a sequence of bytes to the caller, but it does
60178 + * not necessarily have to store a sequence of bytes, it does not necessarily
60179 + * have to support efficient tree traversal to any offset in the sequence of
60180 + * bytes (tail and extent items, whose keys contain offsets, do however provide
60181 + * efficient non-sequential lookup of any offset in the sequence of bytes).
60182 + *
60183 + * Directory plugins provide methods for selecting file plugins by resolving a
60184 + * name for them.
60185 + *
60186 + * The functionality other filesystems call an attribute, and rigidly tie
60187 + * together, we decompose into orthogonal selectable features of files.  Using
60188 + * the terminology we will define next, an attribute is a perhaps constrained,
60189 + * perhaps static length, file whose parent has a uni-count-intra-link to it,
60190 + * which might be grandparent-major-packed, and whose parent has a deletion
60191 + * method that deletes it.
60192 + *
60193 + * File plugins can implement constraints.
60194 + *
60195 + * Files can be of variable length (e.g. regular unix files), or of static
60196 + * length (e.g. static sized attributes).
60197 + *
60198 + * An object may have many sequences of bytes, and many file plugins, but, it
60199 + * has exactly one objectid.  It is usually desirable that an object has a
60200 + * deletion method which deletes every item with that objectid.  Items cannot
60201 + * in general be found by just their objectids.  This means that an object must
60202 + * have either a method built into its deletion plugin method for knowing what
60203 + * items need to be deleted, or links stored with the object that provide the
60204 + * plugin with a method for finding those items.  Deleting a file within an
60205 + * object may or may not have the effect of deleting the entire object,
60206 + * depending on the file plugin's deletion method.
60207 + *
60208 + * LINK TAXONOMY:
60209 + *
60210 + * Many objects have a reference count, and when the reference count reaches 0
60211 + * the object's deletion method is invoked.  Some links embody a reference
60212 + * count increase ("countlinks"), and others do not ("nocountlinks").
60213 + *
60214 + * Some links are bi-directional links ("bilinks"), and some are
60215 + * uni-directional("unilinks").
60216 + *
60217 + * Some links are between parts of the same object ("intralinks"), and some are
60218 + * between different objects ("interlinks").
60219 + *
60220 + * PACKING TAXONOMY:
60221 + *
60222 + * Some items of an object are stored with a major packing locality based on
60223 + * their object's objectid (e.g. unix directory items in plan A), and these are
60224 + * called "self-major-packed".
60225 + *
60226 + * Some items of an object are stored with a major packing locality based on
60227 + * their semantic parent object's objectid (e.g. unix file bodies in plan A),
60228 + * and these are called "parent-major-packed".
60229 + *
60230 + * Some items of an object are stored with a major packing locality based on
60231 + * their semantic grandparent, and these are called "grandparent-major-packed".
60232 + * Now carefully notice that we run into trouble with key length if we have to
60233 + * store a 8 byte major+minor grandparent based packing locality, an 8 byte
60234 + * parent objectid, an 8 byte attribute objectid, and an 8 byte offset, all in
60235 + * a 24 byte key.  One of these fields must be sacrificed if an item is to be
60236 + * grandparent-major-packed, and which to sacrifice is left to the item author
60237 + * choosing to make the item grandparent-major-packed.  You cannot make tail
60238 + * items and extent items grandparent-major-packed, though you could make them
60239 + * self-major-packed (usually they are parent-major-packed).
60240 + *
60241 + * In the case of ACLs (which are composed of fixed length ACEs which consist
60242 + * of {subject-type, subject, and permission bitmask} triples), it makes sense
60243 + * to not have an offset field in the ACE item key, and to allow duplicate keys
60244 + * for ACEs.  Thus, the set of ACES for a given file is found by looking for a
60245 + * key consisting of the objectid of the grandparent (thus grouping all ACLs in
60246 + * a directory together), the minor packing locality of ACE, the objectid of
60247 + * the file, and 0.
60248 + *
60249 + * IO involves moving data from one location to another, which means that two
60250 + * locations must be specified, source and destination.
60251 + *
60252 + * This source and destination can be in the filesystem, or they can be a
60253 + * pointer in the user process address space plus a byte count.
60254 + *
60255 + * If both source and destination are in the filesystem, then at least one of
60256 + * them must be representable as a pure stream of bytes (which we call a flow,
60257 + * and define as a struct containing a key, a data pointer, and a length).
60258 + * This may mean converting one of them into a flow.  We provide a generic
60259 + * cast_into_flow() method, which will work for any plugin supporting
60260 + * read_flow(), though it is inefficiently implemented in that it temporarily
60261 + * stores the flow in a buffer (Question: what to do with huge flows that
60262 + * cannot fit into memory?  Answer: we must not convert them all at once. )
60263 + *
60264 + * Performing a write requires resolving the write request into a flow defining
60265 + * the source, and a method that performs the write, and a key that defines
60266 + * where in the tree the write is to go.
60267 + *
60268 + * Performing a read requires resolving the read request into a flow defining
60269 + * the target, and a method that performs the read, and a key that defines
60270 + * where in the tree the read is to come from.
60271 + *
60272 + * There will exist file plugins which have no pluginid stored on the disk for
60273 + * them, and which are only invoked by other plugins.
60274 + */
60275 +
60276 +/* builtin file-plugins */
60277 +typedef enum {
60278 +       /* regular file */
60279 +       UNIX_FILE_PLUGIN_ID,
60280 +       /* directory */
60281 +       DIRECTORY_FILE_PLUGIN_ID,
60282 +       /* symlink */
60283 +       SYMLINK_FILE_PLUGIN_ID,
60284 +       /* for objects completely handled by the VFS: fifos, devices,
60285 +          sockets  */
60286 +       SPECIAL_FILE_PLUGIN_ID,
60287 +       /* regular cryptcompress file */
60288 +       CRC_FILE_PLUGIN_ID,
60289 +       /* number of file plugins. Used as size of arrays to hold
60290 +          file plugins. */
60291 +       LAST_FILE_PLUGIN_ID
60292 +} reiser4_file_id;
60293 +
60294 +typedef struct file_plugin {
60295 +
60296 +       /* generic fields */
60297 +       plugin_header h;
60298 +
60299 +       struct inode_operations inode_ops;
60300 +       struct file_operations file_ops;
60301 +       struct address_space_operations as_ops;
60302 +
60303 +       /* save inode cached stat-data onto disk. It was called
60304 +          reiserfs_update_sd() in 3.x */
60305 +       int (*write_sd_by_inode) (struct inode *);
60306 +
60307 +       /*
60308 +        * private methods: These are optional.  If used they will allow you to
60309 +        * minimize the amount of code needed to implement a deviation from
60310 +        * some other method that also uses them.
60311 +        */
60312 +
60313 +       /*
60314 +        * Construct flow into @flow according to user-supplied data.
60315 +        *
60316 +        * This is used by read/write methods to construct a flow to
60317 +        * write/read. ->flow_by_inode() is plugin method, rather than single
60318 +        * global implementation, because key in a flow used by plugin may
60319 +        * depend on data in a @buf.
60320 +        *
60321 +        * NIKITA-FIXME-HANS: please create statistics on what functions are
60322 +        * dereferenced how often for the mongo benchmark.  You can supervise
60323 +        * Elena doing this for you if that helps.  Email me the list of the
60324 +        * top 10, with their counts, and an estimate of the total number of
60325 +        * CPU cycles spent dereferencing as a percentage of CPU cycles spent
60326 +        * processing (non-idle processing).  If the total percent is, say,
60327 +        * less than 1%, it will make our coding discussions much easier, and
60328 +        * keep me from questioning whether functions like the below are too
60329 +        * frequently called to be dereferenced.  If the total percent is more
60330 +        * than 1%, perhaps private methods should be listed in a "required"
60331 +        * comment at the top of each plugin (with stern language about how if
60332 +        * the comment is missing it will not be accepted by the maintainer),
60333 +        * and implemented using macros not dereferenced functions.  How about
60334 +        * replacing this whole private methods part of the struct with a
60335 +        * thorough documentation of what the standard helper functions are for
60336 +        * use in constructing plugins?  I think users have been asking for
60337 +        * that, though not in so many words.
60338 +        */
60339 +       int (*flow_by_inode) (struct inode *, const char __user *buf,
60340 +                             int user, loff_t size,
60341 +                             loff_t off, rw_op op, flow_t *);
60342 +
60343 +       /*
60344 +        * Return the key used to retrieve an offset of a file. It is used by
60345 +        * default implementation of ->flow_by_inode() method
60346 +        * (common_build_flow()) and, among other things, to get to the extent
60347 +        * from jnode of unformatted node.
60348 +        */
60349 +       int (*key_by_inode) (struct inode *, loff_t off, reiser4_key *);
60350 +
60351 +       /* NIKITA-FIXME-HANS: this comment is not as clear to others as you think.... */
60352 +       /*
60353 +        * set the plugin for a file.  Called during file creation in creat()
60354 +        * but not reiser4() unless an inode already exists for the file.
60355 +        */
60356 +       int (*set_plug_in_inode) (struct inode *inode, struct inode *parent,
60357 +                                 reiser4_object_create_data *);
60358 +
60359 +       /* NIKITA-FIXME-HANS: comment and name seem to say different things,
60360 +        * are you setting up the object itself also or just adjusting the
60361 +        * parent?.... */
60362 +       /* set up plugins for new @object created in @parent. @root is root
60363 +          directory. */
60364 +       int (*adjust_to_parent) (struct inode *object, struct inode *parent,
60365 +                                struct inode *root);
60366 +       /*
60367 +        * this does whatever is necessary to do when object is created. For
60368 +        * instance, for unix files stat data is inserted. It is supposed to be
60369 +        * called by create of struct inode_operations.
60370 +        */
60371 +       int (*create_object) (struct inode *object, struct inode *parent,
60372 +                             reiser4_object_create_data *);
60373 +
60374 +       /* this does whatever is necessary to do when object is opened */
60375 +       int (*open_object) (struct inode * inode, struct file * file);
60376 +       /*
60377 +        * this method should check REISER4_NO_SD and set REISER4_NO_SD on
60378 +        * success. Deletion of an object usually includes removal of items
60379 +        * building file body (for directories this is removal of "." and "..")
60380 +        * and removal of stat-data item.
60381 +        */
60382 +       int (*delete_object) (struct inode *);
60383 +
60384 +       /* add link from @parent to @object */
60385 +       int (*add_link) (struct inode *object, struct inode *parent);
60386 +
60387 +       /* remove link from @parent to @object */
60388 +       int (*rem_link) (struct inode *object, struct inode *parent);
60389 +
60390 +       /*
60391 +        * return true if item addressed by @coord belongs to @inode.  This is
60392 +        * used by read/write to properly slice flow into items in presence of
60393 +        * multiple key assignment policies, because items of a file are not
60394 +        * necessarily contiguous in a key space, for example, in a plan-b.
60395 +        */
60396 +       int (*owns_item) (const struct inode *, const coord_t *);
60397 +
60398 +       /* checks whether yet another hard links to this object can be
60399 +          added  */
60400 +       int (*can_add_link) (const struct inode *);
60401 +
60402 +       /* checks whether hard links to this object can be removed */
60403 +       int (*can_rem_link) (const struct inode *);
60404 +
60405 +       /* not empty for DIRECTORY_FILE_PLUGIN_ID only currently. It calls
60406 +          detach of directory plugin to remove ".." */
60407 +       int (*detach) (struct inode * child, struct inode * parent);
60408 +
60409 +       /* called when @child was just looked up in the @parent. It is not
60410 +          empty for DIRECTORY_FILE_PLUGIN_ID only where it calls attach of
60411 +          directory plugin */
60412 +       int (*bind) (struct inode * child, struct inode * parent);
60413 +
60414 +       /* process safe-link during mount */
60415 +       int (*safelink) (struct inode * object, reiser4_safe_link_t link,
60416 +                        __u64 value);
60417 +
60418 +       /* The couple of estimate methods for all file operations */
60419 +       struct {
60420 +               reiser4_block_nr(*create) (const struct inode *);
60421 +               reiser4_block_nr(*update) (const struct inode *);
60422 +               reiser4_block_nr(*unlink) (const struct inode *,
60423 +                                          const struct inode *);
60424 +       } estimate;
60425 +
60426 +       /*
60427 +        * reiser4 specific part of inode has a union of structures which are
60428 +        * specific to a plugin. This method is called when inode is read
60429 +        * (read_inode) and when file is created (common_create_child) so that
60430 +        * file plugin could initialize its inode data
60431 +        */
60432 +       void (*init_inode_data) (struct inode *, reiser4_object_create_data *,
60433 +                                int);
60434 +
60435 +       /*
60436 +        * This method performs progressive deletion of items and whole nodes
60437 +        * from right to left.
60438 +        *
60439 +        * @tap: the point deletion process begins from,
60440 +        * @from_key: the beginning of the deleted key range,
60441 +        * @to_key: the end of the deleted key range,
60442 +        * @smallest_removed: the smallest removed key,
60443 +        *
60444 +        * @return: 0 if success, error code otherwise, -E_REPEAT means that long cut_tree
60445 +        * operation was interrupted for allowing atom commit .
60446 +        */
60447 +       int (*cut_tree_worker) (tap_t *, const reiser4_key * from_key,
60448 +                               const reiser4_key * to_key,
60449 +                               reiser4_key * smallest_removed, struct inode *,
60450 +                               int, int *);
60451 +
60452 +       /* called from ->destroy_inode() */
60453 +       void (*destroy_inode) (struct inode *);
60454 +
60455 +       /*
60456 +        * methods to serialize object identify. This is used, for example, by
60457 +        * reiser4_{en,de}code_fh().
60458 +        */
60459 +       struct {
60460 +               /* store object's identity at @area */
60461 +               char *(*write) (struct inode * inode, char *area);
60462 +               /* parse object from wire to the @obj */
60463 +               char *(*read) (char *area, reiser4_object_on_wire * obj);
60464 +               /* given object identity in @obj, find or create its dentry */
60465 +               struct dentry *(*get) (struct super_block * s,
60466 +                                      reiser4_object_on_wire * obj);
60467 +               /* how many bytes ->wire.write() consumes */
60468 +               int (*size) (struct inode * inode);
60469 +               /* finish with object identify */
60470 +               void (*done) (reiser4_object_on_wire * obj);
60471 +       } wire;
60472 +} file_plugin;
60473 +
60474 +extern file_plugin file_plugins[LAST_FILE_PLUGIN_ID];
60475 +
60476 +struct reiser4_object_on_wire {
60477 +       file_plugin *plugin;
60478 +       union {
60479 +               struct {
60480 +                       obj_key_id key_id;
60481 +               } std;
60482 +               void *generic;
60483 +       } u;
60484 +};
60485 +
60486 +/* builtin dir-plugins */
60487 +typedef enum {
60488 +       HASHED_DIR_PLUGIN_ID,
60489 +       SEEKABLE_HASHED_DIR_PLUGIN_ID,
60490 +       LAST_DIR_ID
60491 +} reiser4_dir_id;
60492 +
60493 +typedef struct dir_plugin {
60494 +       /* generic fields */
60495 +       plugin_header h;
60496 +
60497 +       struct inode_operations inode_ops;
60498 +       struct file_operations file_ops;
60499 +       struct address_space_operations as_ops;
60500 +
60501 +       /*
60502 +        * private methods: These are optional.  If used they will allow you to
60503 +        * minimize the amount of code needed to implement a deviation from
60504 +        * some other method that uses them.  You could logically argue that
60505 +        * they should be a separate type of plugin.
60506 +        */
60507 +
60508 +       struct dentry *(*get_parent) (struct inode * childdir);
60509 +
60510 +       /*
60511 +        * check whether "name" is acceptable name to be inserted into this
60512 +        * object. Optionally implemented by directory-like objects.  Can check
60513 +        * for maximal length, reserved symbols etc
60514 +        */
60515 +       int (*is_name_acceptable) (const struct inode * inode, const char *name,
60516 +                                  int len);
60517 +
60518 +       void (*build_entry_key) (const struct inode * dir       /* directory where
60519 +                                                                * entry is (or will
60520 +                                                                * be) in.*/ ,
60521 +                                const struct qstr * name       /* name of file
60522 +                                                                * referenced by this
60523 +                                                                * entry */ ,
60524 +                                reiser4_key * result   /* resulting key of
60525 +                                                        * directory entry */ );
60526 +       int (*build_readdir_key) (struct file * dir, reiser4_key * result);
60527 +       int (*add_entry) (struct inode * object, struct dentry * where,
60528 +                         reiser4_object_create_data * data,
60529 +                         reiser4_dir_entry_desc * entry);
60530 +       int (*rem_entry) (struct inode * object, struct dentry * where,
60531 +                         reiser4_dir_entry_desc * entry);
60532 +
60533 +       /*
60534 +        * initialize directory structure for newly created object. For normal
60535 +        * unix directories, insert dot and dotdot.
60536 +        */
60537 +       int (*init) (struct inode * object, struct inode * parent,
60538 +                    reiser4_object_create_data * data);
60539 +
60540 +       /* destroy directory */
60541 +       int (*done) (struct inode * child);
60542 +
60543 +       /* called when @subdir was just looked up in the @dir */
60544 +       int (*attach) (struct inode * subdir, struct inode * dir);
60545 +       int (*detach) (struct inode * subdir, struct inode * dir);
60546 +
60547 +       struct {
60548 +               reiser4_block_nr(*add_entry) (const struct inode *);
60549 +               reiser4_block_nr(*rem_entry) (const struct inode *);
60550 +               reiser4_block_nr(*unlink) (const struct inode *,
60551 +                                          const struct inode *);
60552 +       } estimate;
60553 +} dir_plugin;
60554 +
60555 +extern dir_plugin dir_plugins[LAST_DIR_ID];
60556 +
60557 +typedef struct formatting_plugin {
60558 +       /* generic fields */
60559 +       plugin_header h;
60560 +       /* returns non-zero iff file's tail has to be stored
60561 +          in a direct item. */
60562 +       int (*have_tail) (const struct inode * inode, loff_t size);
60563 +} formatting_plugin;
60564 +
60565 +typedef struct hash_plugin {
60566 +       /* generic fields */
60567 +       plugin_header h;
60568 +       /* computes hash of the given name */
60569 +        __u64(*hash) (const unsigned char *name, int len);
60570 +} hash_plugin;
60571 +
60572 +typedef struct cipher_plugin {
60573 +       /* generic fields */
60574 +       plugin_header h;
60575 +       struct crypto_tfm * (*alloc) (void);
60576 +       void (*free) (struct crypto_tfm * tfm);
60577 +       /* Offset translator. For each offset this returns (k * offset), where
60578 +          k (k >= 1) is an expansion factor of the cipher algorithm.
60579 +          For all symmetric algorithms k == 1. For asymmetric algorithms (which
60580 +          inflate data) offset translation guarantees that all disk cluster's
60581 +          units will have keys smaller then next cluster's one.
60582 +        */
60583 +        loff_t(*scale) (struct inode * inode, size_t blocksize, loff_t src);
60584 +       /* Cipher algorithms can accept data only by chunks of cipher block
60585 +          size. This method is to align any flow up to cipher block size when
60586 +          we pass it to cipher algorithm. To align means to append padding of
60587 +          special format specific to the cipher algorithm */
60588 +       int (*align_stream) (__u8 * tail, int clust_size, int blocksize);
60589 +       /* low-level key manager (check, install, etc..) */
60590 +       int (*setkey) (struct crypto_tfm * tfm, const __u8 * key,
60591 +                      unsigned int keylen);
60592 +       /* main text processing procedures */
60593 +       void (*encrypt) (__u32 * expkey, __u8 * dst, const __u8 * src);
60594 +       void (*decrypt) (__u32 * expkey, __u8 * dst, const __u8 * src);
60595 +} cipher_plugin;
60596 +
60597 +typedef struct digest_plugin {
60598 +       /* generic fields */
60599 +       plugin_header h;
60600 +       /* fingerprint size in bytes */
60601 +       int fipsize;
60602 +       struct crypto_tfm * (*alloc) (void);
60603 +       void (*free) (struct crypto_tfm * tfm);
60604 +} digest_plugin;
60605 +
60606 +typedef struct compression_plugin {
60607 +       /* generic fields */
60608 +       plugin_header h;
60609 +       int (*init) (void);
60610 +       /* the maximum number of bytes the size of the "compressed" data can
60611 +        * exceed the uncompressed data. */
60612 +       int (*overrun) (unsigned src_len);
60613 +        coa_t(*alloc) (tfm_action act);
60614 +       void (*free) (coa_t coa, tfm_action act);
60615 +       /* minimal size of the flow we still try to compress */
60616 +       int (*min_size_deflate) (void);
60617 +        __u32(*checksum) (char *data, __u32 length);
60618 +       /* main transform procedures */
60619 +       void (*compress) (coa_t coa, __u8 * src_first, unsigned src_len,
60620 +                         __u8 * dst_first, unsigned *dst_len);
60621 +       void (*decompress) (coa_t coa, __u8 * src_first, unsigned src_len,
60622 +                           __u8 * dst_first, unsigned *dst_len);
60623 +} compression_plugin;
60624 +
60625 +typedef struct compression_mode_plugin {
60626 +       /* generic fields */
60627 +       plugin_header h;
60628 +       /* this is called when estimating compressibility
60629 +          of a logical cluster by its content */
60630 +       int (*should_deflate) (struct inode * inode, cloff_t index);
60631 +       /* this is called when results of compression should be saved */
60632 +       int (*accept_hook) (struct inode * inode, cloff_t index);
60633 +       /* this is called when results of compression should be discarded */
60634 +       int (*discard_hook) (struct inode * inode, cloff_t index);
60635 +} compression_mode_plugin;
60636 +
60637 +typedef struct regular_plugin {
60638 +       /* generic fields */
60639 +       plugin_header h;
60640 +       /* file plugin id which implements regular file */
60641 +       reiser4_file_id id;
60642 +} regular_plugin;
60643 +
60644 +typedef struct cluster_plugin {
60645 +       /* generic fields */
60646 +       plugin_header h;
60647 +       int shift;
60648 +} cluster_plugin;
60649 +
60650 +typedef struct sd_ext_plugin {
60651 +       /* generic fields */
60652 +       plugin_header h;
60653 +       int (*present) (struct inode * inode, char **area, int *len);
60654 +       int (*absent) (struct inode * inode);
60655 +       int (*save_len) (struct inode * inode);
60656 +       int (*save) (struct inode * inode, char **area);
60657 +       /* alignment requirement for this stat-data part */
60658 +       int alignment;
60659 +} sd_ext_plugin;
60660 +
60661 +/* this plugin contains methods to allocate objectid for newly created files,
60662 +   to deallocate objectid when file gets removed, to report number of used and
60663 +   free objectids */
60664 +typedef struct oid_allocator_plugin {
60665 +       /* generic fields */
60666 +       plugin_header h;
60667 +       int (*init_oid_allocator) (reiser4_oid_allocator * map, __u64 nr_files,
60668 +                                  __u64 oids);
60669 +       /* used to report statfs->f_files */
60670 +        __u64(*oids_used) (reiser4_oid_allocator * map);
60671 +       /* get next oid to use */
60672 +        __u64(*next_oid) (reiser4_oid_allocator * map);
60673 +       /* used to report statfs->f_ffree */
60674 +        __u64(*oids_free) (reiser4_oid_allocator * map);
60675 +       /* allocate new objectid */
60676 +       int (*allocate_oid) (reiser4_oid_allocator * map, oid_t *);
60677 +       /* release objectid */
60678 +       int (*release_oid) (reiser4_oid_allocator * map, oid_t);
60679 +       /* how many pages to reserve in transaction for allocation of new
60680 +          objectid */
60681 +       int (*oid_reserve_allocate) (reiser4_oid_allocator * map);
60682 +       /* how many pages to reserve in transaction for freeing of an
60683 +          objectid */
60684 +       int (*oid_reserve_release) (reiser4_oid_allocator * map);
60685 +       void (*print_info) (const char *, reiser4_oid_allocator *);
60686 +} oid_allocator_plugin;
60687 +
60688 +/* disk layout plugin: this specifies super block, journal, bitmap (if there
60689 +   are any) locations, etc */
60690 +typedef struct disk_format_plugin {
60691 +       /* generic fields */
60692 +       plugin_header h;
60693 +       /* replay journal, initialize super_info_data, etc */
60694 +       int (*init_format) (struct super_block *, void *data);
60695 +
60696 +       /* key of root directory stat data */
60697 +       const reiser4_key *(*root_dir_key) (const struct super_block *);
60698 +
60699 +       int (*release) (struct super_block *);
60700 +       jnode *(*log_super) (struct super_block *);
60701 +       int (*check_open) (const struct inode * object);
60702 +} disk_format_plugin;
60703 +
60704 +struct jnode_plugin {
60705 +       /* generic fields */
60706 +       plugin_header h;
60707 +       int (*init) (jnode * node);
60708 +       int (*parse) (jnode * node);
60709 +       struct address_space *(*mapping) (const jnode * node);
60710 +       unsigned long (*index) (const jnode * node);
60711 +       jnode *(*clone) (jnode * node);
60712 +};
60713 +
60714 +/* plugin instance.                                                         */
60715 +/*                                                                          */
60716 +/* This is "wrapper" union for all types of plugins. Most of the code uses  */
60717 +/* plugins of particular type (file_plugin, dir_plugin, etc.)  rather than  */
60718 +/* operates with pointers to reiser4_plugin. This union is only used in     */
60719 +/* some generic code in plugin/plugin.c that operates on all                */
60720 +/* plugins. Technically speaking purpose of this union is to add type       */
60721 +/* safety to said generic code: each plugin type (file_plugin, for          */
60722 +/* example), contains plugin_header as its first memeber. This first member */
60723 +/* is located at the same place in memory as .h member of                   */
60724 +/* reiser4_plugin. Generic code, obtains pointer to reiser4_plugin and      */
60725 +/* looks in the .h which is header of plugin type located in union. This    */
60726 +/* allows to avoid type-casts.                                              */
60727 +union reiser4_plugin {
60728 +       /* generic fields */
60729 +       plugin_header h;
60730 +       /* file plugin */
60731 +       file_plugin file;
60732 +       /* directory plugin */
60733 +       dir_plugin dir;
60734 +       /* hash plugin, used by directory plugin */
60735 +       hash_plugin hash;
60736 +       /* fibration plugin used by directory plugin */
60737 +       fibration_plugin fibration;
60738 +       /* cipher transform plugin, used by file plugin */
60739 +       cipher_plugin cipher;
60740 +       /* digest transform plugin, used by file plugin */
60741 +       digest_plugin digest;
60742 +       /* compression transform plugin, used by file plugin */
60743 +       compression_plugin compression;
60744 +       /* tail plugin, used by file plugin */
60745 +       formatting_plugin formatting;
60746 +       /* permission plugin */
60747 +       perm_plugin perm;
60748 +       /* node plugin */
60749 +       node_plugin node;
60750 +       /* item plugin */
60751 +       item_plugin item;
60752 +       /* stat-data extension plugin */
60753 +       sd_ext_plugin sd_ext;
60754 +       /* disk layout plugin */
60755 +       disk_format_plugin format;
60756 +       /* object id allocator plugin */
60757 +       oid_allocator_plugin oid_allocator;
60758 +       /* plugin for different jnode types */
60759 +       jnode_plugin jnode;
60760 +       /* compression mode plugin, used by object plugin */
60761 +       compression_mode_plugin compression_mode;
60762 +       /* cluster plugin, used by object plugin */
60763 +       cluster_plugin clust;
60764 +       /* regular plugin, used by directory plugin */
60765 +       regular_plugin regular;
60766 +       /* place-holder for new plugin types that can be registered
60767 +          dynamically, and used by other dynamically loaded plugins.  */
60768 +       void *generic;
60769 +};
60770 +
60771 +struct reiser4_plugin_ops {
60772 +       /* called when plugin is initialized */
60773 +       int (*init) (reiser4_plugin * plugin);
60774 +       /* called when plugin is unloaded */
60775 +       int (*done) (reiser4_plugin * plugin);
60776 +       /* load given plugin from disk */
60777 +       int (*load) (struct inode * inode,
60778 +                    reiser4_plugin * plugin, char **area, int *len);
60779 +       /* how many space is required to store this plugin's state
60780 +          in stat-data */
60781 +       int (*save_len) (struct inode * inode, reiser4_plugin * plugin);
60782 +       /* save persistent plugin-data to disk */
60783 +       int (*save) (struct inode * inode, reiser4_plugin * plugin,
60784 +                    char **area);
60785 +       /* alignment requirement for on-disk state of this plugin
60786 +          in number of bytes */
60787 +       int alignment;
60788 +       /* install itself into given inode. This can return error
60789 +          (e.g., you cannot change hash of non-empty directory). */
60790 +       int (*change) (struct inode * inode, reiser4_plugin * plugin);
60791 +       /* install itself into given inode. This can return error
60792 +          (e.g., you cannot change hash of non-empty directory). */
60793 +       int (*inherit) (struct inode * inode, struct inode * parent,
60794 +                       reiser4_plugin * plugin);
60795 +};
60796 +
60797 +/* functions implemented in fs/reiser4/plugin/plugin.c */
60798 +
60799 +/* stores plugin reference in reiser4-specific part of inode */
60800 +extern int set_object_plugin(struct inode *inode, reiser4_plugin_id id);
60801 +extern int setup_plugins(struct super_block *super, reiser4_plugin ** area);
60802 +extern int init_plugins(void);
60803 +
60804 +/* builtin plugins */
60805 +
60806 +/* builtin hash-plugins */
60807 +
60808 +typedef enum {
60809 +       RUPASOV_HASH_ID,
60810 +       R5_HASH_ID,
60811 +       TEA_HASH_ID,
60812 +       FNV1_HASH_ID,
60813 +       DEGENERATE_HASH_ID,
60814 +       LAST_HASH_ID
60815 +} reiser4_hash_id;
60816 +
60817 +/* builtin cipher plugins */
60818 +
60819 +typedef enum {
60820 +       NONE_CIPHER_ID,
60821 +       AES_CIPHER_ID,
60822 +       LAST_CIPHER_ID
60823 +} reiser4_cipher_id;
60824 +
60825 +/* builtin digest plugins */
60826 +
60827 +typedef enum {
60828 +       SHA256_32_DIGEST_ID,
60829 +       LAST_DIGEST_ID
60830 +} reiser4_digest_id;
60831 +
60832 +/* builtin compression mode plugins */
60833 +typedef enum {
60834 +       NONE_COMPRESSION_MODE_ID,
60835 +       COL_8_COMPRESSION_MODE_ID,
60836 +       COL_16_COMPRESSION_MODE_ID,
60837 +       COL_32_COMPRESSION_MODE_ID,
60838 +       COZ_COMPRESSION_MODE_ID,
60839 +       FORCE_COMPRESSION_MODE_ID,
60840 +       TEST_COMPRESSION_MODE_ID,
60841 +       LAST_COMPRESSION_MODE_ID
60842 +} reiser4_compression_mode_id;
60843 +
60844 +/* builtin cluster plugins */
60845 +typedef enum {
60846 +       CLUSTER_64K_ID,
60847 +       CLUSTER_32K_ID,
60848 +       CLUSTER_16K_ID,
60849 +       CLUSTER_8K_ID,
60850 +       CLUSTER_4K_ID,
60851 +       LAST_CLUSTER_ID
60852 +} reiser4_cluster_id;
60853 +
60854 +/* builtin regular plugins */
60855 +typedef enum {
60856 +       UF_REGULAR_ID,
60857 +       CRC_REGULAR_ID,
60858 +       LAST_REGULAR_ID
60859 +} reiser4_regular_id;
60860 +
60861 +/* builtin tail-plugins */
60862 +
60863 +typedef enum {
60864 +       NEVER_TAILS_FORMATTING_ID,
60865 +       ALWAYS_TAILS_FORMATTING_ID,
60866 +       SMALL_FILE_FORMATTING_ID,
60867 +       LAST_TAIL_FORMATTING_ID
60868 +} reiser4_formatting_id;
60869 +
60870 +/* compression/clustering specific data */
60871 +typedef struct compression_data {
60872 +       reiser4_compression_id coa;     /* id of the compression algorithm */
60873 +} compression_data_t;
60874 +
60875 +typedef __u8 cluster_data_t;   /* cluster info */
60876 +
60877 +/* data type used to pack parameters that we pass to vfs object creation
60878 +   function create_object() */
60879 +struct reiser4_object_create_data {
60880 +       /* plugin to control created object */
60881 +       reiser4_file_id id;
60882 +       /* mode of regular file, directory or special file */
60883 +/* what happens if some other sort of perm plugin is in use? */
60884 +       int mode;
60885 +       /* rdev of special file */
60886 +       dev_t rdev;
60887 +       /* symlink target */
60888 +       const char *name;
60889 +       /* add here something for non-standard objects you invent, like
60890 +          query for interpolation file etc. */
60891 +
60892 +       crypto_stat_t * crypto;
60893 +       compression_data_t *compression;
60894 +       cluster_data_t *cluster;
60895 +
60896 +       struct inode *parent;
60897 +       struct dentry *dentry;
60898 +};
60899 +
60900 +/* description of directory entry being created/destroyed/sought for
60901 +
60902 +   It is passed down to the directory plugin and farther to the
60903 +   directory item plugin methods. Creation of new directory is done in
60904 +   several stages: first we search for an entry with the same name, then
60905 +   create new one. reiser4_dir_entry_desc is used to store some information
60906 +   collected at some stage of this process and required later: key of
60907 +   item that we want to insert/delete and pointer to an object that will
60908 +   be bound by the new directory entry. Probably some more fields will
60909 +   be added there.
60910 +
60911 +*/
60912 +struct reiser4_dir_entry_desc {
60913 +       /* key of directory entry */
60914 +       reiser4_key key;
60915 +       /* object bound by this entry. */
60916 +       struct inode *obj;
60917 +};
60918 +
60919 +#define MAX_PLUGIN_TYPE_LABEL_LEN  32
60920 +#define MAX_PLUGIN_PLUG_LABEL_LEN  32
60921 +
60922 +/* used for interface with user-land: table-driven parsing in
60923 +    reiser4(). */
60924 +typedef struct plugin_locator {
60925 +       reiser4_plugin_type type_id;
60926 +       reiser4_plugin_id id;
60927 +       char type_label[MAX_PLUGIN_TYPE_LABEL_LEN];
60928 +       char plug_label[MAX_PLUGIN_PLUG_LABEL_LEN];
60929 +} plugin_locator;
60930 +
60931 +extern int locate_plugin(struct inode *inode, plugin_locator * loc);
60932 +
60933 +
60934 +#define PLUGIN_BY_ID(TYPE,ID,FIELD)                                    \
60935 +static inline TYPE *TYPE ## _by_id( reiser4_plugin_id id )             \
60936 +{                                                                      \
60937 +       reiser4_plugin *plugin = plugin_by_id ( ID, id );               \
60938 +       return plugin ? & plugin -> FIELD : NULL;                       \
60939 +}                                                                      \
60940 +static inline TYPE *TYPE ## _by_disk_id( reiser4_tree *tree, d16 *id ) \
60941 +{                                                                      \
60942 +       reiser4_plugin *plugin = plugin_by_disk_id ( tree, ID, id );    \
60943 +       return plugin ? & plugin -> FIELD : NULL;                       \
60944 +}                                                                      \
60945 +static inline TYPE *TYPE ## _by_unsafe_id( reiser4_plugin_id id )      \
60946 +{                                                                      \
60947 +       reiser4_plugin *plugin = plugin_by_unsafe_id ( ID, id );        \
60948 +       return plugin ? & plugin -> FIELD : NULL;                       \
60949 +}                                                                      \
60950 +static inline reiser4_plugin* TYPE ## _to_plugin( TYPE* plugin )       \
60951 +{                                                                      \
60952 +       return ( reiser4_plugin * ) plugin;                             \
60953 +}                                                                      \
60954 +static inline reiser4_plugin_id TYPE ## _id( TYPE* plugin )            \
60955 +{                                                                      \
60956 +       return TYPE ## _to_plugin (plugin) -> h.id;                     \
60957 +}                                                                      \
60958 +typedef struct { int foo; } TYPE ## _plugin_dummy
60959 +
60960 +PLUGIN_BY_ID(item_plugin, REISER4_ITEM_PLUGIN_TYPE, item);
60961 +PLUGIN_BY_ID(file_plugin, REISER4_FILE_PLUGIN_TYPE, file);
60962 +PLUGIN_BY_ID(dir_plugin, REISER4_DIR_PLUGIN_TYPE, dir);
60963 +PLUGIN_BY_ID(node_plugin, REISER4_NODE_PLUGIN_TYPE, node);
60964 +PLUGIN_BY_ID(sd_ext_plugin, REISER4_SD_EXT_PLUGIN_TYPE, sd_ext);
60965 +PLUGIN_BY_ID(perm_plugin, REISER4_PERM_PLUGIN_TYPE, perm);
60966 +PLUGIN_BY_ID(hash_plugin, REISER4_HASH_PLUGIN_TYPE, hash);
60967 +PLUGIN_BY_ID(fibration_plugin, REISER4_FIBRATION_PLUGIN_TYPE, fibration);
60968 +PLUGIN_BY_ID(cipher_plugin, REISER4_CIPHER_PLUGIN_TYPE, cipher);
60969 +PLUGIN_BY_ID(digest_plugin, REISER4_DIGEST_PLUGIN_TYPE, digest);
60970 +PLUGIN_BY_ID(compression_plugin, REISER4_COMPRESSION_PLUGIN_TYPE, compression);
60971 +PLUGIN_BY_ID(formatting_plugin, REISER4_FORMATTING_PLUGIN_TYPE, formatting);
60972 +PLUGIN_BY_ID(disk_format_plugin, REISER4_FORMAT_PLUGIN_TYPE, format);
60973 +PLUGIN_BY_ID(jnode_plugin, REISER4_JNODE_PLUGIN_TYPE, jnode);
60974 +PLUGIN_BY_ID(compression_mode_plugin, REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
60975 +            compression_mode);
60976 +PLUGIN_BY_ID(cluster_plugin, REISER4_CLUSTER_PLUGIN_TYPE, clust);
60977 +PLUGIN_BY_ID(regular_plugin, REISER4_REGULAR_PLUGIN_TYPE, regular);
60978 +
60979 +extern int save_plugin_id(reiser4_plugin * plugin, d16 * area);
60980 +
60981 +extern struct list_head *get_plugin_list(reiser4_plugin_type type_id);
60982 +
60983 +#define for_all_plugins(ptype, plugin)                                                 \
60984 +for (plugin = list_entry(get_plugin_list(ptype)->next, reiser4_plugin, h.linkage);     \
60985 +     get_plugin_list(ptype) != &plugin->h.linkage;                                     \
60986 +     plugin = list_entry(plugin->h.linkage.next, reiser4_plugin, h.linkage))
60987 +
60988 +
60989 +/* enumeration of fields within plugin_set */
60990 +typedef enum {
60991 +       PSET_FILE,
60992 +       PSET_DIR,               /* PSET_FILE and PSET_DIR should be first elements:
60993 +                                * inode.c:read_inode() depends on this. */
60994 +       PSET_PERM,
60995 +       PSET_FORMATTING,
60996 +       PSET_HASH,
60997 +       PSET_FIBRATION,
60998 +       PSET_SD,
60999 +       PSET_DIR_ITEM,
61000 +       PSET_CIPHER,
61001 +       PSET_DIGEST,
61002 +       PSET_COMPRESSION,
61003 +       PSET_COMPRESSION_MODE,
61004 +       PSET_CLUSTER,
61005 +       PSET_REGULAR_ENTRY,
61006 +       PSET_LAST
61007 +} pset_member;
61008 +
61009 +int grab_plugin(struct inode *self, struct inode *ancestor, pset_member memb);
61010 +int grab_plugin_from(struct inode *self, pset_member memb,
61011 +                    reiser4_plugin * plug);
61012 +
61013 +/* defined in fs/reiser4/plugin/object.c */
61014 +extern file_plugin file_plugins[LAST_FILE_PLUGIN_ID];
61015 +/* defined in fs/reiser4/plugin/object.c */
61016 +extern dir_plugin dir_plugins[LAST_DIR_ID];
61017 +/* defined in fs/reiser4/plugin/item/static_stat.c */
61018 +extern sd_ext_plugin sd_ext_plugins[LAST_SD_EXTENSION];
61019 +/* defined in fs/reiser4/plugin/hash.c */
61020 +extern hash_plugin hash_plugins[LAST_HASH_ID];
61021 +/* defined in fs/reiser4/plugin/fibration.c */
61022 +extern fibration_plugin fibration_plugins[LAST_FIBRATION_ID];
61023 +/* defined in fs/reiser4/plugin/crypt.c */
61024 +extern cipher_plugin cipher_plugins[LAST_CIPHER_ID];
61025 +/* defined in fs/reiser4/plugin/digest.c */
61026 +extern digest_plugin digest_plugins[LAST_DIGEST_ID];
61027 +/* defined in fs/reiser4/plugin/compress/compress.c */
61028 +extern compression_plugin compression_plugins[LAST_COMPRESSION_ID];
61029 +/* defined in fs/reiser4/plugin/compress/compression_mode.c */
61030 +extern compression_mode_plugin
61031 +compression_mode_plugins[LAST_COMPRESSION_MODE_ID];
61032 +/* defined in fs/reiser4/plugin/cluster.c */
61033 +extern cluster_plugin cluster_plugins[LAST_CLUSTER_ID];
61034 +/* defined in fs/reiser4/plugin/regular.c */
61035 +extern regular_plugin regular_plugins[LAST_REGULAR_ID];
61036 +/* defined in fs/reiser4/plugin/tail.c */
61037 +extern formatting_plugin formatting_plugins[LAST_TAIL_FORMATTING_ID];
61038 +/* defined in fs/reiser4/plugin/security/security.c */
61039 +extern perm_plugin perm_plugins[LAST_PERM_ID];
61040 +/* defined in fs/reiser4/plugin/item/item.c */
61041 +extern item_plugin item_plugins[LAST_ITEM_ID];
61042 +/* defined in fs/reiser4/plugin/node/node.c */
61043 +extern node_plugin node_plugins[LAST_NODE_ID];
61044 +/* defined in fs/reiser4/plugin/disk_format/disk_format.c */
61045 +extern disk_format_plugin format_plugins[LAST_FORMAT_ID];
61046 +
61047 +/* __FS_REISER4_PLUGIN_TYPES_H__ */
61048 +#endif
61049 +
61050 +/* Make Linus happy.
61051 +   Local variables:
61052 +   c-indentation-style: "K&R"
61053 +   mode-name: "LC"
61054 +   c-basic-offset: 8
61055 +   tab-width: 8
61056 +   fill-column: 120
61057 +   End:
61058 +*/
61059 diff --git a/fs/reiser4/plugin/plugin_header.h b/fs/reiser4/plugin/plugin_header.h
61060 new file mode 100644
61061 index 0000000..35840ca
61062 --- /dev/null
61063 +++ b/fs/reiser4/plugin/plugin_header.h
61064 @@ -0,0 +1,136 @@
61065 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
61066 +
61067 +/* plugin header. Data structures required by all plugin types. */
61068 +
61069 +#if !defined( __PLUGIN_HEADER_H__ )
61070 +#define __PLUGIN_HEADER_H__
61071 +
61072 +/* plugin data-types and constants */
61073 +
61074 +#include "../debug.h"
61075 +#include "../dformat.h"
61076 +
61077 +typedef enum {
61078 +       REISER4_FILE_PLUGIN_TYPE,
61079 +       REISER4_DIR_PLUGIN_TYPE,
61080 +       REISER4_ITEM_PLUGIN_TYPE,
61081 +       REISER4_NODE_PLUGIN_TYPE,
61082 +       REISER4_HASH_PLUGIN_TYPE,
61083 +       REISER4_FIBRATION_PLUGIN_TYPE,
61084 +       REISER4_FORMATTING_PLUGIN_TYPE,
61085 +       REISER4_PERM_PLUGIN_TYPE,
61086 +       REISER4_SD_EXT_PLUGIN_TYPE,
61087 +       REISER4_FORMAT_PLUGIN_TYPE,
61088 +       REISER4_JNODE_PLUGIN_TYPE,
61089 +       REISER4_CIPHER_PLUGIN_TYPE,
61090 +       REISER4_DIGEST_PLUGIN_TYPE,
61091 +       REISER4_COMPRESSION_PLUGIN_TYPE,
61092 +       REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
61093 +       REISER4_CLUSTER_PLUGIN_TYPE,
61094 +       REISER4_REGULAR_PLUGIN_TYPE,
61095 +       REISER4_PLUGIN_TYPES
61096 +} reiser4_plugin_type;
61097 +
61098 +struct reiser4_plugin_ops;
61099 +/* generic plugin operations, supported by each
61100 +    plugin type. */
61101 +typedef struct reiser4_plugin_ops reiser4_plugin_ops;
61102 +
61103 +/* the common part of all plugin instances. */
61104 +typedef struct plugin_header {
61105 +       /* plugin type */
61106 +       reiser4_plugin_type type_id;
61107 +       /* id of this plugin */
61108 +       reiser4_plugin_id id;
61109 +       /* plugin operations */
61110 +       reiser4_plugin_ops *pops;
61111 +/* NIKITA-FIXME-HANS: usage of and access to label and desc is not commented and defined. */
61112 +       /* short label of this plugin */
61113 +       const char *label;
61114 +       /* descriptive string.. */
61115 +       const char *desc;
61116 +       /* list linkage */
61117 +       struct list_head linkage;
61118 +} plugin_header;
61119 +
61120 +/* PRIVATE INTERFACES */
61121 +/* NIKITA-FIXME-HANS: what is this for and why does it duplicate what is in plugin_header? */
61122 +/* plugin type representation. */
61123 +typedef struct reiser4_plugin_type_data {
61124 +       /* internal plugin type identifier. Should coincide with
61125 +          index of this item in plugins[] array. */
61126 +       reiser4_plugin_type type_id;
61127 +       /* short symbolic label of this plugin type. Should be no longer
61128 +          than MAX_PLUGIN_TYPE_LABEL_LEN characters including '\0'. */
61129 +       const char *label;
61130 +       /* plugin type description longer than .label */
61131 +       const char *desc;
61132 +
61133 +/* NIKITA-FIXME-HANS: define built-in */
61134 +       /* number of built-in plugin instances of this type */
61135 +       int builtin_num;
61136 +       /* array of built-in plugins */
61137 +       void *builtin;
61138 +       struct list_head plugins_list;
61139 +       size_t size;
61140 +} reiser4_plugin_type_data;
61141 +
61142 +extern reiser4_plugin_type_data plugins[REISER4_PLUGIN_TYPES];
61143 +
61144 +int is_type_id_valid(reiser4_plugin_type type_id);
61145 +int is_plugin_id_valid(reiser4_plugin_type type_id, reiser4_plugin_id id);
61146 +
61147 +static inline reiser4_plugin *plugin_at(reiser4_plugin_type_data * ptype, int i)
61148 +{
61149 +       char *builtin;
61150 +
61151 +       builtin = ptype->builtin;
61152 +       return (reiser4_plugin *) (builtin + i * ptype->size);
61153 +}
61154 +
61155 +/* return plugin by its @type_id and @id */
61156 +static inline reiser4_plugin *plugin_by_id(reiser4_plugin_type type_id
61157 +                                          /* plugin type id */ ,
61158 +                                          reiser4_plugin_id id /* plugin id */
61159 +                                          )
61160 +{
61161 +       assert("nikita-1651", is_type_id_valid(type_id));
61162 +       assert("nikita-1652", is_plugin_id_valid(type_id, id));
61163 +       return plugin_at(&plugins[type_id], id);
61164 +}
61165 +
61166 +extern reiser4_plugin *plugin_by_unsafe_id(reiser4_plugin_type type_id,
61167 +                                          reiser4_plugin_id id);
61168 +
61169 +/**
61170 + * plugin_by_disk_id - get reiser4_plugin
61171 + * @type_id: plugin type id
61172 + * @did: plugin id in disk format
61173 + *
61174 + * Returns reiser4_plugin by plugin type id an dplugin_id.
61175 + */
61176 +static inline reiser4_plugin *plugin_by_disk_id(reiser4_tree * tree UNUSED_ARG,
61177 +                                               reiser4_plugin_type type_id,
61178 +                                               __le16 *plugin_id)
61179 +{
61180 +       /*
61181 +        * what we should do properly is to maintain within each file-system a
61182 +        * dictionary that maps on-disk plugin ids to "universal" ids. This
61183 +        * dictionary will be resolved on mount time, so that this function
61184 +        * will perform just one additional array lookup.
61185 +        */
61186 +       return plugin_by_unsafe_id(type_id, le16_to_cpu(*plugin_id));
61187 +}
61188 +
61189 +/* __PLUGIN_HEADER_H__ */
61190 +#endif
61191 +
61192 +/*
61193 + * Local variables:
61194 + * c-indentation-style: "K&R"
61195 + * mode-name: "LC"
61196 + * c-basic-offset: 8
61197 + * tab-width: 8
61198 + * fill-column: 79
61199 + * End:
61200 + */
61201 diff --git a/fs/reiser4/plugin/plugin_set.c b/fs/reiser4/plugin/plugin_set.c
61202 new file mode 100644
61203 index 0000000..14a37fa
61204 --- /dev/null
61205 +++ b/fs/reiser4/plugin/plugin_set.c
61206 @@ -0,0 +1,378 @@
61207 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
61208 + * reiser4/README */
61209 +/* NIKITA-FIXME-HANS: you didn't discuss this with me before coding it did you?  Remove plugin-sets from code by March 15th, 2004 */
61210 +/* plugin-sets */
61211 +
61212 +/*
61213 + * Each inode comes with a whole set of plugins: file plugin, directory
61214 + * plugin, hash plugin, tail policy plugin, security plugin, etc.
61215 + *
61216 + * Storing them (pointers to them, that is) in inode is a waste of
61217 + * space. Especially, given that on average file system plugins of vast
61218 + * majority of files will belong to few sets (e.g., one set for regular files,
61219 + * another set for standard directory, etc.)
61220 + *
61221 + * Plugin set (pset) is an object containing pointers to all plugins required
61222 + * by inode. Inode only stores a pointer to pset. psets are "interned", that
61223 + * is, different inodes with the same set of plugins point to the same
61224 + * pset. This is archived by storing psets in global hash table. Races are
61225 + * avoided by simple (and efficient so far) solution of never recycling psets,
61226 + * even when last inode pointing to it is destroyed.
61227 + *
61228 + */
61229 +
61230 +#include "../debug.h"
61231 +#include "../super.h"
61232 +#include "plugin_set.h"
61233 +
61234 +#include <linux/slab.h>
61235 +#include <linux/stddef.h>
61236 +
61237 +/* slab for plugin sets */
61238 +static kmem_cache_t *plugin_set_slab;
61239 +
61240 +static spinlock_t plugin_set_lock[8] __cacheline_aligned_in_smp = {
61241 +       [0 ... 7] = SPIN_LOCK_UNLOCKED
61242 +};
61243 +
61244 +/* hash table support */
61245 +
61246 +#define PS_TABLE_SIZE (32)
61247 +
61248 +static inline plugin_set *cast_to(const unsigned long *a)
61249 +{
61250 +       return container_of(a, plugin_set, hashval);
61251 +}
61252 +
61253 +static inline int pseq(const unsigned long *a1, const unsigned long *a2)
61254 +{
61255 +       plugin_set *set1;
61256 +       plugin_set *set2;
61257 +
61258 +       /* make sure fields are not missed in the code below */
61259 +       cassert(sizeof *set1 ==
61260 +               sizeof set1->hashval +
61261 +               sizeof set1->link +
61262 +               sizeof set1->file +
61263 +               sizeof set1->dir +
61264 +               sizeof set1->perm +
61265 +               sizeof set1->formatting +
61266 +               sizeof set1->hash +
61267 +               sizeof set1->fibration +
61268 +               sizeof set1->sd +
61269 +               sizeof set1->dir_item +
61270 +               sizeof set1->cipher +
61271 +               sizeof set1->digest +
61272 +               sizeof set1->compression +
61273 +               sizeof set1->compression_mode +
61274 +               sizeof set1->cluster + sizeof set1->regular_entry);
61275 +
61276 +       set1 = cast_to(a1);
61277 +       set2 = cast_to(a2);
61278 +       return
61279 +           set1->hashval == set2->hashval &&
61280 +           set1->file == set2->file &&
61281 +           set1->dir == set2->dir &&
61282 +           set1->perm == set2->perm &&
61283 +           set1->formatting == set2->formatting &&
61284 +           set1->hash == set2->hash &&
61285 +           set1->fibration == set2->fibration &&
61286 +           set1->sd == set2->sd &&
61287 +           set1->dir_item == set2->dir_item &&
61288 +           set1->cipher == set2->cipher &&
61289 +           set1->digest == set2->digest &&
61290 +           set1->compression == set2->compression &&
61291 +           set1->compression_mode == set2->compression_mode &&
61292 +           set1->cluster == set2->cluster &&
61293 +           set1->regular_entry == set2->regular_entry;
61294 +}
61295 +
61296 +#define HASH_FIELD(hash, set, field)           \
61297 +({                                             \
61298 +        (hash) += (unsigned long)(set)->field >> 2;    \
61299 +})
61300 +
61301 +static inline unsigned long calculate_hash(const plugin_set * set)
61302 +{
61303 +       unsigned long result;
61304 +
61305 +       result = 0;
61306 +       HASH_FIELD(result, set, file);
61307 +       HASH_FIELD(result, set, dir);
61308 +       HASH_FIELD(result, set, perm);
61309 +       HASH_FIELD(result, set, formatting);
61310 +       HASH_FIELD(result, set, hash);
61311 +       HASH_FIELD(result, set, fibration);
61312 +       HASH_FIELD(result, set, sd);
61313 +       HASH_FIELD(result, set, dir_item);
61314 +       HASH_FIELD(result, set, cipher);
61315 +       HASH_FIELD(result, set, digest);
61316 +       HASH_FIELD(result, set, compression);
61317 +       HASH_FIELD(result, set, compression_mode);
61318 +       HASH_FIELD(result, set, cluster);
61319 +       HASH_FIELD(result, set, regular_entry);
61320 +       return result & (PS_TABLE_SIZE - 1);
61321 +}
61322 +
61323 +static inline unsigned long
61324 +pshash(ps_hash_table * table, const unsigned long *a)
61325 +{
61326 +       return *a;
61327 +}
61328 +
61329 +/* The hash table definition */
61330 +#define KMALLOC(size) kmalloc((size), get_gfp_mask())
61331 +#define KFREE(ptr, size) kfree(ptr)
61332 +TYPE_SAFE_HASH_DEFINE(ps, plugin_set, unsigned long, hashval, link, pshash,
61333 +                     pseq);
61334 +#undef KFREE
61335 +#undef KMALLOC
61336 +
61337 +static ps_hash_table ps_table;
61338 +static plugin_set empty_set = {
61339 +       .hashval = 0,
61340 +       .file = NULL,
61341 +       .dir = NULL,
61342 +       .perm = NULL,
61343 +       .formatting = NULL,
61344 +       .hash = NULL,
61345 +       .fibration = NULL,
61346 +       .sd = NULL,
61347 +       .dir_item = NULL,
61348 +       .cipher = NULL,
61349 +       .digest = NULL,
61350 +       .compression = NULL,
61351 +       .compression_mode = NULL,
61352 +       .cluster = NULL,
61353 +       .regular_entry = NULL,
61354 +       .link = {NULL}
61355 +};
61356 +
61357 +plugin_set *plugin_set_get_empty(void)
61358 +{
61359 +       return &empty_set;
61360 +}
61361 +
61362 +void plugin_set_put(plugin_set * set)
61363 +{
61364 +}
61365 +
61366 +static inline unsigned long *pset_field(plugin_set * set, int offset)
61367 +{
61368 +       return (unsigned long *)(((char *)set) + offset);
61369 +}
61370 +
61371 +static int plugin_set_field(plugin_set ** set, const unsigned long val,
61372 +                           const int offset)
61373 +{
61374 +       unsigned long *spot;
61375 +       spinlock_t *lock;
61376 +       plugin_set replica;
61377 +       plugin_set *twin;
61378 +       plugin_set *psal;
61379 +       plugin_set *orig;
61380 +
61381 +       assert("nikita-2902", set != NULL);
61382 +       assert("nikita-2904", *set != NULL);
61383 +
61384 +       spot = pset_field(*set, offset);
61385 +       if (unlikely(*spot == val))
61386 +               return 0;
61387 +
61388 +       replica = *(orig = *set);
61389 +       *pset_field(&replica, offset) = val;
61390 +       replica.hashval = calculate_hash(&replica);
61391 +       rcu_read_lock();
61392 +       twin = ps_hash_find(&ps_table, &replica.hashval);
61393 +       if (unlikely(twin == NULL)) {
61394 +               rcu_read_unlock();
61395 +               psal = kmem_cache_alloc(plugin_set_slab, get_gfp_mask());
61396 +               if (psal == NULL)
61397 +                       return RETERR(-ENOMEM);
61398 +               *psal = replica;
61399 +               lock = &plugin_set_lock[replica.hashval & 7];
61400 +               spin_lock(lock);
61401 +               twin = ps_hash_find(&ps_table, &replica.hashval);
61402 +               if (likely(twin == NULL)) {
61403 +                       *set = psal;
61404 +                       ps_hash_insert_rcu(&ps_table, psal);
61405 +               } else {
61406 +                       *set = twin;
61407 +                       kmem_cache_free(plugin_set_slab, psal);
61408 +               }
61409 +               spin_unlock(lock);
61410 +       } else {
61411 +               rcu_read_unlock();
61412 +               *set = twin;
61413 +       }
61414 +       return 0;
61415 +}
61416 +
61417 +static struct {
61418 +       int offset;
61419 +       reiser4_plugin_type type;
61420 +} pset_descr[PSET_LAST] = {
61421 +       [PSET_FILE] = {
61422 +               .offset = offsetof(plugin_set, file),
61423 +               .type = REISER4_FILE_PLUGIN_TYPE
61424 +       },
61425 +       [PSET_DIR] = {
61426 +               .offset = offsetof(plugin_set, dir),
61427 +               .type = REISER4_DIR_PLUGIN_TYPE
61428 +       },
61429 +       [PSET_PERM] = {
61430 +               .offset = offsetof(plugin_set, perm),
61431 +               .type = REISER4_PERM_PLUGIN_TYPE
61432 +       },
61433 +       [PSET_FORMATTING] = {
61434 +               .offset = offsetof(plugin_set, formatting),
61435 +               .type = REISER4_FORMATTING_PLUGIN_TYPE
61436 +       },
61437 +       [PSET_HASH] = {
61438 +               .offset = offsetof(plugin_set, hash),
61439 +               .type = REISER4_HASH_PLUGIN_TYPE
61440 +       },
61441 +       [PSET_FIBRATION] = {
61442 +               .offset = offsetof(plugin_set, fibration),
61443 +               .type = REISER4_FIBRATION_PLUGIN_TYPE
61444 +       },
61445 +       [PSET_SD] = {
61446 +               .offset = offsetof(plugin_set, sd),
61447 +               .type = REISER4_ITEM_PLUGIN_TYPE
61448 +       },
61449 +       [PSET_DIR_ITEM] = {
61450 +               .offset = offsetof(plugin_set, dir_item),
61451 +               .type = REISER4_ITEM_PLUGIN_TYPE
61452 +       },
61453 +       [PSET_CIPHER] = {
61454 +               .offset = offsetof(plugin_set, cipher),
61455 +               .type = REISER4_CIPHER_PLUGIN_TYPE
61456 +       },
61457 +       [PSET_DIGEST] = {
61458 +               .offset = offsetof(plugin_set, digest),
61459 +               .type = REISER4_DIGEST_PLUGIN_TYPE
61460 +       },
61461 +       [PSET_COMPRESSION] = {
61462 +               .offset = offsetof(plugin_set, compression),
61463 +               .type = REISER4_COMPRESSION_PLUGIN_TYPE
61464 +       },
61465 +       [PSET_COMPRESSION_MODE] = {
61466 +               .offset = offsetof(plugin_set, compression_mode),
61467 +               .type = REISER4_COMPRESSION_MODE_PLUGIN_TYPE
61468 +       },
61469 +       [PSET_CLUSTER] = {
61470 +               .offset = offsetof(plugin_set, cluster),
61471 +               .type = REISER4_CLUSTER_PLUGIN_TYPE
61472 +       },
61473 +       [PSET_REGULAR_ENTRY] = {
61474 +               .offset = offsetof(plugin_set, regular_entry),
61475 +               .type = REISER4_REGULAR_PLUGIN_TYPE
61476 +       }
61477 +};
61478 +
61479 +#if REISER4_DEBUG
61480 +static reiser4_plugin_type pset_member_to_type(pset_member memb)
61481 +{
61482 +       assert("nikita-3501", 0 <= memb && memb < PSET_LAST);
61483 +       return pset_descr[memb].type;
61484 +}
61485 +#endif
61486 +
61487 +reiser4_plugin_type pset_member_to_type_unsafe(pset_member memb)
61488 +{
61489 +       if (0 <= memb && memb < PSET_LAST)
61490 +               return pset_descr[memb].type;
61491 +       else
61492 +               return REISER4_PLUGIN_TYPES;
61493 +}
61494 +
61495 +int pset_set(plugin_set ** set, pset_member memb, reiser4_plugin * plugin)
61496 +{
61497 +       assert("nikita-3492", set != NULL);
61498 +       assert("nikita-3493", *set != NULL);
61499 +       assert("nikita-3494", plugin != NULL);
61500 +       assert("nikita-3495", 0 <= memb && memb < PSET_LAST);
61501 +       assert("nikita-3496", plugin->h.type_id == pset_member_to_type(memb));
61502 +
61503 +       return plugin_set_field(set,
61504 +                               (unsigned long)plugin, pset_descr[memb].offset);
61505 +}
61506 +
61507 +reiser4_plugin *pset_get(plugin_set * set, pset_member memb)
61508 +{
61509 +       assert("nikita-3497", set != NULL);
61510 +       assert("nikita-3498", 0 <= memb && memb < PSET_LAST);
61511 +
61512 +       return *(reiser4_plugin **) (((char *)set) + pset_descr[memb].offset);
61513 +}
61514 +
61515 +#define DEFINE_PLUGIN_SET(type, field)                                 \
61516 +int plugin_set_ ## field(plugin_set **set, type *val)  \
61517 +{                                                                      \
61518 +       cassert(sizeof val == sizeof(unsigned long));                   \
61519 +       return plugin_set_field(set, (unsigned long)val,                \
61520 +                               offsetof(plugin_set, field));           \
61521 +}
61522 +
61523 +DEFINE_PLUGIN_SET(file_plugin, file)
61524 +    DEFINE_PLUGIN_SET(dir_plugin, dir)
61525 +    DEFINE_PLUGIN_SET(formatting_plugin, formatting)
61526 +    DEFINE_PLUGIN_SET(hash_plugin, hash)
61527 +    DEFINE_PLUGIN_SET(fibration_plugin, fibration)
61528 +    DEFINE_PLUGIN_SET(item_plugin, sd)
61529 +    /* DEFINE_PLUGIN_SET(cipher_plugin, cipher) */
61530 +    /* DEFINE_PLUGIN_SET(digest_plugin, digest) */
61531 +    /* DEFINE_PLUGIN_SET(compression_plugin, compression) */
61532 +    /* DEFINE_PLUGIN_SET(compression_mode_plugin, compression_mode) */
61533 +    DEFINE_PLUGIN_SET(cluster_plugin, cluster)
61534 +    /* DEFINE_PLUGIN_SET(regular_plugin, regular_entry) */
61535 +
61536 +
61537 +/**
61538 + * init_plugin_set - create pset cache and hash table
61539 + *
61540 + * Initializes slab cache of plugin_set-s and their hash table. It is part of
61541 + * reiser4 module initialization.
61542 + */
61543 +int init_plugin_set(void)
61544 +{
61545 +       int result;
61546 +
61547 +       result = ps_hash_init(&ps_table, PS_TABLE_SIZE);
61548 +       if (result == 0) {
61549 +               plugin_set_slab = kmem_cache_create("plugin_set",
61550 +                                                   sizeof(plugin_set), 0,
61551 +                                                   SLAB_HWCACHE_ALIGN,
61552 +                                                   NULL, NULL);
61553 +               if (plugin_set_slab == NULL)
61554 +                       result = RETERR(-ENOMEM);
61555 +       }
61556 +       return result;
61557 +}
61558 +
61559 +/**
61560 + * done_plugin_set - delete plugin_set cache and plugin_set hash table
61561 + *
61562 + * This is called on reiser4 module unloading or system shutdown.
61563 + */
61564 +void done_plugin_set(void)
61565 +{
61566 +       plugin_set *cur, *next;
61567 +
61568 +       for_all_in_htable(&ps_table, ps, cur, next) {
61569 +               ps_hash_remove(&ps_table, cur);
61570 +               kmem_cache_free(plugin_set_slab, cur);
61571 +       }
61572 +       destroy_reiser4_cache(&plugin_set_slab);
61573 +       ps_hash_done(&ps_table);
61574 +}
61575 +
61576 +/*
61577 + * Local variables:
61578 + * c-indentation-style: "K&R"
61579 + * mode-name: "LC"
61580 + * c-basic-offset: 8
61581 + * tab-width: 8
61582 + * fill-column: 120
61583 + * End:
61584 + */
61585 diff --git a/fs/reiser4/plugin/plugin_set.h b/fs/reiser4/plugin/plugin_set.h
61586 new file mode 100644
61587 index 0000000..61857c6
61588 --- /dev/null
61589 +++ b/fs/reiser4/plugin/plugin_set.h
61590 @@ -0,0 +1,82 @@
61591 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
61592 +
61593 +/* plugin-sets. see fs/reiser4/plugin/plugin_set.c for details */
61594 +
61595 +#if !defined( __PLUGIN_SET_H__ )
61596 +#define __PLUGIN_SET_H__
61597 +
61598 +#include "../type_safe_hash.h"
61599 +#include "plugin.h"
61600 +
61601 +#include <linux/rcupdate.h>
61602 +
61603 +struct plugin_set;
61604 +typedef struct plugin_set plugin_set;
61605 +
61606 +TYPE_SAFE_HASH_DECLARE(ps, plugin_set);
61607 +
61608 +struct plugin_set {
61609 +       unsigned long hashval;
61610 +       /* plugin of file */
61611 +       file_plugin *file;
61612 +       /* plugin of dir */
61613 +       dir_plugin *dir;
61614 +       /* perm plugin for this file */
61615 +       perm_plugin *perm;
61616 +       /* tail policy plugin. Only meaningful for regular files */
61617 +       formatting_plugin *formatting;
61618 +       /* hash plugin. Only meaningful for directories. */
61619 +       hash_plugin *hash;
61620 +       /* fibration plugin. Only meaningful for directories. */
61621 +       fibration_plugin *fibration;
61622 +       /* plugin of stat-data */
61623 +       item_plugin *sd;
61624 +       /* plugin of items a directory is built of */
61625 +       item_plugin *dir_item;
61626 +       /* cipher plugin */
61627 +       cipher_plugin *cipher;
61628 +       /* digest plugin */
61629 +       digest_plugin *digest;
61630 +       /* compression plugin */
61631 +       compression_plugin *compression;
61632 +       /* compression mode plugin */
61633 +       compression_mode_plugin *compression_mode;
61634 +       /* cluster plugin */
61635 +       cluster_plugin *cluster;
61636 +       /* plugin of regular child should be created */
61637 +       regular_plugin *regular_entry;
61638 +       ps_hash_link link;
61639 +};
61640 +
61641 +extern plugin_set *plugin_set_get_empty(void);
61642 +extern void plugin_set_put(plugin_set * set);
61643 +
61644 +extern int plugin_set_file(plugin_set ** set, file_plugin * plug);
61645 +extern int plugin_set_dir(plugin_set ** set, dir_plugin * plug);
61646 +extern int plugin_set_formatting(plugin_set ** set, formatting_plugin * plug);
61647 +extern int plugin_set_hash(plugin_set ** set, hash_plugin * plug);
61648 +extern int plugin_set_fibration(plugin_set ** set, fibration_plugin * plug);
61649 +extern int plugin_set_sd(plugin_set ** set, item_plugin * plug);
61650 +extern int plugin_set_cluster(plugin_set ** set, cluster_plugin * plug);
61651 +
61652 +extern int init_plugin_set(void);
61653 +extern void done_plugin_set(void);
61654 +
61655 +extern int pset_set(plugin_set ** set, pset_member memb,
61656 +                   reiser4_plugin * plugin);
61657 +extern reiser4_plugin *pset_get(plugin_set * set, pset_member memb);
61658 +
61659 +extern reiser4_plugin_type pset_member_to_type_unsafe(pset_member memb);
61660 +
61661 +/* __PLUGIN_SET_H__ */
61662 +#endif
61663 +
61664 +/* Make Linus happy.
61665 +   Local variables:
61666 +   c-indentation-style: "K&R"
61667 +   mode-name: "LC"
61668 +   c-basic-offset: 8
61669 +   tab-width: 8
61670 +   fill-column: 120
61671 +   End:
61672 +*/
61673 diff --git a/fs/reiser4/plugin/regular.c b/fs/reiser4/plugin/regular.c
61674 new file mode 100644
61675 index 0000000..9918e95
61676 --- /dev/null
61677 +++ b/fs/reiser4/plugin/regular.c
61678 @@ -0,0 +1,44 @@
61679 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
61680 +
61681 +/* Contains Reiser4 regular plugins which:
61682 +   . specify a set of reiser4 regular object plugins,
61683 +   . used by directory plugin to create entries powered by specified
61684 +     regular plugins */
61685 +
61686 +#include "plugin.h"
61687 +
61688 +regular_plugin regular_plugins[LAST_REGULAR_ID] = {
61689 +       [UF_REGULAR_ID] = {
61690 +               .h = {
61691 +                       .type_id = REISER4_REGULAR_PLUGIN_TYPE,
61692 +                       .id = UF_REGULAR_ID,
61693 +                       .pops = NULL,
61694 +                       .label = "unixfile",
61695 +                       .desc = "Unix file regular plugin",
61696 +                       .linkage = {NULL, NULL}
61697 +               },
61698 +               .id = UNIX_FILE_PLUGIN_ID
61699 +       },
61700 +       [CRC_REGULAR_ID] = {
61701 +               .h = {
61702 +                       .type_id = REISER4_REGULAR_PLUGIN_TYPE,
61703 +                       .id = CRC_REGULAR_ID,
61704 +                       .pops = NULL,
61705 +                       .label = "cryptcompress",
61706 +                       .desc = "Cryptcompress regular plugin",
61707 +                       .linkage = {NULL, NULL}
61708 +               },
61709 +               .id = CRC_FILE_PLUGIN_ID
61710 +       }
61711 +};
61712 +
61713 +/*
61714 +  Local variables:
61715 +  c-indentation-style: "K&R"
61716 +  mode-name: "LC"
61717 +  c-basic-offset: 8
61718 +  tab-width: 8
61719 +  fill-column: 120
61720 +  scroll-step: 1
61721 +  End:
61722 +*/
61723 diff --git a/fs/reiser4/plugin/security/Makefile b/fs/reiser4/plugin/security/Makefile
61724 new file mode 100644
61725 index 0000000..645dbb5
61726 --- /dev/null
61727 +++ b/fs/reiser4/plugin/security/Makefile
61728 @@ -0,0 +1,4 @@
61729 +obj-$(CONFIG_REISER4_FS) += security_plugins.o
61730 +
61731 +security_plugins-objs :=       \
61732 +       perm.o
61733 diff --git a/fs/reiser4/plugin/security/perm.c b/fs/reiser4/plugin/security/perm.c
61734 new file mode 100644
61735 index 0000000..ab3b4fc
61736 --- /dev/null
61737 +++ b/fs/reiser4/plugin/security/perm.c
61738 @@ -0,0 +1,44 @@
61739 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
61740 +
61741 +/*
61742 + * this file contains implementation of permission plugins. Currently, only
61743 + * RWX_PERM_ID is implemented
61744 + */
61745 +
61746 +#include "../plugin.h"
61747 +#include "../plugin_header.h"
61748 +#include "../../debug.h"
61749 +
61750 +perm_plugin perm_plugins[LAST_PERM_ID] = {
61751 +       [NULL_PERM_ID] = {
61752 +               .h = {
61753 +                       .type_id = REISER4_PERM_PLUGIN_TYPE,
61754 +                       .id = NULL_PERM_ID,
61755 +                       .pops = NULL,
61756 +                       .label = "null",
61757 +                       .desc = "stub permission plugin",
61758 +                       .linkage = {NULL, NULL}
61759 +               },
61760 +               .read_ok = NULL,
61761 +               .write_ok = NULL,
61762 +               .lookup_ok = NULL,
61763 +               .create_ok = NULL,
61764 +               .link_ok = NULL,
61765 +               .unlink_ok = NULL,
61766 +               .delete_ok = NULL,
61767 +               .mask_ok = NULL,
61768 +               .setattr_ok = NULL,
61769 +               .getattr_ok = NULL,
61770 +               .rename_ok = NULL,
61771 +       }
61772 +};
61773 +
61774 +/*
61775 + * Local variables:
61776 + * c-indentation-style: "K&R"
61777 + * mode-name: "LC"
61778 + * c-basic-offset: 8
61779 + * tab-width: 8
61780 + * fill-column: 79
61781 + * End:
61782 + */
61783 diff --git a/fs/reiser4/plugin/security/perm.h b/fs/reiser4/plugin/security/perm.h
61784 new file mode 100644
61785 index 0000000..747e8f7
61786 --- /dev/null
61787 +++ b/fs/reiser4/plugin/security/perm.h
61788 @@ -0,0 +1,82 @@
61789 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
61790 +
61791 +/* Perm (short for "permissions") plugins common stuff. */
61792 +
61793 +#if !defined( __REISER4_PERM_H__ )
61794 +#define __REISER4_PERM_H__
61795 +
61796 +#include "../../forward.h"
61797 +#include "../plugin_header.h"
61798 +
61799 +#include <linux/types.h>
61800 +#include <linux/fs.h>          /* for struct file  */
61801 +#include <linux/dcache.h>      /* for struct dentry */
61802 +
61803 +/* interface for perm plugin.
61804 +
61805 +   Perm plugin method can be implemented through:
61806 +
61807 +    1. consulting ->i_mode bits in stat data
61808 +
61809 +    2. obtaining acl from the tree and inspecting it
61810 +
61811 +    3. asking some kernel module or user-level program to authorize access.
61812 +
61813 +   This allows for integration with things like capabilities, SELinux-style
61814 +   secutiry contexts, etc.
61815 +
61816 +*/
61817 +/* NIKITA-FIXME-HANS: define what this is targeted for.  It does not seem to be intended for use with sys_reiser4.  Explain. */
61818 +typedef struct perm_plugin {
61819 +       /* generic plugin fields */
61820 +       plugin_header h;
61821 +
61822 +       /* check permissions for read/write */
61823 +       int (*read_ok) (struct file *file, const char __user *buf,
61824 +                       size_t size, loff_t *off);
61825 +       int (*write_ok) (struct file *file, const char __user *buf,
61826 +                        size_t size, loff_t *off);
61827 +
61828 +       /* check permissions for lookup */
61829 +       int (*lookup_ok) (struct inode * parent, struct dentry * dentry);
61830 +
61831 +       /* check permissions for create */
61832 +       int (*create_ok) (struct inode * parent, struct dentry * dentry,
61833 +                         reiser4_object_create_data * data);
61834 +
61835 +       /* check permissions for linking @where to @existing */
61836 +       int (*link_ok) (struct dentry * existing, struct inode * parent,
61837 +                       struct dentry * where);
61838 +
61839 +       /* check permissions for unlinking @victim from @parent */
61840 +       int (*unlink_ok) (struct inode * parent, struct dentry * victim);
61841 +
61842 +       /* check permissions for deletion of @object whose last reference is
61843 +          by @parent */
61844 +       int (*delete_ok) (struct inode * parent, struct dentry * victim);
61845 +       int (*mask_ok) (struct inode * inode, int mask);
61846 +       /* check whether attribute change is acceptable */
61847 +       int (*setattr_ok) (struct dentry * dentry, struct iattr * attr);
61848 +
61849 +       /* check whether stat(2) is allowed */
61850 +       int (*getattr_ok) (struct vfsmount * mnt UNUSED_ARG,
61851 +                          struct dentry * dentry, struct kstat * stat);
61852 +       /* check whether rename(2) is allowed */
61853 +       int (*rename_ok) (struct inode * old_dir, struct dentry * old,
61854 +                         struct inode * new_dir, struct dentry * new);
61855 +} perm_plugin;
61856 +
61857 +typedef enum { NULL_PERM_ID, LAST_PERM_ID } reiser4_perm_id;
61858 +
61859 +/* __REISER4_PERM_H__ */
61860 +#endif
61861 +
61862 +/* Make Linus happy.
61863 +   Local variables:
61864 +   c-indentation-style: "K&R"
61865 +   mode-name: "LC"
61866 +   c-basic-offset: 8
61867 +   tab-width: 8
61868 +   fill-column: 120
61869 +   End:
61870 +*/
61871 diff --git a/fs/reiser4/plugin/space/Makefile b/fs/reiser4/plugin/space/Makefile
61872 new file mode 100644
61873 index 0000000..5a0c94f
61874 --- /dev/null
61875 +++ b/fs/reiser4/plugin/space/Makefile
61876 @@ -0,0 +1,4 @@
61877 +obj-$(CONFIG_REISER4_FS) += space_plugins.o
61878 +
61879 +space_plugins-objs := \
61880 +       bitmap.o
61881 diff --git a/fs/reiser4/plugin/space/bitmap.c b/fs/reiser4/plugin/space/bitmap.c
61882 new file mode 100644
61883 index 0000000..a5ecc79
61884 --- /dev/null
61885 +++ b/fs/reiser4/plugin/space/bitmap.c
61886 @@ -0,0 +1,1592 @@
61887 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
61888 +
61889 +#include "../../debug.h"
61890 +#include "../../dformat.h"
61891 +#include "../../txnmgr.h"
61892 +#include "../../jnode.h"
61893 +#include "../../block_alloc.h"
61894 +#include "../../tree.h"
61895 +#include "../../super.h"
61896 +#include "../plugin.h"
61897 +#include "space_allocator.h"
61898 +#include "bitmap.h"
61899 +
61900 +#include <linux/types.h>
61901 +#include <linux/fs.h>          /* for struct super_block  */
61902 +#include <asm/semaphore.h>
61903 +#include <linux/vmalloc.h>
61904 +#include <asm/div64.h>
61905 +
61906 +/* Proposed (but discarded) optimization: dynamic loading/unloading of bitmap
61907 + * blocks
61908 +
61909 +   A useful optimization of reiser4 bitmap handling would be dynamic bitmap
61910 +   blocks loading/unloading which is different from v3.x where all bitmap
61911 +   blocks are loaded at mount time.
61912 +
61913 +   To implement bitmap blocks unloading we need to count bitmap block usage
61914 +   and detect currently unused blocks allowing them to be unloaded. It is not
61915 +   a simple task since we allow several threads to modify one bitmap block
61916 +   simultaneously.
61917 +
61918 +   Briefly speaking, the following schema is proposed: we count in special
61919 +   variable associated with each bitmap block. That is for counting of block
61920 +   alloc/dealloc operations on that bitmap block. With a deferred block
61921 +   deallocation feature of reiser4 all those operation will be represented in
61922 +   atom dirty/deleted lists as jnodes for freshly allocated or deleted
61923 +   nodes.
61924 +
61925 +   So, we increment usage counter for each new node allocated or deleted, and
61926 +   decrement it at atom commit one time for each node from the dirty/deleted
61927 +   atom's list.  Of course, freshly allocated node deletion and node reusing
61928 +   from atom deleted (if we do so) list should decrement bitmap usage counter
61929 +   also.
61930 +
61931 +   This schema seems to be working but that reference counting is
61932 +   not easy to debug. I think we should agree with Hans and do not implement
61933 +   it in v4.0. Current code implements "on-demand" bitmap blocks loading only.
61934 +
61935 +   For simplicity all bitmap nodes (both commit and working bitmap blocks) are
61936 +   loaded into memory on fs mount time or each bitmap nodes are loaded at the
61937 +   first access to it, the "dont_load_bitmap" mount option controls whether
61938 +   bimtap nodes should be loaded at mount time. Dynamic unloading of bitmap
61939 +   nodes currently is not supported. */
61940 +
61941 +#define CHECKSUM_SIZE    4
61942 +
61943 +#define BYTES_PER_LONG   (sizeof(long))
61944 +
61945 +#if BITS_PER_LONG == 64
61946 +#  define LONG_INT_SHIFT (6)
61947 +#else
61948 +#  define LONG_INT_SHIFT (5)
61949 +#endif
61950 +
61951 +#define LONG_INT_MASK (BITS_PER_LONG - 1UL)
61952 +
61953 +typedef unsigned long ulong_t;
61954 +
61955 +#define bmap_size(blocksize)       ((blocksize) - CHECKSUM_SIZE)
61956 +#define bmap_bit_count(blocksize)   (bmap_size(blocksize) << 3)
61957 +
61958 +/* Block allocation/deallocation are done through special bitmap objects which
61959 +   are allocated in an array at fs mount. */
61960 +struct bitmap_node {
61961 +       struct semaphore sema;  /* long term lock object */
61962 +
61963 +       jnode *wjnode;          /* j-nodes for WORKING ... */
61964 +       jnode *cjnode;          /* ... and COMMIT bitmap blocks */
61965 +
61966 +       bmap_off_t first_zero_bit;      /* for skip_busy option implementation */
61967 +
61968 +       atomic_t loaded;        /* a flag which shows that bnode is loaded
61969 +                                * already */
61970 +};
61971 +
61972 +static inline char *bnode_working_data(struct bitmap_node *bnode)
61973 +{
61974 +       char *data;
61975 +
61976 +       data = jdata(bnode->wjnode);
61977 +       assert("zam-429", data != NULL);
61978 +
61979 +       return data + CHECKSUM_SIZE;
61980 +}
61981 +
61982 +static inline char *bnode_commit_data(const struct bitmap_node *bnode)
61983 +{
61984 +       char *data;
61985 +
61986 +       data = jdata(bnode->cjnode);
61987 +       assert("zam-430", data != NULL);
61988 +
61989 +       return data + CHECKSUM_SIZE;
61990 +}
61991 +
61992 +static inline __u32 bnode_commit_crc(const struct bitmap_node *bnode)
61993 +{
61994 +       char *data;
61995 +
61996 +       data = jdata(bnode->cjnode);
61997 +       assert("vpf-261", data != NULL);
61998 +
61999 +       return le32_to_cpu(get_unaligned((d32 *)data));
62000 +}
62001 +
62002 +static inline void bnode_set_commit_crc(struct bitmap_node *bnode, __u32 crc)
62003 +{
62004 +       char *data;
62005 +
62006 +       data = jdata(bnode->cjnode);
62007 +       assert("vpf-261", data != NULL);
62008 +
62009 +       put_unaligned(cpu_to_le32(crc), (d32 *)data);
62010 +}
62011 +
62012 +/* ZAM-FIXME-HANS: is the idea that this might be a union someday? having
62013 + * written the code, does this added abstraction still have */
62014 +/* ANSWER(Zam): No, the abstractions is in the level above (exact place is the
62015 + * reiser4_space_allocator structure) */
62016 +/* ZAM-FIXME-HANS: I don't understand your english in comment above. */
62017 +/* FIXME-HANS(Zam): I don't understand the questions like "might be a union
62018 + * someday?". What they about?  If there is a reason to have a union, it should
62019 + * be a union, if not, it should not be a union.  "..might be someday" means no
62020 + * reason. */
62021 +struct bitmap_allocator_data {
62022 +       /* an array for bitmap blocks direct access */
62023 +       struct bitmap_node *bitmap;
62024 +};
62025 +
62026 +#define get_barray(super) \
62027 +(((struct bitmap_allocator_data *)(get_super_private(super)->space_allocator.u.generic)) -> bitmap)
62028 +
62029 +#define get_bnode(super, i) (get_barray(super) + i)
62030 +
62031 +/* allocate and initialize jnode with JNODE_BITMAP type */
62032 +static jnode *bnew(void)
62033 +{
62034 +       jnode *jal = jalloc();
62035 +
62036 +       if (jal)
62037 +               jnode_init(jal, current_tree, JNODE_BITMAP);
62038 +
62039 +       return jal;
62040 +}
62041 +
62042 +/* this file contains:
62043 +   - bitmap based implementation of space allocation plugin
62044 +   - all the helper functions like set bit, find_first_zero_bit, etc */
62045 +
62046 +/* Audited by: green(2002.06.12) */
62047 +static int find_next_zero_bit_in_word(ulong_t word, int start_bit)
62048 +{
62049 +       ulong_t mask = 1UL << start_bit;
62050 +       int i = start_bit;
62051 +
62052 +       while ((word & mask) != 0) {
62053 +               mask <<= 1;
62054 +               if (++i >= BITS_PER_LONG)
62055 +                       break;
62056 +       }
62057 +
62058 +       return i;
62059 +}
62060 +
62061 +#include <asm/bitops.h>
62062 +
62063 +#if BITS_PER_LONG == 64
62064 +
62065 +#define OFF(addr)  (((ulong_t)(addr) & (BYTES_PER_LONG - 1)) << 3)
62066 +#define BASE(addr) ((ulong_t*) ((ulong_t)(addr) & ~(BYTES_PER_LONG - 1)))
62067 +
62068 +static inline void reiser4_set_bit(int nr, void *addr)
62069 +{
62070 +       ext2_set_bit(nr + OFF(addr), BASE(addr));
62071 +}
62072 +
62073 +static inline void reiser4_clear_bit(int nr, void *addr)
62074 +{
62075 +       ext2_clear_bit(nr + OFF(addr), BASE(addr));
62076 +}
62077 +
62078 +static inline int reiser4_test_bit(int nr, void *addr)
62079 +{
62080 +       return ext2_test_bit(nr + OFF(addr), BASE(addr));
62081 +}
62082 +static inline int reiser4_find_next_zero_bit(void *addr, int maxoffset,
62083 +                                            int offset)
62084 +{
62085 +       int off = OFF(addr);
62086 +
62087 +       return ext2_find_next_zero_bit(BASE(addr), maxoffset + off,
62088 +                                      offset + off) - off;
62089 +}
62090 +
62091 +#else
62092 +
62093 +#define reiser4_set_bit(nr, addr)    ext2_set_bit(nr, addr)
62094 +#define reiser4_clear_bit(nr, addr)  ext2_clear_bit(nr, addr)
62095 +#define reiser4_test_bit(nr, addr)  ext2_test_bit(nr, addr)
62096 +
62097 +#define reiser4_find_next_zero_bit(addr, maxoffset, offset) \
62098 +ext2_find_next_zero_bit(addr, maxoffset, offset)
62099 +#endif
62100 +
62101 +/* Search for a set bit in the bit array [@start_offset, @max_offset[, offsets
62102 + * are counted from @addr, return the offset of the first bit if it is found,
62103 + * @maxoffset otherwise. */
62104 +static bmap_off_t __reiser4_find_next_set_bit(void *addr, bmap_off_t max_offset,
62105 +                                             bmap_off_t start_offset)
62106 +{
62107 +       ulong_t *base = addr;
62108 +       /* start_offset is in bits, convert it to byte offset within bitmap. */
62109 +       int word_nr = start_offset >> LONG_INT_SHIFT;
62110 +       /* bit number within the byte. */
62111 +       int bit_nr = start_offset & LONG_INT_MASK;
62112 +       int max_word_nr = (max_offset - 1) >> LONG_INT_SHIFT;
62113 +
62114 +       assert("zam-387", max_offset != 0);
62115 +
62116 +       /* Unaligned @start_offset case.  */
62117 +       if (bit_nr != 0) {
62118 +               bmap_nr_t nr;
62119 +
62120 +               nr = find_next_zero_bit_in_word(~(base[word_nr]), bit_nr);
62121 +
62122 +               if (nr < BITS_PER_LONG)
62123 +                       return (word_nr << LONG_INT_SHIFT) + nr;
62124 +
62125 +               ++word_nr;
62126 +       }
62127 +
62128 +       /* Fast scan trough aligned words. */
62129 +       while (word_nr <= max_word_nr) {
62130 +               if (base[word_nr] != 0) {
62131 +                       return (word_nr << LONG_INT_SHIFT)
62132 +                           + find_next_zero_bit_in_word(~(base[word_nr]), 0);
62133 +               }
62134 +
62135 +               ++word_nr;
62136 +       }
62137 +
62138 +       return max_offset;
62139 +}
62140 +
62141 +#if BITS_PER_LONG == 64
62142 +
62143 +static bmap_off_t reiser4_find_next_set_bit(void *addr, bmap_off_t max_offset,
62144 +                                           bmap_off_t start_offset)
62145 +{
62146 +       bmap_off_t off = OFF(addr);
62147 +
62148 +       return __reiser4_find_next_set_bit(BASE(addr), max_offset + off,
62149 +                                          start_offset + off) - off;
62150 +}
62151 +
62152 +#else
62153 +#define reiser4_find_next_set_bit(addr, max_offset, start_offset) \
62154 +  __reiser4_find_next_set_bit(addr, max_offset, start_offset)
62155 +#endif
62156 +
62157 +/* search for the first set bit in single word. */
62158 +static int find_last_set_bit_in_word(ulong_t word, int start_bit)
62159 +{
62160 +       ulong_t bit_mask;
62161 +       int nr = start_bit;
62162 +
62163 +       assert("zam-965", start_bit < BITS_PER_LONG);
62164 +       assert("zam-966", start_bit >= 0);
62165 +
62166 +       bit_mask = (1UL << nr);
62167 +
62168 +       while (bit_mask != 0) {
62169 +               if (bit_mask & word)
62170 +                       return nr;
62171 +               bit_mask >>= 1;
62172 +               nr--;
62173 +       }
62174 +       return BITS_PER_LONG;
62175 +}
62176 +
62177 +/* Search bitmap for a set bit in backward direction from the end to the
62178 + * beginning of given region
62179 + *
62180 + * @result: result offset of the last set bit
62181 + * @addr:   base memory address,
62182 + * @low_off:  low end of the search region, edge bit included into the region,
62183 + * @high_off: high end of the search region, edge bit included into the region,
62184 + *
62185 + * @return: 0 - set bit was found, -1 otherwise.
62186 + */
62187 +static int
62188 +reiser4_find_last_set_bit(bmap_off_t * result, void *addr, bmap_off_t low_off,
62189 +                         bmap_off_t high_off)
62190 +{
62191 +       ulong_t *base = addr;
62192 +       int last_word;
62193 +       int first_word;
62194 +       int last_bit;
62195 +       int nr;
62196 +
62197 +       assert("zam-961", high_off >= 0);
62198 +       assert("zam-962", high_off >= low_off);
62199 +
62200 +       last_word = high_off >> LONG_INT_SHIFT;
62201 +       last_bit = high_off & LONG_INT_MASK;
62202 +       first_word = low_off >> LONG_INT_SHIFT;
62203 +
62204 +       if (last_bit < BITS_PER_LONG) {
62205 +               nr = find_last_set_bit_in_word(base[last_word], last_bit);
62206 +               if (nr < BITS_PER_LONG) {
62207 +                       *result = (last_word << LONG_INT_SHIFT) + nr;
62208 +                       return 0;
62209 +               }
62210 +               --last_word;
62211 +       }
62212 +       while (last_word >= first_word) {
62213 +               if (base[last_word] != 0x0) {
62214 +                       last_bit =
62215 +                           find_last_set_bit_in_word(base[last_word],
62216 +                                                     BITS_PER_LONG - 1);
62217 +                       assert("zam-972", last_bit < BITS_PER_LONG);
62218 +                       *result = (last_word << LONG_INT_SHIFT) + last_bit;
62219 +                       return 0;
62220 +               }
62221 +               --last_word;
62222 +       }
62223 +
62224 +       return -1;              /* set bit not found */
62225 +}
62226 +
62227 +/* Search bitmap for a clear bit in backward direction from the end to the
62228 + * beginning of given region */
62229 +static int
62230 +reiser4_find_last_zero_bit(bmap_off_t * result, void *addr, bmap_off_t low_off,
62231 +                          bmap_off_t high_off)
62232 +{
62233 +       ulong_t *base = addr;
62234 +       int last_word;
62235 +       int first_word;
62236 +       int last_bit;
62237 +       int nr;
62238 +
62239 +       last_word = high_off >> LONG_INT_SHIFT;
62240 +       last_bit = high_off & LONG_INT_MASK;
62241 +       first_word = low_off >> LONG_INT_SHIFT;
62242 +
62243 +       if (last_bit < BITS_PER_LONG) {
62244 +               nr = find_last_set_bit_in_word(~base[last_word], last_bit);
62245 +               if (nr < BITS_PER_LONG) {
62246 +                       *result = (last_word << LONG_INT_SHIFT) + nr;
62247 +                       return 0;
62248 +               }
62249 +               --last_word;
62250 +       }
62251 +       while (last_word >= first_word) {
62252 +               if (base[last_word] != (ulong_t) (-1)) {
62253 +                       *result = (last_word << LONG_INT_SHIFT) +
62254 +                           find_last_set_bit_in_word(~base[last_word],
62255 +                                                     BITS_PER_LONG - 1);
62256 +                       return 0;
62257 +               }
62258 +               --last_word;
62259 +       }
62260 +
62261 +       return -1;              /* zero bit not found */
62262 +}
62263 +
62264 +/* Audited by: green(2002.06.12) */
62265 +static void reiser4_clear_bits(char *addr, bmap_off_t start, bmap_off_t end)
62266 +{
62267 +       int first_byte;
62268 +       int last_byte;
62269 +
62270 +       unsigned char first_byte_mask = 0xFF;
62271 +       unsigned char last_byte_mask = 0xFF;
62272 +
62273 +       assert("zam-410", start < end);
62274 +
62275 +       first_byte = start >> 3;
62276 +       last_byte = (end - 1) >> 3;
62277 +
62278 +       if (last_byte > first_byte + 1)
62279 +               memset(addr + first_byte + 1, 0,
62280 +                      (size_t) (last_byte - first_byte - 1));
62281 +
62282 +       first_byte_mask >>= 8 - (start & 0x7);
62283 +       last_byte_mask <<= ((end - 1) & 0x7) + 1;
62284 +
62285 +       if (first_byte == last_byte) {
62286 +               addr[first_byte] &= (first_byte_mask | last_byte_mask);
62287 +       } else {
62288 +               addr[first_byte] &= first_byte_mask;
62289 +               addr[last_byte] &= last_byte_mask;
62290 +       }
62291 +}
62292 +
62293 +/* Audited by: green(2002.06.12) */
62294 +/* ZAM-FIXME-HANS: comment this */
62295 +static void reiser4_set_bits(char *addr, bmap_off_t start, bmap_off_t end)
62296 +{
62297 +       int first_byte;
62298 +       int last_byte;
62299 +
62300 +       unsigned char first_byte_mask = 0xFF;
62301 +       unsigned char last_byte_mask = 0xFF;
62302 +
62303 +       assert("zam-386", start < end);
62304 +
62305 +       first_byte = start >> 3;
62306 +       last_byte = (end - 1) >> 3;
62307 +
62308 +       if (last_byte > first_byte + 1)
62309 +               memset(addr + first_byte + 1, 0xFF,
62310 +                      (size_t) (last_byte - first_byte - 1));
62311 +
62312 +       first_byte_mask <<= start & 0x7;
62313 +       last_byte_mask >>= 7 - ((end - 1) & 0x7);
62314 +
62315 +       if (first_byte == last_byte) {
62316 +               addr[first_byte] |= (first_byte_mask & last_byte_mask);
62317 +       } else {
62318 +               addr[first_byte] |= first_byte_mask;
62319 +               addr[last_byte] |= last_byte_mask;
62320 +       }
62321 +}
62322 +
62323 +#define ADLER_BASE    65521
62324 +#define ADLER_NMAX    5552
62325 +
62326 +/* Calculates the adler32 checksum for the data pointed by `data` of the
62327 +    length `len`. This function was originally taken from zlib, version 1.1.3,
62328 +    July 9th, 1998.
62329 +
62330 +    Copyright (C) 1995-1998 Jean-loup Gailly and Mark Adler
62331 +
62332 +    This software is provided 'as-is', without any express or implied
62333 +    warranty.  In no event will the authors be held liable for any damages
62334 +    arising from the use of this software.
62335 +
62336 +    Permission is granted to anyone to use this software for any purpose,
62337 +    including commercial applications, and to alter it and redistribute it
62338 +    freely, subject to the following restrictions:
62339 +
62340 +    1. The origin of this software must not be misrepresented; you must not
62341 +       claim that you wrote the original software. If you use this software
62342 +       in a product, an acknowledgment in the product documentation would be
62343 +       appreciated but is not required.
62344 +    2. Altered source versions must be plainly marked as such, and must not be
62345 +       misrepresented as being the original software.
62346 +    3. This notice may not be removed or altered from any source distribution.
62347 +
62348 +    Jean-loup Gailly        Mark Adler
62349 +    jloup@gzip.org          madler@alumni.caltech.edu
62350 +
62351 +    The above comment applies only to the reiser4_adler32 function.
62352 +*/
62353 +
62354 +__u32 reiser4_adler32(char *data, __u32 len)
62355 +{
62356 +       unsigned char *t = data;
62357 +       __u32 s1 = 1;
62358 +       __u32 s2 = 0;
62359 +       int k;
62360 +
62361 +       while (len > 0) {
62362 +               k = len < ADLER_NMAX ? len : ADLER_NMAX;
62363 +               len -= k;
62364 +
62365 +               while (k--) {
62366 +                       s1 += *t++;
62367 +                       s2 += s1;
62368 +               }
62369 +
62370 +               s1 %= ADLER_BASE;
62371 +               s2 %= ADLER_BASE;
62372 +       }
62373 +       return (s2 << 16) | s1;
62374 +}
62375 +
62376 +#define sb_by_bnode(bnode) \
62377 +       ((struct super_block *)jnode_get_tree(bnode->wjnode)->super)
62378 +
62379 +static __u32 bnode_calc_crc(const struct bitmap_node *bnode, unsigned long size)
62380 +{
62381 +       return reiser4_adler32(bnode_commit_data(bnode), bmap_size(size));
62382 +}
62383 +
62384 +static int
62385 +bnode_check_adler32(const struct bitmap_node *bnode, unsigned long size)
62386 +{
62387 +       if (bnode_calc_crc(bnode, size) != bnode_commit_crc(bnode)) {
62388 +               bmap_nr_t bmap;
62389 +
62390 +               bmap = bnode - get_bnode(sb_by_bnode(bnode), 0);
62391 +
62392 +               warning("vpf-263",
62393 +                       "Checksum for the bitmap block %llu is incorrect",
62394 +                       bmap);
62395 +
62396 +               return RETERR(-EIO);
62397 +       }
62398 +
62399 +       return 0;
62400 +}
62401 +
62402 +#define REISER4_CHECK_BMAP_CRC (0)
62403 +
62404 +#if REISER4_CHECK_BMAP_CRC
62405 +static int bnode_check_crc(const struct bitmap_node *bnode)
62406 +{
62407 +       return bnode_check_adler32(bnode,
62408 +                                  bmap_size(sb_by_bnode(bnode)->s_blocksize));
62409 +}
62410 +
62411 +/* REISER4_CHECK_BMAP_CRC */
62412 +#else
62413 +
62414 +#define bnode_check_crc(bnode) (0)
62415 +
62416 +/* REISER4_CHECK_BMAP_CRC */
62417 +#endif
62418 +
62419 +/* Recalculates the adler32 checksum for only 1 byte change.
62420 +    adler - previous adler checksum
62421 +    old_data, data - old, new byte values.
62422 +    tail == (chunk - offset) : length, checksum was calculated for, - offset of
62423 +    the changed byte within this chunk.
62424 +    This function can be used for checksum calculation optimisation.
62425 +*/
62426 +
62427 +static __u32
62428 +adler32_recalc(__u32 adler, unsigned char old_data, unsigned char data,
62429 +              __u32 tail)
62430 +{
62431 +       __u32 delta = data - old_data + 2 * ADLER_BASE;
62432 +       __u32 s1 = adler & 0xffff;
62433 +       __u32 s2 = (adler >> 16) & 0xffff;
62434 +
62435 +       s1 = (delta + s1) % ADLER_BASE;
62436 +       s2 = (delta * tail + s2) % ADLER_BASE;
62437 +
62438 +       return (s2 << 16) | s1;
62439 +}
62440 +
62441 +#define LIMIT(val, boundary) ((val) > (boundary) ? (boundary) : (val))
62442 +
62443 +/**
62444 + * get_nr_bitmap - calculate number of bitmap blocks
62445 + * @super: super block with initialized blocksize and block count
62446 + *
62447 + * Calculates number of bitmap blocks of a filesystem which uses bitmaps to
62448 + * maintain free disk space. It assumes that each bitmap addresses the same
62449 + * number of blocks which is calculated by bmap_block_count macro defined in
62450 + * above. Number of blocks in the filesystem has to be initialized in reiser4
62451 + * private data of super block already so that it can be obtained via
62452 + * reiser4_block_count(). Unfortunately, number of blocks addressed by a bitmap
62453 + * is not power of 2 because 4 bytes are used for checksum. Therefore, we have
62454 + * to use special function to divide and modulo 64bits filesystem block
62455 + * counters.
62456 + *
62457 + * Example: suppose filesystem have 32768 blocks. Blocksize is 4096. Each bitmap
62458 + * block addresses (4096 - 4) * 8 = 32736 blocks. Number of bitmaps to address
62459 + * all 32768 blocks is calculated as (32768 - 1) / 32736 + 1 = 2.
62460 + */
62461 +static bmap_nr_t get_nr_bmap(const struct super_block *super)
62462 +{
62463 +       u64 quotient;
62464 +
62465 +       assert("zam-393", reiser4_block_count(super) != 0);
62466 +
62467 +       quotient = reiser4_block_count(super) - 1;
62468 +       do_div(quotient, bmap_bit_count(super->s_blocksize));
62469 +       return quotient + 1;
62470 +}
62471 +
62472 +/**
62473 + * parse_blocknr - calculate bitmap number and offset in it by block number
62474 + * @block: pointer to block number to calculate location in bitmap of
62475 + * @bmap: pointer where to store bitmap block number
62476 + * @offset: pointer where to store offset within bitmap block
62477 + *
62478 + * Calculates location of bit which is responsible for allocation/freeing of
62479 + * block @*block. That location is represented by bitmap block number and offset
62480 + * within that bitmap block.
62481 + */
62482 +static void
62483 +parse_blocknr(const reiser4_block_nr *block, bmap_nr_t *bmap,
62484 +             bmap_off_t *offset)
62485 +{
62486 +       struct super_block *super = get_current_context()->super;
62487 +       u64 quotient = *block;
62488 +
62489 +       *offset = do_div(quotient, bmap_bit_count(super->s_blocksize));
62490 +       *bmap = quotient;
62491 +
62492 +       assert("zam-433", *bmap < get_nr_bmap(super));
62493 +       assert("", *offset < bmap_bit_count(super->s_blocksize));
62494 +}
62495 +
62496 +#if REISER4_DEBUG
62497 +/* Audited by: green(2002.06.12) */
62498 +static void
62499 +check_block_range(const reiser4_block_nr * start, const reiser4_block_nr * len)
62500 +{
62501 +       struct super_block *sb = reiser4_get_current_sb();
62502 +
62503 +       assert("zam-436", sb != NULL);
62504 +
62505 +       assert("zam-455", start != NULL);
62506 +       assert("zam-437", *start != 0);
62507 +       assert("zam-541", !blocknr_is_fake(start));
62508 +       assert("zam-441", *start < reiser4_block_count(sb));
62509 +
62510 +       if (len != NULL) {
62511 +               assert("zam-438", *len != 0);
62512 +               assert("zam-442", *start + *len <= reiser4_block_count(sb));
62513 +       }
62514 +}
62515 +
62516 +static void check_bnode_loaded(const struct bitmap_node *bnode)
62517 +{
62518 +       assert("zam-485", bnode != NULL);
62519 +       assert("zam-483", jnode_page(bnode->wjnode) != NULL);
62520 +       assert("zam-484", jnode_page(bnode->cjnode) != NULL);
62521 +       assert("nikita-2820", jnode_is_loaded(bnode->wjnode));
62522 +       assert("nikita-2821", jnode_is_loaded(bnode->cjnode));
62523 +}
62524 +
62525 +#else
62526 +
62527 +#  define check_block_range(start, len) do { /* nothing */} while(0)
62528 +#  define check_bnode_loaded(bnode)     do { /* nothing */} while(0)
62529 +
62530 +#endif
62531 +
62532 +/* modify bnode->first_zero_bit (if we free bits before); bnode should be
62533 +   spin-locked */
62534 +static inline void
62535 +adjust_first_zero_bit(struct bitmap_node *bnode, bmap_off_t offset)
62536 +{
62537 +       if (offset < bnode->first_zero_bit)
62538 +               bnode->first_zero_bit = offset;
62539 +}
62540 +
62541 +/* return a physical disk address for logical bitmap number @bmap */
62542 +/* FIXME-VS: this is somehow related to disk layout? */
62543 +/* ZAM-FIXME-HANS: your answer is? Use not more than one function dereference
62544 + * per block allocation so that performance is not affected.  Probably this
62545 + * whole file should be considered part of the disk layout plugin, and other
62546 + * disk layouts can use other defines and efficiency will not be significantly
62547 + * affected.  */
62548 +
62549 +#define REISER4_FIRST_BITMAP_BLOCK \
62550 +       ((REISER4_MASTER_OFFSET / PAGE_CACHE_SIZE) + 2)
62551 +
62552 +/* Audited by: green(2002.06.12) */
62553 +static void
62554 +get_bitmap_blocknr(struct super_block *super, bmap_nr_t bmap,
62555 +                  reiser4_block_nr * bnr)
62556 +{
62557 +
62558 +       assert("zam-390", bmap < get_nr_bmap(super));
62559 +
62560 +#ifdef CONFIG_REISER4_BADBLOCKS
62561 +#define BITMAP_PLUGIN_DISKMAP_ID ((0xc0e1<<16) | (0xe0ff))
62562 +       /* Check if the diskmap have this already, first. */
62563 +       if (reiser4_get_diskmap_value(BITMAP_PLUGIN_DISKMAP_ID, bmap, bnr) == 0)
62564 +               return;         /* Found it in diskmap */
62565 +#endif
62566 +       /* FIXME_ZAM: before discussing of disk layouts and disk format
62567 +          plugins I implement bitmap location scheme which is close to scheme
62568 +          used in reiser 3.6 */
62569 +       if (bmap == 0) {
62570 +               *bnr = REISER4_FIRST_BITMAP_BLOCK;
62571 +       } else {
62572 +               *bnr = bmap * bmap_bit_count(super->s_blocksize);
62573 +       }
62574 +}
62575 +
62576 +/* construct a fake block number for shadow bitmap (WORKING BITMAP) block */
62577 +/* Audited by: green(2002.06.12) */
62578 +static void get_working_bitmap_blocknr(bmap_nr_t bmap, reiser4_block_nr * bnr)
62579 +{
62580 +       *bnr =
62581 +           (reiser4_block_nr) ((bmap & ~REISER4_BLOCKNR_STATUS_BIT_MASK) |
62582 +                               REISER4_BITMAP_BLOCKS_STATUS_VALUE);
62583 +}
62584 +
62585 +/* bnode structure initialization */
62586 +static void
62587 +init_bnode(struct bitmap_node *bnode,
62588 +          struct super_block *super UNUSED_ARG, bmap_nr_t bmap UNUSED_ARG)
62589 +{
62590 +       memset(bnode, 0, sizeof(struct bitmap_node));
62591 +
62592 +       sema_init(&bnode->sema, 1);
62593 +       atomic_set(&bnode->loaded, 0);
62594 +}
62595 +
62596 +static void release(jnode * node)
62597 +{
62598 +       jrelse(node);
62599 +       JF_SET(node, JNODE_HEARD_BANSHEE);
62600 +       jput(node);
62601 +}
62602 +
62603 +/* This function is for internal bitmap.c use because it assumes that jnode is
62604 +   in under full control of this thread */
62605 +static void done_bnode(struct bitmap_node *bnode)
62606 +{
62607 +       if (bnode) {
62608 +               atomic_set(&bnode->loaded, 0);
62609 +               if (bnode->wjnode != NULL)
62610 +                       release(bnode->wjnode);
62611 +               if (bnode->cjnode != NULL)
62612 +                       release(bnode->cjnode);
62613 +               bnode->wjnode = bnode->cjnode = NULL;
62614 +       }
62615 +}
62616 +
62617 +/* ZAM-FIXME-HANS: comment this.  Called only by load_and_lock_bnode()*/
62618 +static int
62619 +prepare_bnode(struct bitmap_node *bnode, jnode ** cjnode_ret,
62620 +             jnode ** wjnode_ret)
62621 +{
62622 +       struct super_block *super;
62623 +       jnode *cjnode;
62624 +       jnode *wjnode;
62625 +       bmap_nr_t bmap;
62626 +       int ret;
62627 +
62628 +       super = reiser4_get_current_sb();
62629 +
62630 +       *wjnode_ret = wjnode = bnew();
62631 +       if (wjnode == NULL) {
62632 +               *cjnode_ret = NULL;
62633 +               return RETERR(-ENOMEM);
62634 +       }
62635 +
62636 +       *cjnode_ret = cjnode = bnew();
62637 +       if (cjnode == NULL)
62638 +               return RETERR(-ENOMEM);
62639 +
62640 +       bmap = bnode - get_bnode(super, 0);
62641 +
62642 +       get_working_bitmap_blocknr(bmap, &wjnode->blocknr);
62643 +       get_bitmap_blocknr(super, bmap, &cjnode->blocknr);
62644 +
62645 +       jref(cjnode);
62646 +       jref(wjnode);
62647 +
62648 +       /* load commit bitmap */
62649 +       ret = jload_gfp(cjnode, GFP_NOFS, 1);
62650 +
62651 +       if (ret)
62652 +               goto error;
62653 +
62654 +       /* allocate memory for working bitmap block. Note that for
62655 +        * bitmaps jinit_new() doesn't actually modifies node content,
62656 +        * so parallel calls to this are ok. */
62657 +       ret = jinit_new(wjnode, GFP_NOFS);
62658 +
62659 +       if (ret != 0) {
62660 +               jrelse(cjnode);
62661 +               goto error;
62662 +       }
62663 +
62664 +       return 0;
62665 +
62666 +      error:
62667 +       jput(cjnode);
62668 +       jput(wjnode);
62669 +       *wjnode_ret = *cjnode_ret = NULL;
62670 +       return ret;
62671 +
62672 +}
62673 +
62674 +/* Check the bnode data on read. */
62675 +static int check_struct_bnode(struct bitmap_node *bnode, __u32 blksize)
62676 +{
62677 +       void *data;
62678 +       int ret;
62679 +
62680 +       /* Check CRC */
62681 +       ret = bnode_check_adler32(bnode, blksize);
62682 +
62683 +       if (ret) {
62684 +               return ret;
62685 +       }
62686 +
62687 +       data = jdata(bnode->cjnode) + CHECKSUM_SIZE;
62688 +
62689 +       /* Check the very first bit -- it must be busy. */
62690 +       if (!reiser4_test_bit(0, data)) {
62691 +               warning("vpf-1362", "The allocator block %llu is not marked "
62692 +                       "as used.", (unsigned long long)bnode->cjnode->blocknr);
62693 +
62694 +               return -EINVAL;
62695 +       }
62696 +
62697 +       return 0;
62698 +}
62699 +
62700 +/* load bitmap blocks "on-demand" */
62701 +static int load_and_lock_bnode(struct bitmap_node *bnode)
62702 +{
62703 +       int ret;
62704 +
62705 +       jnode *cjnode;
62706 +       jnode *wjnode;
62707 +
62708 +       assert("nikita-3040", schedulable());
62709 +
62710 +/* ZAM-FIXME-HANS: since bitmaps are never unloaded, this does not
62711 + * need to be atomic, right? Just leave a comment that if bitmaps were
62712 + * unloadable, this would need to be atomic.  */
62713 +       if (atomic_read(&bnode->loaded)) {
62714 +               /* bitmap is already loaded, nothing to do */
62715 +               check_bnode_loaded(bnode);
62716 +               down(&bnode->sema);
62717 +               assert("nikita-2827", atomic_read(&bnode->loaded));
62718 +               return 0;
62719 +       }
62720 +
62721 +       ret = prepare_bnode(bnode, &cjnode, &wjnode);
62722 +       if (ret == 0) {
62723 +               down(&bnode->sema);
62724 +
62725 +               if (!atomic_read(&bnode->loaded)) {
62726 +                       assert("nikita-2822", cjnode != NULL);
62727 +                       assert("nikita-2823", wjnode != NULL);
62728 +                       assert("nikita-2824", jnode_is_loaded(cjnode));
62729 +                       assert("nikita-2825", jnode_is_loaded(wjnode));
62730 +
62731 +                       bnode->wjnode = wjnode;
62732 +                       bnode->cjnode = cjnode;
62733 +
62734 +                       ret = check_struct_bnode(bnode, current_blocksize);
62735 +                       if (!ret) {
62736 +                               cjnode = wjnode = NULL;
62737 +                               atomic_set(&bnode->loaded, 1);
62738 +                               /* working bitmap is initialized by on-disk
62739 +                                * commit bitmap. This should be performed
62740 +                                * under semaphore. */
62741 +                               memcpy(bnode_working_data(bnode),
62742 +                                      bnode_commit_data(bnode),
62743 +                                      bmap_size(current_blocksize));
62744 +                       } else {
62745 +                               up(&bnode->sema);
62746 +                       }
62747 +               } else
62748 +                       /* race: someone already loaded bitmap while we were
62749 +                        * busy initializing data. */
62750 +                       check_bnode_loaded(bnode);
62751 +       }
62752 +
62753 +       if (wjnode != NULL) {
62754 +               release(wjnode);
62755 +               bnode->wjnode = NULL;
62756 +       }
62757 +       if (cjnode != NULL) {
62758 +               release(cjnode);
62759 +               bnode->cjnode = NULL;
62760 +       }
62761 +
62762 +       return ret;
62763 +}
62764 +
62765 +static void release_and_unlock_bnode(struct bitmap_node *bnode)
62766 +{
62767 +       check_bnode_loaded(bnode);
62768 +       up(&bnode->sema);
62769 +}
62770 +
62771 +/* This function does all block allocation work but only for one bitmap
62772 +   block.*/
62773 +/* FIXME_ZAM: It does not allow us to allocate block ranges across bitmap
62774 +   block responsibility zone boundaries. This had no sense in v3.6 but may
62775 +   have it in v4.x */
62776 +/* ZAM-FIXME-HANS: do you mean search one bitmap block forward? */
62777 +static int
62778 +search_one_bitmap_forward(bmap_nr_t bmap, bmap_off_t * offset,
62779 +                         bmap_off_t max_offset, int min_len, int max_len)
62780 +{
62781 +       struct super_block *super = get_current_context()->super;
62782 +       struct bitmap_node *bnode = get_bnode(super, bmap);
62783 +
62784 +       char *data;
62785 +
62786 +       bmap_off_t search_end;
62787 +       bmap_off_t start;
62788 +       bmap_off_t end;
62789 +
62790 +       int set_first_zero_bit = 0;
62791 +
62792 +       int ret;
62793 +
62794 +       assert("zam-364", min_len > 0);
62795 +       assert("zam-365", max_len >= min_len);
62796 +       assert("zam-366", *offset <= max_offset);
62797 +
62798 +       ret = load_and_lock_bnode(bnode);
62799 +
62800 +       if (ret)
62801 +               return ret;
62802 +
62803 +       data = bnode_working_data(bnode);
62804 +
62805 +       start = *offset;
62806 +
62807 +       if (bnode->first_zero_bit >= start) {
62808 +               start = bnode->first_zero_bit;
62809 +               set_first_zero_bit = 1;
62810 +       }
62811 +
62812 +       while (start + min_len < max_offset) {
62813 +
62814 +               start =
62815 +                   reiser4_find_next_zero_bit((long *)data, max_offset, start);
62816 +               if (set_first_zero_bit) {
62817 +                       bnode->first_zero_bit = start;
62818 +                       set_first_zero_bit = 0;
62819 +               }
62820 +               if (start >= max_offset)
62821 +                       break;
62822 +
62823 +               search_end = LIMIT(start + max_len, max_offset);
62824 +               end =
62825 +                   reiser4_find_next_set_bit((long *)data, search_end, start);
62826 +               if (end >= start + min_len) {
62827 +                       /* we can't trust find_next_set_bit result if set bit
62828 +                          was not fount, result may be bigger than
62829 +                          max_offset */
62830 +                       if (end > search_end)
62831 +                               end = search_end;
62832 +
62833 +                       ret = end - start;
62834 +                       *offset = start;
62835 +
62836 +                       reiser4_set_bits(data, start, end);
62837 +
62838 +                       /* FIXME: we may advance first_zero_bit if [start,
62839 +                          end] region overlaps the first_zero_bit point */
62840 +
62841 +                       break;
62842 +               }
62843 +
62844 +               start = end + 1;
62845 +       }
62846 +
62847 +       release_and_unlock_bnode(bnode);
62848 +
62849 +       return ret;
62850 +}
62851 +
62852 +static int
62853 +search_one_bitmap_backward(bmap_nr_t bmap, bmap_off_t * start_offset,
62854 +                          bmap_off_t end_offset, int min_len, int max_len)
62855 +{
62856 +       struct super_block *super = get_current_context()->super;
62857 +       struct bitmap_node *bnode = get_bnode(super, bmap);
62858 +       char *data;
62859 +       bmap_off_t start;
62860 +       int ret;
62861 +
62862 +       assert("zam-958", min_len > 0);
62863 +       assert("zam-959", max_len >= min_len);
62864 +       assert("zam-960", *start_offset >= end_offset);
62865 +
62866 +       ret = load_and_lock_bnode(bnode);
62867 +       if (ret)
62868 +               return ret;
62869 +
62870 +       data = bnode_working_data(bnode);
62871 +       start = *start_offset;
62872 +
62873 +       while (1) {
62874 +               bmap_off_t end, search_end;
62875 +
62876 +               /* Find the beginning of the zero filled region */
62877 +               if (reiser4_find_last_zero_bit(&start, data, end_offset, start))
62878 +                       break;
62879 +               /* Is there more than `min_len' bits from `start' to
62880 +                * `end_offset'?  */
62881 +               if (start < end_offset + min_len - 1)
62882 +                       break;
62883 +
62884 +               /* Do not search to `end_offset' if we need to find less than
62885 +                * `max_len' zero bits. */
62886 +               if (end_offset + max_len - 1 < start)
62887 +                       search_end = start - max_len + 1;
62888 +               else
62889 +                       search_end = end_offset;
62890 +
62891 +               if (reiser4_find_last_set_bit(&end, data, search_end, start))
62892 +                       end = search_end;
62893 +               else
62894 +                       end++;
62895 +
62896 +               if (end + min_len <= start + 1) {
62897 +                       if (end < search_end)
62898 +                               end = search_end;
62899 +                       ret = start - end + 1;
62900 +                       *start_offset = end;    /* `end' is lowest offset */
62901 +                       assert("zam-987",
62902 +                              reiser4_find_next_set_bit(data, start + 1,
62903 +                                                        end) >= start + 1);
62904 +                       reiser4_set_bits(data, end, start + 1);
62905 +                       break;
62906 +               }
62907 +
62908 +               if (end <= end_offset)
62909 +                       /* left search boundary reached. */
62910 +                       break;
62911 +               start = end - 1;
62912 +       }
62913 +
62914 +       release_and_unlock_bnode(bnode);
62915 +       return ret;
62916 +}
62917 +
62918 +/* allocate contiguous range of blocks in bitmap */
62919 +static int bitmap_alloc_forward(reiser4_block_nr * start,
62920 +                               const reiser4_block_nr * end, int min_len,
62921 +                               int max_len)
62922 +{
62923 +       bmap_nr_t bmap, end_bmap;
62924 +       bmap_off_t offset, end_offset;
62925 +       int len;
62926 +
62927 +       reiser4_block_nr tmp;
62928 +
62929 +       struct super_block *super = get_current_context()->super;
62930 +       const bmap_off_t max_offset = bmap_bit_count(super->s_blocksize);
62931 +
62932 +       parse_blocknr(start, &bmap, &offset);
62933 +
62934 +       tmp = *end - 1;
62935 +       parse_blocknr(&tmp, &end_bmap, &end_offset);
62936 +       ++end_offset;
62937 +
62938 +       assert("zam-358", end_bmap >= bmap);
62939 +       assert("zam-359", ergo(end_bmap == bmap, end_offset >= offset));
62940 +
62941 +       for (; bmap < end_bmap; bmap++, offset = 0) {
62942 +               len =
62943 +                   search_one_bitmap_forward(bmap, &offset, max_offset,
62944 +                                             min_len, max_len);
62945 +               if (len != 0)
62946 +                       goto out;
62947 +       }
62948 +
62949 +       len =
62950 +           search_one_bitmap_forward(bmap, &offset, end_offset, min_len,
62951 +                                     max_len);
62952 +      out:
62953 +       *start = bmap * max_offset + offset;
62954 +       return len;
62955 +}
62956 +
62957 +/* allocate contiguous range of blocks in bitmap (from @start to @end in
62958 + * backward direction) */
62959 +static int bitmap_alloc_backward(reiser4_block_nr * start,
62960 +                                const reiser4_block_nr * end, int min_len,
62961 +                                int max_len)
62962 +{
62963 +       bmap_nr_t bmap, end_bmap;
62964 +       bmap_off_t offset, end_offset;
62965 +       int len;
62966 +       struct super_block *super = get_current_context()->super;
62967 +       const bmap_off_t max_offset = bmap_bit_count(super->s_blocksize);
62968 +
62969 +       parse_blocknr(start, &bmap, &offset);
62970 +       parse_blocknr(end, &end_bmap, &end_offset);
62971 +
62972 +       assert("zam-961", end_bmap <= bmap);
62973 +       assert("zam-962", ergo(end_bmap == bmap, end_offset <= offset));
62974 +
62975 +       for (; bmap > end_bmap; bmap--, offset = max_offset - 1) {
62976 +               len =
62977 +                   search_one_bitmap_backward(bmap, &offset, 0, min_len,
62978 +                                              max_len);
62979 +               if (len != 0)
62980 +                       goto out;
62981 +       }
62982 +
62983 +       len =
62984 +           search_one_bitmap_backward(bmap, &offset, end_offset, min_len,
62985 +                                      max_len);
62986 +      out:
62987 +       *start = bmap * max_offset + offset;
62988 +       return len;
62989 +}
62990 +
62991 +/* plugin->u.space_allocator.alloc_blocks() */
62992 +static int alloc_blocks_forward(reiser4_blocknr_hint *hint, int needed,
62993 +                               reiser4_block_nr *start, reiser4_block_nr *len)
62994 +{
62995 +       struct super_block *super = get_current_context()->super;
62996 +       int actual_len;
62997 +
62998 +       reiser4_block_nr search_start;
62999 +       reiser4_block_nr search_end;
63000 +
63001 +       assert("zam-398", super != NULL);
63002 +       assert("zam-412", hint != NULL);
63003 +       assert("zam-397", hint->blk <= reiser4_block_count(super));
63004 +
63005 +       if (hint->max_dist == 0)
63006 +               search_end = reiser4_block_count(super);
63007 +       else
63008 +               search_end =
63009 +                   LIMIT(hint->blk + hint->max_dist,
63010 +                         reiser4_block_count(super));
63011 +
63012 +       /* We use @hint -> blk as a search start and search from it to the end
63013 +          of the disk or in given region if @hint -> max_dist is not zero */
63014 +       search_start = hint->blk;
63015 +
63016 +       actual_len =
63017 +           bitmap_alloc_forward(&search_start, &search_end, 1, needed);
63018 +
63019 +       /* There is only one bitmap search if max_dist was specified or first
63020 +          pass was from the beginning of the bitmap. We also do one pass for
63021 +          scanning bitmap in backward direction. */
63022 +       if (!(actual_len != 0 || hint->max_dist != 0 || search_start == 0)) {
63023 +               /* next step is a scanning from 0 to search_start */
63024 +               search_end = search_start;
63025 +               search_start = 0;
63026 +               actual_len =
63027 +                   bitmap_alloc_forward(&search_start, &search_end, 1, needed);
63028 +       }
63029 +       if (actual_len == 0)
63030 +               return RETERR(-ENOSPC);
63031 +       if (actual_len < 0)
63032 +               return RETERR(actual_len);
63033 +       *len = actual_len;
63034 +       *start = search_start;
63035 +       return 0;
63036 +}
63037 +
63038 +static int alloc_blocks_backward(reiser4_blocknr_hint * hint, int needed,
63039 +                                reiser4_block_nr * start,
63040 +                                reiser4_block_nr * len)
63041 +{
63042 +       reiser4_block_nr search_start;
63043 +       reiser4_block_nr search_end;
63044 +       int actual_len;
63045 +
63046 +       ON_DEBUG(struct super_block *super = reiser4_get_current_sb());
63047 +
63048 +       assert("zam-969", super != NULL);
63049 +       assert("zam-970", hint != NULL);
63050 +       assert("zam-971", hint->blk <= reiser4_block_count(super));
63051 +
63052 +       search_start = hint->blk;
63053 +       if (hint->max_dist == 0 || search_start <= hint->max_dist)
63054 +               search_end = 0;
63055 +       else
63056 +               search_end = search_start - hint->max_dist;
63057 +
63058 +       actual_len =
63059 +           bitmap_alloc_backward(&search_start, &search_end, 1, needed);
63060 +       if (actual_len == 0)
63061 +               return RETERR(-ENOSPC);
63062 +       if (actual_len < 0)
63063 +               return RETERR(actual_len);
63064 +       *len = actual_len;
63065 +       *start = search_start;
63066 +       return 0;
63067 +}
63068 +
63069 +/* plugin->u.space_allocator.alloc_blocks() */
63070 +int
63071 +alloc_blocks_bitmap(reiser4_space_allocator * allocator UNUSED_ARG,
63072 +                   reiser4_blocknr_hint * hint, int needed,
63073 +                   reiser4_block_nr * start, reiser4_block_nr * len)
63074 +{
63075 +       if (hint->backward)
63076 +               return alloc_blocks_backward(hint, needed, start, len);
63077 +       return alloc_blocks_forward(hint, needed, start, len);
63078 +}
63079 +
63080 +/* plugin->u.space_allocator.dealloc_blocks(). */
63081 +/* It just frees blocks in WORKING BITMAP. Usually formatted an unformatted
63082 +   nodes deletion is deferred until transaction commit.  However, deallocation
63083 +   of temporary objects like wandered blocks and transaction commit records
63084 +   requires immediate node deletion from WORKING BITMAP.*/
63085 +void
63086 +dealloc_blocks_bitmap(reiser4_space_allocator * allocator UNUSED_ARG,
63087 +                     reiser4_block_nr start, reiser4_block_nr len)
63088 +{
63089 +       struct super_block *super = reiser4_get_current_sb();
63090 +
63091 +       bmap_nr_t bmap;
63092 +       bmap_off_t offset;
63093 +
63094 +       struct bitmap_node *bnode;
63095 +       int ret;
63096 +
63097 +       assert("zam-468", len != 0);
63098 +       check_block_range(&start, &len);
63099 +
63100 +       parse_blocknr(&start, &bmap, &offset);
63101 +
63102 +       assert("zam-469", offset + len <= bmap_bit_count(super->s_blocksize));
63103 +
63104 +       bnode = get_bnode(super, bmap);
63105 +
63106 +       assert("zam-470", bnode != NULL);
63107 +
63108 +       ret = load_and_lock_bnode(bnode);
63109 +       assert("zam-481", ret == 0);
63110 +
63111 +       reiser4_clear_bits(bnode_working_data(bnode), offset,
63112 +                          (bmap_off_t) (offset + len));
63113 +
63114 +       adjust_first_zero_bit(bnode, offset);
63115 +
63116 +       release_and_unlock_bnode(bnode);
63117 +}
63118 +
63119 +/* plugin->u.space_allocator.check_blocks(). */
63120 +void
63121 +check_blocks_bitmap(const reiser4_block_nr * start,
63122 +                   const reiser4_block_nr * len, int desired)
63123 +{
63124 +#if REISER4_DEBUG
63125 +       struct super_block *super = reiser4_get_current_sb();
63126 +
63127 +       bmap_nr_t bmap;
63128 +       bmap_off_t start_offset;
63129 +       bmap_off_t end_offset;
63130 +
63131 +       struct bitmap_node *bnode;
63132 +       int ret;
63133 +
63134 +       assert("zam-622", len != NULL);
63135 +       check_block_range(start, len);
63136 +       parse_blocknr(start, &bmap, &start_offset);
63137 +
63138 +       end_offset = start_offset + *len;
63139 +       assert("nikita-2214", end_offset <= bmap_bit_count(super->s_blocksize));
63140 +
63141 +       bnode = get_bnode(super, bmap);
63142 +
63143 +       assert("nikita-2215", bnode != NULL);
63144 +
63145 +       ret = load_and_lock_bnode(bnode);
63146 +       assert("zam-626", ret == 0);
63147 +
63148 +       assert("nikita-2216", jnode_is_loaded(bnode->wjnode));
63149 +
63150 +       if (desired) {
63151 +               assert("zam-623",
63152 +                      reiser4_find_next_zero_bit(bnode_working_data(bnode),
63153 +                                                 end_offset, start_offset)
63154 +                      >= end_offset);
63155 +       } else {
63156 +               assert("zam-624",
63157 +                      reiser4_find_next_set_bit(bnode_working_data(bnode),
63158 +                                                end_offset, start_offset)
63159 +                      >= end_offset);
63160 +       }
63161 +
63162 +       release_and_unlock_bnode(bnode);
63163 +#endif
63164 +}
63165 +
63166 +/* conditional insertion of @node into atom's overwrite set  if it was not there */
63167 +static void cond_add_to_overwrite_set(txn_atom * atom, jnode * node)
63168 +{
63169 +       assert("zam-546", atom != NULL);
63170 +       assert("zam-547", atom->stage == ASTAGE_PRE_COMMIT);
63171 +       assert("zam-548", node != NULL);
63172 +
63173 +       spin_lock_atom(atom);
63174 +       spin_lock_jnode(node);
63175 +
63176 +       if (node->atom == NULL) {
63177 +               JF_SET(node, JNODE_OVRWR);
63178 +               insert_into_atom_ovrwr_list(atom, node);
63179 +       } else {
63180 +               assert("zam-549", node->atom == atom);
63181 +       }
63182 +
63183 +       spin_unlock_jnode(node);
63184 +       spin_unlock_atom(atom);
63185 +}
63186 +
63187 +/* an actor which applies delete set to COMMIT bitmap pages and link modified
63188 +   pages in a single-linked list */
63189 +static int
63190 +apply_dset_to_commit_bmap(txn_atom * atom, const reiser4_block_nr * start,
63191 +                         const reiser4_block_nr * len, void *data)
63192 +{
63193 +
63194 +       bmap_nr_t bmap;
63195 +       bmap_off_t offset;
63196 +       int ret;
63197 +
63198 +       long long *blocks_freed_p = data;
63199 +
63200 +       struct bitmap_node *bnode;
63201 +
63202 +       struct super_block *sb = reiser4_get_current_sb();
63203 +
63204 +       check_block_range(start, len);
63205 +
63206 +       parse_blocknr(start, &bmap, &offset);
63207 +
63208 +       /* FIXME-ZAM: we assume that all block ranges are allocated by this
63209 +          bitmap-based allocator and each block range can't go over a zone of
63210 +          responsibility of one bitmap block; same assumption is used in
63211 +          other journal hooks in bitmap code. */
63212 +       bnode = get_bnode(sb, bmap);
63213 +       assert("zam-448", bnode != NULL);
63214 +
63215 +       /* it is safe to unlock atom with is in ASTAGE_PRE_COMMIT */
63216 +       assert("zam-767", atom->stage == ASTAGE_PRE_COMMIT);
63217 +       ret = load_and_lock_bnode(bnode);
63218 +       if (ret)
63219 +               return ret;
63220 +
63221 +       /* put bnode into atom's overwrite set */
63222 +       cond_add_to_overwrite_set(atom, bnode->cjnode);
63223 +
63224 +       data = bnode_commit_data(bnode);
63225 +
63226 +       ret = bnode_check_crc(bnode);
63227 +       if (ret != 0)
63228 +               return ret;
63229 +
63230 +       if (len != NULL) {
63231 +               /* FIXME-ZAM: a check that all bits are set should be there */
63232 +               assert("zam-443",
63233 +                      offset + *len <= bmap_bit_count(sb->s_blocksize));
63234 +               reiser4_clear_bits(data, offset, (bmap_off_t) (offset + *len));
63235 +
63236 +               (*blocks_freed_p) += *len;
63237 +       } else {
63238 +               reiser4_clear_bit(offset, data);
63239 +               (*blocks_freed_p)++;
63240 +       }
63241 +
63242 +       bnode_set_commit_crc(bnode, bnode_calc_crc(bnode, sb->s_blocksize));
63243 +
63244 +       release_and_unlock_bnode(bnode);
63245 +
63246 +       return 0;
63247 +}
63248 +
63249 +/* plugin->u.space_allocator.pre_commit_hook(). */
63250 +/* It just applies transaction changes to fs-wide COMMIT BITMAP, hoping the
63251 +   rest is done by transaction manager (allocate wandered locations for COMMIT
63252 +   BITMAP blocks, copy COMMIT BITMAP blocks data). */
63253 +/* Only one instance of this function can be running at one given time, because
63254 +   only one transaction can be committed a time, therefore it is safe to access
63255 +   some global variables without any locking */
63256 +
63257 +int pre_commit_hook_bitmap(void)
63258 +{
63259 +       struct super_block *super = reiser4_get_current_sb();
63260 +       txn_atom *atom;
63261 +
63262 +       long long blocks_freed = 0;
63263 +
63264 +       atom = get_current_atom_locked();
63265 +       assert("zam-876", atom->stage == ASTAGE_PRE_COMMIT);
63266 +       spin_unlock_atom(atom);
63267 +
63268 +       {                       /* scan atom's captured list and find all freshly allocated nodes,
63269 +                                * mark corresponded bits in COMMIT BITMAP as used */
63270 +               struct list_head *head = ATOM_CLEAN_LIST(atom);
63271 +               jnode *node = list_entry(head->next, jnode, capture_link);
63272 +
63273 +               while (head != &node->capture_link) {
63274 +                       /* we detect freshly allocated jnodes */
63275 +                       if (JF_ISSET(node, JNODE_RELOC)) {
63276 +                               int ret;
63277 +                               bmap_nr_t bmap;
63278 +
63279 +                               bmap_off_t offset;
63280 +                               bmap_off_t index;
63281 +                               struct bitmap_node *bn;
63282 +                               __u32 size = bmap_size(super->s_blocksize);
63283 +                               __u32 crc;
63284 +                               char byte;
63285 +
63286 +                               assert("zam-559", !JF_ISSET(node, JNODE_OVRWR));
63287 +                               assert("zam-460",
63288 +                                      !blocknr_is_fake(&node->blocknr));
63289 +
63290 +                               parse_blocknr(&node->blocknr, &bmap, &offset);
63291 +                               bn = get_bnode(super, bmap);
63292 +
63293 +                               index = offset >> 3;
63294 +                               assert("vpf-276", index < size);
63295 +
63296 +                               ret = bnode_check_crc(bnode);
63297 +                               if (ret != 0)
63298 +                                       return ret;
63299 +
63300 +                               check_bnode_loaded(bn);
63301 +                               load_and_lock_bnode(bn);
63302 +
63303 +                               byte = *(bnode_commit_data(bn) + index);
63304 +                               reiser4_set_bit(offset, bnode_commit_data(bn));
63305 +
63306 +                               crc = adler32_recalc(bnode_commit_crc(bn), byte,
63307 +                                                    *(bnode_commit_data(bn) +
63308 +                                                      index),
63309 +                                                    size - index),
63310 +                                   bnode_set_commit_crc(bn, crc);
63311 +
63312 +                               release_and_unlock_bnode(bn);
63313 +
63314 +                               ret = bnode_check_crc(bn);
63315 +                               if (ret != 0)
63316 +                                       return ret;
63317 +
63318 +                               /* working of this depends on how it inserts
63319 +                                  new j-node into clean list, because we are
63320 +                                  scanning the same list now. It is OK, if
63321 +                                  insertion is done to the list front */
63322 +                               cond_add_to_overwrite_set(atom, bn->cjnode);
63323 +                       }
63324 +
63325 +                       node = list_entry(node->capture_link.next, jnode, capture_link);
63326 +               }
63327 +       }
63328 +
63329 +       blocknr_set_iterator(atom, &atom->delete_set, apply_dset_to_commit_bmap,
63330 +                            &blocks_freed, 0);
63331 +
63332 +       blocks_freed -= atom->nr_blocks_allocated;
63333 +
63334 +       {
63335 +               reiser4_super_info_data *sbinfo;
63336 +
63337 +               sbinfo = get_super_private(super);
63338 +
63339 +               spin_lock_reiser4_super(sbinfo);
63340 +               sbinfo->blocks_free_committed += blocks_freed;
63341 +               spin_unlock_reiser4_super(sbinfo);
63342 +       }
63343 +
63344 +       return 0;
63345 +}
63346 +
63347 +/* plugin->u.space_allocator.init_allocator
63348 +    constructor of reiser4_space_allocator object. It is called on fs mount */
63349 +int
63350 +init_allocator_bitmap(reiser4_space_allocator * allocator,
63351 +                     struct super_block *super, void *arg UNUSED_ARG)
63352 +{
63353 +       struct bitmap_allocator_data *data = NULL;
63354 +       bmap_nr_t bitmap_blocks_nr;
63355 +       bmap_nr_t i;
63356 +
63357 +       assert("nikita-3039", schedulable());
63358 +
63359 +       /* getting memory for bitmap allocator private data holder */
63360 +       data =
63361 +               kmalloc(sizeof(struct bitmap_allocator_data), GFP_KERNEL);
63362 +
63363 +       if (data == NULL)
63364 +               return RETERR(-ENOMEM);
63365 +
63366 +       /* allocation and initialization for the array of bnodes */
63367 +       bitmap_blocks_nr = get_nr_bmap(super);
63368 +
63369 +       /* FIXME-ZAM: it is not clear what to do with huge number of bitmaps
63370 +          which is bigger than 2^32 (= 8 * 4096 * 4096 * 2^32 bytes = 5.76e+17,
63371 +          may I never meet someone who still uses the ia32 architecture when
63372 +          storage devices of that size enter the market, and wants to use ia32
63373 +          with that storage device, much less reiser4. ;-) -Hans). Kmalloc is not possible and,
63374 +          probably, another dynamic data structure should replace a static
63375 +          array of bnodes. */
63376 +       /*data->bitmap = reiser4_kmalloc((size_t) (sizeof (struct bitmap_node) * bitmap_blocks_nr), GFP_KERNEL); */
63377 +       data->bitmap = vmalloc(sizeof(struct bitmap_node) * bitmap_blocks_nr);
63378 +       if (data->bitmap == NULL) {
63379 +               kfree(data);
63380 +               return RETERR(-ENOMEM);
63381 +       }
63382 +
63383 +       for (i = 0; i < bitmap_blocks_nr; i++)
63384 +               init_bnode(data->bitmap + i, super, i);
63385 +
63386 +       allocator->u.generic = data;
63387 +
63388 +#if REISER4_DEBUG
63389 +       get_super_private(super)->min_blocks_used += bitmap_blocks_nr;
63390 +#endif
63391 +
63392 +       /* Load all bitmap blocks at mount time. */
63393 +       if (!test_bit
63394 +           (REISER4_DONT_LOAD_BITMAP, &get_super_private(super)->fs_flags)) {
63395 +               __u64 start_time, elapsed_time;
63396 +               struct bitmap_node *bnode;
63397 +               int ret;
63398 +
63399 +               if (REISER4_DEBUG)
63400 +                       printk(KERN_INFO "loading reiser4 bitmap...");
63401 +               start_time = jiffies;
63402 +
63403 +               for (i = 0; i < bitmap_blocks_nr; i++) {
63404 +                       bnode = data->bitmap + i;
63405 +                       ret = load_and_lock_bnode(bnode);
63406 +                       if (ret) {
63407 +                               destroy_allocator_bitmap(allocator, super);
63408 +                               return ret;
63409 +                       }
63410 +                       release_and_unlock_bnode(bnode);
63411 +               }
63412 +
63413 +               elapsed_time = jiffies - start_time;
63414 +               if (REISER4_DEBUG)
63415 +                       printk("...done (%llu jiffies)\n",
63416 +                              (unsigned long long)elapsed_time);
63417 +       }
63418 +
63419 +       return 0;
63420 +}
63421 +
63422 +/* plugin->u.space_allocator.destroy_allocator
63423 +   destructor. It is called on fs unmount */
63424 +int
63425 +destroy_allocator_bitmap(reiser4_space_allocator * allocator,
63426 +                        struct super_block *super)
63427 +{
63428 +       bmap_nr_t bitmap_blocks_nr;
63429 +       bmap_nr_t i;
63430 +
63431 +       struct bitmap_allocator_data *data = allocator->u.generic;
63432 +
63433 +       assert("zam-414", data != NULL);
63434 +       assert("zam-376", data->bitmap != NULL);
63435 +
63436 +       bitmap_blocks_nr = get_nr_bmap(super);
63437 +
63438 +       for (i = 0; i < bitmap_blocks_nr; i++) {
63439 +               struct bitmap_node *bnode = data->bitmap + i;
63440 +
63441 +               down(&bnode->sema);
63442 +
63443 +#if REISER4_DEBUG
63444 +               if (atomic_read(&bnode->loaded)) {
63445 +                       jnode *wj = bnode->wjnode;
63446 +                       jnode *cj = bnode->cjnode;
63447 +
63448 +                       assert("zam-480", jnode_page(cj) != NULL);
63449 +                       assert("zam-633", jnode_page(wj) != NULL);
63450 +
63451 +                       assert("zam-634",
63452 +                              memcmp(jdata(wj), jdata(wj),
63453 +                                     bmap_size(super->s_blocksize)) == 0);
63454 +
63455 +               }
63456 +#endif
63457 +               done_bnode(bnode);
63458 +               up(&bnode->sema);
63459 +       }
63460 +
63461 +       vfree(data->bitmap);
63462 +       kfree(data);
63463 +
63464 +       allocator->u.generic = NULL;
63465 +
63466 +       return 0;
63467 +}
63468 +
63469 +/*
63470 +   Local variables:
63471 +   c-indentation-style: "K&R"
63472 +   mode-name: "LC"
63473 +   c-basic-offset: 8
63474 +   tab-width: 8
63475 +   fill-column: 80
63476 +   scroll-step: 1
63477 +   End:
63478 +*/
63479 diff --git a/fs/reiser4/plugin/space/bitmap.h b/fs/reiser4/plugin/space/bitmap.h
63480 new file mode 100644
63481 index 0000000..7047a55
63482 --- /dev/null
63483 +++ b/fs/reiser4/plugin/space/bitmap.h
63484 @@ -0,0 +1,47 @@
63485 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
63486 +
63487 +#if !defined (__REISER4_PLUGIN_SPACE_BITMAP_H__)
63488 +#define __REISER4_PLUGIN_SPACE_BITMAP_H__
63489 +
63490 +#include "../../dformat.h"
63491 +#include "../../block_alloc.h"
63492 +
63493 +#include <linux/types.h>       /* for __u??  */
63494 +#include <linux/fs.h>          /* for struct super_block  */
63495 +/* EDWARD-FIXME-HANS: write something as informative as the below for every .h file lacking it. */
63496 +/* declarations of functions implementing methods of space allocator plugin for
63497 +   bitmap based allocator. The functions themselves are in bitmap.c */
63498 +extern int init_allocator_bitmap(reiser4_space_allocator *,
63499 +                                struct super_block *, void *);
63500 +extern int destroy_allocator_bitmap(reiser4_space_allocator *,
63501 +                                   struct super_block *);
63502 +extern int alloc_blocks_bitmap(reiser4_space_allocator *,
63503 +                              reiser4_blocknr_hint *, int needed,
63504 +                              reiser4_block_nr * start,
63505 +                              reiser4_block_nr * len);
63506 +extern void check_blocks_bitmap(const reiser4_block_nr *,
63507 +                               const reiser4_block_nr *, int);
63508 +
63509 +extern void dealloc_blocks_bitmap(reiser4_space_allocator *, reiser4_block_nr,
63510 +                                 reiser4_block_nr);
63511 +extern int pre_commit_hook_bitmap(void);
63512 +
63513 +#define post_commit_hook_bitmap() do{}while(0)
63514 +#define post_write_back_hook_bitmap() do{}while(0)
63515 +#define print_info_bitmap(pref, al) do{}while(0)
63516 +
63517 +typedef __u64 bmap_nr_t;
63518 +typedef __u32 bmap_off_t;
63519 +
63520 +#endif                         /* __REISER4_PLUGIN_SPACE_BITMAP_H__ */
63521 +
63522 +/* Make Linus happy.
63523 +   Local variables:
63524 +   c-indentation-style: "K&R"
63525 +   mode-name: "LC"
63526 +   c-basic-offset: 8
63527 +   tab-width: 8
63528 +   fill-column: 120
63529 +   scroll-step: 1
63530 +   End:
63531 +*/
63532 diff --git a/fs/reiser4/plugin/space/space_allocator.h b/fs/reiser4/plugin/space/space_allocator.h
63533 new file mode 100644
63534 index 0000000..e796de5
63535 --- /dev/null
63536 +++ b/fs/reiser4/plugin/space/space_allocator.h
63537 @@ -0,0 +1,80 @@
63538 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
63539 +
63540 +#ifndef __SPACE_ALLOCATOR_H__
63541 +#define __SPACE_ALLOCATOR_H__
63542 +
63543 +#include "../../forward.h"
63544 +#include "bitmap.h"
63545 +/* NIKITA-FIXME-HANS: surely this could use a comment. Something about how bitmap is the only space allocator for now,
63546 + * but... */
63547 +#define DEF_SPACE_ALLOCATOR(allocator)                                                                                 \
63548 +                                                                                                                       \
63549 +static inline int sa_init_allocator (reiser4_space_allocator * al, struct super_block *s, void * opaque)               \
63550 +{                                                                                                                      \
63551 +       return init_allocator_##allocator (al, s, opaque);                                                              \
63552 +}                                                                                                                      \
63553 +                                                                                                                       \
63554 +static inline void sa_destroy_allocator (reiser4_space_allocator *al, struct super_block *s)                           \
63555 +{                                                                                                                      \
63556 +       destroy_allocator_##allocator (al, s);                                                                          \
63557 +}                                                                                                                      \
63558 +                                                                                                                       \
63559 +static inline int sa_alloc_blocks (reiser4_space_allocator *al, reiser4_blocknr_hint * hint,                           \
63560 +                                  int needed, reiser4_block_nr * start, reiser4_block_nr * len)                        \
63561 +{                                                                                                                      \
63562 +       return alloc_blocks_##allocator (al, hint, needed, start, len);                                                 \
63563 +}                                                                                                                      \
63564 +static inline void sa_dealloc_blocks (reiser4_space_allocator * al, reiser4_block_nr start, reiser4_block_nr len)      \
63565 +{                                                                                                                      \
63566 +       dealloc_blocks_##allocator (al, start, len);                                                                    \
63567 +}                                                                                                                      \
63568 +                                                                                                                       \
63569 +static inline void sa_check_blocks (const reiser4_block_nr * start, const reiser4_block_nr * end, int desired)                 \
63570 +{                                                                                                                      \
63571 +       check_blocks_##allocator (start, end, desired);                                                                 \
63572 +}                                                                                                                      \
63573 +                                                                                                                       \
63574 +static inline void sa_pre_commit_hook (void)                                                                           \
63575 +{                                                                                                                      \
63576 +       pre_commit_hook_##allocator ();                                                                                 \
63577 +}                                                                                                                      \
63578 +                                                                                                                       \
63579 +static inline void sa_post_commit_hook (void)                                                                          \
63580 +{                                                                                                                      \
63581 +       post_commit_hook_##allocator ();                                                                                \
63582 +}                                                                                                                      \
63583 +                                                                                                                       \
63584 +static inline void sa_post_write_back_hook (void)                                                                      \
63585 +{                                                                                                                      \
63586 +       post_write_back_hook_##allocator();                                                                             \
63587 +}                                                                                                                      \
63588 +                                                                                                                       \
63589 +static inline void sa_print_info(const char * prefix, reiser4_space_allocator * al)                                    \
63590 +{                                                                                                                      \
63591 +       print_info_##allocator (prefix, al);                                                                            \
63592 +}
63593 +
63594 +DEF_SPACE_ALLOCATOR(bitmap)
63595 +
63596 +/* this object is part of reiser4 private in-core super block */
63597 +struct reiser4_space_allocator {
63598 +       union {
63599 +               /* space allocators might use this pointer to reference their
63600 +                * data. */
63601 +               void *generic;
63602 +       } u;
63603 +};
63604 +
63605 +/* __SPACE_ALLOCATOR_H__ */
63606 +#endif
63607 +
63608 +/* Make Linus happy.
63609 +   Local variables:
63610 +   c-indentation-style: "K&R"
63611 +   mode-name: "LC"
63612 +   c-basic-offset: 8
63613 +   tab-width: 8
63614 +   fill-column: 120
63615 +   scroll-step: 1
63616 +   End:
63617 +*/
63618 diff --git a/fs/reiser4/plugin/tail_policy.c b/fs/reiser4/plugin/tail_policy.c
63619 new file mode 100644
63620 index 0000000..43f4ae7
63621 --- /dev/null
63622 +++ b/fs/reiser4/plugin/tail_policy.c
63623 @@ -0,0 +1,113 @@
63624 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
63625 + * reiser4/README */
63626 +
63627 +/* Formatting policy plugins */
63628 +
63629 +/*
63630 + * Formatting policy plugin is used by object plugin (of regular file) to
63631 + * convert file between two representations.
63632 + *
63633 + * Currently following policies are implemented:
63634 + *  never store file in formatted nodes
63635 + *  always store file in formatted nodes
63636 + *  store file in formatted nodes if file is smaller than 4 blocks (default)
63637 + */
63638 +
63639 +#include "../tree.h"
63640 +#include "../inode.h"
63641 +#include "../super.h"
63642 +#include "object.h"
63643 +#include "plugin.h"
63644 +#include "node/node.h"
63645 +#include "plugin_header.h"
63646 +
63647 +#include <linux/pagemap.h>
63648 +#include <linux/fs.h>          /* For struct inode */
63649 +
63650 +/**
63651 + * have_formatting_never -
63652 + * @inode:
63653 + * @size:
63654 + *
63655 + *
63656 + */
63657 +/* Never store file's tail as direct item */
63658 +/* Audited by: green(2002.06.12) */
63659 +static int have_formatting_never(const struct inode *inode UNUSED_ARG
63660 +                     /* inode to operate on */ ,
63661 +                     loff_t size UNUSED_ARG /* new object size */ )
63662 +{
63663 +       return 0;
63664 +}
63665 +
63666 +/* Always store file's tail as direct item */
63667 +/* Audited by: green(2002.06.12) */
63668 +static int
63669 +have_formatting_always(const struct inode *inode UNUSED_ARG
63670 +                      /* inode to operate on */ ,
63671 +                      loff_t size UNUSED_ARG /* new object size */ )
63672 +{
63673 +       return 1;
63674 +}
63675 +
63676 +/* This function makes test if we should store file denoted @inode as tails only or
63677 +   as extents only. */
63678 +static int
63679 +have_formatting_default(const struct inode *inode UNUSED_ARG
63680 +                       /* inode to operate on */ ,
63681 +                       loff_t size /* new object size */ )
63682 +{
63683 +       assert("umka-1253", inode != NULL);
63684 +
63685 +       if (size > inode->i_sb->s_blocksize * 4)
63686 +               return 0;
63687 +
63688 +       return 1;
63689 +}
63690 +
63691 +/* tail plugins */
63692 +formatting_plugin formatting_plugins[LAST_TAIL_FORMATTING_ID] = {
63693 +       [NEVER_TAILS_FORMATTING_ID] = {
63694 +               .h = {
63695 +                       .type_id = REISER4_FORMATTING_PLUGIN_TYPE,
63696 +                       .id = NEVER_TAILS_FORMATTING_ID,
63697 +                       .pops = NULL,
63698 +                       .label = "never",
63699 +                       .desc = "Never store file's tail",
63700 +                       .linkage = {NULL, NULL}
63701 +               },
63702 +               .have_tail = have_formatting_never
63703 +       },
63704 +       [ALWAYS_TAILS_FORMATTING_ID] = {
63705 +               .h = {
63706 +                       .type_id = REISER4_FORMATTING_PLUGIN_TYPE,
63707 +                       .id = ALWAYS_TAILS_FORMATTING_ID,
63708 +                       .pops = NULL,
63709 +                       .label = "always",
63710 +                       .desc = "Always store file's tail",
63711 +                       .linkage = {NULL, NULL}
63712 +               },
63713 +               .have_tail = have_formatting_always
63714 +       },
63715 +       [SMALL_FILE_FORMATTING_ID] = {
63716 +               .h = {
63717 +                       .type_id = REISER4_FORMATTING_PLUGIN_TYPE,
63718 +                       .id = SMALL_FILE_FORMATTING_ID,
63719 +                       .pops = NULL,
63720 +                       .label = "4blocks",
63721 +                       .desc = "store files shorter than 4 blocks in tail items",
63722 +                       .linkage = {NULL, NULL}
63723 +               },
63724 +               .have_tail = have_formatting_default
63725 +       }
63726 +};
63727 +
63728 +/*
63729 + * Local variables:
63730 + * c-indentation-style: "K&R"
63731 + * mode-name: "LC"
63732 + * c-basic-offset: 8
63733 + * tab-width: 8
63734 + * fill-column: 79
63735 + * End:
63736 + */
63737 diff --git a/fs/reiser4/pool.c b/fs/reiser4/pool.c
63738 new file mode 100644
63739 index 0000000..d1686c9
63740 --- /dev/null
63741 +++ b/fs/reiser4/pool.c
63742 @@ -0,0 +1,236 @@
63743 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
63744 + * reiser4/README */
63745 +
63746 +/* Fast pool allocation.
63747 +
63748 +   There are situations when some sub-system normally asks memory allocator
63749 +   for only few objects, but under some circumstances could require much
63750 +   more. Typical and actually motivating example is tree balancing. It needs
63751 +   to keep track of nodes that were involved into it, and it is well-known
63752 +   that in reasonable packed balanced tree most (92.938121%) percent of all
63753 +   balancings end up after working with only few nodes (3.141592 on
63754 +   average). But in rare cases balancing can involve much more nodes
63755 +   (3*tree_height+1 in extremal situation).
63756 +
63757 +   On the one hand, we don't want to resort to dynamic allocation (slab,
63758 +    malloc(), etc.) to allocate data structures required to keep track of
63759 +   nodes during balancing. On the other hand, we cannot statically allocate
63760 +   required amount of space on the stack, because first: it is useless wastage
63761 +   of precious resource, and second: this amount is unknown in advance (tree
63762 +   height can change).
63763 +
63764 +   Pools, implemented in this file are solution for this problem:
63765 +
63766 +    - some configurable amount of objects is statically preallocated on the
63767 +    stack
63768 +
63769 +    - if this preallocated pool is exhausted and more objects is requested
63770 +    they are allocated dynamically.
63771 +
63772 +   Pools encapsulate distinction between statically and dynamically allocated
63773 +   objects. Both allocation and recycling look exactly the same.
63774 +
63775 +   To keep track of dynamically allocated objects, pool adds its own linkage
63776 +   to each object.
63777 +
63778 +   NOTE-NIKITA This linkage also contains some balancing-specific data. This
63779 +   is not perfect. On the other hand, balancing is currently the only client
63780 +   of pool code.
63781 +
63782 +   NOTE-NIKITA Another desirable feature is to rewrite all pool manipulation
63783 +   functions in the style of tslist/tshash, i.e., make them unreadable, but
63784 +   type-safe.
63785 +
63786 +
63787 +*/
63788 +
63789 +#include "debug.h"
63790 +#include "pool.h"
63791 +#include "super.h"
63792 +
63793 +#include <linux/types.h>
63794 +#include <linux/err.h>
63795 +
63796 +/* initialize new pool object */
63797 +static void reiser4_init_pool_obj(reiser4_pool_header * h      /* pool object to
63798 +                                                                * initialize */ )
63799 +{
63800 +       INIT_LIST_HEAD(&h->usage_linkage);
63801 +       INIT_LIST_HEAD(&h->level_linkage);
63802 +       INIT_LIST_HEAD(&h->extra_linkage);
63803 +}
63804 +
63805 +/* initialize new pool */
63806 +void reiser4_init_pool(reiser4_pool * pool /* pool to initialize */ ,
63807 +                      size_t obj_size /* size of objects in @pool */ ,
63808 +                      int num_of_objs /* number of preallocated objects */ ,
63809 +                      char *data /* area for preallocated objects */ )
63810 +{
63811 +       reiser4_pool_header *h;
63812 +       int i;
63813 +
63814 +       assert("nikita-955", pool != NULL);
63815 +       assert("nikita-1044", obj_size > 0);
63816 +       assert("nikita-956", num_of_objs >= 0);
63817 +       assert("nikita-957", data != NULL);
63818 +
63819 +       memset(pool, 0, sizeof *pool);
63820 +       pool->obj_size = obj_size;
63821 +       pool->data = data;
63822 +       INIT_LIST_HEAD(&pool->free);
63823 +       INIT_LIST_HEAD(&pool->used);
63824 +       INIT_LIST_HEAD(&pool->extra);
63825 +       memset(data, 0, obj_size * num_of_objs);
63826 +       for (i = 0; i < num_of_objs; ++i) {
63827 +               h = (reiser4_pool_header *) (data + i * obj_size);
63828 +               reiser4_init_pool_obj(h);
63829 +               /* add pool header to the end of pool's free list */
63830 +               list_add_tail(&h->usage_linkage, &pool->free);
63831 +       }
63832 +}
63833 +
63834 +/* release pool resources
63835 +
63836 +   Release all resources acquired by this pool, specifically, dynamically
63837 +   allocated objects.
63838 +
63839 +*/
63840 +void reiser4_done_pool(reiser4_pool * pool UNUSED_ARG /* pool to destroy */ )
63841 +{
63842 +}
63843 +
63844 +/* allocate carry object from pool
63845 +
63846 +   First, try to get preallocated object. If this fails, resort to dynamic
63847 +   allocation.
63848 +
63849 +*/
63850 +static void *reiser4_pool_alloc(reiser4_pool * pool    /* pool to allocate object
63851 +                                                        * from */ )
63852 +{
63853 +       reiser4_pool_header *result;
63854 +
63855 +       assert("nikita-959", pool != NULL);
63856 +
63857 +       if (!list_empty(&pool->free)) {
63858 +               struct list_head *linkage;
63859 +
63860 +               linkage = pool->free.next;
63861 +               list_del(linkage);
63862 +               INIT_LIST_HEAD(linkage);
63863 +               result = list_entry(linkage, reiser4_pool_header, usage_linkage);
63864 +               BUG_ON(!list_empty(&result->level_linkage) ||
63865 +                      !list_empty(&result->extra_linkage));
63866 +       } else {
63867 +               /* pool is empty. Extra allocations don't deserve dedicated
63868 +                  slab to be served from, as they are expected to be rare. */
63869 +               result = kmalloc(pool->obj_size, get_gfp_mask());
63870 +               if (result != 0) {
63871 +                       reiser4_init_pool_obj(result);
63872 +                       list_add(&result->extra_linkage, &pool->extra);
63873 +               } else
63874 +                       return ERR_PTR(RETERR(-ENOMEM));
63875 +               BUG_ON(!list_empty(&result->usage_linkage) ||
63876 +                      !list_empty(&result->level_linkage));
63877 +       }
63878 +       ++pool->objs;
63879 +       list_add(&result->usage_linkage, &pool->used);
63880 +       memset(result + 1, 0, pool->obj_size - sizeof *result);
63881 +       return result;
63882 +}
63883 +
63884 +/* return object back to the pool */
63885 +void reiser4_pool_free(reiser4_pool * pool, reiser4_pool_header * h    /* pool to return object back
63886 +                                                                        * into */ )
63887 +{
63888 +       assert("nikita-961", h != NULL);
63889 +       assert("nikita-962", pool != NULL);
63890 +
63891 +       --pool->objs;
63892 +       assert("nikita-963", pool->objs >= 0);
63893 +
63894 +       list_del_init(&h->usage_linkage);
63895 +       list_del_init(&h->level_linkage);
63896 +
63897 +       if (list_empty(&h->extra_linkage))
63898 +               /*
63899 +                * pool header is not an extra one. Push it onto free list
63900 +                * using usage_linkage
63901 +                */
63902 +               list_add(&h->usage_linkage, &pool->free);
63903 +       else {
63904 +               /* remove pool header from pool's extra list and kfree it */
63905 +               list_del(&h->extra_linkage);
63906 +               kfree(h);
63907 +       }
63908 +}
63909 +
63910 +/* add new object to the carry level list
63911 +
63912 +   Carry level is FIFO most of the time, but not always. Complications arise
63913 +   when make_space() function tries to go to the left neighbor and thus adds
63914 +   carry node before existing nodes, and also, when updating delimiting keys
63915 +   after moving data between two nodes, we want left node to be locked before
63916 +   right node.
63917 +
63918 +   Latter case is confusing at the first glance. Problem is that COP_UPDATE
63919 +   opration that updates delimiting keys is sometimes called with two nodes
63920 +   (when data are moved between two nodes) and sometimes with only one node
63921 +   (when leftmost item is deleted in a node). In any case operation is
63922 +   supplied with at least node whose left delimiting key is to be updated
63923 +   (that is "right" node).
63924 +
63925 +*/
63926 +reiser4_pool_header *add_obj(reiser4_pool * pool       /* pool from which to
63927 +                                                        * allocate new object */ ,
63928 +                            struct list_head *list,    /* list where to add
63929 +                                                        * object */
63930 +                            pool_ordering order /* where to add */ ,
63931 +                            reiser4_pool_header * reference    /* after (or
63932 +                                                                * before) which
63933 +                                                                * existing
63934 +                                                                * object to
63935 +                                                                * add */ )
63936 +{
63937 +       reiser4_pool_header *result;
63938 +
63939 +       assert("nikita-972", pool != NULL);
63940 +
63941 +       result = reiser4_pool_alloc(pool);
63942 +       if (IS_ERR(result))
63943 +               return result;
63944 +
63945 +       assert("nikita-973", result != NULL);
63946 +
63947 +       switch (order) {
63948 +       case POOLO_BEFORE:
63949 +               __list_add(&result->level_linkage,
63950 +                          reference->level_linkage.prev,
63951 +                          &reference->level_linkage);
63952 +               break;
63953 +       case POOLO_AFTER:
63954 +               __list_add(&result->level_linkage,
63955 +                          &reference->level_linkage,
63956 +                          reference->level_linkage.next);
63957 +               break;
63958 +       case POOLO_LAST:
63959 +               list_add_tail(&result->level_linkage, list);
63960 +               break;
63961 +       case POOLO_FIRST:
63962 +               list_add(&result->level_linkage, list);
63963 +               break;
63964 +       default:
63965 +               wrong_return_value("nikita-927", "order");
63966 +       }
63967 +       return result;
63968 +}
63969 +
63970 +/* Make Linus happy.
63971 +   Local variables:
63972 +   c-indentation-style: "K&R"
63973 +   mode-name: "LC"
63974 +   c-basic-offset: 8
63975 +   tab-width: 8
63976 +   fill-column: 120
63977 +   End:
63978 +*/
63979 diff --git a/fs/reiser4/pool.h b/fs/reiser4/pool.h
63980 new file mode 100644
63981 index 0000000..0be92df
63982 --- /dev/null
63983 +++ b/fs/reiser4/pool.h
63984 @@ -0,0 +1,54 @@
63985 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
63986 +
63987 +/* Fast pool allocation */
63988 +
63989 +#ifndef __REISER4_POOL_H__
63990 +#define __REISER4_POOL_H__
63991 +
63992 +#include <linux/types.h>
63993 +
63994 +typedef struct reiser4_pool {
63995 +       size_t obj_size;
63996 +       int objs;
63997 +       char *data;
63998 +       struct list_head free;
63999 +       struct list_head used;
64000 +       struct list_head extra;
64001 +} reiser4_pool;
64002 +
64003 +typedef struct reiser4_pool_header {
64004 +       /* object is either on free or "used" lists */
64005 +       struct list_head usage_linkage;
64006 +       struct list_head level_linkage;
64007 +       struct list_head extra_linkage;
64008 +} reiser4_pool_header;
64009 +
64010 +typedef enum {
64011 +       POOLO_BEFORE,
64012 +       POOLO_AFTER,
64013 +       POOLO_LAST,
64014 +       POOLO_FIRST
64015 +} pool_ordering;
64016 +
64017 +/* pool manipulation functions */
64018 +
64019 +extern void reiser4_init_pool(reiser4_pool * pool, size_t obj_size,
64020 +                             int num_of_objs, char *data);
64021 +extern void reiser4_done_pool(reiser4_pool * pool);
64022 +extern void reiser4_pool_free(reiser4_pool * pool, reiser4_pool_header * h);
64023 +reiser4_pool_header *add_obj(reiser4_pool * pool, struct list_head * list,
64024 +                            pool_ordering order,
64025 +                            reiser4_pool_header * reference);
64026 +
64027 +/* __REISER4_POOL_H__ */
64028 +#endif
64029 +
64030 +/* Make Linus happy.
64031 +   Local variables:
64032 +   c-indentation-style: "K&R"
64033 +   mode-name: "LC"
64034 +   c-basic-offset: 8
64035 +   tab-width: 8
64036 +   fill-column: 120
64037 +   End:
64038 +*/
64039 diff --git a/fs/reiser4/readahead.c b/fs/reiser4/readahead.c
64040 new file mode 100644
64041 index 0000000..9c73264
64042 --- /dev/null
64043 +++ b/fs/reiser4/readahead.c
64044 @@ -0,0 +1,138 @@
64045 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
64046 + * reiser4/README */
64047 +
64048 +#include "forward.h"
64049 +#include "tree.h"
64050 +#include "tree_walk.h"
64051 +#include "super.h"
64052 +#include "inode.h"
64053 +#include "key.h"
64054 +#include "znode.h"
64055 +
64056 +#include <linux/swap.h>                /* for totalram_pages */
64057 +
64058 +void init_ra_info(ra_info_t * rai)
64059 +{
64060 +       rai->key_to_stop = *min_key();
64061 +}
64062 +
64063 +/* global formatted node readahead parameter. It can be set by mount option -o readahead:NUM:1 */
64064 +static inline int ra_adjacent_only(int flags)
64065 +{
64066 +       return flags & RA_ADJACENT_ONLY;
64067 +}
64068 +
64069 +/* this is used by formatted_readahead to decide whether read for right neighbor of node is to be issued. It returns 1
64070 +   if right neighbor's first key is less or equal to readahead's stop key */
64071 +static int should_readahead_neighbor(znode * node, ra_info_t * info)
64072 +{
64073 +       int result;
64074 +
64075 +       read_lock_dk(znode_get_tree(node));
64076 +       result = keyle(znode_get_rd_key(node), &info->key_to_stop);
64077 +       read_unlock_dk(znode_get_tree(node));
64078 +       return result;
64079 +}
64080 +
64081 +#define LOW_MEM_PERCENTAGE (5)
64082 +
64083 +static int low_on_memory(void)
64084 +{
64085 +       unsigned int freepages;
64086 +
64087 +       freepages = nr_free_pages();
64088 +       return freepages < (totalram_pages * LOW_MEM_PERCENTAGE / 100);
64089 +}
64090 +
64091 +/* start read for @node and for a few of its right neighbors */
64092 +void formatted_readahead(znode * node, ra_info_t * info)
64093 +{
64094 +       ra_params_t *ra_params;
64095 +       znode *cur;
64096 +       int i;
64097 +       int grn_flags;
64098 +       lock_handle next_lh;
64099 +
64100 +       /* do nothing if node block number has not been assigned to node (which means it is still in cache). */
64101 +       if (blocknr_is_fake(znode_get_block(node)))
64102 +               return;
64103 +
64104 +       ra_params = get_current_super_ra_params();
64105 +
64106 +       if (znode_page(node) == NULL)
64107 +               jstartio(ZJNODE(node));
64108 +
64109 +       if (znode_get_level(node) != LEAF_LEVEL)
64110 +               return;
64111 +
64112 +       /* don't waste memory for read-ahead when low on memory */
64113 +       if (low_on_memory())
64114 +               return;
64115 +
64116 +       /* We can have locked nodes on upper tree levels, in this situation lock
64117 +          priorities do not help to resolve deadlocks, we have to use TRY_LOCK
64118 +          here. */
64119 +       grn_flags = (GN_CAN_USE_UPPER_LEVELS | GN_TRY_LOCK);
64120 +
64121 +       i = 0;
64122 +       cur = zref(node);
64123 +       init_lh(&next_lh);
64124 +       while (i < ra_params->max) {
64125 +               const reiser4_block_nr *nextblk;
64126 +
64127 +               if (!should_readahead_neighbor(cur, info))
64128 +                       break;
64129 +
64130 +               if (reiser4_get_right_neighbor
64131 +                   (&next_lh, cur, ZNODE_READ_LOCK, grn_flags))
64132 +                       break;
64133 +
64134 +               nextblk = znode_get_block(next_lh.node);
64135 +               if (blocknr_is_fake(nextblk) ||
64136 +                   (ra_adjacent_only(ra_params->flags)
64137 +                    && *nextblk != *znode_get_block(cur) + 1)) {
64138 +                       break;
64139 +               }
64140 +
64141 +               zput(cur);
64142 +               cur = zref(next_lh.node);
64143 +               done_lh(&next_lh);
64144 +               if (znode_page(cur) == NULL)
64145 +                       jstartio(ZJNODE(cur));
64146 +               else
64147 +                       /* Do not scan read-ahead window if pages already
64148 +                        * allocated (and i/o already started). */
64149 +                       break;
64150 +
64151 +               i++;
64152 +       }
64153 +       zput(cur);
64154 +       done_lh(&next_lh);
64155 +}
64156 +
64157 +void reiser4_readdir_readahead_init(struct inode *dir, tap_t * tap)
64158 +{
64159 +       reiser4_key *stop_key;
64160 +
64161 +       assert("nikita-3542", dir != NULL);
64162 +       assert("nikita-3543", tap != NULL);
64163 +
64164 +       stop_key = &tap->ra_info.key_to_stop;
64165 +       /* initialize readdir readahead information: include into readahead
64166 +        * stat data of all files of the directory */
64167 +       set_key_locality(stop_key, get_inode_oid(dir));
64168 +       set_key_type(stop_key, KEY_SD_MINOR);
64169 +       set_key_ordering(stop_key, get_key_ordering(max_key()));
64170 +       set_key_objectid(stop_key, get_key_objectid(max_key()));
64171 +       set_key_offset(stop_key, get_key_offset(max_key()));
64172 +}
64173 +
64174 +/*
64175 +   Local variables:
64176 +   c-indentation-style: "K&R"
64177 +   mode-name: "LC"
64178 +   c-basic-offset: 8
64179 +   tab-width: 8
64180 +   fill-column: 80
64181 +   End:
64182 +*/
64183 diff --git a/fs/reiser4/readahead.h b/fs/reiser4/readahead.h
64184 new file mode 100644
64185 index 0000000..3682eb6
64186 --- /dev/null
64187 +++ b/fs/reiser4/readahead.h
64188 @@ -0,0 +1,48 @@
64189 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
64190 +
64191 +#ifndef __READAHEAD_H__
64192 +#define __READAHEAD_H__
64193 +
64194 +#include "key.h"
64195 +
64196 +typedef enum {
64197 +       RA_ADJACENT_ONLY = 1,   /* only requests nodes which are adjacent. Default is NO (not only adjacent) */
64198 +} ra_global_flags;
64199 +
64200 +/* reiser4 super block has a field of this type. It controls readahead during tree traversals */
64201 +typedef struct formatted_read_ahead_params {
64202 +       unsigned long max;      /* request not more than this amount of nodes. Default is totalram_pages / 4 */
64203 +       int flags;
64204 +} ra_params_t;
64205 +
64206 +typedef struct {
64207 +       reiser4_key key_to_stop;
64208 +} ra_info_t;
64209 +
64210 +void formatted_readahead(znode *, ra_info_t *);
64211 +void init_ra_info(ra_info_t * rai);
64212 +
64213 +struct reiser4_file_ra_state {
64214 +       loff_t start;           /* Current window */
64215 +       loff_t size;
64216 +       loff_t next_size;       /* Next window size */
64217 +       loff_t ahead_start;     /* Ahead window */
64218 +       loff_t ahead_size;
64219 +       loff_t max_window_size; /* Maximum readahead window */
64220 +       loff_t slow_start;      /* enlarging r/a size algorithm. */
64221 +};
64222 +
64223 +extern void reiser4_readdir_readahead_init(struct inode *dir, tap_t * tap);
64224 +
64225 +/* __READAHEAD_H__ */
64226 +#endif
64227 +
64228 +/*
64229 +   Local variables:
64230 +   c-indentation-style: "K&R"
64231 +   mode-name: "LC"
64232 +   c-basic-offset: 8
64233 +   tab-width: 8
64234 +   fill-column: 120
64235 +   End:
64236 +*/
64237 diff --git a/fs/reiser4/reiser4.h b/fs/reiser4/reiser4.h
64238 new file mode 100644
64239 index 0000000..cdae341
64240 --- /dev/null
64241 +++ b/fs/reiser4/reiser4.h
64242 @@ -0,0 +1,275 @@
64243 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
64244 + * reiser4/README */
64245 +
64246 +/* definitions of common constants used by reiser4 */
64247 +
64248 +#if !defined( __REISER4_H__ )
64249 +#define __REISER4_H__
64250 +
64251 +#include <asm/param.h>         /* for HZ */
64252 +#include <linux/errno.h>
64253 +#include <linux/types.h>
64254 +#include <linux/fs.h>
64255 +#include <linux/hardirq.h>
64256 +#include <linux/sched.h>
64257 +
64258 +/*
64259 + * reiser4 compilation options.
64260 + */
64261 +
64262 +#if defined(CONFIG_REISER4_DEBUG)
64263 +/* turn on assertion checks */
64264 +#define REISER4_DEBUG (1)
64265 +#else
64266 +#define REISER4_DEBUG (0)
64267 +#endif
64268 +
64269 +#if defined(CONFIG_ZLIB_INFLATE)
64270 +/* turn on zlib */
64271 +#define REISER4_ZLIB (1)
64272 +#else
64273 +#define REISER4_ZLIB (0)
64274 +#endif
64275 +
64276 +#if defined(CONFIG_CRYPTO_SHA256)
64277 +#define REISER4_SHA256 (1)
64278 +#else
64279 +#define REISER4_SHA256 (0)
64280 +#endif
64281 +
64282 +#if defined(CONFIG_CRYPTO_AES_586)
64283 +#define REISER4_AES (1)
64284 +#else
64285 +#define REISER4_AES (0)
64286 +#endif
64287 +
64288 +/*
64289 + * Turn on large keys mode. In his mode (which is default), reiser4 key has 4
64290 + * 8-byte components. In the old "small key" mode, it's 3 8-byte
64291 + * components. Additional component, referred to as "ordering" is used to
64292 + * order items from which given object is composed of. As such, ordering is
64293 + * placed between locality and objectid. For directory item ordering contains
64294 + * initial prefix of the file name this item is for. This sorts all directory
64295 + * items within given directory lexicographically (but see
64296 + * fibration.[ch]). For file body and stat-data, ordering contains initial
64297 + * prefix of the name file was initially created with. In the common case
64298 + * (files with single name) this allows to order file bodies and stat-datas in
64299 + * the same order as their respective directory entries, thus speeding up
64300 + * readdir.
64301 + *
64302 + * Note, that kernel can only mount file system with the same key size as one
64303 + * it is compiled for, so flipping this option may render your data
64304 + * inaccessible.
64305 + */
64306 +#define REISER4_LARGE_KEY (1)
64307 +/*#define REISER4_LARGE_KEY (0)*/
64308 +
64309 +/*#define GUESS_EXISTS 1*/
64310 +
64311 +/*
64312 + * PLEASE update fs/reiser4/kattr.c:show_options() when adding new compilation
64313 + * option
64314 + */
64315 +
64316 +extern const char *REISER4_SUPER_MAGIC_STRING;
64317 +extern const int REISER4_MAGIC_OFFSET; /* offset to magic string from the
64318 +                                        * beginning of device */
64319 +
64320 +/* here go tunable parameters that are not worth special entry in kernel
64321 +   configuration */
64322 +
64323 +/* default number of slots in coord-by-key caches */
64324 +#define CBK_CACHE_SLOTS    (16)
64325 +/* how many elementary tree operation to carry on the next level */
64326 +#define CARRIES_POOL_SIZE        (5)
64327 +/* size of pool of preallocated nodes for carry process. */
64328 +#define NODES_LOCKED_POOL_SIZE   (5)
64329 +
64330 +#define REISER4_NEW_NODE_FLAGS (COPI_LOAD_LEFT | COPI_LOAD_RIGHT | COPI_GO_LEFT)
64331 +#define REISER4_NEW_EXTENT_FLAGS (COPI_LOAD_LEFT | COPI_LOAD_RIGHT | COPI_GO_LEFT)
64332 +#define REISER4_PASTE_FLAGS (COPI_GO_LEFT)
64333 +#define REISER4_INSERT_FLAGS (COPI_GO_LEFT)
64334 +
64335 +/* we are supporting reservation of disk space on uid basis */
64336 +#define REISER4_SUPPORT_UID_SPACE_RESERVATION (0)
64337 +/* we are supporting reservation of disk space for groups */
64338 +#define REISER4_SUPPORT_GID_SPACE_RESERVATION (0)
64339 +/* we are supporting reservation of disk space for root */
64340 +#define REISER4_SUPPORT_ROOT_SPACE_RESERVATION (0)
64341 +/* we use rapid flush mode, see flush.c for comments.  */
64342 +#define REISER4_USE_RAPID_FLUSH (1)
64343 +
64344 +/*
64345 + * set this to 0 if you don't want to use wait-for-flush in ->writepage().
64346 + */
64347 +#define REISER4_USE_ENTD (1)
64348 +
64349 +/* key allocation is Plan-A */
64350 +#define REISER4_PLANA_KEY_ALLOCATION (1)
64351 +/* key allocation follows good old 3.x scheme */
64352 +#define REISER4_3_5_KEY_ALLOCATION (0)
64353 +
64354 +/* size of hash-table for znodes */
64355 +#define REISER4_ZNODE_HASH_TABLE_SIZE (1 << 13)
64356 +
64357 +/* number of buckets in lnode hash-table */
64358 +#define LNODE_HTABLE_BUCKETS (1024)
64359 +
64360 +/* some ridiculously high maximal limit on height of znode tree. This
64361 +    is used in declaration of various per level arrays and
64362 +    to allocate stattistics gathering array for per-level stats. */
64363 +#define REISER4_MAX_ZTREE_HEIGHT     (8)
64364 +
64365 +#define REISER4_PANIC_MSG_BUFFER_SIZE (1024)
64366 +
64367 +/* If array contains less than REISER4_SEQ_SEARCH_BREAK elements then,
64368 +   sequential search is on average faster than binary. This is because
64369 +   of better optimization and because sequential search is more CPU
64370 +   cache friendly. This number (25) was found by experiments on dual AMD
64371 +   Athlon(tm), 1400MHz.
64372 +
64373 +   NOTE: testing in kernel has shown that binary search is more effective than
64374 +   implied by results of the user level benchmarking. Probably because in the
64375 +   node keys are separated by other data. So value was adjusted after few
64376 +   tests. More thorough tuning is needed.
64377 +*/
64378 +#define REISER4_SEQ_SEARCH_BREAK      (3)
64379 +
64380 +/* don't allow tree to be lower than this */
64381 +#define REISER4_MIN_TREE_HEIGHT       (TWIG_LEVEL)
64382 +
64383 +/* NOTE NIKITA this is no longer used: maximal atom size is auto-adjusted to
64384 + * available memory. */
64385 +/* Default value of maximal atom size. Can be ovewritten by
64386 +   tmgr.atom_max_size mount option. By default infinity. */
64387 +#define REISER4_ATOM_MAX_SIZE         ((unsigned)(~0))
64388 +
64389 +/* Default value of maximal atom age (in jiffies). After reaching this age
64390 +   atom will be forced to commit, either synchronously or asynchronously. Can
64391 +   be overwritten by tmgr.atom_max_age mount option. */
64392 +#define REISER4_ATOM_MAX_AGE          (600 * HZ)
64393 +
64394 +/* sleeping period for ktxnmrgd */
64395 +#define REISER4_TXNMGR_TIMEOUT  (5 * HZ)
64396 +
64397 +/* timeout to wait for ent thread in writepage. Default: 3 milliseconds. */
64398 +#define REISER4_ENTD_TIMEOUT (3 * HZ / 1000)
64399 +
64400 +/* start complaining after that many restarts in coord_by_key().
64401 +
64402 +   This either means incredibly heavy contention for this part of a tree, or
64403 +   some corruption or bug.
64404 +*/
64405 +#define REISER4_CBK_ITERATIONS_LIMIT  (100)
64406 +
64407 +/* return -EIO after that many iterations in coord_by_key().
64408 +
64409 +   I have witnessed more than 800 iterations (in 30 thread test) before cbk
64410 +   finished. --nikita
64411 +*/
64412 +#define REISER4_MAX_CBK_ITERATIONS    500000
64413 +
64414 +/* put a per-inode limit on maximal number of directory entries with identical
64415 +   keys in hashed directory.
64416 +
64417 +   Disable this until inheritance interfaces stabilize: we need some way to
64418 +   set per directory limit.
64419 +*/
64420 +#define REISER4_USE_COLLISION_LIMIT    (0)
64421 +
64422 +/* If flush finds more than FLUSH_RELOCATE_THRESHOLD adjacent dirty leaf-level blocks it
64423 +   will force them to be relocated. */
64424 +#define FLUSH_RELOCATE_THRESHOLD 64
64425 +/* If flush finds can find a block allocation closer than at most FLUSH_RELOCATE_DISTANCE
64426 +   from the preceder it will relocate to that position. */
64427 +#define FLUSH_RELOCATE_DISTANCE  64
64428 +
64429 +/* If we have written this much or more blocks before encountering busy jnode
64430 +   in flush list - abort flushing hoping that next time we get called
64431 +   this jnode will be clean already, and we will save some seeks. */
64432 +#define FLUSH_WRITTEN_THRESHOLD 50
64433 +
64434 +/* The maximum number of nodes to scan left on a level during flush. */
64435 +#define FLUSH_SCAN_MAXNODES 10000
64436 +
64437 +/* per-atom limit of flushers */
64438 +#define ATOM_MAX_FLUSHERS (1)
64439 +
64440 +/* default tracing buffer size */
64441 +#define REISER4_TRACE_BUF_SIZE (1 << 15)
64442 +
64443 +/* what size units of IO we would like cp, etc., to use, in writing to
64444 +   reiser4. In bytes.
64445 +
64446 +   Can be overwritten by optimal_io_size mount option.
64447 +*/
64448 +#define REISER4_OPTIMAL_IO_SIZE (64 * 1024)
64449 +
64450 +/* see comments in inode.c:oid_to_uino() */
64451 +#define REISER4_UINO_SHIFT (1 << 30)
64452 +
64453 +/* Mark function argument as unused to avoid compiler warnings. */
64454 +#define UNUSED_ARG __attribute__((unused))
64455 +
64456 +#if ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 3)) || (__GNUC__ > 3)
64457 +#define NONNULL __attribute__((nonnull))
64458 +#else
64459 +#define NONNULL
64460 +#endif
64461 +
64462 +/* master super block offset in bytes.*/
64463 +#define REISER4_MASTER_OFFSET 65536
64464 +
64465 +/* size of VFS block */
64466 +#define VFS_BLKSIZE 512
64467 +/* number of bits in size of VFS block (512==2^9) */
64468 +#define VFS_BLKSIZE_BITS 9
64469 +
64470 +#define REISER4_I reiser4_inode_data
64471 +
64472 +/* implication */
64473 +#define ergo( antecedent, consequent ) ( !( antecedent ) || ( consequent ) )
64474 +/* logical equivalence */
64475 +#define equi( p1, p2 ) ( ergo( ( p1 ), ( p2 ) ) && ergo( ( p2 ), ( p1 ) ) )
64476 +
64477 +#define sizeof_array(x) ((int) (sizeof(x) / sizeof(x[0])))
64478 +
64479 +#define NOT_YET                       (0)
64480 +
64481 +/** Reiser4 specific error codes **/
64482 +
64483 +#define REISER4_ERROR_CODE_BASE 500
64484 +
64485 +/* Neighbor is not available (side neighbor or parent) */
64486 +#define E_NO_NEIGHBOR  (REISER4_ERROR_CODE_BASE)
64487 +
64488 +/* Node was not found in cache */
64489 +#define E_NOT_IN_CACHE (REISER4_ERROR_CODE_BASE + 1)
64490 +
64491 +/* node has no free space enough for completion of balancing operation */
64492 +#define E_NODE_FULL    (REISER4_ERROR_CODE_BASE + 2)
64493 +
64494 +/* repeat operation */
64495 +#define E_REPEAT       (REISER4_ERROR_CODE_BASE + 3)
64496 +
64497 +/* deadlock happens */
64498 +#define E_DEADLOCK     (REISER4_ERROR_CODE_BASE + 4)
64499 +
64500 +/* operation cannot be performed, because it would block and non-blocking mode
64501 + * was requested. */
64502 +#define E_BLOCK        (REISER4_ERROR_CODE_BASE + 5)
64503 +
64504 +/* wait some event (depends on context), then repeat */
64505 +#define E_WAIT         (REISER4_ERROR_CODE_BASE + 6)
64506 +
64507 +#endif                         /* __REISER4_H__ */
64508 +
64509 +/* Make Linus happy.
64510 +   Local variables:
64511 +   c-indentation-style: "K&R"
64512 +   mode-name: "LC"
64513 +   c-basic-offset: 8
64514 +   tab-width: 8
64515 +   fill-column: 120
64516 +   End:
64517 +*/
64518 diff --git a/fs/reiser4/safe_link.c b/fs/reiser4/safe_link.c
64519 new file mode 100644
64520 index 0000000..cc65a81
64521 --- /dev/null
64522 +++ b/fs/reiser4/safe_link.c
64523 @@ -0,0 +1,351 @@
64524 +/* Copyright 2003, 2004 by Hans Reiser, licensing governed by
64525 + * reiser4/README */
64526 +
64527 +/* Safe-links. */
64528 +
64529 +/*
64530 + * Safe-links are used to maintain file system consistency during operations
64531 + * that spawns multiple transactions. For example:
64532 + *
64533 + *     1. Unlink. UNIX supports "open-but-unlinked" files, that is files
64534 + *     without user-visible names in the file system, but still opened by some
64535 + *     active process. What happens here is that unlink proper (i.e., removal
64536 + *     of the last file name) and file deletion (truncate of file body to zero
64537 + *     and deletion of stat-data, that happens when last file descriptor is
64538 + *     closed), may belong to different transactions T1 and T2. If a crash
64539 + *     happens after T1 commit, but before T2 commit, on-disk file system has
64540 + *     a file without name, that is, disk space leak.
64541 + *
64542 + *     2. Truncate. Truncate of large file may spawn multiple transactions. If
64543 + *     system crashes while truncate was in-progress, file is left partially
64544 + *     truncated, which violates "atomicity guarantees" of reiser4, viz. that
64545 + *     every system is atomic.
64546 + *
64547 + * Safe-links address both above cases. Basically, safe-link is a way post
64548 + * some operation to be executed during commit of some other transaction than
64549 + * current one. (Another way to look at the safe-link is to interpret it as a
64550 + * logical logging.)
64551 + *
64552 + * Specifically, at the beginning of unlink safe-link in inserted in the
64553 + * tree. This safe-link is normally removed by file deletion code (during
64554 + * transaction T2 in the above terms). Truncate also inserts safe-link that is
64555 + * normally removed when truncate operation is finished.
64556 + *
64557 + * This means, that in the case of "clean umount" there are no safe-links in
64558 + * the tree. If safe-links are observed during mount, it means that (a) system
64559 + * was terminated abnormally, and (b) safe-link correspond to the "pending"
64560 + * (i.e., not finished) operations that were in-progress during system
64561 + * termination. Each safe-link record enough information to complete
64562 + * corresponding operation, and mount simply "replays" them (hence, the
64563 + * analogy with the logical logging).
64564 + *
64565 + * Safe-links are implemented as blackbox items (see
64566 + * plugin/item/blackbox.[ch]).
64567 + *
64568 + * For the reference: ext3 also has similar mechanism, it's called "an orphan
64569 + * list" there.
64570 + */
64571 +
64572 +#include "safe_link.h"
64573 +#include "debug.h"
64574 +#include "inode.h"
64575 +
64576 +#include "plugin/item/blackbox.h"
64577 +
64578 +#include <linux/fs.h>
64579 +
64580 +/*
64581 + * On-disk format of safe-link.
64582 + */
64583 +typedef struct safelink {
64584 +       reiser4_key sdkey;      /* key of stat-data for the file safe-link is
64585 +                                * for */
64586 +       d64 size;               /* size to which file should be truncated */
64587 +} safelink_t;
64588 +
64589 +/*
64590 + * locality where safe-link items are stored. Next to the objectid of root
64591 + * directory.
64592 + */
64593 +static oid_t safe_link_locality(reiser4_tree * tree)
64594 +{
64595 +       return get_key_objectid(get_super_private(tree->super)->df_plug->
64596 +                               root_dir_key(tree->super)) + 1;
64597 +}
64598 +
64599 +/*
64600 +  Construct a key for the safe-link. Key has the following format:
64601 +
64602 +|        60     | 4 |        64        | 4 |      60       |         64       |
64603 ++---------------+---+------------------+---+---------------+------------------+
64604 +|   locality    | 0 |        0         | 0 |   objectid    |     link type    |
64605 ++---------------+---+------------------+---+---------------+------------------+
64606 +|                   |                  |                   |                  |
64607 +|     8 bytes       |     8 bytes      |      8 bytes      |      8 bytes     |
64608 +
64609 +   This is in large keys format. In small keys format second 8 byte chunk is
64610 +   out. Locality is a constant returned by safe_link_locality(). objectid is
64611 +   an oid of a file on which operation protected by this safe-link is
64612 +   performed. link-type is used to distinguish safe-links for different
64613 +   operations.
64614 +
64615 + */
64616 +static reiser4_key *build_link_key(reiser4_tree * tree, oid_t oid,
64617 +                                  reiser4_safe_link_t link, reiser4_key * key)
64618 +{
64619 +       reiser4_key_init(key);
64620 +       set_key_locality(key, safe_link_locality(tree));
64621 +       set_key_objectid(key, oid);
64622 +       set_key_offset(key, link);
64623 +       return key;
64624 +}
64625 +
64626 +/*
64627 + * how much disk space is necessary to insert and remove (in the
64628 + * error-handling path) safe-link.
64629 + */
64630 +static __u64 safe_link_tograb(reiser4_tree * tree)
64631 +{
64632 +       return
64633 +           /* insert safe link */
64634 +           estimate_one_insert_item(tree) +
64635 +           /* remove safe link */
64636 +           estimate_one_item_removal(tree) +
64637 +           /* drill to the leaf level during insertion */
64638 +           1 + estimate_one_insert_item(tree) +
64639 +           /*
64640 +            * possible update of existing safe-link. Actually, if
64641 +            * safe-link existed already (we failed to remove it), then no
64642 +            * insertion is necessary, so this term is already "covered",
64643 +            * but for simplicity let's left it.
64644 +            */
64645 +           1;
64646 +}
64647 +
64648 +/*
64649 + * grab enough disk space to insert and remove (in the error-handling path)
64650 + * safe-link.
64651 + */
64652 +int safe_link_grab(reiser4_tree * tree, reiser4_ba_flags_t flags)
64653 +{
64654 +       int result;
64655 +
64656 +       grab_space_enable();
64657 +       /* The sbinfo->delete semaphore can be taken here.
64658 +        * safe_link_release() should be called before leaving reiser4
64659 +        * context. */
64660 +       result =
64661 +           reiser4_grab_reserved(tree->super, safe_link_tograb(tree), flags);
64662 +       grab_space_enable();
64663 +       return result;
64664 +}
64665 +
64666 +/*
64667 + * release unused disk space reserved by safe_link_grab().
64668 + */
64669 +void safe_link_release(reiser4_tree * tree)
64670 +{
64671 +       reiser4_release_reserved(tree->super);
64672 +}
64673 +
64674 +/*
64675 + * insert into tree safe-link for operation @link on inode @inode.
64676 + */
64677 +int safe_link_add(struct inode *inode, reiser4_safe_link_t link)
64678 +{
64679 +       reiser4_key key;
64680 +       safelink_t sl;
64681 +       int length;
64682 +       int result;
64683 +       reiser4_tree *tree;
64684 +
64685 +       build_sd_key(inode, &sl.sdkey);
64686 +       length = sizeof sl.sdkey;
64687 +
64688 +       if (link == SAFE_TRUNCATE) {
64689 +               /*
64690 +                * for truncate we have to store final file length also,
64691 +                * expand item.
64692 +                */
64693 +               length += sizeof(sl.size);
64694 +               put_unaligned(cpu_to_le64(inode->i_size), &sl.size);
64695 +       }
64696 +       tree = tree_by_inode(inode);
64697 +       build_link_key(tree, get_inode_oid(inode), link, &key);
64698 +
64699 +       result = store_black_box(tree, &key, &sl, length);
64700 +       if (result == -EEXIST)
64701 +               result = update_black_box(tree, &key, &sl, length);
64702 +       return result;
64703 +}
64704 +
64705 +/*
64706 + * remove safe-link corresponding to the operation @link on inode @inode from
64707 + * the tree.
64708 + */
64709 +int safe_link_del(reiser4_tree * tree, oid_t oid, reiser4_safe_link_t link)
64710 +{
64711 +       reiser4_key key;
64712 +
64713 +       return kill_black_box(tree, build_link_key(tree, oid, link, &key));
64714 +}
64715 +
64716 +/*
64717 + * in-memory structure to keep information extracted from safe-link. This is
64718 + * used to iterate over all safe-links.
64719 + */
64720 +typedef struct {
64721 +       reiser4_tree *tree;     /* internal tree */
64722 +       reiser4_key key;        /* safe-link key */
64723 +       reiser4_key sdkey;      /* key of object stat-data */
64724 +       reiser4_safe_link_t link;       /* safe-link type */
64725 +       oid_t oid;              /* object oid */
64726 +       __u64 size;             /* final size for truncate */
64727 +} safe_link_context;
64728 +
64729 +/*
64730 + * start iterating over all safe-links.
64731 + */
64732 +static void safe_link_iter_begin(reiser4_tree * tree, safe_link_context * ctx)
64733 +{
64734 +       ctx->tree = tree;
64735 +       reiser4_key_init(&ctx->key);
64736 +       set_key_locality(&ctx->key, safe_link_locality(tree));
64737 +       set_key_objectid(&ctx->key, get_key_objectid(max_key()));
64738 +       set_key_offset(&ctx->key, get_key_offset(max_key()));
64739 +}
64740 +
64741 +/*
64742 + * return next safe-link.
64743 + */
64744 +static int safe_link_iter_next(safe_link_context * ctx)
64745 +{
64746 +       int result;
64747 +       safelink_t sl;
64748 +
64749 +       result = load_black_box(ctx->tree, &ctx->key, &sl, sizeof sl, 0);
64750 +       if (result == 0) {
64751 +               ctx->oid = get_key_objectid(&ctx->key);
64752 +               ctx->link = get_key_offset(&ctx->key);
64753 +               ctx->sdkey = sl.sdkey;
64754 +               if (ctx->link == SAFE_TRUNCATE)
64755 +                       ctx->size = le64_to_cpu(get_unaligned(&sl.size));
64756 +       }
64757 +       return result;
64758 +}
64759 +
64760 +/*
64761 + * check are there any more safe-links left in the tree.
64762 + */
64763 +static int safe_link_iter_finished(safe_link_context * ctx)
64764 +{
64765 +       return get_key_locality(&ctx->key) != safe_link_locality(ctx->tree);
64766 +}
64767 +
64768 +/*
64769 + * finish safe-link iteration.
64770 + */
64771 +static void safe_link_iter_end(safe_link_context * ctx)
64772 +{
64773 +       /* nothing special */
64774 +}
64775 +
64776 +/*
64777 + * process single safe-link.
64778 + */
64779 +static int process_safelink(struct super_block *super, reiser4_safe_link_t link,
64780 +                           reiser4_key * sdkey, oid_t oid, __u64 size)
64781 +{
64782 +       struct inode *inode;
64783 +       int result;
64784 +
64785 +       /*
64786 +        * obtain object inode by reiser4_iget(), then call object plugin
64787 +        * ->safelink() method to do actual work, then delete safe-link on
64788 +        * success.
64789 +        */
64790 +       inode = reiser4_iget(super, sdkey, 1);
64791 +       if (!IS_ERR(inode)) {
64792 +               file_plugin *fplug;
64793 +
64794 +               fplug = inode_file_plugin(inode);
64795 +               assert("nikita-3428", fplug != NULL);
64796 +               assert("", oid == get_inode_oid(inode));
64797 +               if (fplug->safelink != NULL) {
64798 +                       /* txn_restart_current is not necessary because
64799 +                        * mounting is signle thread. However, without it
64800 +                        * deadlock detection code will complain (see
64801 +                        * nikita-3361). */
64802 +                       txn_restart_current();
64803 +                       result = fplug->safelink(inode, link, size);
64804 +               } else {
64805 +                       warning("nikita-3430",
64806 +                               "Cannot handle safelink for %lli",
64807 +                               (unsigned long long)oid);
64808 +                       print_key("key", sdkey);
64809 +                       result = 0;
64810 +               }
64811 +               if (result != 0) {
64812 +                       warning("nikita-3431",
64813 +                               "Error processing safelink for %lli: %i",
64814 +                               (unsigned long long)oid, result);
64815 +               }
64816 +               reiser4_iget_complete(inode);
64817 +               iput(inode);
64818 +               if (result == 0) {
64819 +                       result = safe_link_grab(get_tree(super), BA_CAN_COMMIT);
64820 +                       if (result == 0)
64821 +                               result =
64822 +                                   safe_link_del(get_tree(super), oid, link);
64823 +                       safe_link_release(get_tree(super));
64824 +                       /*
64825 +                        * restart transaction: if there was large number of
64826 +                        * safe-links, their processing may fail to fit into
64827 +                        * single transaction.
64828 +                        */
64829 +                       if (result == 0)
64830 +                               txn_restart_current();
64831 +               }
64832 +       } else
64833 +               result = PTR_ERR(inode);
64834 +       return result;
64835 +}
64836 +
64837 +/*
64838 + * iterate over all safe-links in the file-system processing them one by one.
64839 + */
64840 +int process_safelinks(struct super_block *super)
64841 +{
64842 +       safe_link_context ctx;
64843 +       int result;
64844 +
64845 +       if (rofs_super(super))
64846 +               /* do nothing on the read-only file system */
64847 +               return 0;
64848 +       safe_link_iter_begin(&get_super_private(super)->tree, &ctx);
64849 +       result = 0;
64850 +       do {
64851 +               result = safe_link_iter_next(&ctx);
64852 +               if (safe_link_iter_finished(&ctx) || result == -ENOENT) {
64853 +                       result = 0;
64854 +                       break;
64855 +               }
64856 +               if (result == 0)
64857 +                       result = process_safelink(super, ctx.link,
64858 +                                                 &ctx.sdkey, ctx.oid,
64859 +                                                 ctx.size);
64860 +       } while (result == 0);
64861 +       safe_link_iter_end(&ctx);
64862 +       return result;
64863 +}
64864 +
64865 +/* Make Linus happy.
64866 +   Local variables:
64867 +   c-indentation-style: "K&R"
64868 +   mode-name: "LC"
64869 +   c-basic-offset: 8
64870 +   tab-width: 8
64871 +   fill-column: 120
64872 +   scroll-step: 1
64873 +   End:
64874 +*/
64875 diff --git a/fs/reiser4/safe_link.h b/fs/reiser4/safe_link.h
64876 new file mode 100644
64877 index 0000000..7ae4458
64878 --- /dev/null
64879 +++ b/fs/reiser4/safe_link.h
64880 @@ -0,0 +1,29 @@
64881 +/* Copyright 2003 by Hans Reiser, licensing governed by
64882 + * reiser4/README */
64883 +
64884 +/* Safe-links. See safe_link.c for details. */
64885 +
64886 +#if !defined( __FS_SAFE_LINK_H__ )
64887 +#define __FS_SAFE_LINK_H__
64888 +
64889 +#include "tree.h"
64890 +
64891 +int safe_link_grab(reiser4_tree * tree, reiser4_ba_flags_t flags);
64892 +void safe_link_release(reiser4_tree * tree);
64893 +int safe_link_add(struct inode *inode, reiser4_safe_link_t link);
64894 +int safe_link_del(reiser4_tree *, oid_t oid, reiser4_safe_link_t link);
64895 +
64896 +int process_safelinks(struct super_block *super);
64897 +
64898 +/* __FS_SAFE_LINK_H__ */
64899 +#endif
64900 +
64901 +/* Make Linus happy.
64902 +   Local variables:
64903 +   c-indentation-style: "K&R"
64904 +   mode-name: "LC"
64905 +   c-basic-offset: 8
64906 +   tab-width: 8
64907 +   fill-column: 120
64908 +   End:
64909 +*/
64910 diff --git a/fs/reiser4/seal.c b/fs/reiser4/seal.c
64911 new file mode 100644
64912 index 0000000..7466de8
64913 --- /dev/null
64914 +++ b/fs/reiser4/seal.c
64915 @@ -0,0 +1,217 @@
64916 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
64917 +/* Seals implementation. */
64918 +/* Seals are "weak" tree pointers. They are analogous to tree coords in
64919 +   allowing to bypass tree traversal. But normal usage of coords implies that
64920 +   node pointed to by coord is locked, whereas seals don't keep a lock (or
64921 +   even a reference) to znode. In stead, each znode contains a version number,
64922 +   increased on each znode modification. This version number is copied into a
64923 +   seal when seal is created. Later, one can "validate" seal by calling
64924 +   seal_validate(). If znode is in cache and its version number is still the
64925 +   same, seal is "pristine" and coord associated with it can be re-used
64926 +   immediately.
64927 +
64928 +   If, on the other hand, znode is out of cache, or it is obviously different
64929 +   one from the znode seal was initially attached to (for example, it is on
64930 +   the different level, or is being removed from the tree), seal is
64931 +   irreparably invalid ("burned") and tree traversal has to be repeated.
64932 +
64933 +   Otherwise, there is some hope, that while znode was modified (and seal was
64934 +   "broken" as a result), key attached to the seal is still in the node. This
64935 +   is checked by first comparing this key with delimiting keys of node and, if
64936 +   key is ok, doing intra-node lookup.
64937 +
64938 +   Znode version is maintained in the following way:
64939 +
64940 +   there is reiser4_tree.znode_epoch counter. Whenever new znode is created,
64941 +   znode_epoch is incremented and its new value is stored in ->version field
64942 +   of new znode. Whenever znode is dirtied (which means it was probably
64943 +   modified), znode_epoch is also incremented and its new value is stored in
64944 +   znode->version. This is done so, because just incrementing znode->version
64945 +   on each update is not enough: it may so happen, that znode get deleted, new
64946 +   znode is allocated for the same disk block and gets the same version
64947 +   counter, tricking seal code into false positive.
64948 +*/
64949 +
64950 +#include "forward.h"
64951 +#include "debug.h"
64952 +#include "key.h"
64953 +#include "coord.h"
64954 +#include "seal.h"
64955 +#include "plugin/item/item.h"
64956 +#include "plugin/node/node.h"
64957 +#include "jnode.h"
64958 +#include "znode.h"
64959 +#include "super.h"
64960 +
64961 +static znode *seal_node(const seal_t * seal);
64962 +static int seal_matches(const seal_t * seal, znode * node);
64963 +
64964 +/* initialise seal. This can be called several times on the same seal. @coord
64965 +   and @key can be NULL.  */
64966 +void seal_init(seal_t * seal /* seal to initialise */ ,
64967 +              const coord_t * coord /* coord @seal will be attached to */ ,
64968 +              const reiser4_key * key UNUSED_ARG       /* key @seal will be
64969 +                                                        * attached to */ )
64970 +{
64971 +       assert("nikita-1886", seal != NULL);
64972 +       memset(seal, 0, sizeof *seal);
64973 +       if (coord != NULL) {
64974 +               znode *node;
64975 +
64976 +               node = coord->node;
64977 +               assert("nikita-1987", node != NULL);
64978 +               spin_lock_znode(node);
64979 +               seal->version = node->version;
64980 +               assert("nikita-1988", seal->version != 0);
64981 +               seal->block = *znode_get_block(node);
64982 +#if REISER4_DEBUG
64983 +               seal->coord1 = *coord;
64984 +               if (key != NULL)
64985 +                       seal->key = *key;
64986 +#endif
64987 +               spin_unlock_znode(node);
64988 +       }
64989 +}
64990 +
64991 +/* finish with seal */
64992 +void seal_done(seal_t * seal /* seal to clear */ )
64993 +{
64994 +       assert("nikita-1887", seal != NULL);
64995 +       seal->version = 0;
64996 +}
64997 +
64998 +/* true if seal was initialised */
64999 +int seal_is_set(const seal_t * seal /* seal to query */ )
65000 +{
65001 +       assert("nikita-1890", seal != NULL);
65002 +       return seal->version != 0;
65003 +}
65004 +
65005 +#if REISER4_DEBUG
65006 +/* helper function for seal_validate(). It checks that item at @coord has
65007 + * expected key. This is to detect cases where node was modified but wasn't
65008 + * marked dirty. */
65009 +static inline int check_seal_match(const coord_t * coord /* coord to check */ ,
65010 +                                  const reiser4_key * k /* expected key */ )
65011 +{
65012 +       reiser4_key ukey;
65013 +
65014 +       return (coord->between != AT_UNIT) ||
65015 +           /* FIXME-VS: we only can compare keys for items whose units
65016 +              represent exactly one key */
65017 +           ((coord_is_existing_unit(coord))
65018 +            && (item_is_extent(coord)
65019 +                || keyeq(k, unit_key_by_coord(coord, &ukey))))
65020 +           || ((coord_is_existing_unit(coord)) && (item_is_ctail(coord))
65021 +               && keyge(k, unit_key_by_coord(coord, &ukey)));
65022 +}
65023 +#endif
65024 +
65025 +/* this is used by seal_validate. It accepts return value of
65026 + * longterm_lock_znode and returns 1 if it can be interpreted as seal
65027 + * validation failure. For instance, when longterm_lock_znode returns -EINVAL,
65028 + * seal_validate returns -E_REPEAT and caller will call tre search. We cannot
65029 + * do this in longterm_lock_znode(), because sometimes we want to distinguish
65030 + * between -EINVAL and -E_REPEAT. */
65031 +static int should_repeat(int return_code)
65032 +{
65033 +       return return_code == -EINVAL;
65034 +}
65035 +
65036 +/* (re-)validate seal.
65037 +
65038 +   Checks whether seal is pristine, and try to revalidate it if possible.
65039 +
65040 +   If seal was burned, or broken irreparably, return -E_REPEAT.
65041 +
65042 +   NOTE-NIKITA currently seal_validate() returns -E_REPEAT if key we are
65043 +   looking for is in range of keys covered by the sealed node, but item wasn't
65044 +   found by node ->lookup() method. Alternative is to return -ENOENT in this
65045 +   case, but this would complicate callers logic.
65046 +
65047 +*/
65048 +int seal_validate(seal_t * seal /* seal to validate */ ,
65049 +                 coord_t * coord /* coord to validate against */ ,
65050 +                 const reiser4_key * key /* key to validate against */ ,
65051 +                 lock_handle * lh /* resulting lock handle */ ,
65052 +                 znode_lock_mode mode /* lock node */ ,
65053 +                 znode_lock_request request /* locking priority */ )
65054 +{
65055 +       znode *node;
65056 +       int result;
65057 +
65058 +       assert("nikita-1889", seal != NULL);
65059 +       assert("nikita-1881", seal_is_set(seal));
65060 +       assert("nikita-1882", key != NULL);
65061 +       assert("nikita-1883", coord != NULL);
65062 +       assert("nikita-1884", lh != NULL);
65063 +       assert("nikita-1885", keyeq(&seal->key, key));
65064 +       assert("nikita-1989", coords_equal(&seal->coord1, coord));
65065 +
65066 +       /* obtain znode by block number */
65067 +       node = seal_node(seal);
65068 +       if (node != NULL) {
65069 +               /* znode was in cache, lock it */
65070 +               result = longterm_lock_znode(lh, node, mode, request);
65071 +               zput(node);
65072 +               if (result == 0) {
65073 +                       if (seal_matches(seal, node)) {
65074 +                               /* if seal version and znode version
65075 +                                  coincide */
65076 +                               ON_DEBUG(coord_update_v(coord));
65077 +                               assert("nikita-1990",
65078 +                                      node == seal->coord1.node);
65079 +                               assert("nikita-1898",
65080 +                                      WITH_DATA_RET(coord->node, 1,
65081 +                                                    check_seal_match(coord,
65082 +                                                                     key)));
65083 +                       } else
65084 +                               result = RETERR(-E_REPEAT);
65085 +               }
65086 +               if (result != 0) {
65087 +                       if (should_repeat(result))
65088 +                               result = RETERR(-E_REPEAT);
65089 +                       /* unlock node on failure */
65090 +                       done_lh(lh);
65091 +               }
65092 +       } else {
65093 +               /* znode wasn't in cache */
65094 +               result = RETERR(-E_REPEAT);
65095 +       }
65096 +       return result;
65097 +}
65098 +
65099 +/* helpers functions */
65100 +
65101 +/* obtain reference to znode seal points to, if in cache */
65102 +static znode *seal_node(const seal_t * seal /* seal to query */ )
65103 +{
65104 +       assert("nikita-1891", seal != NULL);
65105 +       return zlook(current_tree, &seal->block);
65106 +}
65107 +
65108 +/* true if @seal version and @node version coincide */
65109 +static int seal_matches(const seal_t * seal /* seal to check */ ,
65110 +                       znode * node /* node to check */ )
65111 +{
65112 +       int result;
65113 +
65114 +       assert("nikita-1991", seal != NULL);
65115 +       assert("nikita-1993", node != NULL);
65116 +
65117 +       spin_lock_znode(node);
65118 +       result = (seal->version == node->version);
65119 +       spin_unlock_znode(node);
65120 +       return result;
65121 +}
65122 +
65123 +/* Make Linus happy.
65124 +   Local variables:
65125 +   c-indentation-style: "K&R"
65126 +   mode-name: "LC"
65127 +   c-basic-offset: 8
65128 +   tab-width: 8
65129 +   fill-column: 120
65130 +   scroll-step: 1
65131 +   End:
65132 +*/
65133 diff --git a/fs/reiser4/seal.h b/fs/reiser4/seal.h
65134 new file mode 100644
65135 index 0000000..17e5e75
65136 --- /dev/null
65137 +++ b/fs/reiser4/seal.h
65138 @@ -0,0 +1,49 @@
65139 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
65140 +
65141 +/* Declaration of seals: "weak" tree pointers. See seal.c for comments. */
65142 +
65143 +#ifndef __SEAL_H__
65144 +#define __SEAL_H__
65145 +
65146 +#include "forward.h"
65147 +#include "debug.h"
65148 +#include "dformat.h"
65149 +#include "key.h"
65150 +#include "coord.h"
65151 +
65152 +/* for __u?? types */
65153 +/*#include <linux/types.h>*/
65154 +
65155 +/* seal. See comment at the top of seal.c */
65156 +typedef struct seal_s {
65157 +       /* version of znode recorder at the time of seal creation */
65158 +       __u64 version;
65159 +       /* block number of znode attached to this seal */
65160 +       reiser4_block_nr block;
65161 +#if REISER4_DEBUG
65162 +       /* coord this seal is attached to. For debugging. */
65163 +       coord_t coord1;
65164 +       /* key this seal is attached to. For debugging. */
65165 +       reiser4_key key;
65166 +#endif
65167 +} seal_t;
65168 +
65169 +extern void seal_init(seal_t *, const coord_t *, const reiser4_key *);
65170 +extern void seal_done(seal_t *);
65171 +extern int seal_is_set(const seal_t *);
65172 +extern int seal_validate(seal_t *, coord_t *,
65173 +                        const reiser4_key *, lock_handle *,
65174 +                        znode_lock_mode mode, znode_lock_request request);
65175 +
65176 +/* __SEAL_H__ */
65177 +#endif
65178 +
65179 +/* Make Linus happy.
65180 +   Local variables:
65181 +   c-indentation-style: "K&R"
65182 +   mode-name: "LC"
65183 +   c-basic-offset: 8
65184 +   tab-width: 8
65185 +   fill-column: 120
65186 +   End:
65187 +*/
65188 diff --git a/fs/reiser4/search.c b/fs/reiser4/search.c
65189 new file mode 100644
65190 index 0000000..fac161a
65191 --- /dev/null
65192 +++ b/fs/reiser4/search.c
65193 @@ -0,0 +1,1611 @@
65194 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
65195 + * reiser4/README */
65196 +
65197 +#include "forward.h"
65198 +#include "debug.h"
65199 +#include "dformat.h"
65200 +#include "key.h"
65201 +#include "coord.h"
65202 +#include "seal.h"
65203 +#include "plugin/item/item.h"
65204 +#include "plugin/node/node.h"
65205 +#include "plugin/plugin.h"
65206 +#include "jnode.h"
65207 +#include "znode.h"
65208 +#include "block_alloc.h"
65209 +#include "tree_walk.h"
65210 +#include "tree.h"
65211 +#include "reiser4.h"
65212 +#include "super.h"
65213 +#include "inode.h"
65214 +
65215 +#include <linux/slab.h>
65216 +
65217 +static const char *bias_name(lookup_bias bias);
65218 +
65219 +/* tree searching algorithm, intranode searching algorithms are in
65220 +   plugin/node/ */
65221 +
65222 +/* tree lookup cache
65223 + *
65224 + * The coord by key cache consists of small list of recently accessed nodes
65225 + * maintained according to the LRU discipline. Before doing real top-to-down
65226 + * tree traversal this cache is scanned for nodes that can contain key
65227 + * requested.
65228 + *
65229 + * The efficiency of coord cache depends heavily on locality of reference for
65230 + * tree accesses. Our user level simulations show reasonably good hit ratios
65231 + * for coord cache under most loads so far.
65232 + */
65233 +
65234 +/* Initialise coord cache slot */
65235 +static void cbk_cache_init_slot(cbk_cache_slot *slot)
65236 +{
65237 +       assert("nikita-345", slot != NULL);
65238 +
65239 +       INIT_LIST_HEAD(&slot->lru);
65240 +       slot->node = NULL;
65241 +}
65242 +
65243 +/* Initialize coord cache */
65244 +int cbk_cache_init(cbk_cache *cache /* cache to init */ )
65245 +{
65246 +       int i;
65247 +
65248 +       assert("nikita-346", cache != NULL);
65249 +
65250 +       cache->slot =
65251 +           kmalloc(sizeof(cbk_cache_slot) * cache->nr_slots, GFP_KERNEL);
65252 +       if (cache->slot == NULL)
65253 +               return RETERR(-ENOMEM);
65254 +
65255 +       INIT_LIST_HEAD(&cache->lru);
65256 +       for (i = 0; i < cache->nr_slots; ++i) {
65257 +               cbk_cache_init_slot(cache->slot + i);
65258 +               list_add_tail(&((cache->slot + i)->lru), &cache->lru);
65259 +       }
65260 +       rwlock_init(&cache->guard);
65261 +       return 0;
65262 +}
65263 +
65264 +/* free cbk cache data */
65265 +void cbk_cache_done(cbk_cache * cache /* cache to release */ )
65266 +{
65267 +       assert("nikita-2493", cache != NULL);
65268 +       if (cache->slot != NULL) {
65269 +               kfree(cache->slot);
65270 +               cache->slot = NULL;
65271 +       }
65272 +}
65273 +
65274 +/* macro to iterate over all cbk cache slots */
65275 +#define for_all_slots(cache, slot)                                             \
65276 +       for ((slot) = list_entry((cache)->lru.next, cbk_cache_slot, lru);       \
65277 +            &(cache)->lru != &(slot)->lru;                                     \
65278 +            (slot) = list_entry(slot->lru.next, cbk_cache_slot, lru))
65279 +
65280 +
65281 +#if REISER4_DEBUG
65282 +/* this function assures that [cbk-cache-invariant] invariant holds */
65283 +static int cbk_cache_invariant(const cbk_cache *cache)
65284 +{
65285 +       cbk_cache_slot *slot;
65286 +       int result;
65287 +       int unused;
65288 +
65289 +       if (cache->nr_slots == 0)
65290 +               return 1;
65291 +
65292 +       assert("nikita-2469", cache != NULL);
65293 +       unused = 0;
65294 +       result = 1;
65295 +       read_lock(&((cbk_cache *)cache)->guard);
65296 +       for_all_slots(cache, slot) {
65297 +               /* in LRU first go all `used' slots followed by `unused' */
65298 +               if (unused && (slot->node != NULL))
65299 +                       result = 0;
65300 +               if (slot->node == NULL)
65301 +                       unused = 1;
65302 +               else {
65303 +                       cbk_cache_slot *scan;
65304 +
65305 +                       /* all cached nodes are different */
65306 +                       scan = slot;
65307 +                       while (result) {
65308 +                               scan = list_entry(scan->lru.next, cbk_cache_slot, lru);
65309 +                               if (&cache->lru == &scan->lru)
65310 +                                       break;
65311 +                               if (slot->node == scan->node)
65312 +                                       result = 0;
65313 +                       }
65314 +               }
65315 +               if (!result)
65316 +                       break;
65317 +       }
65318 +       read_unlock(&((cbk_cache *)cache)->guard);
65319 +       return result;
65320 +}
65321 +
65322 +#endif
65323 +
65324 +/* Remove references, if any, to @node from coord cache */
65325 +void cbk_cache_invalidate(const znode * node /* node to remove from cache */ ,
65326 +                         reiser4_tree * tree /* tree to remove node from */ )
65327 +{
65328 +       cbk_cache_slot *slot;
65329 +       cbk_cache *cache;
65330 +       int i;
65331 +
65332 +       assert("nikita-350", node != NULL);
65333 +       assert("nikita-1479", LOCK_CNT_GTZ(rw_locked_tree));
65334 +
65335 +       cache = &tree->cbk_cache;
65336 +       assert("nikita-2470", cbk_cache_invariant(cache));
65337 +
65338 +       write_lock(&(cache->guard));
65339 +       for (i = 0, slot = cache->slot; i < cache->nr_slots; ++i, ++slot) {
65340 +               if (slot->node == node) {
65341 +                       list_move_tail(&slot->lru, &cache->lru);
65342 +                       slot->node = NULL;
65343 +                       break;
65344 +               }
65345 +       }
65346 +       write_unlock(&(cache->guard));
65347 +       assert("nikita-2471", cbk_cache_invariant(cache));
65348 +}
65349 +
65350 +/* add to the cbk-cache in the "tree" information about "node". This
65351 +    can actually be update of existing slot in a cache. */
65352 +static void cbk_cache_add(const znode *node /* node to add to the cache */ )
65353 +{
65354 +       cbk_cache *cache;
65355 +       cbk_cache_slot *slot;
65356 +       int i;
65357 +
65358 +       assert("nikita-352", node != NULL);
65359 +
65360 +       cache = &znode_get_tree(node)->cbk_cache;
65361 +       assert("nikita-2472", cbk_cache_invariant(cache));
65362 +
65363 +       if (cache->nr_slots == 0)
65364 +               return;
65365 +
65366 +       write_lock(&(cache->guard));
65367 +       /* find slot to update/add */
65368 +       for (i = 0, slot = cache->slot; i < cache->nr_slots; ++i, ++slot) {
65369 +               /* oops, this node is already in a cache */
65370 +               if (slot->node == node)
65371 +                       break;
65372 +       }
65373 +       /* if all slots are used, reuse least recently used one */
65374 +       if (i == cache->nr_slots) {
65375 +               slot = list_entry(cache->lru.prev, cbk_cache_slot, lru);
65376 +               slot->node = (znode *) node;
65377 +       }
65378 +       list_move(&slot->lru, &cache->lru);
65379 +       write_unlock(&(cache->guard));
65380 +       assert("nikita-2473", cbk_cache_invariant(cache));
65381 +}
65382 +
65383 +static int setup_delimiting_keys(cbk_handle * h);
65384 +static lookup_result coord_by_handle(cbk_handle * handle);
65385 +static lookup_result traverse_tree(cbk_handle * h);
65386 +static int cbk_cache_search(cbk_handle * h);
65387 +
65388 +static level_lookup_result cbk_level_lookup(cbk_handle * h);
65389 +static level_lookup_result cbk_node_lookup(cbk_handle * h);
65390 +
65391 +/* helper functions */
65392 +
65393 +static void update_stale_dk(reiser4_tree * tree, znode * node);
65394 +
65395 +/* release parent node during traversal */
65396 +static void put_parent(cbk_handle * h);
65397 +/* check consistency of fields */
65398 +static int sanity_check(cbk_handle * h);
65399 +/* release resources in handle */
65400 +static void hput(cbk_handle * h);
65401 +
65402 +static level_lookup_result search_to_left(cbk_handle * h);
65403 +
65404 +/* pack numerous (numberous I should say) arguments of coord_by_key() into
65405 + * cbk_handle */
65406 +static cbk_handle *cbk_pack(cbk_handle * handle,
65407 +                           reiser4_tree * tree,
65408 +                           const reiser4_key * key,
65409 +                           coord_t * coord,
65410 +                           lock_handle * active_lh,
65411 +                           lock_handle * parent_lh,
65412 +                           znode_lock_mode lock_mode,
65413 +                           lookup_bias bias,
65414 +                           tree_level lock_level,
65415 +                           tree_level stop_level,
65416 +                           __u32 flags, ra_info_t * info)
65417 +{
65418 +       memset(handle, 0, sizeof *handle);
65419 +
65420 +       handle->tree = tree;
65421 +       handle->key = key;
65422 +       handle->lock_mode = lock_mode;
65423 +       handle->bias = bias;
65424 +       handle->lock_level = lock_level;
65425 +       handle->stop_level = stop_level;
65426 +       handle->coord = coord;
65427 +       /* set flags. See comment in tree.h:cbk_flags */
65428 +       handle->flags = flags | CBK_TRUST_DK | CBK_USE_CRABLOCK;
65429 +
65430 +       handle->active_lh = active_lh;
65431 +       handle->parent_lh = parent_lh;
65432 +       handle->ra_info = info;
65433 +       return handle;
65434 +}
65435 +
65436 +/* main tree lookup procedure
65437 +
65438 +   Check coord cache. If key we are looking for is not found there, call cbk()
65439 +   to do real tree traversal.
65440 +
65441 +   As we have extents on the twig level, @lock_level and @stop_level can
65442 +   be different from LEAF_LEVEL and each other.
65443 +
65444 +   Thread cannot keep any reiser4 locks (tree, znode, dk spin-locks, or znode
65445 +   long term locks) while calling this.
65446 +*/
65447 +lookup_result coord_by_key(reiser4_tree * tree /* tree to perform search
65448 +                                                * in. Usually this tree is
65449 +                                                * part of file-system
65450 +                                                * super-block */ ,
65451 +                          const reiser4_key * key /* key to look for */ ,
65452 +                          coord_t * coord      /* where to store found
65453 +                                                * position in a tree. Fields
65454 +                                                * in "coord" are only valid if
65455 +                                                * coord_by_key() returned
65456 +                                                * "CBK_COORD_FOUND" */ ,
65457 +                          lock_handle * lh,    /* resulting lock handle */
65458 +                          znode_lock_mode lock_mode    /* type of lookup we
65459 +                                                        * want on node. Pass
65460 +                                                        * ZNODE_READ_LOCK here
65461 +                                                        * if you only want to
65462 +                                                        * read item found and
65463 +                                                        * ZNODE_WRITE_LOCK if
65464 +                                                        * you want to modify
65465 +                                                        * it */ ,
65466 +                          lookup_bias bias     /* what to return if coord
65467 +                                                * with exactly the @key is
65468 +                                                * not in the tree */ ,
65469 +                          tree_level lock_level        /* tree level where to start
65470 +                                                        * taking @lock type of
65471 +                                                        * locks */ ,
65472 +                          tree_level stop_level        /* tree level to stop. Pass
65473 +                                                        * LEAF_LEVEL or TWIG_LEVEL
65474 +                                                        * here Item being looked
65475 +                                                        * for has to be between
65476 +                                                        * @lock_level and
65477 +                                                        * @stop_level, inclusive */ ,
65478 +                          __u32 flags /* search flags */ ,
65479 +                          ra_info_t *
65480 +                          info
65481 +                          /* information about desired tree traversal readahead */
65482 +                          )
65483 +{
65484 +       cbk_handle handle;
65485 +       lock_handle parent_lh;
65486 +       lookup_result result;
65487 +
65488 +       init_lh(lh);
65489 +       init_lh(&parent_lh);
65490 +
65491 +       assert("nikita-3023", schedulable());
65492 +
65493 +       assert("nikita-353", tree != NULL);
65494 +       assert("nikita-354", key != NULL);
65495 +       assert("nikita-355", coord != NULL);
65496 +       assert("nikita-356", (bias == FIND_EXACT)
65497 +              || (bias == FIND_MAX_NOT_MORE_THAN));
65498 +       assert("nikita-357", stop_level >= LEAF_LEVEL);
65499 +       /* no locks can be held during tree traversal */
65500 +       assert("nikita-2104", lock_stack_isclean(get_current_lock_stack()));
65501 +
65502 +       cbk_pack(&handle,
65503 +                tree,
65504 +                key,
65505 +                coord,
65506 +                lh,
65507 +                &parent_lh,
65508 +                lock_mode, bias, lock_level, stop_level, flags, info);
65509 +
65510 +       result = coord_by_handle(&handle);
65511 +       assert("nikita-3247",
65512 +              ergo(!IS_CBKERR(result), coord->node == lh->node));
65513 +       return result;
65514 +}
65515 +
65516 +/* like coord_by_key(), but starts traversal from vroot of @object rather than
65517 + * from tree root. */
65518 +lookup_result
65519 +object_lookup(struct inode * object,
65520 +             const reiser4_key * key,
65521 +             coord_t * coord,
65522 +             lock_handle * lh,
65523 +             znode_lock_mode lock_mode,
65524 +             lookup_bias bias,
65525 +             tree_level lock_level,
65526 +             tree_level stop_level, __u32 flags, ra_info_t * info)
65527 +{
65528 +       cbk_handle handle;
65529 +       lock_handle parent_lh;
65530 +       lookup_result result;
65531 +
65532 +       init_lh(lh);
65533 +       init_lh(&parent_lh);
65534 +
65535 +       assert("nikita-3023", schedulable());
65536 +
65537 +       assert("nikita-354", key != NULL);
65538 +       assert("nikita-355", coord != NULL);
65539 +       assert("nikita-356", (bias == FIND_EXACT)
65540 +              || (bias == FIND_MAX_NOT_MORE_THAN));
65541 +       assert("nikita-357", stop_level >= LEAF_LEVEL);
65542 +       /* no locks can be held during tree search by key */
65543 +       assert("nikita-2104", lock_stack_isclean(get_current_lock_stack()));
65544 +
65545 +       cbk_pack(&handle,
65546 +                object != NULL ? tree_by_inode(object) : current_tree,
65547 +                key,
65548 +                coord,
65549 +                lh,
65550 +                &parent_lh,
65551 +                lock_mode, bias, lock_level, stop_level, flags, info);
65552 +       handle.object = object;
65553 +
65554 +       result = coord_by_handle(&handle);
65555 +       assert("nikita-3247",
65556 +              ergo(!IS_CBKERR(result), coord->node == lh->node));
65557 +       return result;
65558 +}
65559 +
65560 +/* lookup by cbk_handle. Common part of coord_by_key() and object_lookup(). */
65561 +static lookup_result coord_by_handle(cbk_handle * handle)
65562 +{
65563 +       /*
65564 +        * first check cbk_cache (which is look-aside cache for our tree) and
65565 +        * of this fails, start traversal.
65566 +        */
65567 +       /* first check whether "key" is in cache of recent lookups. */
65568 +       if (cbk_cache_search(handle) == 0)
65569 +               return handle->result;
65570 +       else
65571 +               return traverse_tree(handle);
65572 +}
65573 +
65574 +/* Execute actor for each item (or unit, depending on @through_units_p),
65575 +   starting from @coord, right-ward, until either:
65576 +
65577 +   - end of the tree is reached
65578 +   - unformatted node is met
65579 +   - error occurred
65580 +   - @actor returns 0 or less
65581 +
65582 +   Error code, or last actor return value is returned.
65583 +
65584 +   This is used by plugin/dir/hashe_dir.c:find_entry() to move through
65585 +   sequence of entries with identical keys and alikes.
65586 +*/
65587 +int iterate_tree(reiser4_tree * tree /* tree to scan */ ,
65588 +                coord_t * coord /* coord to start from */ ,
65589 +                lock_handle * lh       /* lock handle to start with and to
65590 +                                        * update along the way */ ,
65591 +                tree_iterate_actor_t actor     /* function to call on each
65592 +                                                * item/unit */ ,
65593 +                void *arg /* argument to pass to @actor */ ,
65594 +                znode_lock_mode mode /* lock mode on scanned nodes */ ,
65595 +                int through_units_p    /* call @actor on each item or on each
65596 +                                        * unit */ )
65597 +{
65598 +       int result;
65599 +
65600 +       assert("nikita-1143", tree != NULL);
65601 +       assert("nikita-1145", coord != NULL);
65602 +       assert("nikita-1146", lh != NULL);
65603 +       assert("nikita-1147", actor != NULL);
65604 +
65605 +       result = zload(coord->node);
65606 +       coord_clear_iplug(coord);
65607 +       if (result != 0)
65608 +               return result;
65609 +       if (!coord_is_existing_unit(coord)) {
65610 +               zrelse(coord->node);
65611 +               return -ENOENT;
65612 +       }
65613 +       while ((result = actor(tree, coord, lh, arg)) > 0) {
65614 +               /* move further  */
65615 +               if ((through_units_p && coord_next_unit(coord)) ||
65616 +                   (!through_units_p && coord_next_item(coord))) {
65617 +                       do {
65618 +                               lock_handle couple;
65619 +
65620 +                               /* move to the next node  */
65621 +                               init_lh(&couple);
65622 +                               result =
65623 +                                   reiser4_get_right_neighbor(&couple,
65624 +                                                              coord->node,
65625 +                                                              (int)mode,
65626 +                                                              GN_CAN_USE_UPPER_LEVELS);
65627 +                               zrelse(coord->node);
65628 +                               if (result == 0) {
65629 +
65630 +                                       result = zload(couple.node);
65631 +                                       if (result != 0) {
65632 +                                               done_lh(&couple);
65633 +                                               return result;
65634 +                                       }
65635 +
65636 +                                       coord_init_first_unit(coord,
65637 +                                                             couple.node);
65638 +                                       done_lh(lh);
65639 +                                       move_lh(lh, &couple);
65640 +                               } else
65641 +                                       return result;
65642 +                       } while (node_is_empty(coord->node));
65643 +               }
65644 +
65645 +               assert("nikita-1149", coord_is_existing_unit(coord));
65646 +       }
65647 +       zrelse(coord->node);
65648 +       return result;
65649 +}
65650 +
65651 +/* return locked uber znode for @tree */
65652 +int get_uber_znode(reiser4_tree * tree, znode_lock_mode mode,
65653 +                  znode_lock_request pri, lock_handle * lh)
65654 +{
65655 +       int result;
65656 +
65657 +       result = longterm_lock_znode(lh, tree->uber, mode, pri);
65658 +       return result;
65659 +}
65660 +
65661 +/* true if @key is strictly within @node
65662 +
65663 +   we are looking for possibly non-unique key and it is item is at the edge of
65664 +   @node. May be it is in the neighbor.
65665 +*/
65666 +static int znode_contains_key_strict(znode * node      /* node to check key
65667 +                                                        * against */ ,
65668 +                                    const reiser4_key *
65669 +                                    key /* key to check */ ,
65670 +                                    int isunique)
65671 +{
65672 +       int answer;
65673 +
65674 +       assert("nikita-1760", node != NULL);
65675 +       assert("nikita-1722", key != NULL);
65676 +
65677 +       if (keyge(key, &node->rd_key))
65678 +               return 0;
65679 +
65680 +       answer = keycmp(&node->ld_key, key);
65681 +
65682 +       if (isunique)
65683 +               return answer != GREATER_THAN;
65684 +       else
65685 +               return answer == LESS_THAN;
65686 +}
65687 +
65688 +/*
65689 + * Virtual Root (vroot) code.
65690 + *
65691 + *     For given file system object (e.g., regular file or directory) let's
65692 + *     define its "virtual root" as lowest in the tree (that is, furtherest
65693 + *     from the tree root) node such that all body items of said object are
65694 + *     located in a tree rooted at this node.
65695 + *
65696 + *     Once vroot of object is found all tree lookups for items within body of
65697 + *     this object ("object lookups") can be started from its vroot rather
65698 + *     than from real root. This has following advantages:
65699 + *
65700 + *         1. amount of nodes traversed during lookup (and, hence, amount of
65701 + *         key comparisons made) decreases, and
65702 + *
65703 + *         2. contention on tree root is decreased. This latter was actually
65704 + *         motivating reason behind vroot, because spin lock of root node,
65705 + *         which is taken when acquiring long-term lock on root node is the
65706 + *         hottest lock in the reiser4.
65707 + *
65708 + * How to find vroot.
65709 + *
65710 + *     When vroot of object F is not yet determined, all object lookups start
65711 + *     from the root of the tree. At each tree level during traversal we have
65712 + *     a node N such that a key we are looking for (which is the key inside
65713 + *     object's body) is located within N. In function handle_vroot() called
65714 + *     from cbk_level_lookup() we check whether N is possible vroot for
65715 + *     F. Check is trivial---if neither leftmost nor rightmost item of N
65716 + *     belongs to F (and we already have helpful ->owns_item() method of
65717 + *     object plugin for this), then N is possible vroot of F. This, of
65718 + *     course, relies on the assumption that each object occupies contiguous
65719 + *     range of keys in the tree.
65720 + *
65721 + *     Thus, traversing tree downward and checking each node as we go, we can
65722 + *     find lowest such node, which, by definition, is vroot.
65723 + *
65724 + * How to track vroot.
65725 + *
65726 + *     Nohow. If actual vroot changes, next object lookup will just restart
65727 + *     from the actual tree root, refreshing object's vroot along the way.
65728 + *
65729 + */
65730 +
65731 +/*
65732 + * Check whether @node is possible vroot of @object.
65733 + */
65734 +static void handle_vroot(struct inode *object, znode * node)
65735 +{
65736 +       file_plugin *fplug;
65737 +       coord_t coord;
65738 +
65739 +       fplug = inode_file_plugin(object);
65740 +       assert("nikita-3353", fplug != NULL);
65741 +       assert("nikita-3354", fplug->owns_item != NULL);
65742 +
65743 +       if (unlikely(node_is_empty(node)))
65744 +               return;
65745 +
65746 +       coord_init_first_unit(&coord, node);
65747 +       /*
65748 +        * if leftmost item of @node belongs to @object, we cannot be sure
65749 +        * that @node is vroot of @object, because, some items of @object are
65750 +        * probably in the sub-tree rooted at the left neighbor of @node.
65751 +        */
65752 +       if (fplug->owns_item(object, &coord))
65753 +               return;
65754 +       coord_init_last_unit(&coord, node);
65755 +       /* mutatis mutandis for the rightmost item */
65756 +       if (fplug->owns_item(object, &coord))
65757 +               return;
65758 +       /* otherwise, @node is possible vroot of @object */
65759 +       inode_set_vroot(object, node);
65760 +}
65761 +
65762 +/*
65763 + * helper function used by traverse tree to start tree traversal not from the
65764 + * tree root, but from @h->object's vroot, if possible.
65765 + */
65766 +static int prepare_object_lookup(cbk_handle * h)
65767 +{
65768 +       znode *vroot;
65769 +       int result;
65770 +
65771 +       vroot = inode_get_vroot(h->object);
65772 +       if (vroot == NULL) {
65773 +               /*
65774 +                * object doesn't have known vroot, start from real tree root.
65775 +                */
65776 +               return LOOKUP_CONT;
65777 +       }
65778 +
65779 +       h->level = znode_get_level(vroot);
65780 +       /* take a long-term lock on vroot */
65781 +       h->result = longterm_lock_znode(h->active_lh, vroot,
65782 +                                       cbk_lock_mode(h->level, h),
65783 +                                       ZNODE_LOCK_LOPRI);
65784 +       result = LOOKUP_REST;
65785 +       if (h->result == 0) {
65786 +               int isunique;
65787 +               int inside;
65788 +
65789 +               isunique = h->flags & CBK_UNIQUE;
65790 +               /* check that key is inside vroot */
65791 +               read_lock_dk(h->tree);
65792 +               inside = (znode_contains_key_strict(vroot, h->key, isunique) &&
65793 +                         !ZF_ISSET(vroot, JNODE_HEARD_BANSHEE));
65794 +               read_unlock_dk(h->tree);
65795 +               if (inside) {
65796 +                       h->result = zload(vroot);
65797 +                       if (h->result == 0) {
65798 +                               /* search for key in vroot. */
65799 +                               result = cbk_node_lookup(h);
65800 +                               zrelse(vroot);  /*h->active_lh->node); */
65801 +                               if (h->active_lh->node != vroot) {
65802 +                                       result = LOOKUP_REST;
65803 +                               } else if (result == LOOKUP_CONT) {
65804 +                                       move_lh(h->parent_lh, h->active_lh);
65805 +                                       h->flags &= ~CBK_DKSET;
65806 +                               }
65807 +                       }
65808 +               }
65809 +       } else
65810 +               /* long-term locking failed. Restart. */
65811 +               ;
65812 +
65813 +       zput(vroot);
65814 +
65815 +       if (IS_CBKERR(h->result) || result == LOOKUP_REST)
65816 +               hput(h);
65817 +       return result;
65818 +}
65819 +
65820 +/* main function that handles common parts of tree traversal: starting
65821 +    (fake znode handling), restarts, error handling, completion */
65822 +static lookup_result traverse_tree(cbk_handle * h /* search handle */ )
65823 +{
65824 +       int done;
65825 +       int iterations;
65826 +       int vroot_used;
65827 +
65828 +       assert("nikita-365", h != NULL);
65829 +       assert("nikita-366", h->tree != NULL);
65830 +       assert("nikita-367", h->key != NULL);
65831 +       assert("nikita-368", h->coord != NULL);
65832 +       assert("nikita-369", (h->bias == FIND_EXACT)
65833 +              || (h->bias == FIND_MAX_NOT_MORE_THAN));
65834 +       assert("nikita-370", h->stop_level >= LEAF_LEVEL);
65835 +       assert("nikita-2949", !(h->flags & CBK_DKSET));
65836 +       assert("zam-355", lock_stack_isclean(get_current_lock_stack()));
65837 +
65838 +       done = 0;
65839 +       iterations = 0;
65840 +       vroot_used = 0;
65841 +
65842 +       /* loop for restarts */
65843 +      restart:
65844 +
65845 +       assert("nikita-3024", schedulable());
65846 +
65847 +       h->result = CBK_COORD_FOUND;
65848 +       /* connect_znode() needs it */
65849 +       h->ld_key = *min_key();
65850 +       h->rd_key = *max_key();
65851 +       h->flags |= CBK_DKSET;
65852 +       h->error = NULL;
65853 +
65854 +       if (!vroot_used && h->object != NULL) {
65855 +               vroot_used = 1;
65856 +               done = prepare_object_lookup(h);
65857 +               if (done == LOOKUP_REST) {
65858 +                       goto restart;
65859 +               } else if (done == LOOKUP_DONE)
65860 +                       return h->result;
65861 +       }
65862 +       if (h->parent_lh->node == NULL) {
65863 +               done =
65864 +                   get_uber_znode(h->tree, ZNODE_READ_LOCK, ZNODE_LOCK_LOPRI,
65865 +                                  h->parent_lh);
65866 +
65867 +               assert("nikita-1637", done != -E_DEADLOCK);
65868 +
65869 +               h->block = h->tree->root_block;
65870 +               h->level = h->tree->height;
65871 +               h->coord->node = h->parent_lh->node;
65872 +
65873 +               if (done != 0)
65874 +                       return done;
65875 +       }
65876 +
65877 +       /* loop descending a tree */
65878 +       while (!done) {
65879 +
65880 +               if (unlikely((iterations > REISER4_CBK_ITERATIONS_LIMIT) &&
65881 +                            IS_POW(iterations))) {
65882 +                       warning("nikita-1481", "Too many iterations: %i",
65883 +                               iterations);
65884 +                       print_key("key", h->key);
65885 +                       ++iterations;
65886 +               } else if (unlikely(iterations > REISER4_MAX_CBK_ITERATIONS)) {
65887 +                       h->error =
65888 +                           "reiser-2018: Too many iterations. Tree corrupted, or (less likely) starvation occurring.";
65889 +                       h->result = RETERR(-EIO);
65890 +                       break;
65891 +               }
65892 +               switch (cbk_level_lookup(h)) {
65893 +               case LOOKUP_CONT:
65894 +                       move_lh(h->parent_lh, h->active_lh);
65895 +                       continue;
65896 +               default:
65897 +                       wrong_return_value("nikita-372", "cbk_level");
65898 +               case LOOKUP_DONE:
65899 +                       done = 1;
65900 +                       break;
65901 +               case LOOKUP_REST:
65902 +                       hput(h);
65903 +                       /* deadlock avoidance is normal case. */
65904 +                       if (h->result != -E_DEADLOCK)
65905 +                               ++iterations;
65906 +                       preempt_point();
65907 +                       goto restart;
65908 +               }
65909 +       }
65910 +       /* that's all. The rest is error handling */
65911 +       if (unlikely(h->error != NULL)) {
65912 +               warning("nikita-373", "%s: level: %i, "
65913 +                       "lock_level: %i, stop_level: %i "
65914 +                       "lock_mode: %s, bias: %s",
65915 +                       h->error, h->level, h->lock_level, h->stop_level,
65916 +                       lock_mode_name(h->lock_mode), bias_name(h->bias));
65917 +               reiser4_print_address("block", &h->block);
65918 +               print_key("key", h->key);
65919 +               print_coord_content("coord", h->coord);
65920 +       }
65921 +       /* `unlikely' error case */
65922 +       if (unlikely(IS_CBKERR(h->result))) {
65923 +               /* failure. do cleanup */
65924 +               hput(h);
65925 +       } else {
65926 +               assert("nikita-1605", WITH_DATA_RET
65927 +                      (h->coord->node, 1,
65928 +                       ergo((h->result == CBK_COORD_FOUND) &&
65929 +                            (h->bias == FIND_EXACT) &&
65930 +                            (!node_is_empty(h->coord->node)),
65931 +                            coord_is_existing_item(h->coord))));
65932 +       }
65933 +       return h->result;
65934 +}
65935 +
65936 +/* find delimiting keys of child
65937 +
65938 +   Determine left and right delimiting keys for child pointed to by
65939 +   @parent_coord.
65940 +
65941 +*/
65942 +static void find_child_delimiting_keys(znode * parent  /* parent znode, passed
65943 +                                                        * locked */ ,
65944 +                                      const coord_t * parent_coord     /* coord where
65945 +                                                                        * pointer to
65946 +                                                                        * child is
65947 +                                                                        * stored */ ,
65948 +                                      reiser4_key * ld /* where to store left
65949 +                                                        * delimiting key */ ,
65950 +                                      reiser4_key * rd /* where to store right
65951 +                                                        * delimiting key */ )
65952 +{
65953 +       coord_t neighbor;
65954 +
65955 +       assert("nikita-1484", parent != NULL);
65956 +       assert_rw_locked(&(znode_get_tree(parent)->dk_lock));
65957 +
65958 +       coord_dup(&neighbor, parent_coord);
65959 +
65960 +       if (neighbor.between == AT_UNIT)
65961 +               /* imitate item ->lookup() behavior. */
65962 +               neighbor.between = AFTER_UNIT;
65963 +
65964 +       if (coord_set_to_left(&neighbor) == 0)
65965 +               unit_key_by_coord(&neighbor, ld);
65966 +       else {
65967 +               assert("nikita-14851", 0);
65968 +               *ld = *znode_get_ld_key(parent);
65969 +       }
65970 +
65971 +       coord_dup(&neighbor, parent_coord);
65972 +       if (neighbor.between == AT_UNIT)
65973 +               neighbor.between = AFTER_UNIT;
65974 +       if (coord_set_to_right(&neighbor) == 0)
65975 +               unit_key_by_coord(&neighbor, rd);
65976 +       else
65977 +               *rd = *znode_get_rd_key(parent);
65978 +}
65979 +
65980 +/*
65981 + * setup delimiting keys for a child
65982 + *
65983 + * @parent parent node
65984 + *
65985 + * @coord location in @parent where pointer to @child is
65986 + *
65987 + * @child child node
65988 + */
65989 +int
65990 +set_child_delimiting_keys(znode * parent, const coord_t * coord, znode * child)
65991 +{
65992 +       reiser4_tree *tree;
65993 +
65994 +       assert("nikita-2952",
65995 +              znode_get_level(parent) == znode_get_level(coord->node));
65996 +
65997 +       /* fast check without taking dk lock. This is safe, because
65998 +        * JNODE_DKSET is never cleared once set. */
65999 +       if (!ZF_ISSET(child, JNODE_DKSET)) {
66000 +               tree = znode_get_tree(parent);
66001 +               write_lock_dk(tree);
66002 +               if (likely(!ZF_ISSET(child, JNODE_DKSET))) {
66003 +                       find_child_delimiting_keys(parent, coord,
66004 +                                                  &child->ld_key,
66005 +                                                  &child->rd_key);
66006 +                       ON_DEBUG(child->ld_key_version =
66007 +                                atomic_inc_return(&delim_key_version);
66008 +                                child->rd_key_version =
66009 +                                atomic_inc_return(&delim_key_version););
66010 +                       ZF_SET(child, JNODE_DKSET);
66011 +               }
66012 +               write_unlock_dk(tree);
66013 +               return 1;
66014 +       }
66015 +       return 0;
66016 +}
66017 +
66018 +/* Perform tree lookup at one level. This is called from cbk_traverse()
66019 +   function that drives lookup through tree and calls cbk_node_lookup() to
66020 +   perform lookup within one node.
66021 +
66022 +   See comments in a code.
66023 +*/
66024 +static level_lookup_result cbk_level_lookup(cbk_handle * h /* search handle */ )
66025 +{
66026 +       int ret;
66027 +       int setdk;
66028 +       int ldkeyset = 0;
66029 +       reiser4_key ldkey;
66030 +       reiser4_key key;
66031 +       znode *active;
66032 +
66033 +       assert("nikita-3025", schedulable());
66034 +
66035 +       /* acquire reference to @active node */
66036 +       active =
66037 +           zget(h->tree, &h->block, h->parent_lh->node, h->level, get_gfp_mask());
66038 +
66039 +       if (IS_ERR(active)) {
66040 +               h->result = PTR_ERR(active);
66041 +               return LOOKUP_DONE;
66042 +       }
66043 +
66044 +       /* lock @active */
66045 +       h->result = longterm_lock_znode(h->active_lh,
66046 +                                       active,
66047 +                                       cbk_lock_mode(h->level, h),
66048 +                                       ZNODE_LOCK_LOPRI);
66049 +       /* longterm_lock_znode() acquires additional reference to znode (which
66050 +          will be later released by longterm_unlock_znode()). Release
66051 +          reference acquired by zget().
66052 +        */
66053 +       zput(active);
66054 +       if (unlikely(h->result != 0))
66055 +               goto fail_or_restart;
66056 +
66057 +       setdk = 0;
66058 +       /* if @active is accessed for the first time, setup delimiting keys on
66059 +          it. Delimiting keys are taken from the parent node. See
66060 +          setup_delimiting_keys() for details.
66061 +        */
66062 +       if (h->flags & CBK_DKSET) {
66063 +               setdk = setup_delimiting_keys(h);
66064 +               h->flags &= ~CBK_DKSET;
66065 +       } else {
66066 +               znode *parent;
66067 +
66068 +               parent = h->parent_lh->node;
66069 +               h->result = zload(parent);
66070 +               if (unlikely(h->result != 0))
66071 +                       goto fail_or_restart;
66072 +
66073 +               if (!ZF_ISSET(active, JNODE_DKSET))
66074 +                       setdk = set_child_delimiting_keys(parent,
66075 +                                                         h->coord, active);
66076 +               else {
66077 +                       read_lock_dk(h->tree);
66078 +                       find_child_delimiting_keys(parent, h->coord, &ldkey,
66079 +                                                  &key);
66080 +                       read_unlock_dk(h->tree);
66081 +                       ldkeyset = 1;
66082 +               }
66083 +               zrelse(parent);
66084 +       }
66085 +
66086 +       /* this is ugly kludge. Reminder: this is necessary, because
66087 +          ->lookup() method returns coord with ->between field probably set
66088 +          to something different from AT_UNIT.
66089 +        */
66090 +       h->coord->between = AT_UNIT;
66091 +
66092 +       if (znode_just_created(active) && (h->coord->node != NULL)) {
66093 +               write_lock_tree(h->tree);
66094 +               /* if we are going to load znode right now, setup
66095 +                  ->in_parent: coord where pointer to this node is stored in
66096 +                  parent.
66097 +                */
66098 +               coord_to_parent_coord(h->coord, &active->in_parent);
66099 +               write_unlock_tree(h->tree);
66100 +       }
66101 +
66102 +       /* check connectedness without holding tree lock---false negatives
66103 +        * will be re-checked by connect_znode(), and false positives are
66104 +        * impossible---@active cannot suddenly turn into unconnected
66105 +        * state. */
66106 +       if (!znode_is_connected(active)) {
66107 +               h->result = connect_znode(h->coord, active);
66108 +               if (unlikely(h->result != 0)) {
66109 +                       put_parent(h);
66110 +                       goto fail_or_restart;
66111 +               }
66112 +       }
66113 +
66114 +       jload_prefetch(ZJNODE(active));
66115 +
66116 +       if (setdk)
66117 +               update_stale_dk(h->tree, active);
66118 +
66119 +       /* put_parent() cannot be called earlier, because connect_znode()
66120 +          assumes parent node is referenced; */
66121 +       put_parent(h);
66122 +
66123 +       if ((!znode_contains_key_lock(active, h->key) &&
66124 +            (h->flags & CBK_TRUST_DK))
66125 +           || ZF_ISSET(active, JNODE_HEARD_BANSHEE)) {
66126 +               /* 1. key was moved out of this node while this thread was
66127 +                  waiting for the lock. Restart. More elaborate solution is
66128 +                  to determine where key moved (to the left, or to the right)
66129 +                  and try to follow it through sibling pointers.
66130 +
66131 +                  2. or, node itself is going to be removed from the
66132 +                  tree. Release lock and restart.
66133 +                */
66134 +               h->result = -E_REPEAT;
66135 +       }
66136 +       if (h->result == -E_REPEAT)
66137 +               return LOOKUP_REST;
66138 +
66139 +       h->result = zload_ra(active, h->ra_info);
66140 +       if (h->result) {
66141 +               return LOOKUP_DONE;
66142 +       }
66143 +
66144 +       /* sanity checks */
66145 +       if (sanity_check(h)) {
66146 +               zrelse(active);
66147 +               return LOOKUP_DONE;
66148 +       }
66149 +
66150 +       /* check that key of leftmost item in the @active is the same as in
66151 +        * its parent */
66152 +       if (ldkeyset && !node_is_empty(active) &&
66153 +           !keyeq(leftmost_key_in_node(active, &key), &ldkey)) {
66154 +               warning("vs-3533", "Keys are inconsistent. Fsck?");
66155 +               print_key("inparent", &ldkey);
66156 +               print_key("inchild", &key);
66157 +               h->result = RETERR(-EIO);
66158 +               zrelse(active);
66159 +               return LOOKUP_DONE;
66160 +       }
66161 +
66162 +       if (h->object != NULL)
66163 +               handle_vroot(h->object, active);
66164 +
66165 +       ret = cbk_node_lookup(h);
66166 +
66167 +       /* h->active_lh->node might change, but active is yet to be zrelsed */
66168 +       zrelse(active);
66169 +
66170 +       return ret;
66171 +
66172 +      fail_or_restart:
66173 +       if (h->result == -E_DEADLOCK)
66174 +               return LOOKUP_REST;
66175 +       return LOOKUP_DONE;
66176 +}
66177 +
66178 +#if REISER4_DEBUG
66179 +/* check left and right delimiting keys of a znode */
66180 +void check_dkeys(znode * node)
66181 +{
66182 +       znode *left;
66183 +       znode *right;
66184 +
66185 +       read_lock_tree(current_tree);
66186 +       read_lock_dk(current_tree);
66187 +
66188 +       assert("vs-1710", znode_is_any_locked(node));
66189 +       assert("vs-1197",
66190 +              !keygt(znode_get_ld_key(node), znode_get_rd_key(node)));
66191 +
66192 +       left = node->left;
66193 +       right = node->right;
66194 +
66195 +       if (ZF_ISSET(node, JNODE_LEFT_CONNECTED) && ZF_ISSET(node, JNODE_DKSET)
66196 +           && left != NULL && ZF_ISSET(left, JNODE_DKSET))
66197 +               /* check left neighbor. Note that left neighbor is not locked,
66198 +                  so it might get wrong delimiting keys therefore */
66199 +               assert("vs-1198",
66200 +                      (keyeq(znode_get_rd_key(left), znode_get_ld_key(node))
66201 +                       || ZF_ISSET(left, JNODE_HEARD_BANSHEE)));
66202 +
66203 +       if (ZF_ISSET(node, JNODE_RIGHT_CONNECTED) && ZF_ISSET(node, JNODE_DKSET)
66204 +           && right != NULL && ZF_ISSET(right, JNODE_DKSET))
66205 +               /* check right neighbor. Note that right neighbor is not
66206 +                  locked, so it might get wrong delimiting keys therefore  */
66207 +               assert("vs-1199",
66208 +                      (keyeq(znode_get_rd_key(node), znode_get_ld_key(right))
66209 +                       || ZF_ISSET(right, JNODE_HEARD_BANSHEE)));
66210 +
66211 +       read_unlock_dk(current_tree);
66212 +       read_unlock_tree(current_tree);
66213 +}
66214 +#endif
66215 +
66216 +/* true if @key is left delimiting key of @node */
66217 +static int key_is_ld(znode * node, const reiser4_key * key)
66218 +{
66219 +       int ld;
66220 +
66221 +       assert("nikita-1716", node != NULL);
66222 +       assert("nikita-1758", key != NULL);
66223 +
66224 +       read_lock_dk(znode_get_tree(node));
66225 +       assert("nikita-1759", znode_contains_key(node, key));
66226 +       ld = keyeq(znode_get_ld_key(node), key);
66227 +       read_unlock_dk(znode_get_tree(node));
66228 +       return ld;
66229 +}
66230 +
66231 +/* Process one node during tree traversal.
66232 +
66233 +   This is called by cbk_level_lookup(). */
66234 +static level_lookup_result cbk_node_lookup(cbk_handle * h /* search handle */ )
66235 +{
66236 +       /* node plugin of @active */
66237 +       node_plugin *nplug;
66238 +       /* item plugin of item that was found */
66239 +       item_plugin *iplug;
66240 +       /* search bias */
66241 +       lookup_bias node_bias;
66242 +       /* node we are operating upon */
66243 +       znode *active;
66244 +       /* tree we are searching in */
66245 +       reiser4_tree *tree;
66246 +       /* result */
66247 +       int result;
66248 +
66249 +       assert("nikita-379", h != NULL);
66250 +
66251 +       active = h->active_lh->node;
66252 +       tree = h->tree;
66253 +
66254 +       nplug = active->nplug;
66255 +       assert("nikita-380", nplug != NULL);
66256 +
66257 +       ON_DEBUG(check_dkeys(active));
66258 +
66259 +       /* return item from "active" node with maximal key not greater than
66260 +          "key"  */
66261 +       node_bias = h->bias;
66262 +       result = nplug->lookup(active, h->key, node_bias, h->coord);
66263 +       if (unlikely(result != NS_FOUND && result != NS_NOT_FOUND)) {
66264 +               /* error occurred */
66265 +               h->result = result;
66266 +               return LOOKUP_DONE;
66267 +       }
66268 +       if (h->level == h->stop_level) {
66269 +               /* welcome to the stop level */
66270 +               assert("nikita-381", h->coord->node == active);
66271 +               if (result == NS_FOUND) {
66272 +                       /* success of tree lookup */
66273 +                       if (!(h->flags & CBK_UNIQUE)
66274 +                           && key_is_ld(active, h->key)) {
66275 +                               return search_to_left(h);
66276 +                       } else
66277 +                               h->result = CBK_COORD_FOUND;
66278 +               } else {
66279 +                       h->result = CBK_COORD_NOTFOUND;
66280 +               }
66281 +               if (!(h->flags & CBK_IN_CACHE))
66282 +                       cbk_cache_add(active);
66283 +               return LOOKUP_DONE;
66284 +       }
66285 +
66286 +       if (h->level > TWIG_LEVEL && result == NS_NOT_FOUND) {
66287 +               h->error = "not found on internal node";
66288 +               h->result = result;
66289 +               return LOOKUP_DONE;
66290 +       }
66291 +
66292 +       assert("vs-361", h->level > h->stop_level);
66293 +
66294 +       if (handle_eottl(h, &result)) {
66295 +               assert("vs-1674", (result == LOOKUP_DONE ||
66296 +                                  result == LOOKUP_REST));
66297 +               return result;
66298 +       }
66299 +
66300 +       /* go down to next level */
66301 +       check_me("vs-12", zload(h->coord->node) == 0);
66302 +       assert("nikita-2116", item_is_internal(h->coord));
66303 +       iplug = item_plugin_by_coord(h->coord);
66304 +       iplug->s.internal.down_link(h->coord, h->key, &h->block);
66305 +       zrelse(h->coord->node);
66306 +       --h->level;
66307 +       return LOOKUP_CONT;     /* continue */
66308 +}
66309 +
66310 +/* scan cbk_cache slots looking for a match for @h */
66311 +static int cbk_cache_scan_slots(cbk_handle * h /* cbk handle */ )
66312 +{
66313 +       level_lookup_result llr;
66314 +       znode *node;
66315 +       reiser4_tree *tree;
66316 +       cbk_cache_slot *slot;
66317 +       cbk_cache *cache;
66318 +       tree_level level;
66319 +       int isunique;
66320 +       const reiser4_key *key;
66321 +       int result;
66322 +
66323 +       assert("nikita-1317", h != NULL);
66324 +       assert("nikita-1315", h->tree != NULL);
66325 +       assert("nikita-1316", h->key != NULL);
66326 +
66327 +       tree = h->tree;
66328 +       cache = &tree->cbk_cache;
66329 +       if (cache->nr_slots == 0)
66330 +               /* size of cbk cache was set to 0 by mount time option. */
66331 +               return RETERR(-ENOENT);
66332 +
66333 +       assert("nikita-2474", cbk_cache_invariant(cache));
66334 +       node = NULL;            /* to keep gcc happy */
66335 +       level = h->level;
66336 +       key = h->key;
66337 +       isunique = h->flags & CBK_UNIQUE;
66338 +       result = RETERR(-ENOENT);
66339 +
66340 +       /*
66341 +        * this is time-critical function and dragons had, hence, been settled
66342 +        * here.
66343 +        *
66344 +        * Loop below scans cbk cache slots trying to find matching node with
66345 +        * suitable range of delimiting keys and located at the h->level.
66346 +        *
66347 +        * Scan is done under cbk cache spin lock that protects slot->node
66348 +        * pointers. If suitable node is found we want to pin it in
66349 +        * memory. But slot->node can point to the node with x_count 0
66350 +        * (unreferenced). Such node can be recycled at any moment, or can
66351 +        * already be in the process of being recycled (within jput()).
66352 +        *
66353 +        * As we found node in the cbk cache, it means that jput() hasn't yet
66354 +        * called cbk_cache_invalidate().
66355 +        *
66356 +        * We acquire reference to the node without holding tree lock, and
66357 +        * later, check node's RIP bit. This avoids races with jput().
66358 +        */
66359 +
66360 +       rcu_read_lock();
66361 +       read_lock(&((cbk_cache *)cache)->guard);
66362 +
66363 +       slot = list_entry(cache->lru.next, cbk_cache_slot, lru);
66364 +       slot = list_entry(slot->lru.prev, cbk_cache_slot, lru);
66365 +       BUG_ON(&slot->lru != &cache->lru);/*????*/
66366 +       while (1) {
66367 +
66368 +               slot = list_entry(slot->lru.next, cbk_cache_slot, lru);
66369 +
66370 +               if (&cache->lru != &slot->lru)
66371 +                       node = slot->node;
66372 +               else
66373 +                       node = NULL;
66374 +
66375 +               if (unlikely(node == NULL))
66376 +                       break;
66377 +
66378 +               /*
66379 +                * this is (hopefully) the only place in the code where we are
66380 +                * working with delimiting keys without holding dk lock. This
66381 +                * is fine here, because this is only "guess" anyway---keys
66382 +                * are rechecked under dk lock below.
66383 +                */
66384 +               if (znode_get_level(node) == level &&
66385 +                   /* min_key < key < max_key */
66386 +                   znode_contains_key_strict(node, key, isunique)) {
66387 +                       zref(node);
66388 +                       result = 0;
66389 +                       spin_lock_prefetch(&tree->tree_lock);
66390 +                       break;
66391 +               }
66392 +       }
66393 +       read_unlock(&((cbk_cache *)cache)->guard);
66394 +
66395 +       assert("nikita-2475", cbk_cache_invariant(cache));
66396 +
66397 +       if (unlikely(result == 0 && ZF_ISSET(node, JNODE_RIP)))
66398 +               result = -ENOENT;
66399 +
66400 +       rcu_read_unlock();
66401 +
66402 +       if (result != 0) {
66403 +               h->result = CBK_COORD_NOTFOUND;
66404 +               return RETERR(-ENOENT);
66405 +       }
66406 +
66407 +       result =
66408 +           longterm_lock_znode(h->active_lh, node, cbk_lock_mode(level, h),
66409 +                               ZNODE_LOCK_LOPRI);
66410 +       zput(node);
66411 +       if (result != 0)
66412 +               return result;
66413 +       result = zload(node);
66414 +       if (result != 0)
66415 +               return result;
66416 +
66417 +       /* recheck keys */
66418 +       read_lock_dk(tree);
66419 +       result = (znode_contains_key_strict(node, key, isunique) &&
66420 +               !ZF_ISSET(node, JNODE_HEARD_BANSHEE));
66421 +       read_unlock_dk(tree);
66422 +       if (result) {
66423 +               /* do lookup inside node */
66424 +               llr = cbk_node_lookup(h);
66425 +               /* if cbk_node_lookup() wandered to another node (due to eottl
66426 +                  or non-unique keys), adjust @node */
66427 +               /*node = h->active_lh->node; */
66428 +
66429 +               if (llr != LOOKUP_DONE) {
66430 +                       /* restart or continue on the next level */
66431 +                       result = RETERR(-ENOENT);
66432 +               } else if (IS_CBKERR(h->result))
66433 +                       /* io or oom */
66434 +                       result = RETERR(-ENOENT);
66435 +               else {
66436 +                       /* good. Either item found or definitely not found. */
66437 +                       result = 0;
66438 +
66439 +                       write_lock(&(cache->guard));
66440 +                       if (slot->node == h->active_lh->node /*node */ ) {
66441 +                               /* if this node is still in cbk cache---move
66442 +                                  its slot to the head of the LRU list. */
66443 +                               list_move(&slot->lru, &cache->lru);
66444 +                       }
66445 +                       write_unlock(&(cache->guard));
66446 +               }
66447 +       } else {
66448 +               /* race. While this thread was waiting for the lock, node was
66449 +                  rebalanced and item we are looking for, shifted out of it
66450 +                  (if it ever was here).
66451 +
66452 +                  Continuing scanning is almost hopeless: node key range was
66453 +                  moved to, is almost certainly at the beginning of the LRU
66454 +                  list at this time, because it's hot, but restarting
66455 +                  scanning from the very beginning is complex. Just return,
66456 +                  so that cbk() will be performed. This is not that
66457 +                  important, because such races should be rare. Are they?
66458 +                */
66459 +               result = RETERR(-ENOENT);       /* -ERAUGHT */
66460 +       }
66461 +       zrelse(node);
66462 +       assert("nikita-2476", cbk_cache_invariant(cache));
66463 +       return result;
66464 +}
66465 +
66466 +/* look for item with given key in the coord cache
66467 +
66468 +   This function, called by coord_by_key(), scans "coord cache" (&cbk_cache)
66469 +   which is a small LRU list of znodes accessed lately. For each znode in
66470 +   znode in this list, it checks whether key we are looking for fits into key
66471 +   range covered by this node. If so, and in addition, node lies at allowed
66472 +   level (this is to handle extents on a twig level), node is locked, and
66473 +   lookup inside it is performed.
66474 +
66475 +   we need a measurement of the cost of this cache search compared to the cost
66476 +   of coord_by_key.
66477 +
66478 +*/
66479 +static int cbk_cache_search(cbk_handle * h /* cbk handle */ )
66480 +{
66481 +       int result = 0;
66482 +       tree_level level;
66483 +
66484 +       /* add CBK_IN_CACHE to the handle flags. This means that
66485 +        * cbk_node_lookup() assumes that cbk_cache is scanned and would add
66486 +        * found node to the cache. */
66487 +       h->flags |= CBK_IN_CACHE;
66488 +       for (level = h->stop_level; level <= h->lock_level; ++level) {
66489 +               h->level = level;
66490 +               result = cbk_cache_scan_slots(h);
66491 +               if (result != 0) {
66492 +                       done_lh(h->active_lh);
66493 +                       done_lh(h->parent_lh);
66494 +               } else {
66495 +                       assert("nikita-1319", !IS_CBKERR(h->result));
66496 +                       break;
66497 +               }
66498 +       }
66499 +       h->flags &= ~CBK_IN_CACHE;
66500 +       return result;
66501 +}
66502 +
66503 +/* type of lock we want to obtain during tree traversal. On stop level
66504 +    we want type of lock user asked for, on upper levels: read lock. */
66505 +znode_lock_mode cbk_lock_mode(tree_level level, cbk_handle * h)
66506 +{
66507 +       assert("nikita-382", h != NULL);
66508 +
66509 +       return (level <= h->lock_level) ? h->lock_mode : ZNODE_READ_LOCK;
66510 +}
66511 +
66512 +/* update outdated delimiting keys */
66513 +static void stale_dk(reiser4_tree * tree, znode * node)
66514 +{
66515 +       znode *right;
66516 +
66517 +       read_lock_tree(tree);
66518 +       write_lock_dk(tree);
66519 +       right = node->right;
66520 +
66521 +       if (ZF_ISSET(node, JNODE_RIGHT_CONNECTED) &&
66522 +           right && ZF_ISSET(right, JNODE_DKSET) &&
66523 +           !keyeq(znode_get_rd_key(node), znode_get_ld_key(right)))
66524 +               znode_set_rd_key(node, znode_get_ld_key(right));
66525 +
66526 +       write_unlock_dk(tree);
66527 +       read_unlock_tree(tree);
66528 +}
66529 +
66530 +/* check for possibly outdated delimiting keys, and update them if
66531 + * necessary. */
66532 +static void update_stale_dk(reiser4_tree * tree, znode * node)
66533 +{
66534 +       znode *right;
66535 +       reiser4_key rd;
66536 +
66537 +       read_lock_tree(tree);
66538 +       read_lock_dk(tree);
66539 +       rd = *znode_get_rd_key(node);
66540 +       right = node->right;
66541 +       if (unlikely(ZF_ISSET(node, JNODE_RIGHT_CONNECTED) &&
66542 +                    right && ZF_ISSET(right, JNODE_DKSET) &&
66543 +                    !keyeq(&rd, znode_get_ld_key(right)))) {
66544 +               assert("nikita-38211", ZF_ISSET(node, JNODE_DKSET));
66545 +               read_unlock_dk(tree);
66546 +               read_unlock_tree(tree);
66547 +               stale_dk(tree, node);
66548 +               return;
66549 +       }
66550 +       read_unlock_dk(tree);
66551 +       read_unlock_tree(tree);
66552 +}
66553 +
66554 +/*
66555 + * handle searches a the non-unique key.
66556 + *
66557 + * Suppose that we are looking for an item with possibly non-unique key 100.
66558 + *
66559 + * Root node contains two pointers: one to a node with left delimiting key 0,
66560 + * and another to a node with left delimiting key 100. Item we interested in
66561 + * may well happen in the sub-tree rooted at the first pointer.
66562 + *
66563 + * To handle this search_to_left() is called when search reaches stop
66564 + * level. This function checks it is _possible_ that item we are looking for
66565 + * is in the left neighbor (this can be done by comparing delimiting keys) and
66566 + * if so, tries to lock left neighbor (this is low priority lock, so it can
66567 + * deadlock, tree traversal is just restarted if it did) and then checks
66568 + * whether left neighbor actually contains items with our key.
66569 + *
66570 + * Note that this is done on the stop level only. It is possible to try such
66571 + * left-check on each level, but as duplicate keys are supposed to be rare
66572 + * (very unlikely that more than one node is completely filled with items with
66573 + * duplicate keys), it sis cheaper to scan to the left on the stop level once.
66574 + *
66575 + */
66576 +static level_lookup_result search_to_left(cbk_handle * h /* search handle */ )
66577 +{
66578 +       level_lookup_result result;
66579 +       coord_t *coord;
66580 +       znode *node;
66581 +       znode *neighbor;
66582 +
66583 +       lock_handle lh;
66584 +
66585 +       assert("nikita-1761", h != NULL);
66586 +       assert("nikita-1762", h->level == h->stop_level);
66587 +
66588 +       init_lh(&lh);
66589 +       coord = h->coord;
66590 +       node = h->active_lh->node;
66591 +       assert("nikita-1763", coord_is_leftmost_unit(coord));
66592 +
66593 +       h->result =
66594 +           reiser4_get_left_neighbor(&lh, node, (int)h->lock_mode,
66595 +                                     GN_CAN_USE_UPPER_LEVELS);
66596 +       neighbor = NULL;
66597 +       switch (h->result) {
66598 +       case -E_DEADLOCK:
66599 +               result = LOOKUP_REST;
66600 +               break;
66601 +       case 0:{
66602 +                       node_plugin *nplug;
66603 +                       coord_t crd;
66604 +                       lookup_bias bias;
66605 +
66606 +                       neighbor = lh.node;
66607 +                       h->result = zload(neighbor);
66608 +                       if (h->result != 0) {
66609 +                               result = LOOKUP_DONE;
66610 +                               break;
66611 +                       }
66612 +
66613 +                       nplug = neighbor->nplug;
66614 +
66615 +                       coord_init_zero(&crd);
66616 +                       bias = h->bias;
66617 +                       h->bias = FIND_EXACT;
66618 +                       h->result =
66619 +                           nplug->lookup(neighbor, h->key, h->bias, &crd);
66620 +                       h->bias = bias;
66621 +
66622 +                       if (h->result == NS_NOT_FOUND) {
66623 +       case -E_NO_NEIGHBOR:
66624 +                               h->result = CBK_COORD_FOUND;
66625 +                               if (!(h->flags & CBK_IN_CACHE))
66626 +                                       cbk_cache_add(node);
66627 +       default:                /* some other error */
66628 +                               result = LOOKUP_DONE;
66629 +                       } else if (h->result == NS_FOUND) {
66630 +                               read_lock_dk(znode_get_tree(neighbor));
66631 +                               h->rd_key = *znode_get_ld_key(node);
66632 +                               leftmost_key_in_node(neighbor, &h->ld_key);
66633 +                               read_unlock_dk(znode_get_tree(neighbor));
66634 +                               h->flags |= CBK_DKSET;
66635 +
66636 +                               h->block = *znode_get_block(neighbor);
66637 +                               /* clear coord -> node so that cbk_level_lookup()
66638 +                                  wouldn't overwrite parent hint in neighbor.
66639 +
66640 +                                  Parent hint was set up by
66641 +                                  reiser4_get_left_neighbor()
66642 +                                */
66643 +                               /* FIXME: why do we have to spinlock here? */
66644 +                               write_lock_tree(znode_get_tree(neighbor));
66645 +                               h->coord->node = NULL;
66646 +                               write_unlock_tree(znode_get_tree(neighbor));
66647 +                               result = LOOKUP_CONT;
66648 +                       } else {
66649 +                               result = LOOKUP_DONE;
66650 +                       }
66651 +                       if (neighbor != NULL)
66652 +                               zrelse(neighbor);
66653 +               }
66654 +       }
66655 +       done_lh(&lh);
66656 +       return result;
66657 +}
66658 +
66659 +/* debugging aid: return symbolic name of search bias */
66660 +static const char *bias_name(lookup_bias bias /* bias to get name of */ )
66661 +{
66662 +       if (bias == FIND_EXACT)
66663 +               return "exact";
66664 +       else if (bias == FIND_MAX_NOT_MORE_THAN)
66665 +               return "left-slant";
66666 +/*     else if( bias == RIGHT_SLANT_BIAS ) */
66667 +/*             return "right-bias"; */
66668 +       else {
66669 +               static char buf[30];
66670 +
66671 +               sprintf(buf, "unknown: %i", bias);
66672 +               return buf;
66673 +       }
66674 +}
66675 +
66676 +#if REISER4_DEBUG
66677 +/* debugging aid: print human readable information about @p */
66678 +void print_coord_content(const char *prefix /* prefix to print */ ,
66679 +                        coord_t * p /* coord to print */ )
66680 +{
66681 +       reiser4_key key;
66682 +
66683 +       if (p == NULL) {
66684 +               printk("%s: null\n", prefix);
66685 +               return;
66686 +       }
66687 +       if ((p->node != NULL) && znode_is_loaded(p->node)
66688 +           && coord_is_existing_item(p))
66689 +               printk("%s: data: %p, length: %i\n", prefix,
66690 +                      item_body_by_coord(p), item_length_by_coord(p));
66691 +       if (znode_is_loaded(p->node)) {
66692 +               item_key_by_coord(p, &key);
66693 +               print_key(prefix, &key);
66694 +       }
66695 +}
66696 +
66697 +/* debugging aid: print human readable information about @block */
66698 +void reiser4_print_address(const char *prefix /* prefix to print */ ,
66699 +                  const reiser4_block_nr * block /* block number to print */ )
66700 +{
66701 +       printk("%s: %s\n", prefix, sprint_address(block));
66702 +}
66703 +#endif
66704 +
66705 +/* return string containing human readable representation of @block */
66706 +char *sprint_address(const reiser4_block_nr *
66707 +                    block /* block number to print */ )
66708 +{
66709 +       static char address[30];
66710 +
66711 +       if (block == NULL)
66712 +               sprintf(address, "null");
66713 +       else if (blocknr_is_fake(block))
66714 +               sprintf(address, "%llx", (unsigned long long)(*block));
66715 +       else
66716 +               sprintf(address, "%llu", (unsigned long long)(*block));
66717 +       return address;
66718 +}
66719 +
66720 +/* release parent node during traversal */
66721 +static void put_parent(cbk_handle * h /* search handle */ )
66722 +{
66723 +       assert("nikita-383", h != NULL);
66724 +       if (h->parent_lh->node != NULL) {
66725 +               longterm_unlock_znode(h->parent_lh);
66726 +       }
66727 +}
66728 +
66729 +/* helper function used by coord_by_key(): release reference to parent znode
66730 +   stored in handle before processing its child. */
66731 +static void hput(cbk_handle * h /* search handle */ )
66732 +{
66733 +       assert("nikita-385", h != NULL);
66734 +       done_lh(h->parent_lh);
66735 +       done_lh(h->active_lh);
66736 +}
66737 +
66738 +/* Helper function used by cbk(): update delimiting keys of child node (stored
66739 +   in h->active_lh->node) using key taken from parent on the parent level. */
66740 +static int setup_delimiting_keys(cbk_handle * h /* search handle */ )
66741 +{
66742 +       znode *active;
66743 +       reiser4_tree *tree;
66744 +
66745 +       assert("nikita-1088", h != NULL);
66746 +
66747 +       active = h->active_lh->node;
66748 +
66749 +       /* fast check without taking dk lock. This is safe, because
66750 +        * JNODE_DKSET is never cleared once set. */
66751 +       if (!ZF_ISSET(active, JNODE_DKSET)) {
66752 +               tree = znode_get_tree(active);
66753 +               write_lock_dk(tree);
66754 +               if (!ZF_ISSET(active, JNODE_DKSET)) {
66755 +                       znode_set_ld_key(active, &h->ld_key);
66756 +                       znode_set_rd_key(active, &h->rd_key);
66757 +                       ZF_SET(active, JNODE_DKSET);
66758 +               }
66759 +               write_unlock_dk(tree);
66760 +               return 1;
66761 +       }
66762 +       return 0;
66763 +}
66764 +
66765 +/* true if @block makes sense for the @tree. Used to detect corrupted node
66766 + * pointers */
66767 +static int
66768 +block_nr_is_correct(reiser4_block_nr * block /* block number to check */ ,
66769 +                   reiser4_tree * tree /* tree to check against */ )
66770 +{
66771 +       assert("nikita-757", block != NULL);
66772 +       assert("nikita-758", tree != NULL);
66773 +
66774 +       /* check to see if it exceeds the size of the device. */
66775 +       return reiser4_blocknr_is_sane_for(tree->super, block);
66776 +}
66777 +
66778 +/* check consistency of fields */
66779 +static int sanity_check(cbk_handle * h /* search handle */ )
66780 +{
66781 +       assert("nikita-384", h != NULL);
66782 +
66783 +       if (h->level < h->stop_level) {
66784 +               h->error = "Buried under leaves";
66785 +               h->result = RETERR(-EIO);
66786 +               return LOOKUP_DONE;
66787 +       } else if (!block_nr_is_correct(&h->block, h->tree)) {
66788 +               h->error = "bad block number";
66789 +               h->result = RETERR(-EIO);
66790 +               return LOOKUP_DONE;
66791 +       } else
66792 +               return 0;
66793 +}
66794 +
66795 +/* Make Linus happy.
66796 +   Local variables:
66797 +   c-indentation-style: "K&R"
66798 +   mode-name: "LC"
66799 +   c-basic-offset: 8
66800 +   tab-width: 8
66801 +   fill-column: 120
66802 +   scroll-step: 1
66803 +   End:
66804 +*/
66805 diff --git a/fs/reiser4/status_flags.c b/fs/reiser4/status_flags.c
66806 new file mode 100644
66807 index 0000000..8c8c9d8
66808 --- /dev/null
66809 +++ b/fs/reiser4/status_flags.c
66810 @@ -0,0 +1,176 @@
66811 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
66812 + * reiser4/README */
66813 +
66814 +/* Functions that deal with reiser4 status block, query status and update it, if needed */
66815 +
66816 +#include <linux/bio.h>
66817 +#include <linux/highmem.h>
66818 +#include <linux/fs.h>
66819 +#include <linux/blkdev.h>
66820 +#include "debug.h"
66821 +#include "dformat.h"
66822 +#include "status_flags.h"
66823 +#include "super.h"
66824 +
66825 +/* This is our end I/O handler that marks page uptodate if IO was successful. It also
66826 +   unconditionally unlocks the page, so we can see that io was done.
66827 +   We do not free bio, because we hope to reuse that. */
66828 +static int reiser4_status_endio(struct bio *bio, unsigned int bytes_done,
66829 +                               int err)
66830 +{
66831 +       if (bio->bi_size)
66832 +               return 1;
66833 +
66834 +       if (test_bit(BIO_UPTODATE, &bio->bi_flags)) {
66835 +               SetPageUptodate(bio->bi_io_vec->bv_page);
66836 +       } else {
66837 +               ClearPageUptodate(bio->bi_io_vec->bv_page);
66838 +               SetPageError(bio->bi_io_vec->bv_page);
66839 +       }
66840 +       unlock_page(bio->bi_io_vec->bv_page);
66841 +       return 0;
66842 +}
66843 +
66844 +/* Initialise status code. This is expected to be called from the disk format
66845 +   code. block paremeter is where status block lives. */
66846 +int reiser4_status_init(reiser4_block_nr block)
66847 +{
66848 +       struct super_block *sb = reiser4_get_current_sb();
66849 +       struct reiser4_status *statuspage;
66850 +       struct bio *bio;
66851 +       struct page *page;
66852 +
66853 +
66854 +       get_super_private(sb)->status_page = NULL;
66855 +       get_super_private(sb)->status_bio = NULL;
66856 +
66857 +       page = alloc_pages(GFP_KERNEL, 0);
66858 +       if (!page)
66859 +               return -ENOMEM;
66860 +
66861 +       bio = bio_alloc(GFP_KERNEL, 1);
66862 +       if (bio != NULL) {
66863 +               bio->bi_sector = block * (sb->s_blocksize >> 9);
66864 +               bio->bi_bdev = sb->s_bdev;
66865 +               bio->bi_io_vec[0].bv_page = page;
66866 +               bio->bi_io_vec[0].bv_len = sb->s_blocksize;
66867 +               bio->bi_io_vec[0].bv_offset = 0;
66868 +               bio->bi_vcnt = 1;
66869 +               bio->bi_size = sb->s_blocksize;
66870 +               bio->bi_end_io = reiser4_status_endio;
66871 +       } else {
66872 +               __free_pages(page, 0);
66873 +               return -ENOMEM;
66874 +       }
66875 +       lock_page(page);
66876 +       submit_bio(READ, bio);
66877 +       blk_run_address_space(get_super_fake(sb)->i_mapping);
66878 +       wait_on_page_locked(page);
66879 +       if (!PageUptodate(page)) {
66880 +               warning("green-2007",
66881 +                       "I/O error while tried to read status page\n");
66882 +               return -EIO;
66883 +       }
66884 +
66885 +       statuspage = (struct reiser4_status *)kmap_atomic(page, KM_USER0);
66886 +       if (memcmp
66887 +           (statuspage->magic, REISER4_STATUS_MAGIC,
66888 +            sizeof(REISER4_STATUS_MAGIC))) {
66889 +               /* Magic does not match. */
66890 +               kunmap_atomic((char *)statuspage, KM_USER0);
66891 +               warning("green-2008", "Wrong magic in status block\n");
66892 +               __free_pages(page, 0);
66893 +               bio_put(bio);
66894 +               return -EINVAL;
66895 +       }
66896 +       kunmap_atomic((char *)statuspage, KM_USER0);
66897 +
66898 +       get_super_private(sb)->status_page = page;
66899 +       get_super_private(sb)->status_bio = bio;
66900 +       return 0;
66901 +}
66902 +
66903 +/* Query the status of fs. Returns if the FS can be safely mounted.
66904 +   Also if "status" and "extended" parameters are given, it will fill
66905 +   actual parts of status from disk there. */
66906 +int reiser4_status_query(u64 * status, u64 * extended)
66907 +{
66908 +       struct super_block *sb = reiser4_get_current_sb();
66909 +       struct reiser4_status *statuspage;
66910 +       int retval;
66911 +
66912 +       if (!get_super_private(sb)->status_page) {      // No status page?
66913 +               return REISER4_STATUS_MOUNT_UNKNOWN;
66914 +       }
66915 +       statuspage = (struct reiser4_status *)
66916 +           kmap_atomic(get_super_private(sb)->status_page, KM_USER0);
66917 +       switch ((long)le64_to_cpu(get_unaligned(&statuspage->status))) {        // FIXME: this cast is a hack for 32 bit arches to work.
66918 +       case REISER4_STATUS_OK:
66919 +               retval = REISER4_STATUS_MOUNT_OK;
66920 +               break;
66921 +       case REISER4_STATUS_CORRUPTED:
66922 +               retval = REISER4_STATUS_MOUNT_WARN;
66923 +               break;
66924 +       case REISER4_STATUS_DAMAGED:
66925 +       case REISER4_STATUS_DESTROYED:
66926 +       case REISER4_STATUS_IOERROR:
66927 +               retval = REISER4_STATUS_MOUNT_RO;
66928 +               break;
66929 +       default:
66930 +               retval = REISER4_STATUS_MOUNT_UNKNOWN;
66931 +               break;
66932 +       }
66933 +
66934 +       if (status)
66935 +               *status = le64_to_cpu(get_unaligned(&statuspage->status));
66936 +       if (extended)
66937 +               *extended = le64_to_cpu(get_unaligned(&statuspage->extended_status));
66938 +
66939 +       kunmap_atomic((char *)statuspage, KM_USER0);
66940 +       return retval;
66941 +}
66942 +
66943 +/* This function should be called when something bad happens (e.g. from reiser4_panic).
66944 +   It fills the status structure and tries to push it to disk. */
66945 +int reiser4_status_write(__u64 status, __u64 extended_status, char *message)
66946 +{
66947 +       struct super_block *sb = reiser4_get_current_sb();
66948 +       struct reiser4_status *statuspage;
66949 +       struct bio *bio = get_super_private(sb)->status_bio;
66950 +
66951 +       if (!get_super_private(sb)->status_page) {      // No status page?
66952 +               return -1;
66953 +       }
66954 +       statuspage = (struct reiser4_status *)
66955 +           kmap_atomic(get_super_private(sb)->status_page, KM_USER0);
66956 +
66957 +       put_unaligned(cpu_to_le64(status), &statuspage->status);
66958 +       put_unaligned(cpu_to_le64(extended_status), &statuspage->extended_status);
66959 +       strncpy(statuspage->texterror, message, REISER4_TEXTERROR_LEN);
66960 +
66961 +       kunmap_atomic((char *)statuspage, KM_USER0);
66962 +       bio->bi_bdev = sb->s_bdev;
66963 +       bio->bi_io_vec[0].bv_page = get_super_private(sb)->status_page;
66964 +       bio->bi_io_vec[0].bv_len = sb->s_blocksize;
66965 +       bio->bi_io_vec[0].bv_offset = 0;
66966 +       bio->bi_vcnt = 1;
66967 +       bio->bi_size = sb->s_blocksize;
66968 +       bio->bi_end_io = reiser4_status_endio;
66969 +       lock_page(get_super_private(sb)->status_page);  // Safe as nobody should touch our page.
66970 +       /* We can block now, but we have no other choice anyway */
66971 +       submit_bio(WRITE, bio);
66972 +       blk_run_address_space(get_super_fake(sb)->i_mapping);
66973 +       return 0;               // We do not wait for io to finish.
66974 +}
66975 +
66976 +/* Frees the page with status and bio structure. Should be called by disk format at umount time */
66977 +int reiser4_status_finish(void)
66978 +{
66979 +       struct super_block *sb = reiser4_get_current_sb();
66980 +
66981 +       __free_pages(get_super_private(sb)->status_page, 0);
66982 +       get_super_private(sb)->status_page = NULL;
66983 +       bio_put(get_super_private(sb)->status_bio);
66984 +       get_super_private(sb)->status_bio = NULL;
66985 +       return 0;
66986 +}
66987 diff --git a/fs/reiser4/status_flags.h b/fs/reiser4/status_flags.h
66988 new file mode 100644
66989 index 0000000..6cfa5ad
66990 --- /dev/null
66991 +++ b/fs/reiser4/status_flags.h
66992 @@ -0,0 +1,43 @@
66993 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
66994 + * reiser4/README */
66995 +
66996 +/* Here we declare structures and flags that store reiser4 status on disk.
66997 +   The status that helps us to find out if the filesystem is valid or if it
66998 +   contains some critical, or not so critical errors */
66999 +
67000 +#if !defined( __REISER4_STATUS_FLAGS_H__ )
67001 +#define __REISER4_STATUS_FLAGS_H__
67002 +
67003 +#include "dformat.h"
67004 +/* These are major status flags */
67005 +#define REISER4_STATUS_OK 0
67006 +#define REISER4_STATUS_CORRUPTED 0x1
67007 +#define REISER4_STATUS_DAMAGED 0x2
67008 +#define REISER4_STATUS_DESTROYED 0x4
67009 +#define REISER4_STATUS_IOERROR 0x8
67010 +
67011 +/* Return values for reiser4_status_query() */
67012 +#define REISER4_STATUS_MOUNT_OK 0
67013 +#define REISER4_STATUS_MOUNT_WARN 1
67014 +#define REISER4_STATUS_MOUNT_RO 2
67015 +#define REISER4_STATUS_MOUNT_UNKNOWN -1
67016 +
67017 +#define REISER4_TEXTERROR_LEN 256
67018 +
67019 +#define REISER4_STATUS_MAGIC "ReiSeR4StATusBl"
67020 +/* We probably need to keep its size under sector size which is 512 bytes */
67021 +struct reiser4_status {
67022 +       char magic[16];
67023 +       d64 status;             /* Current FS state */
67024 +       d64 extended_status;    /* Any additional info that might have sense in addition to "status". E.g.
67025 +                                  last sector where io error happened if status is "io error encountered" */
67026 +       d64 stacktrace[10];     /* Last ten functional calls made (addresses) */
67027 +       char texterror[REISER4_TEXTERROR_LEN];  /* Any error message if appropriate, otherwise filled with zeroes */
67028 +};
67029 +
67030 +int reiser4_status_init(reiser4_block_nr block);
67031 +int reiser4_status_query(u64 * status, u64 * extended);
67032 +int reiser4_status_write(u64 status, u64 extended_status, char *message);
67033 +int reiser4_status_finish(void);
67034 +
67035 +#endif
67036 diff --git a/fs/reiser4/super.c b/fs/reiser4/super.c
67037 new file mode 100644
67038 index 0000000..bf18f2a
67039 --- /dev/null
67040 +++ b/fs/reiser4/super.c
67041 @@ -0,0 +1,313 @@
67042 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
67043 + * reiser4/README */
67044 +
67045 +/* Super-block manipulations. */
67046 +
67047 +#include "debug.h"
67048 +#include "dformat.h"
67049 +#include "key.h"
67050 +#include "plugin/security/perm.h"
67051 +#include "plugin/space/space_allocator.h"
67052 +#include "plugin/plugin.h"
67053 +#include "tree.h"
67054 +#include "vfs_ops.h"
67055 +#include "super.h"
67056 +#include "reiser4.h"
67057 +
67058 +#include <linux/types.h>       /* for __u??  */
67059 +#include <linux/fs.h>          /* for struct super_block  */
67060 +
67061 +
67062 +static __u64 reserved_for_gid(const struct super_block *super, gid_t gid);
67063 +static __u64 reserved_for_uid(const struct super_block *super, uid_t uid);
67064 +static __u64 reserved_for_root(const struct super_block *super);
67065 +
67066 +/* Return reiser4-specific part of super block */
67067 +reiser4_super_info_data *get_super_private_nocheck(const struct super_block *super     /* super block
67068 +                                                                                        * queried */ )
67069 +{
67070 +       return (reiser4_super_info_data *) super->s_fs_info;
67071 +}
67072 +
67073 +/* Return reiser4 fstype: value that is returned in ->f_type field by statfs() */
67074 +long statfs_type(const struct super_block *super UNUSED_ARG    /* super block
67075 +                                                                * queried */ )
67076 +{
67077 +       assert("nikita-448", super != NULL);
67078 +       assert("nikita-449", is_reiser4_super(super));
67079 +       return (long)REISER4_SUPER_MAGIC;
67080 +}
67081 +
67082 +/* functions to read/modify fields of reiser4_super_info_data */
67083 +
67084 +/* get number of blocks in file system */
67085 +__u64 reiser4_block_count(const struct super_block *super      /* super block
67086 +                                                                  queried */ )
67087 +{
67088 +       assert("vs-494", super != NULL);
67089 +       assert("vs-495", is_reiser4_super(super));
67090 +       return get_super_private(super)->block_count;
67091 +}
67092 +
67093 +/*
67094 + * number of blocks in the current file system
67095 + */
67096 +__u64 reiser4_current_block_count(void)
67097 +{
67098 +       return get_current_super_private()->block_count;
67099 +}
67100 +
67101 +/* set number of block in filesystem */
67102 +void reiser4_set_block_count(const struct super_block *super, __u64 nr)
67103 +{
67104 +       assert("vs-501", super != NULL);
67105 +       assert("vs-502", is_reiser4_super(super));
67106 +       get_super_private(super)->block_count = nr;
67107 +       /*
67108 +        * The proper calculation of the reserved space counter (%5 of device
67109 +        * block counter) we need a 64 bit division which is missing in Linux
67110 +        * on i386 platform. Because we do not need a precise calculation here
67111 +        * we can replace a div64 operation by this combination of
67112 +        * multiplication and shift: 51. / (2^10) == .0498 .
67113 +        * FIXME: this is a bug. It comes up only for very small filesystems
67114 +        * which probably are never used. Nevertheless, it is a bug. Number of
67115 +        * reserved blocks must be not less than maximal number of blocks which
67116 +        * get grabbed with BA_RESERVED.
67117 +        */
67118 +       get_super_private(super)->blocks_reserved = ((nr * 51) >> 10);
67119 +}
67120 +
67121 +/* amount of blocks used (allocated for data) in file system */
67122 +__u64 reiser4_data_blocks(const struct super_block *super      /* super block
67123 +                                                                  queried */ )
67124 +{
67125 +       assert("nikita-452", super != NULL);
67126 +       assert("nikita-453", is_reiser4_super(super));
67127 +       return get_super_private(super)->blocks_used;
67128 +}
67129 +
67130 +/* set number of block used in filesystem */
67131 +void reiser4_set_data_blocks(const struct super_block *super, __u64 nr)
67132 +{
67133 +       assert("vs-503", super != NULL);
67134 +       assert("vs-504", is_reiser4_super(super));
67135 +       get_super_private(super)->blocks_used = nr;
67136 +}
67137 +
67138 +/* amount of free blocks in file system */
67139 +__u64 reiser4_free_blocks(const struct super_block *super      /* super block
67140 +                                                                  queried */ )
67141 +{
67142 +       assert("nikita-454", super != NULL);
67143 +       assert("nikita-455", is_reiser4_super(super));
67144 +       return get_super_private(super)->blocks_free;
67145 +}
67146 +
67147 +/* set number of blocks free in filesystem */
67148 +void reiser4_set_free_blocks(const struct super_block *super, __u64 nr)
67149 +{
67150 +       assert("vs-505", super != NULL);
67151 +       assert("vs-506", is_reiser4_super(super));
67152 +       get_super_private(super)->blocks_free = nr;
67153 +}
67154 +
67155 +/* get mkfs unique identifier */
67156 +__u32 reiser4_mkfs_id(const struct super_block *super  /* super block
67157 +                                                          queried */ )
67158 +{
67159 +       assert("vpf-221", super != NULL);
67160 +       assert("vpf-222", is_reiser4_super(super));
67161 +       return get_super_private(super)->mkfs_id;
67162 +}
67163 +
67164 +/* amount of free blocks in file system */
67165 +__u64 reiser4_free_committed_blocks(const struct super_block *super)
67166 +{
67167 +       assert("vs-497", super != NULL);
67168 +       assert("vs-498", is_reiser4_super(super));
67169 +       return get_super_private(super)->blocks_free_committed;
67170 +}
67171 +
67172 +/* amount of blocks in the file system reserved for @uid and @gid */
67173 +long reiser4_reserved_blocks(const struct super_block *super   /* super block
67174 +                                                                  queried */ ,
67175 +                            uid_t uid /* user id */ ,
67176 +                            gid_t gid /* group id */ )
67177 +{
67178 +       long reserved;
67179 +
67180 +       assert("nikita-456", super != NULL);
67181 +       assert("nikita-457", is_reiser4_super(super));
67182 +
67183 +       reserved = 0;
67184 +       if (REISER4_SUPPORT_GID_SPACE_RESERVATION)
67185 +               reserved += reserved_for_gid(super, gid);
67186 +       if (REISER4_SUPPORT_UID_SPACE_RESERVATION)
67187 +               reserved += reserved_for_uid(super, uid);
67188 +       if (REISER4_SUPPORT_ROOT_SPACE_RESERVATION && (uid == 0))
67189 +               reserved += reserved_for_root(super);
67190 +       return reserved;
67191 +}
67192 +
67193 +/* get/set value of/to grabbed blocks counter */
67194 +__u64 reiser4_grabbed_blocks(const struct super_block * super)
67195 +{
67196 +       assert("zam-512", super != NULL);
67197 +       assert("zam-513", is_reiser4_super(super));
67198 +
67199 +       return get_super_private(super)->blocks_grabbed;
67200 +}
67201 +
67202 +__u64 flush_reserved(const struct super_block * super)
67203 +{
67204 +       assert("vpf-285", super != NULL);
67205 +       assert("vpf-286", is_reiser4_super(super));
67206 +
67207 +       return get_super_private(super)->blocks_flush_reserved;
67208 +}
67209 +
67210 +/* get/set value of/to counter of fake allocated formatted blocks */
67211 +__u64 reiser4_fake_allocated(const struct super_block * super)
67212 +{
67213 +       assert("zam-516", super != NULL);
67214 +       assert("zam-517", is_reiser4_super(super));
67215 +
67216 +       return get_super_private(super)->blocks_fake_allocated;
67217 +}
67218 +
67219 +/* get/set value of/to counter of fake allocated unformatted blocks */
67220 +__u64 reiser4_fake_allocated_unformatted(const struct super_block * super)
67221 +{
67222 +       assert("zam-516", super != NULL);
67223 +       assert("zam-517", is_reiser4_super(super));
67224 +
67225 +       return get_super_private(super)->blocks_fake_allocated_unformatted;
67226 +}
67227 +
67228 +/* get/set value of/to counter of clustered blocks */
67229 +__u64 reiser4_clustered_blocks(const struct super_block * super)
67230 +{
67231 +       assert("edward-601", super != NULL);
67232 +       assert("edward-602", is_reiser4_super(super));
67233 +
67234 +       return get_super_private(super)->blocks_clustered;
67235 +}
67236 +
67237 +/* space allocator used by this file system */
67238 +reiser4_space_allocator *get_space_allocator(const struct super_block * super)
67239 +{
67240 +       assert("nikita-1965", super != NULL);
67241 +       assert("nikita-1966", is_reiser4_super(super));
67242 +       return &get_super_private(super)->space_allocator;
67243 +}
67244 +
67245 +/* return fake inode used to bind formatted nodes in the page cache */
67246 +struct inode *get_super_fake(const struct super_block *super   /* super block
67247 +                                                                  queried */ )
67248 +{
67249 +       assert("nikita-1757", super != NULL);
67250 +       return get_super_private(super)->fake;
67251 +}
67252 +
67253 +/* return fake inode used to bind copied on capture nodes in the page cache */
67254 +struct inode *get_cc_fake(const struct super_block *super      /* super block
67255 +                                                                  queried */ )
67256 +{
67257 +       assert("nikita-1757", super != NULL);
67258 +       return get_super_private(super)->cc;
67259 +}
67260 +
67261 +/* return fake inode used to bind bitmaps and journlal heads */
67262 +struct inode *get_bitmap_fake(const struct super_block *super)
67263 +{
67264 +       assert("nikita-17571", super != NULL);
67265 +       return get_super_private(super)->bitmap;
67266 +}
67267 +
67268 +/* tree used by this file system */
67269 +reiser4_tree *get_tree(const struct super_block * super        /* super block
67270 +                                                        * queried */ )
67271 +{
67272 +       assert("nikita-460", super != NULL);
67273 +       assert("nikita-461", is_reiser4_super(super));
67274 +       return &get_super_private(super)->tree;
67275 +}
67276 +
67277 +/* Check that @super is (looks like) reiser4 super block. This is mainly for
67278 +   use in assertions. */
67279 +int is_reiser4_super(const struct super_block *super   /* super block
67280 +                                                        * queried */ )
67281 +{
67282 +       return
67283 +           super != NULL &&
67284 +           get_super_private(super) != NULL &&
67285 +           super->s_op == &(get_super_private(super)->ops.super);
67286 +}
67287 +
67288 +int reiser4_is_set(const struct super_block *super, reiser4_fs_flag f)
67289 +{
67290 +       return test_bit((int)f, &get_super_private(super)->fs_flags);
67291 +}
67292 +
67293 +/* amount of blocks reserved for given group in file system */
67294 +static __u64 reserved_for_gid(const struct super_block *super UNUSED_ARG       /* super
67295 +                                                                                * block
67296 +                                                                                * queried */ ,
67297 +                             gid_t gid UNUSED_ARG /* group id */ )
67298 +{
67299 +       return 0;
67300 +}
67301 +
67302 +/* amount of blocks reserved for given user in file system */
67303 +static __u64 reserved_for_uid(const struct super_block *super UNUSED_ARG       /* super
67304 +                                                                                  block
67305 +                                                                                  queried */ ,
67306 +                             uid_t uid UNUSED_ARG /* user id */ )
67307 +{
67308 +       return 0;
67309 +}
67310 +
67311 +/* amount of blocks reserved for super user in file system */
67312 +static __u64 reserved_for_root(const struct super_block *super UNUSED_ARG      /* super
67313 +                                                                                  block
67314 +                                                                                  queried */ )
67315 +{
67316 +       return 0;
67317 +}
67318 +
67319 +/*
67320 + * true if block number @blk makes sense for the file system at @super.
67321 + */
67322 +int
67323 +reiser4_blocknr_is_sane_for(const struct super_block *super,
67324 +                           const reiser4_block_nr * blk)
67325 +{
67326 +       reiser4_super_info_data *sbinfo;
67327 +
67328 +       assert("nikita-2957", super != NULL);
67329 +       assert("nikita-2958", blk != NULL);
67330 +
67331 +       if (blocknr_is_fake(blk))
67332 +               return 1;
67333 +
67334 +       sbinfo = get_super_private(super);
67335 +       return *blk < sbinfo->block_count;
67336 +}
67337 +
67338 +/*
67339 + * true, if block number @blk makes sense for the current file system
67340 + */
67341 +int reiser4_blocknr_is_sane(const reiser4_block_nr * blk)
67342 +{
67343 +       return reiser4_blocknr_is_sane_for(reiser4_get_current_sb(), blk);
67344 +}
67345 +
67346 +/* Make Linus happy.
67347 +   Local variables:
67348 +   c-indentation-style: "K&R"
67349 +   mode-name: "LC"
67350 +   c-basic-offset: 8
67351 +   tab-width: 8
67352 +   fill-column: 120
67353 +   End:
67354 +*/
67355 diff --git a/fs/reiser4/super.h b/fs/reiser4/super.h
67356 new file mode 100644
67357 index 0000000..7693b92
67358 --- /dev/null
67359 +++ b/fs/reiser4/super.h
67360 @@ -0,0 +1,467 @@
67361 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
67362 + * reiser4/README */
67363 +
67364 +/* Super-block functions. See super.c for details. */
67365 +
67366 +#if !defined( __REISER4_SUPER_H__ )
67367 +#define __REISER4_SUPER_H__
67368 +
67369 +#include "tree.h"
67370 +#include "entd.h"
67371 +#include "wander.h"
67372 +#include "fsdata.h"
67373 +#include "plugin/object.h"
67374 +#include "plugin/space/space_allocator.h"
67375 +
67376 +/*
67377 + * Flush algorithms parameters.
67378 + */
67379 +typedef struct {
67380 +       unsigned relocate_threshold;
67381 +       unsigned relocate_distance;
67382 +       unsigned written_threshold;
67383 +       unsigned scan_maxnodes;
67384 +} flush_params;
67385 +
67386 +typedef enum {
67387 +       /*
67388 +        * True if this file system doesn't support hard-links (multiple names)
67389 +        * for directories: this is default UNIX behavior.
67390 +        *
67391 +        * If hard-links on directoires are not allowed, file system is Acyclic
67392 +        * Directed Graph (modulo dot, and dotdot, of course).
67393 +        *
67394 +        * This is used by reiser4_link().
67395 +        */
67396 +       REISER4_ADG = 0,
67397 +       /*
67398 +        * set if all nodes in internal tree have the same node layout plugin.
67399 +        * If so, znode_guess_plugin() will return tree->node_plugin in stead
67400 +        * of guessing plugin by plugin id stored in the node.
67401 +        */
67402 +       REISER4_ONE_NODE_PLUGIN = 1,
67403 +       /* if set, bsd gid assignment is supported. */
67404 +       REISER4_BSD_GID = 2,
67405 +       /* [mac]_time are 32 bit in inode */
67406 +       REISER4_32_BIT_TIMES = 3,
67407 +       /* allow concurrent flushes */
67408 +       REISER4_MTFLUSH = 4,
67409 +       /* load all bitmap blocks at mount time */
67410 +       REISER4_DONT_LOAD_BITMAP = 5,
67411 +       /* enforce atomicity during write(2) */
67412 +       REISER4_ATOMIC_WRITE = 6,
67413 +       /* don't use write barriers in the log writer code. */
67414 +       REISER4_NO_WRITE_BARRIER = 7
67415 +
67416 +} reiser4_fs_flag;
67417 +
67418 +/*
67419 + * VFS related operation vectors.
67420 + */
67421 +typedef struct object_ops {
67422 +       struct super_operations super;
67423 +       struct dentry_operations dentry;
67424 +       struct export_operations export;
67425 +} object_ops;
67426 +
67427 +/* reiser4-specific part of super block
67428 +
67429 +   Locking
67430 +
67431 +   Fields immutable after mount:
67432 +
67433 +    ->oid*
67434 +    ->space*
67435 +    ->default_[ug]id
67436 +    ->mkfs_id
67437 +    ->trace_flags
67438 +    ->debug_flags
67439 +    ->fs_flags
67440 +    ->df_plug
67441 +    ->optimal_io_size
67442 +    ->plug
67443 +    ->flush
67444 +    ->u (bad name)
67445 +    ->txnmgr
67446 +    ->ra_params
67447 +    ->fsuid
67448 +    ->journal_header
67449 +    ->journal_footer
67450 +
67451 +   Fields protected by ->lnode_guard
67452 +
67453 +    ->lnode_htable
67454 +
67455 +   Fields protected by per-super block spin lock
67456 +
67457 +    ->block_count
67458 +    ->blocks_used
67459 +    ->blocks_free
67460 +    ->blocks_free_committed
67461 +    ->blocks_grabbed
67462 +    ->blocks_fake_allocated_unformatted
67463 +    ->blocks_fake_allocated
67464 +    ->blocks_flush_reserved
67465 +    ->eflushed
67466 +    ->blocknr_hint_default
67467 +
67468 +   After journal replaying during mount,
67469 +
67470 +    ->last_committed_tx
67471 +
67472 +   is protected by ->tmgr.commit_semaphore
67473 +
67474 +   Invariants involving this data-type:
67475 +
67476 +      [sb-block-counts]
67477 +      [sb-grabbed]
67478 +      [sb-fake-allocated]
67479 +*/
67480 +struct reiser4_super_info_data {
67481 +       /*
67482 +        * guard spinlock which protects reiser4 super block fields (currently
67483 +        * blocks_free, blocks_free_committed)
67484 +        */
67485 +       spinlock_t guard;
67486 +
67487 +       /* next oid that will be returned by oid_allocate() */
67488 +       oid_t next_to_use;
67489 +       /* total number of used oids */
67490 +       oid_t oids_in_use;
67491 +
67492 +       /* space manager plugin */
67493 +       reiser4_space_allocator space_allocator;
67494 +
67495 +       /* reiser4 internal tree */
67496 +       reiser4_tree tree;
67497 +
67498 +       /*
67499 +        * default user id used for light-weight files without their own
67500 +        * stat-data.
67501 +        */
67502 +       uid_t default_uid;
67503 +
67504 +       /*
67505 +        * default group id used for light-weight files without their own
67506 +        * stat-data.
67507 +        */
67508 +       gid_t default_gid;
67509 +
67510 +       /* mkfs identifier generated at mkfs time. */
67511 +       __u32 mkfs_id;
67512 +       /* amount of blocks in a file system */
67513 +       __u64 block_count;
67514 +
67515 +       /* inviolable reserve */
67516 +       __u64 blocks_reserved;
67517 +
67518 +       /* amount of blocks used by file system data and meta-data. */
67519 +       __u64 blocks_used;
67520 +
67521 +       /*
67522 +        * amount of free blocks. This is "working" free blocks counter. It is
67523 +        * like "working" bitmap, please see block_alloc.c for description.
67524 +        */
67525 +       __u64 blocks_free;
67526 +
67527 +       /*
67528 +        * free block count for fs committed state. This is "commit" version of
67529 +        * free block counter.
67530 +        */
67531 +       __u64 blocks_free_committed;
67532 +
67533 +       /*
67534 +        * number of blocks reserved for further allocation, for all
67535 +        * threads.
67536 +        */
67537 +       __u64 blocks_grabbed;
67538 +
67539 +       /* number of fake allocated unformatted blocks in tree. */
67540 +       __u64 blocks_fake_allocated_unformatted;
67541 +
67542 +       /* number of fake allocated formatted blocks in tree. */
67543 +       __u64 blocks_fake_allocated;
67544 +
67545 +       /* number of blocks reserved for flush operations. */
67546 +       __u64 blocks_flush_reserved;
67547 +
67548 +       /* number of blocks reserved for cluster operations. */
67549 +       __u64 blocks_clustered;
67550 +
67551 +       /* unique file-system identifier */
67552 +       __u32 fsuid;
67553 +
67554 +       /* file-system wide flags. See reiser4_fs_flag enum */
67555 +       unsigned long fs_flags;
67556 +
67557 +       /* transaction manager */
67558 +       txn_mgr tmgr;
67559 +
67560 +       /* ent thread */
67561 +       entd_context entd;
67562 +
67563 +       /* fake inode used to bind formatted nodes */
67564 +       struct inode *fake;
67565 +       /* inode used to bind bitmaps (and journal heads) */
67566 +       struct inode *bitmap;
67567 +       /* inode used to bind copied on capture nodes */
67568 +       struct inode *cc;
67569 +
67570 +       /* disk layout plugin */
67571 +       disk_format_plugin *df_plug;
67572 +
67573 +       /* disk layout specific part of reiser4 super info data */
67574 +       union {
67575 +               format40_super_info format40;
67576 +       } u;
67577 +
67578 +       /* value we return in st_blksize on stat(2) */
67579 +       unsigned long optimal_io_size;
67580 +
67581 +       /* parameters for the flush algorithm */
67582 +       flush_params flush;
67583 +
67584 +       /* pointers to jnodes for journal header and footer */
67585 +       jnode *journal_header;
67586 +       jnode *journal_footer;
67587 +
67588 +       journal_location jloc;
67589 +
67590 +       /* head block number of last committed transaction */
67591 +       __u64 last_committed_tx;
67592 +
67593 +       /*
67594 +        * we remember last written location for using as a hint for new block
67595 +        * allocation
67596 +        */
67597 +       __u64 blocknr_hint_default;
67598 +
67599 +       /* committed number of files (oid allocator state variable ) */
67600 +       __u64 nr_files_committed;
67601 +
67602 +       ra_params_t ra_params;
67603 +
67604 +       /*
67605 +        * A semaphore for serializing cut tree operation if out-of-free-space:
67606 +        * the only one cut_tree thread is allowed to grab space from reserved
67607 +        * area (it is 5% of disk space)
67608 +        */
67609 +       struct semaphore delete_sema;
67610 +       /* task owning ->delete_sema */
67611 +       struct task_struct *delete_sema_owner;
67612 +
67613 +       /* serialize semaphore */
67614 +       struct semaphore flush_sema;
67615 +
67616 +       /* Diskmap's blocknumber */
67617 +       __u64 diskmap_block;
67618 +
67619 +       /* What to do in case of error */
67620 +       int onerror;
67621 +
67622 +       /* operations for objects on this file system */
67623 +       object_ops ops;
67624 +
67625 +       /*
67626 +        * structure to maintain d_cursors. See plugin/file_ops_readdir.c for
67627 +        * more details
67628 +        */
67629 +       d_cursor_info d_info;
67630 +
67631 +#ifdef CONFIG_REISER4_BADBLOCKS
67632 +       /* Alternative master superblock offset (in bytes) */
67633 +       unsigned long altsuper;
67634 +#endif
67635 +       struct repacker *repacker;
67636 +       struct page *status_page;
67637 +       struct bio *status_bio;
67638 +
67639 +#if REISER4_DEBUG
67640 +       /*
67641 +        * minimum used blocks value (includes super blocks, bitmap blocks and
67642 +        * other fs reserved areas), depends on fs format and fs size.
67643 +        */
67644 +       __u64 min_blocks_used;
67645 +
67646 +       /*
67647 +        * when debugging is on, all jnodes (including znodes, bitmaps, etc.)
67648 +        * are kept on a list anchored at sbinfo->all_jnodes. This list is
67649 +        * protected by sbinfo->all_guard spin lock. This lock should be taken
67650 +        * with _irq modifier, because it is also modified from interrupt
67651 +        * contexts (by RCU).
67652 +        */
67653 +       spinlock_t all_guard;
67654 +       /* list of all jnodes */
67655 +       struct list_head all_jnodes;
67656 +#endif
67657 +       struct dentry *debugfs_root;
67658 +};
67659 +
67660 +extern reiser4_super_info_data *get_super_private_nocheck(const struct
67661 +                                                         super_block *super);
67662 +
67663 +
67664 +/* Return reiser4-specific part of super block */
67665 +static inline reiser4_super_info_data *get_super_private(const struct
67666 +                                                        super_block *super)
67667 +{
67668 +       assert("nikita-447", super != NULL);
67669 +
67670 +       return (reiser4_super_info_data *) super->s_fs_info;
67671 +}
67672 +
67673 +/* get ent context for the @super */
67674 +static inline entd_context *get_entd_context(struct super_block *super)
67675 +{
67676 +       return &get_super_private(super)->entd;
67677 +}
67678 +
67679 +
67680 +/* "Current" super-block: main super block used during current system
67681 +   call. Reference to this super block is stored in reiser4_context. */
67682 +static inline struct super_block *reiser4_get_current_sb(void)
67683 +{
67684 +       return get_current_context()->super;
67685 +}
67686 +
67687 +/* Reiser4-specific part of "current" super-block: main super block used
67688 +   during current system call. Reference to this super block is stored in
67689 +   reiser4_context. */
67690 +static inline reiser4_super_info_data *get_current_super_private(void)
67691 +{
67692 +       return get_super_private(reiser4_get_current_sb());
67693 +}
67694 +
67695 +static inline ra_params_t *get_current_super_ra_params(void)
67696 +{
67697 +       return &(get_current_super_private()->ra_params);
67698 +}
67699 +
67700 +/*
67701 + * true, if file system on @super is read-only
67702 + */
67703 +static inline int rofs_super(struct super_block *super)
67704 +{
67705 +       return super->s_flags & MS_RDONLY;
67706 +}
67707 +
67708 +/*
67709 + * true, if @tree represents read-only file system
67710 + */
67711 +static inline int rofs_tree(reiser4_tree * tree)
67712 +{
67713 +       return rofs_super(tree->super);
67714 +}
67715 +
67716 +/*
67717 + * true, if file system where @inode lives on, is read-only
67718 + */
67719 +static inline int rofs_inode(struct inode *inode)
67720 +{
67721 +       return rofs_super(inode->i_sb);
67722 +}
67723 +
67724 +/*
67725 + * true, if file system where @node lives on, is read-only
67726 + */
67727 +static inline int rofs_jnode(jnode * node)
67728 +{
67729 +       return rofs_tree(jnode_get_tree(node));
67730 +}
67731 +
67732 +extern __u64 reiser4_current_block_count(void);
67733 +
67734 +extern void build_object_ops(struct super_block *super, object_ops * ops);
67735 +
67736 +#define REISER4_SUPER_MAGIC 0x52345362 /* (*(__u32 *)"R4Sb"); */
67737 +
67738 +static inline void spin_lock_reiser4_super(reiser4_super_info_data *sbinfo)
67739 +{
67740 +       spin_lock(&(sbinfo->guard));
67741 +}
67742 +
67743 +static inline void spin_unlock_reiser4_super(reiser4_super_info_data *sbinfo)
67744 +{
67745 +       assert_spin_locked(&(sbinfo->guard));
67746 +       spin_unlock(&(sbinfo->guard));
67747 +}
67748 +
67749 +extern __u64 flush_reserved(const struct super_block *);
67750 +extern int reiser4_is_set(const struct super_block *super, reiser4_fs_flag f);
67751 +extern long statfs_type(const struct super_block *super);
67752 +extern __u64 reiser4_block_count(const struct super_block *super);
67753 +extern void reiser4_set_block_count(const struct super_block *super, __u64 nr);
67754 +extern __u64 reiser4_data_blocks(const struct super_block *super);
67755 +extern void reiser4_set_data_blocks(const struct super_block *super, __u64 nr);
67756 +extern __u64 reiser4_free_blocks(const struct super_block *super);
67757 +extern void reiser4_set_free_blocks(const struct super_block *super, __u64 nr);
67758 +extern __u32 reiser4_mkfs_id(const struct super_block *super);
67759 +
67760 +extern __u64 reiser4_free_committed_blocks(const struct super_block *super);
67761 +
67762 +extern __u64 reiser4_grabbed_blocks(const struct super_block *);
67763 +extern __u64 reiser4_fake_allocated(const struct super_block *);
67764 +extern __u64 reiser4_fake_allocated_unformatted(const struct super_block *);
67765 +extern __u64 reiser4_clustered_blocks(const struct super_block *);
67766 +
67767 +extern long reiser4_reserved_blocks(const struct super_block *super, uid_t uid,
67768 +                                   gid_t gid);
67769 +
67770 +extern reiser4_space_allocator *get_space_allocator(const struct super_block
67771 +                                                   *super);
67772 +extern reiser4_oid_allocator *get_oid_allocator(const struct super_block
67773 +                                               *super);
67774 +extern struct inode *get_super_fake(const struct super_block *super);
67775 +extern struct inode *get_cc_fake(const struct super_block *super);
67776 +extern struct inode *get_bitmap_fake(const struct super_block *super);
67777 +extern reiser4_tree *get_tree(const struct super_block *super);
67778 +extern int is_reiser4_super(const struct super_block *super);
67779 +
67780 +extern int reiser4_blocknr_is_sane(const reiser4_block_nr * blk);
67781 +extern int reiser4_blocknr_is_sane_for(const struct super_block *super,
67782 +                                      const reiser4_block_nr * blk);
67783 +extern int reiser4_fill_super(struct super_block *s, void *data, int silent);
67784 +extern int reiser4_done_super(struct super_block *s);
67785 +
67786 +/* step of fill super */
67787 +extern int init_fs_info(struct super_block *);
67788 +extern void done_fs_info(struct super_block *);
67789 +extern int init_super_data(struct super_block *, char *opt_string);
67790 +extern int init_read_super(struct super_block *, int silent);
67791 +extern int init_root_inode(struct super_block *);
67792 +
67793 +
67794 +/* Maximal possible object id. */
67795 +#define  ABSOLUTE_MAX_OID ((oid_t)~0)
67796 +
67797 +#define OIDS_RESERVED  ( 1 << 16 )
67798 +int oid_init_allocator(struct super_block *, oid_t nr_files, oid_t next);
67799 +oid_t oid_allocate(struct super_block *);
67800 +int oid_release(struct super_block *, oid_t);
67801 +oid_t oid_next(const struct super_block *);
67802 +void oid_count_allocated(void);
67803 +void oid_count_released(void);
67804 +long oids_used(const struct super_block *);
67805 +
67806 +#if REISER4_DEBUG
67807 +void print_fs_info(const char *prefix, const struct super_block *);
67808 +#endif
67809 +
67810 +extern void destroy_reiser4_cache(kmem_cache_t **);
67811 +
67812 +extern struct super_operations reiser4_super_operations;
67813 +extern struct export_operations reiser4_export_operations;
67814 +extern struct dentry_operations reiser4_dentry_operations;
67815 +
67816 +/* __REISER4_SUPER_H__ */
67817 +#endif
67818 +
67819 +/*
67820 + * Local variables:
67821 + * c-indentation-style: "K&R"
67822 + * mode-name: "LC"
67823 + * c-basic-offset: 8
67824 + * tab-width: 8
67825 + * fill-column: 120
67826 + * End:
67827 + */
67828 diff --git a/fs/reiser4/super_ops.c b/fs/reiser4/super_ops.c
67829 new file mode 100644
67830 index 0000000..04fa72d
67831 --- /dev/null
67832 +++ b/fs/reiser4/super_ops.c
67833 @@ -0,0 +1,720 @@
67834 +/* Copyright 2005 by Hans Reiser, licensing governed by
67835 + * reiser4/README */
67836 +
67837 +#include "inode.h"
67838 +#include "page_cache.h"
67839 +#include "ktxnmgrd.h"
67840 +#include "flush.h"
67841 +#include "safe_link.h"
67842 +
67843 +#include <linux/vfs.h>
67844 +#include <linux/writeback.h>
67845 +#include <linux/mount.h>
67846 +#include <linux/seq_file.h>
67847 +#include <linux/debugfs.h>
67848 +
67849 +/* slab cache for inodes */
67850 +static kmem_cache_t *inode_cache;
67851 +
67852 +static struct dentry *reiser4_debugfs_root = NULL;
67853 +
67854 +/**
67855 + * init_once - constructor for reiser4 inodes
67856 + * @obj: inode to be initialized
67857 + * @cache: cache @obj belongs to
67858 + * @flags: SLAB flags
67859 + *
67860 + * Initialization function to be called when new page is allocated by reiser4
67861 + * inode cache. It is set on inode cache creation.
67862 + */
67863 +static void init_once(void *obj, kmem_cache_t *cache, unsigned long flags)
67864 +{
67865 +       reiser4_inode_object *info;
67866 +
67867 +       info = obj;
67868 +
67869 +       if ((flags & (SLAB_CTOR_VERIFY | SLAB_CTOR_CONSTRUCTOR)) ==
67870 +           SLAB_CTOR_CONSTRUCTOR) {
67871 +               /* initialize vfs inode */
67872 +               inode_init_once(&info->vfs_inode);
67873 +
67874 +               /*
67875 +                * initialize reiser4 specific part fo inode.
67876 +                * NOTE-NIKITA add here initializations for locks, list heads,
67877 +                * etc. that will be added to our private inode part.
67878 +                */
67879 +               INIT_LIST_HEAD(get_readdir_list(&info->vfs_inode));
67880 +               /* init semaphore which is used during inode loading */
67881 +               loading_init_once(&info->p);
67882 +               INIT_RADIX_TREE(jnode_tree_by_reiser4_inode(&info->p),
67883 +                               GFP_ATOMIC);
67884 +#if REISER4_DEBUG
67885 +               info->p.nr_jnodes = 0;
67886 +#endif
67887 +       }
67888 +}
67889 +
67890 +/**
67891 + * init_inodes - create znode cache
67892 + *
67893 + * Initializes slab cache of inodes. It is part of reiser4 module initialization.
67894 + */
67895 +static int init_inodes(void)
67896 +{
67897 +       inode_cache = kmem_cache_create("reiser4_inode",
67898 +                                       sizeof(reiser4_inode_object),
67899 +                                       0,
67900 +                                       SLAB_HWCACHE_ALIGN |
67901 +                                       SLAB_RECLAIM_ACCOUNT, init_once, NULL);
67902 +       if (inode_cache == NULL)
67903 +               return RETERR(-ENOMEM);
67904 +       return 0;
67905 +}
67906 +
67907 +/**
67908 + * done_inodes - delete inode cache
67909 + *
67910 + * This is called on reiser4 module unloading or system shutdown.
67911 + */
67912 +static void done_inodes(void)
67913 +{
67914 +       destroy_reiser4_cache(&inode_cache);
67915 +}
67916 +
67917 +/**
67918 + * reiser4_alloc_inode - alloc_inode of super operations
67919 + * @super: super block new inode is allocated for
67920 + *
67921 + * Allocates new inode, initializes reiser4 specific part of it.
67922 + */
67923 +static struct inode *reiser4_alloc_inode(struct super_block *super)
67924 +{
67925 +       reiser4_inode_object *obj;
67926 +
67927 +       assert("nikita-1696", super != NULL);
67928 +       obj = kmem_cache_alloc(inode_cache, SLAB_KERNEL);
67929 +       if (obj != NULL) {
67930 +               reiser4_inode *info;
67931 +
67932 +               info = &obj->p;
67933 +
67934 +               info->hset = info->pset = plugin_set_get_empty();
67935 +               info->extmask = 0;
67936 +               info->locality_id = 0ull;
67937 +               info->plugin_mask = 0;
67938 +#if !REISER4_INO_IS_OID
67939 +               info->oid_hi = 0;
67940 +#endif
67941 +               seal_init(&info->sd_seal, NULL, NULL);
67942 +               coord_init_invalid(&info->sd_coord, NULL);
67943 +               info->flags = 0;
67944 +               spin_lock_init(&info->guard);
67945 +               /* this deals with info's loading semaphore */
67946 +               loading_alloc(info);
67947 +               info->vroot = UBER_TREE_ADDR;
67948 +               return &obj->vfs_inode;
67949 +       } else
67950 +               return NULL;
67951 +}
67952 +
67953 +/**
67954 + * reiser4_destroy_inode - destroy_inode of super operations
67955 + * @inode: inode being destroyed
67956 + *
67957 + * Puts reiser4 specific portion of inode, frees memory occupied by inode.
67958 + */
67959 +static void reiser4_destroy_inode(struct inode *inode)
67960 +{
67961 +       reiser4_inode *info;
67962 +
67963 +       info = reiser4_inode_data(inode);
67964 +
67965 +       assert("vs-1220", inode_has_no_jnodes(info));
67966 +
67967 +       if (!is_bad_inode(inode) && is_inode_loaded(inode)) {
67968 +               file_plugin *fplug = inode_file_plugin(inode);
67969 +               if (fplug->destroy_inode != NULL)
67970 +                       fplug->destroy_inode(inode);
67971 +       }
67972 +       dispose_cursors(inode);
67973 +       if (info->pset)
67974 +               plugin_set_put(info->pset);
67975 +
67976 +       /*
67977 +        * cannot add similar assertion about ->i_list as prune_icache return
67978 +        * inode into slab with dangling ->list.{next,prev}. This is safe,
67979 +        * because they are re-initialized in the new_inode().
67980 +        */
67981 +       assert("nikita-2895", list_empty(&inode->i_dentry));
67982 +       assert("nikita-2896", hlist_unhashed(&inode->i_hash));
67983 +       assert("nikita-2898", list_empty_careful(get_readdir_list(inode)));
67984 +
67985 +       /* this deals with info's loading semaphore */
67986 +       loading_destroy(info);
67987 +
67988 +       kmem_cache_free(inode_cache,
67989 +                       container_of(info, reiser4_inode_object, p));
67990 +}
67991 +
67992 +/**
67993 + * reiser4_dirty_inode - dirty_inode of super operations
67994 + * @inode: inode being dirtied
67995 + *
67996 + * Updates stat data.
67997 + */
67998 +static void reiser4_dirty_inode(struct inode *inode)
67999 +{
68000 +       int result;
68001 +
68002 +       if (!is_in_reiser4_context())
68003 +               return;
68004 +       assert("", !IS_RDONLY(inode));
68005 +       assert("", (inode_file_plugin(inode)->estimate.update(inode) <=
68006 +                   get_current_context()->grabbed_blocks));
68007 +
68008 +       result = reiser4_update_sd(inode);
68009 +       if (result)
68010 +               warning("", "failed to dirty inode for %llu: %d",
68011 +                       get_inode_oid(inode), result);
68012 +}
68013 +
68014 +/**
68015 + * reiser4_delete_inode - delete_inode of super operations
68016 + * @inode: inode to delete
68017 + *
68018 + * Calls file plugin's delete_object method to delete object items from
68019 + * filesystem tree and calls clear_inode.
68020 + */
68021 +static void reiser4_delete_inode(struct inode *inode)
68022 +{
68023 +       reiser4_context *ctx;
68024 +       file_plugin *fplug;
68025 +
68026 +       ctx = init_context(inode->i_sb);
68027 +       if (IS_ERR(ctx)) {
68028 +               warning("vs-15", "failed to init context");
68029 +               return;
68030 +       }
68031 +
68032 +       if (is_inode_loaded(inode)) {
68033 +               fplug = inode_file_plugin(inode);
68034 +               if (fplug != NULL && fplug->delete_object != NULL)
68035 +                       fplug->delete_object(inode);
68036 +       }
68037 +
68038 +       truncate_inode_pages(&inode->i_data, 0);
68039 +       inode->i_blocks = 0;
68040 +       clear_inode(inode);
68041 +       reiser4_exit_context(ctx);
68042 +}
68043 +
68044 +/**
68045 + * reiser4_put_super - put_super of super operations
68046 + * @super: super block to free
68047 + *
68048 + * Stops daemons, release resources, umounts in short.
68049 + */
68050 +static void reiser4_put_super(struct super_block *super)
68051 +{
68052 +       reiser4_super_info_data *sbinfo;
68053 +       reiser4_context *ctx;
68054 +
68055 +       sbinfo = get_super_private(super);
68056 +       assert("vs-1699", sbinfo);
68057 +
68058 +       debugfs_remove(sbinfo->tmgr.debugfs_atom_count);
68059 +       debugfs_remove(sbinfo->tmgr.debugfs_id_count);
68060 +       debugfs_remove(sbinfo->debugfs_root);
68061 +
68062 +       ctx = init_context(super);
68063 +       if (IS_ERR(ctx)) {
68064 +               warning("vs-17", "failed to init context");
68065 +               return;
68066 +       }
68067 +
68068 +       /* have disk format plugin to free its resources */
68069 +       if (get_super_private(super)->df_plug->release)
68070 +               get_super_private(super)->df_plug->release(super);
68071 +
68072 +       done_formatted_fake(super);
68073 +
68074 +       /* stop daemons: ktxnmgr and entd */
68075 +       done_entd(super);
68076 +       done_ktxnmgrd(super);
68077 +       done_txnmgr(&sbinfo->tmgr);
68078 +
68079 +       done_fs_info(super);
68080 +       reiser4_exit_context(ctx);
68081 +}
68082 +
68083 +/**
68084 + * reiser4_write_super - write_super of super operations
68085 + * @super: super block to write
68086 + *
68087 + * Captures znode associated with super block, comit all transactions.
68088 + */
68089 +static void reiser4_write_super(struct super_block *super)
68090 +{
68091 +       int ret;
68092 +       reiser4_context *ctx;
68093 +
68094 +       assert("vs-1700", !rofs_super(super));
68095 +
68096 +       ctx = init_context(super);
68097 +       if (IS_ERR(ctx)) {
68098 +               warning("vs-16", "failed to init context");
68099 +               return;
68100 +       }
68101 +
68102 +       ret = capture_super_block(super);
68103 +       if (ret != 0)
68104 +               warning("vs-1701",
68105 +                       "capture_super_block failed in write_super: %d", ret);
68106 +       ret = txnmgr_force_commit_all(super, 0);
68107 +       if (ret != 0)
68108 +               warning("jmacd-77113",
68109 +                       "txn_force failed in write_super: %d", ret);
68110 +
68111 +       super->s_dirt = 0;
68112 +
68113 +       reiser4_exit_context(ctx);
68114 +}
68115 +
68116 +/**
68117 + * reiser4_statfs - statfs of super operations
68118 + * @super: super block of file system in queried
68119 + * @stafs: buffer to fill with statistics
68120 + *
68121 + * Returns information about filesystem.
68122 + */
68123 +static int reiser4_statfs(struct dentry *dentry, struct kstatfs *statfs)
68124 +{
68125 +       sector_t total;
68126 +       sector_t reserved;
68127 +       sector_t free;
68128 +       sector_t forroot;
68129 +       sector_t deleted;
68130 +       reiser4_context *ctx;
68131 +       struct super_block *super = dentry->d_sb;
68132 +
68133 +       assert("nikita-408", super != NULL);
68134 +       assert("nikita-409", statfs != NULL);
68135 +
68136 +       ctx = init_context(super);
68137 +       if (IS_ERR(ctx))
68138 +               return PTR_ERR(ctx);
68139 +
68140 +       statfs->f_type = statfs_type(super);
68141 +       statfs->f_bsize = super->s_blocksize;
68142 +
68143 +       /*
68144 +        * 5% of total block space is reserved. This is needed for flush and
68145 +        * for truncates (so that we are able to perform truncate/unlink even
68146 +        * on the otherwise completely full file system). If this reservation
68147 +        * is hidden from statfs(2), users will mistakenly guess that they
68148 +        * have enough free space to complete some operation, which is
68149 +        * frustrating.
68150 +        *
68151 +        * Another possible solution is to subtract ->blocks_reserved from
68152 +        * ->f_bfree, but changing available space seems less intrusive than
68153 +        * letting user to see 5% of disk space to be used directly after
68154 +        * mkfs.
68155 +        */
68156 +       total = reiser4_block_count(super);
68157 +       reserved = get_super_private(super)->blocks_reserved;
68158 +       deleted = txnmgr_count_deleted_blocks();
68159 +       free = reiser4_free_blocks(super) + deleted;
68160 +       forroot = reiser4_reserved_blocks(super, 0, 0);
68161 +
68162 +       /*
68163 +        * These counters may be in inconsistent state because we take the
68164 +        * values without keeping any global spinlock.  Here we do a sanity
68165 +        * check that free block counter does not exceed the number of all
68166 +        * blocks.
68167 +        */
68168 +       if (free > total)
68169 +               free = total;
68170 +       statfs->f_blocks = total - reserved;
68171 +       /* make sure statfs->f_bfree is never larger than statfs->f_blocks */
68172 +       if (free > reserved)
68173 +               free -= reserved;
68174 +       else
68175 +               free = 0;
68176 +       statfs->f_bfree = free;
68177 +
68178 +       if (free > forroot)
68179 +               free -= forroot;
68180 +       else
68181 +               free = 0;
68182 +       statfs->f_bavail = free;
68183 +
68184 +       statfs->f_files = 0;
68185 +       statfs->f_ffree = 0;
68186 +
68187 +       /* maximal acceptable name length depends on directory plugin. */
68188 +       assert("nikita-3351", super->s_root->d_inode != NULL);
68189 +       statfs->f_namelen = reiser4_max_filename_len(super->s_root->d_inode);
68190 +       reiser4_exit_context(ctx);
68191 +       return 0;
68192 +}
68193 +
68194 +/**
68195 + * reiser4_clear_inode - clear_inode of super operation
68196 + * @inode: inode about to destroy
68197 + *
68198 + * Does sanity checks: being destroyed should have all jnodes detached.
68199 + */
68200 +static void reiser4_clear_inode(struct inode *inode)
68201 +{
68202 +#if REISER4_DEBUG
68203 +       reiser4_inode *r4_inode;
68204 +
68205 +       r4_inode = reiser4_inode_data(inode);
68206 +       if (!inode_has_no_jnodes(r4_inode))
68207 +               warning("vs-1732", "reiser4 inode has %ld jnodes\n",
68208 +                       r4_inode->nr_jnodes);
68209 +#endif
68210 +}
68211 +
68212 +/**
68213 + * reiser4_sync_inodes - sync_inodes of super operations
68214 + * @super:
68215 + * @wbc:
68216 + *
68217 + * This method is called by background and non-backgound writeback. Reiser4's
68218 + * implementation uses generic_sync_sb_inodes to call reiser4_writepages for
68219 + * each of dirty inodes. Reiser4_writepages handles pages dirtied via shared
68220 + * mapping - dirty pages get into atoms. Writeout is called to flush some
68221 + * atoms.
68222 + */
68223 +static void reiser4_sync_inodes(struct super_block *super,
68224 +                               struct writeback_control *wbc)
68225 +{
68226 +       reiser4_context *ctx;
68227 +       long to_write;
68228 +
68229 +       if (wbc->for_kupdate)
68230 +               /* reiser4 has its own means of periodical write-out */
68231 +               return;
68232 +
68233 +       to_write = wbc->nr_to_write;
68234 +       assert("vs-49", wbc->older_than_this == NULL);
68235 +
68236 +       ctx = init_context(super);
68237 +       if (IS_ERR(ctx)) {
68238 +               warning("vs-13", "failed to init context");
68239 +               return;
68240 +       }
68241 +
68242 +       /*
68243 +        * call reiser4_writepages for each of dirty inodes to turn dirty pages
68244 +        * into transactions if they were not yet.
68245 +        */
68246 +       generic_sync_sb_inodes(super, wbc);
68247 +
68248 +       /* flush goes here */
68249 +       wbc->nr_to_write = to_write;
68250 +       writeout(super, wbc);
68251 +
68252 +       /* avoid recursive calls to ->sync_inodes */
68253 +       context_set_commit_async(ctx);
68254 +       reiser4_exit_context(ctx);
68255 +}
68256 +
68257 +/**
68258 + * reiser4_show_options - show_options of super operations
68259 + * @m: file where to write information
68260 + * @mnt: mount structure
68261 + *
68262 + * Makes reiser4 mount options visible in /proc/mounts.
68263 + */
68264 +static int reiser4_show_options(struct seq_file *m, struct vfsmount *mnt)
68265 +{
68266 +       struct super_block *super;
68267 +       reiser4_super_info_data *sbinfo;
68268 +
68269 +       super = mnt->mnt_sb;
68270 +       sbinfo = get_super_private(super);
68271 +
68272 +       seq_printf(m, ",atom_max_size=0x%x", sbinfo->tmgr.atom_max_size);
68273 +       seq_printf(m, ",atom_max_age=0x%x", sbinfo->tmgr.atom_max_age);
68274 +       seq_printf(m, ",atom_min_size=0x%x", sbinfo->tmgr.atom_min_size);
68275 +       seq_printf(m, ",atom_max_flushers=0x%x",
68276 +                  sbinfo->tmgr.atom_max_flushers);
68277 +       seq_printf(m, ",cbk_cache_slots=0x%x",
68278 +                  sbinfo->tree.cbk_cache.nr_slots);
68279 +
68280 +       return 0;
68281 +}
68282 +
68283 +struct super_operations reiser4_super_operations = {
68284 +       .alloc_inode = reiser4_alloc_inode,
68285 +       .destroy_inode = reiser4_destroy_inode,
68286 +       .dirty_inode = reiser4_dirty_inode,
68287 +       .delete_inode = reiser4_delete_inode,
68288 +       .put_super = reiser4_put_super,
68289 +       .write_super = reiser4_write_super,
68290 +       .statfs = reiser4_statfs,
68291 +       .clear_inode = reiser4_clear_inode,
68292 +       .sync_inodes = reiser4_sync_inodes,
68293 +       .show_options = reiser4_show_options
68294 +};
68295 +
68296 +/**
68297 + * fill_super - initialize super block on mount
68298 + * @super: super block to fill
68299 + * @data: reiser4 specific mount option
68300 + * @silent:
68301 + *
68302 + * This is to be called by reiser4_get_sb. Mounts filesystem.
68303 + */
68304 +static int fill_super(struct super_block *super, void *data, int silent)
68305 +{
68306 +       reiser4_context ctx;
68307 +       int result;
68308 +       reiser4_super_info_data *sbinfo;
68309 +
68310 +       assert("zam-989", super != NULL);
68311 +
68312 +       super->s_op = NULL;
68313 +       init_stack_context(&ctx, super);
68314 +
68315 +       /* allocate reiser4 specific super block */
68316 +       if ((result = init_fs_info(super)) != 0)
68317 +               goto failed_init_sinfo;
68318 +
68319 +       sbinfo = get_super_private(super);
68320 +       /* initialize various reiser4 parameters, parse mount options */
68321 +       if ((result = init_super_data(super, data)) != 0)
68322 +               goto failed_init_super_data;
68323 +
68324 +       /* read reiser4 master super block, initialize disk format plugin */
68325 +       if ((result = init_read_super(super, silent)) != 0)
68326 +               goto failed_init_read_super;
68327 +
68328 +       /* initialize transaction manager */
68329 +       init_txnmgr(&sbinfo->tmgr);
68330 +
68331 +       /* initialize ktxnmgrd context and start kernel thread ktxnmrgd */
68332 +       if ((result = init_ktxnmgrd(super)) != 0)
68333 +               goto failed_init_ktxnmgrd;
68334 +
68335 +       /* initialize entd context and start kernel thread entd */
68336 +       if ((result = init_entd(super)) != 0)
68337 +               goto failed_init_entd;
68338 +
68339 +       /* initialize address spaces for formatted nodes and bitmaps */
68340 +       if ((result = init_formatted_fake(super)) != 0)
68341 +               goto failed_init_formatted_fake;
68342 +
68343 +       /* initialize disk format plugin */
68344 +       if ((result = get_super_private(super)->df_plug->init_format(super, data)) != 0 )
68345 +               goto failed_init_disk_format;
68346 +
68347 +       /*
68348 +        * There are some 'committed' versions of reiser4 super block counters,
68349 +        * which correspond to reiser4 on-disk state. These counters are
68350 +        * initialized here
68351 +        */
68352 +       sbinfo->blocks_free_committed = sbinfo->blocks_free;
68353 +       sbinfo->nr_files_committed = oids_used(super);
68354 +
68355 +       /* get inode of root directory */
68356 +       if ((result = init_root_inode(super)) != 0)
68357 +               goto failed_init_root_inode;
68358 +
68359 +       process_safelinks(super);
68360 +       reiser4_exit_context(&ctx);
68361 +
68362 +       sbinfo->debugfs_root = debugfs_create_dir(super->s_id,
68363 +                                                 reiser4_debugfs_root);
68364 +       if (sbinfo->debugfs_root) {
68365 +               sbinfo->tmgr.debugfs_atom_count =
68366 +                       debugfs_create_u32("atom_count", S_IFREG|S_IRUSR,
68367 +                                          sbinfo->debugfs_root,
68368 +                                          &sbinfo->tmgr.atom_count);
68369 +               sbinfo->tmgr.debugfs_id_count =
68370 +                       debugfs_create_u32("id_count", S_IFREG|S_IRUSR,
68371 +                                          sbinfo->debugfs_root,
68372 +                                          &sbinfo->tmgr.id_count);
68373 +       }
68374 +       return 0;
68375 +
68376 + failed_init_root_inode:
68377 +       if (sbinfo->df_plug->release)
68378 +               sbinfo->df_plug->release(super);
68379 + failed_init_disk_format:
68380 +       done_formatted_fake(super);
68381 + failed_init_formatted_fake:
68382 +       done_entd(super);
68383 + failed_init_entd:
68384 +       done_ktxnmgrd(super);
68385 + failed_init_ktxnmgrd:
68386 +       done_txnmgr(&sbinfo->tmgr);
68387 + failed_init_read_super:
68388 + failed_init_super_data:
68389 +       done_fs_info(super);
68390 + failed_init_sinfo:
68391 +       reiser4_exit_context(&ctx);
68392 +       return result;
68393 +}
68394 +
68395 +/**
68396 + * reiser4_get_sb - get_sb of file_system_type operations
68397 + * @fs_type:
68398 + * @flags: mount flags MS_RDONLY, MS_VERBOSE, etc
68399 + * @dev_name: block device file name
68400 + * @data: specific mount options
68401 + *
68402 + * Reiser4 mount entry.
68403 + */
68404 +static int reiser4_get_sb(struct file_system_type *fs_type,
68405 +                         int flags,
68406 +                         const char *dev_name,
68407 +                         void *data,
68408 +                         struct vfsmount *mnt)
68409 +{
68410 +       return get_sb_bdev(fs_type, flags, dev_name, data, fill_super, mnt);
68411 +}
68412 +
68413 +/* structure describing the reiser4 filesystem implementation */
68414 +static struct file_system_type reiser4_fs_type = {
68415 +       .owner = THIS_MODULE,
68416 +       .name = "reiser4",
68417 +       .fs_flags = FS_REQUIRES_DEV,
68418 +       .get_sb = reiser4_get_sb,
68419 +       .kill_sb = kill_block_super,
68420 +       .next = NULL
68421 +};
68422 +
68423 +void destroy_reiser4_cache(kmem_cache_t **cachep)
68424 +{
68425 +       kmem_cache_destroy(*cachep);
68426 +       *cachep = NULL;
68427 +}
68428 +
68429 +/**
68430 + * init_reiser4 - reiser4 initialization entry point
68431 + *
68432 + * Initializes reiser4 slabs, registers reiser4 filesystem type. It is called
68433 + * on kernel initialization or during reiser4 module load.
68434 + */
68435 +static int __init init_reiser4(void)
68436 +{
68437 +       int result;
68438 +
68439 +       printk(KERN_INFO
68440 +              "Loading Reiser4. "
68441 +              "See www.namesys.com for a description of Reiser4.\n");
68442 +
68443 +       /* initialize slab cache of inodes */
68444 +       if ((result = init_inodes()) != 0)
68445 +               goto failed_inode_cache;
68446 +
68447 +       /* initialize cache of znodes */
68448 +       if ((result = init_znodes()) != 0)
68449 +               goto failed_init_znodes;
68450 +
68451 +       /* initialize all plugins */
68452 +       if ((result = init_plugins()) != 0)
68453 +               goto failed_init_plugins;
68454 +
68455 +       /* initialize cache of plugin_set-s and plugin_set's hash table */
68456 +       if ((result = init_plugin_set()) != 0)
68457 +               goto failed_init_plugin_set;
68458 +
68459 +       /* initialize caches of txn_atom-s and txn_handle-s */
68460 +       if ((result = init_txnmgr_static()) != 0)
68461 +               goto failed_init_txnmgr_static;
68462 +
68463 +       /* initialize cache of jnodes */
68464 +       if ((result = init_jnodes()) != 0)
68465 +               goto failed_init_jnodes;
68466 +
68467 +       /* initialize cache of flush queues */
68468 +       if ((result = init_fqs()) != 0)
68469 +               goto failed_init_fqs;
68470 +
68471 +       /* initialize cache of structures attached to dentry->d_fsdata */
68472 +       if ((result = init_dentry_fsdata()) != 0)
68473 +               goto failed_init_dentry_fsdata;
68474 +
68475 +       /* initialize cache of structures attached to file->private_data */
68476 +       if ((result = init_file_fsdata()) != 0)
68477 +               goto failed_init_file_fsdata;
68478 +
68479 +       /*
68480 +        * initialize cache of d_cursors. See plugin/file_ops_readdir.c for
68481 +        * more details
68482 +        */
68483 +       if ((result = init_d_cursor()) != 0)
68484 +               goto failed_init_d_cursor;
68485 +
68486 +       if ((result = register_filesystem(&reiser4_fs_type)) == 0) {
68487 +               reiser4_debugfs_root = debugfs_create_dir("reiser4", NULL);
68488 +               return 0;
68489 +       }
68490 +
68491 +       done_d_cursor();
68492 + failed_init_d_cursor:
68493 +       done_file_fsdata();
68494 + failed_init_file_fsdata:
68495 +       done_dentry_fsdata();
68496 + failed_init_dentry_fsdata:
68497 +       done_fqs();
68498 + failed_init_fqs:
68499 +       done_jnodes();
68500 + failed_init_jnodes:
68501 +       done_txnmgr_static();
68502 + failed_init_txnmgr_static:
68503 +       done_plugin_set();
68504 + failed_init_plugin_set:
68505 + failed_init_plugins:
68506 +       done_znodes();
68507 + failed_init_znodes:
68508 +       done_inodes();
68509 + failed_inode_cache:
68510 +       return result;
68511 +}
68512 +
68513 +/**
68514 + * done_reiser4 - reiser4 exit entry point
68515 + *
68516 + * Unregister reiser4 filesystem type, deletes caches. It is called on shutdown
68517 + * or at module unload.
68518 + */
68519 +static void __exit done_reiser4(void)
68520 +{
68521 +       int result;
68522 +
68523 +       debugfs_remove(reiser4_debugfs_root);
68524 +       result = unregister_filesystem(&reiser4_fs_type);
68525 +       BUG_ON(result != 0);
68526 +       done_d_cursor();
68527 +       done_file_fsdata();
68528 +       done_dentry_fsdata();
68529 +       done_fqs();
68530 +       done_jnodes();
68531 +       done_txnmgr_static();
68532 +       done_plugin_set();
68533 +       done_znodes();
68534 +       destroy_reiser4_cache(&inode_cache);
68535 +}
68536 +
68537 +module_init(init_reiser4);
68538 +module_exit(done_reiser4);
68539 +
68540 +MODULE_DESCRIPTION("Reiser4 filesystem");
68541 +MODULE_AUTHOR("Hans Reiser <Reiser@Namesys.COM>");
68542 +
68543 +MODULE_LICENSE("GPL");
68544 +
68545 +/*
68546 + * Local variables:
68547 + * c-indentation-style: "K&R"
68548 + * mode-name: "LC"
68549 + * c-basic-offset: 8
68550 + * tab-width: 8
68551 + * fill-column: 79
68552 + * End:
68553 + */
68554 diff --git a/fs/reiser4/tap.c b/fs/reiser4/tap.c
68555 new file mode 100644
68556 index 0000000..0c423dd
68557 --- /dev/null
68558 +++ b/fs/reiser4/tap.c
68559 @@ -0,0 +1,377 @@
68560 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
68561 + * reiser4/README */
68562 +
68563 +/*
68564 +   Tree Access Pointer (tap).
68565 +
68566 +   tap is data structure combining coord and lock handle (mostly). It is
68567 +   useful when one has to scan tree nodes (for example, in readdir, or flush),
68568 +   for tap functions allow to move tap in either direction transparently
68569 +   crossing unit/item/node borders.
68570 +
68571 +   Tap doesn't provide automatic synchronization of its fields as it is
68572 +   supposed to be per-thread object.
68573 +*/
68574 +
68575 +#include "forward.h"
68576 +#include "debug.h"
68577 +#include "coord.h"
68578 +#include "tree.h"
68579 +#include "context.h"
68580 +#include "tap.h"
68581 +#include "znode.h"
68582 +#include "tree_walk.h"
68583 +
68584 +#if REISER4_DEBUG
68585 +static int tap_invariant(const tap_t * tap);
68586 +static void tap_check(const tap_t * tap);
68587 +#else
68588 +#define tap_check(tap) noop
68589 +#endif
68590 +
68591 +/** load node tap is pointing to, if not loaded already */
68592 +int tap_load(tap_t * tap)
68593 +{
68594 +       tap_check(tap);
68595 +       if (tap->loaded == 0) {
68596 +               int result;
68597 +
68598 +               result = zload_ra(tap->coord->node, &tap->ra_info);
68599 +               if (result != 0)
68600 +                       return result;
68601 +               coord_clear_iplug(tap->coord);
68602 +       }
68603 +       ++tap->loaded;
68604 +       tap_check(tap);
68605 +       return 0;
68606 +}
68607 +
68608 +/** release node tap is pointing to. Dual to tap_load() */
68609 +void tap_relse(tap_t * tap)
68610 +{
68611 +       tap_check(tap);
68612 +       if (tap->loaded > 0) {
68613 +               --tap->loaded;
68614 +               if (tap->loaded == 0) {
68615 +                       zrelse(tap->coord->node);
68616 +               }
68617 +       }
68618 +       tap_check(tap);
68619 +}
68620 +
68621 +/**
68622 + * init tap to consist of @coord and @lh. Locks on nodes will be acquired with
68623 + * @mode
68624 + */
68625 +void
68626 +tap_init(tap_t * tap, coord_t * coord, lock_handle * lh, znode_lock_mode mode)
68627 +{
68628 +       tap->coord = coord;
68629 +       tap->lh = lh;
68630 +       tap->mode = mode;
68631 +       tap->loaded = 0;
68632 +       INIT_LIST_HEAD(&tap->linkage);
68633 +       init_ra_info(&tap->ra_info);
68634 +}
68635 +
68636 +/** add @tap to the per-thread list of all taps */
68637 +void tap_monitor(tap_t * tap)
68638 +{
68639 +       assert("nikita-2623", tap != NULL);
68640 +       tap_check(tap);
68641 +       list_add(&tap->linkage, taps_list());
68642 +       tap_check(tap);
68643 +}
68644 +
68645 +/* duplicate @src into @dst. Copy lock handle. @dst is not initially
68646 + * loaded. */
68647 +void tap_copy(tap_t * dst, tap_t * src)
68648 +{
68649 +       assert("nikita-3193", src != NULL);
68650 +       assert("nikita-3194", dst != NULL);
68651 +
68652 +       *dst->coord = *src->coord;
68653 +       if (src->lh->node)
68654 +               copy_lh(dst->lh, src->lh);
68655 +       dst->mode = src->mode;
68656 +       dst->loaded = 0;
68657 +       INIT_LIST_HEAD(&dst->linkage);
68658 +       dst->ra_info = src->ra_info;
68659 +}
68660 +
68661 +/** finish with @tap */
68662 +void tap_done(tap_t * tap)
68663 +{
68664 +       assert("nikita-2565", tap != NULL);
68665 +       tap_check(tap);
68666 +       if (tap->loaded > 0)
68667 +               zrelse(tap->coord->node);
68668 +       done_lh(tap->lh);
68669 +       tap->loaded = 0;
68670 +       list_del_init(&tap->linkage);
68671 +       tap->coord->node = NULL;
68672 +}
68673 +
68674 +/**
68675 + * move @tap to the new node, locked with @target. Load @target, if @tap was
68676 + * already loaded.
68677 + */
68678 +int tap_move(tap_t * tap, lock_handle * target)
68679 +{
68680 +       int result = 0;
68681 +
68682 +       assert("nikita-2567", tap != NULL);
68683 +       assert("nikita-2568", target != NULL);
68684 +       assert("nikita-2570", target->node != NULL);
68685 +       assert("nikita-2569", tap->coord->node == tap->lh->node);
68686 +
68687 +       tap_check(tap);
68688 +       if (tap->loaded > 0)
68689 +               result = zload_ra(target->node, &tap->ra_info);
68690 +
68691 +       if (result == 0) {
68692 +               if (tap->loaded > 0)
68693 +                       zrelse(tap->coord->node);
68694 +               done_lh(tap->lh);
68695 +               copy_lh(tap->lh, target);
68696 +               tap->coord->node = target->node;
68697 +               coord_clear_iplug(tap->coord);
68698 +       }
68699 +       tap_check(tap);
68700 +       return result;
68701 +}
68702 +
68703 +/**
68704 + * move @tap to @target. Acquire lock on @target, if @tap was already
68705 + * loaded.
68706 + */
68707 +static int tap_to(tap_t * tap, znode * target)
68708 +{
68709 +       int result;
68710 +
68711 +       assert("nikita-2624", tap != NULL);
68712 +       assert("nikita-2625", target != NULL);
68713 +
68714 +       tap_check(tap);
68715 +       result = 0;
68716 +       if (tap->coord->node != target) {
68717 +               lock_handle here;
68718 +
68719 +               init_lh(&here);
68720 +               result = longterm_lock_znode(&here, target,
68721 +                                            tap->mode, ZNODE_LOCK_HIPRI);
68722 +               if (result == 0) {
68723 +                       result = tap_move(tap, &here);
68724 +                       done_lh(&here);
68725 +               }
68726 +       }
68727 +       tap_check(tap);
68728 +       return result;
68729 +}
68730 +
68731 +/**
68732 + * move @tap to given @target, loading and locking @target->node if
68733 + * necessary
68734 + */
68735 +int tap_to_coord(tap_t * tap, coord_t * target)
68736 +{
68737 +       int result;
68738 +
68739 +       tap_check(tap);
68740 +       result = tap_to(tap, target->node);
68741 +       if (result == 0)
68742 +               coord_dup(tap->coord, target);
68743 +       tap_check(tap);
68744 +       return result;
68745 +}
68746 +
68747 +/** return list of all taps */
68748 +struct list_head *taps_list(void)
68749 +{
68750 +       return &get_current_context()->taps;
68751 +}
68752 +
68753 +/** helper function for go_{next,prev}_{item,unit,node}() */
68754 +int go_dir_el(tap_t * tap, sideof dir, int units_p)
68755 +{
68756 +       coord_t dup;
68757 +       coord_t *coord;
68758 +       int result;
68759 +
68760 +       int (*coord_dir) (coord_t *);
68761 +       int (*get_dir_neighbor) (lock_handle *, znode *, int, int);
68762 +       void (*coord_init) (coord_t *, const znode *);
68763 +       ON_DEBUG(int (*coord_check) (const coord_t *));
68764 +
68765 +       assert("nikita-2556", tap != NULL);
68766 +       assert("nikita-2557", tap->coord != NULL);
68767 +       assert("nikita-2558", tap->lh != NULL);
68768 +       assert("nikita-2559", tap->coord->node != NULL);
68769 +
68770 +       tap_check(tap);
68771 +       if (dir == LEFT_SIDE) {
68772 +               coord_dir = units_p ? coord_prev_unit : coord_prev_item;
68773 +               get_dir_neighbor = reiser4_get_left_neighbor;
68774 +               coord_init = coord_init_last_unit;
68775 +       } else {
68776 +               coord_dir = units_p ? coord_next_unit : coord_next_item;
68777 +               get_dir_neighbor = reiser4_get_right_neighbor;
68778 +               coord_init = coord_init_first_unit;
68779 +       }
68780 +       ON_DEBUG(coord_check =
68781 +                units_p ? coord_is_existing_unit : coord_is_existing_item);
68782 +       assert("nikita-2560", coord_check(tap->coord));
68783 +
68784 +       coord = tap->coord;
68785 +       coord_dup(&dup, coord);
68786 +       if (coord_dir(&dup) != 0) {
68787 +               do {
68788 +                       /* move to the left neighboring node */
68789 +                       lock_handle dup;
68790 +
68791 +                       init_lh(&dup);
68792 +                       result =
68793 +                           get_dir_neighbor(&dup, coord->node, (int)tap->mode,
68794 +                                            GN_CAN_USE_UPPER_LEVELS);
68795 +                       if (result == 0) {
68796 +                               result = tap_move(tap, &dup);
68797 +                               if (result == 0)
68798 +                                       coord_init(tap->coord, dup.node);
68799 +                               done_lh(&dup);
68800 +                       }
68801 +                       /* skip empty nodes */
68802 +               } while ((result == 0) && node_is_empty(coord->node));
68803 +       } else {
68804 +               result = 0;
68805 +               coord_dup(coord, &dup);
68806 +       }
68807 +       assert("nikita-2564", ergo(!result, coord_check(tap->coord)));
68808 +       tap_check(tap);
68809 +       return result;
68810 +}
68811 +
68812 +/**
68813 + * move @tap to the next unit, transparently crossing item and node
68814 + * boundaries
68815 + */
68816 +int go_next_unit(tap_t * tap)
68817 +{
68818 +       return go_dir_el(tap, RIGHT_SIDE, 1);
68819 +}
68820 +
68821 +/**
68822 + * move @tap to the previous unit, transparently crossing item and node
68823 + * boundaries
68824 + */
68825 +int go_prev_unit(tap_t * tap)
68826 +{
68827 +       return go_dir_el(tap, LEFT_SIDE, 1);
68828 +}
68829 +
68830 +/**
68831 + * @shift times apply @actor to the @tap. This is used to move @tap by
68832 + * @shift units (or items, or nodes) in either direction.
68833 + */
68834 +static int rewind_to(tap_t * tap, go_actor_t actor, int shift)
68835 +{
68836 +       int result;
68837 +
68838 +       assert("nikita-2555", shift >= 0);
68839 +       assert("nikita-2562", tap->coord->node == tap->lh->node);
68840 +
68841 +       tap_check(tap);
68842 +       result = tap_load(tap);
68843 +       if (result != 0)
68844 +               return result;
68845 +
68846 +       for (; shift > 0; --shift) {
68847 +               result = actor(tap);
68848 +               assert("nikita-2563", tap->coord->node == tap->lh->node);
68849 +               if (result != 0)
68850 +                       break;
68851 +       }
68852 +       tap_relse(tap);
68853 +       tap_check(tap);
68854 +       return result;
68855 +}
68856 +
68857 +/** move @tap @shift units rightward */
68858 +int rewind_right(tap_t * tap, int shift)
68859 +{
68860 +       return rewind_to(tap, go_next_unit, shift);
68861 +}
68862 +
68863 +/** move @tap @shift units leftward */
68864 +int rewind_left(tap_t * tap, int shift)
68865 +{
68866 +       return rewind_to(tap, go_prev_unit, shift);
68867 +}
68868 +
68869 +#if REISER4_DEBUG
68870 +/** debugging function: print @tap content in human readable form */
68871 +static void print_tap(const char *prefix, const tap_t * tap)
68872 +{
68873 +       if (tap == NULL) {
68874 +               printk("%s: null tap\n", prefix);
68875 +               return;
68876 +       }
68877 +       printk("%s: loaded: %i, in-list: %i, node: %p, mode: %s\n", prefix,
68878 +              tap->loaded, (&tap->linkage == tap->linkage.next &&
68879 +                            &tap->linkage == tap->linkage.prev),
68880 +              tap->lh->node,
68881 +              lock_mode_name(tap->mode));
68882 +       print_coord("\tcoord", tap->coord, 0);
68883 +}
68884 +
68885 +/** check [tap-sane] invariant */
68886 +static int tap_invariant(const tap_t * tap)
68887 +{
68888 +       /* [tap-sane] invariant */
68889 +
68890 +       if (tap == NULL)
68891 +               return 1;
68892 +       /* tap->mode is one of
68893 +        *
68894 +        * {ZNODE_NO_LOCK, ZNODE_READ_LOCK, ZNODE_WRITE_LOCK}, and
68895 +        */
68896 +       if (tap->mode != ZNODE_NO_LOCK &&
68897 +           tap->mode != ZNODE_READ_LOCK && tap->mode != ZNODE_WRITE_LOCK)
68898 +               return 2;
68899 +       /* tap->coord != NULL, and */
68900 +       if (tap->coord == NULL)
68901 +               return 3;
68902 +       /* tap->lh != NULL, and */
68903 +       if (tap->lh == NULL)
68904 +               return 4;
68905 +       /* tap->loaded > 0 => znode_is_loaded(tap->coord->node), and */
68906 +       if (!ergo(tap->loaded, znode_is_loaded(tap->coord->node)))
68907 +               return 5;
68908 +       /* tap->coord->node == tap->lh->node if tap->lh->node is not 0 */
68909 +       if (tap->lh->node != NULL && tap->coord->node != tap->lh->node)
68910 +               return 6;
68911 +       return 0;
68912 +}
68913 +
68914 +/** debugging function: check internal @tap consistency */
68915 +static void tap_check(const tap_t * tap)
68916 +{
68917 +       int result;
68918 +
68919 +       result = tap_invariant(tap);
68920 +       if (result != 0) {
68921 +               print_tap("broken", tap);
68922 +               reiser4_panic("nikita-2831", "tap broken: %i\n", result);
68923 +       }
68924 +}
68925 +#endif
68926 +
68927 +/* Make Linus happy.
68928 +   Local variables:
68929 +   c-indentation-style: "K&R"
68930 +   mode-name: "LC"
68931 +   c-basic-offset: 8
68932 +   tab-width: 8
68933 +   fill-column: 120
68934 +   scroll-step: 1
68935 +   End:
68936 +*/
68937 diff --git a/fs/reiser4/tap.h b/fs/reiser4/tap.h
68938 new file mode 100644
68939 index 0000000..38ce717
68940 --- /dev/null
68941 +++ b/fs/reiser4/tap.h
68942 @@ -0,0 +1,69 @@
68943 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
68944 +
68945 +/* Tree Access Pointers. See tap.c for more details. */
68946 +
68947 +#if !defined( __REISER4_TAP_H__ )
68948 +#define __REISER4_TAP_H__
68949 +
68950 +#include "forward.h"
68951 +#include "readahead.h"
68952 +
68953 +/**
68954 +    tree_access_pointer aka tap. Data structure combining coord_t and lock
68955 +    handle.
68956 +    Invariants involving this data-type, see doc/lock-ordering for details:
68957 +
68958 +      [tap-sane]
68959 + */
68960 +struct tree_access_pointer {
68961 +       /* coord tap is at */
68962 +       coord_t *coord;
68963 +       /* lock handle on ->coord->node */
68964 +       lock_handle *lh;
68965 +       /* mode of lock acquired by this tap */
68966 +       znode_lock_mode mode;
68967 +       /* incremented by tap_load(). Decremented by tap_relse(). */
68968 +       int loaded;
68969 +       /* list of taps */
68970 +       struct list_head linkage;
68971 +       /* read-ahead hint */
68972 +       ra_info_t ra_info;
68973 +};
68974 +
68975 +typedef int (*go_actor_t) (tap_t * tap);
68976 +
68977 +extern int tap_load(tap_t * tap);
68978 +extern void tap_relse(tap_t * tap);
68979 +extern void tap_init(tap_t * tap, coord_t * coord, lock_handle * lh,
68980 +                    znode_lock_mode mode);
68981 +extern void tap_monitor(tap_t * tap);
68982 +extern void tap_copy(tap_t * dst, tap_t * src);
68983 +extern void tap_done(tap_t * tap);
68984 +extern int tap_move(tap_t * tap, lock_handle * target);
68985 +extern int tap_to_coord(tap_t * tap, coord_t * target);
68986 +
68987 +extern int go_dir_el(tap_t * tap, sideof dir, int units_p);
68988 +extern int go_next_unit(tap_t * tap);
68989 +extern int go_prev_unit(tap_t * tap);
68990 +extern int rewind_right(tap_t * tap, int shift);
68991 +extern int rewind_left(tap_t * tap, int shift);
68992 +
68993 +extern struct list_head *taps_list(void);
68994 +
68995 +#define for_all_taps(tap)                                              \
68996 +       for (tap = list_entry(taps_list()->next, tap_t, linkage);       \
68997 +            taps_list() != &tap->linkage;                              \
68998 +            tap = list_entry(tap->linkage.next, tap_t, linkage))
68999 +
69000 +/* __REISER4_TAP_H__ */
69001 +#endif
69002 +/* Make Linus happy.
69003 +   Local variables:
69004 +   c-indentation-style: "K&R"
69005 +   mode-name: "LC"
69006 +   c-basic-offset: 8
69007 +   tab-width: 8
69008 +   fill-column: 120
69009 +   scroll-step: 1
69010 +   End:
69011 +*/
69012 diff --git a/fs/reiser4/tree.c b/fs/reiser4/tree.c
69013 new file mode 100644
69014 index 0000000..0a604d5
69015 --- /dev/null
69016 +++ b/fs/reiser4/tree.c
69017 @@ -0,0 +1,1875 @@
69018 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
69019 + * reiser4/README */
69020 +
69021 +/*
69022 + * KEYS IN A TREE.
69023 + *
69024 + * The tree consists of nodes located on the disk. Node in the tree is either
69025 + * formatted or unformatted. Formatted node is one that has structure
69026 + * understood by the tree balancing and traversal code. Formatted nodes are
69027 + * further classified into leaf and internal nodes. Latter distinctions is
69028 + * (almost) of only historical importance: general structure of leaves and
69029 + * internal nodes is the same in Reiser4. Unformatted nodes contain raw data
69030 + * that are part of bodies of ordinary files and attributes.
69031 + *
69032 + * Each node in the tree spawns some interval in the key space. Key ranges for
69033 + * all nodes in the tree are disjoint. Actually, this only holds in some weak
69034 + * sense, because of the non-unique keys: intersection of key ranges for
69035 + * different nodes is either empty, or consists of exactly one key.
69036 + *
69037 + * Formatted node consists of a sequence of items. Each item spawns some
69038 + * interval in key space. Key ranges for all items in a tree are disjoint,
69039 + * modulo non-unique keys again. Items within nodes are ordered in the key
69040 + * order of the smallest key in a item.
69041 + *
69042 + * Particular type of item can be further split into units. Unit is piece of
69043 + * item that can be cut from item and moved into another item of the same
69044 + * time. Units are used by balancing code to repack data during balancing.
69045 + *
69046 + * Unit can be further split into smaller entities (for example, extent unit
69047 + * represents several pages, and it is natural for extent code to operate on
69048 + * particular pages and even bytes within one unit), but this is of no
69049 + * relevance to the generic balancing and lookup code.
69050 + *
69051 + * Although item is said to "spawn" range or interval of keys, it is not
69052 + * necessary that item contains piece of data addressable by each and every
69053 + * key in this range. For example, compound directory item, consisting of
69054 + * units corresponding to directory entries and keyed by hashes of file names,
69055 + * looks more as having "discrete spectrum": only some disjoint keys inside
69056 + * range occupied by this item really address data.
69057 + *
69058 + * No than less, each item always has well-defined least (minimal) key, that
69059 + * is recorded in item header, stored in the node this item is in. Also, item
69060 + * plugin can optionally define method ->max_key_inside() returning maximal
69061 + * key that can _possibly_ be located within this item. This method is used
69062 + * (mainly) to determine when given piece of data should be merged into
69063 + * existing item, in stead of creating new one. Because of this, even though
69064 + * ->max_key_inside() can be larger that any key actually located in the item,
69065 + * intervals
69066 + *
69067 + * [ min_key( item ), ->max_key_inside( item ) ]
69068 + *
69069 + * are still disjoint for all items within the _same_ node.
69070 + *
69071 + * In memory node is represented by znode. It plays several roles:
69072 + *
69073 + *  . something locks are taken on
69074 + *
69075 + *  . something tracked by transaction manager (this is going to change)
69076 + *
69077 + *  . something used to access node data
69078 + *
69079 + *  . something used to maintain tree structure in memory: sibling and
69080 + *  parental linkage.
69081 + *
69082 + *  . something used to organize nodes into "slums"
69083 + *
69084 + * More on znodes see in znode.[ch]
69085 + *
69086 + * DELIMITING KEYS
69087 + *
69088 + *   To simplify balancing, allow some flexibility in locking and speed up
69089 + *   important coord cache optimization, we keep delimiting keys of nodes in
69090 + *   memory. Depending on disk format (implemented by appropriate node plugin)
69091 + *   node on disk can record both left and right delimiting key, only one of
69092 + *   them, or none. Still, our balancing and tree traversal code keep both
69093 + *   delimiting keys for a node that is in memory stored in the znode. When
69094 + *   node is first brought into memory during tree traversal, its left
69095 + *   delimiting key is taken from its parent, and its right delimiting key is
69096 + *   either next key in its parent, or is right delimiting key of parent if
69097 + *   node is the rightmost child of parent.
69098 + *
69099 + *   Physical consistency of delimiting key is protected by special dk
69100 + *   read-write lock. That is, delimiting keys can only be inspected or
69101 + *   modified under this lock. But dk lock is only sufficient for fast
69102 + *   "pessimistic" check, because to simplify code and to decrease lock
69103 + *   contention, balancing (carry) only updates delimiting keys right before
69104 + *   unlocking all locked nodes on the given tree level. For example,
69105 + *   coord-by-key cache scans LRU list of recently accessed znodes. For each
69106 + *   node it first does fast check under dk spin lock. If key looked for is
69107 + *   not between delimiting keys for this node, next node is inspected and so
69108 + *   on. If key is inside of the key range, long term lock is taken on node
69109 + *   and key range is rechecked.
69110 + *
69111 + * COORDINATES
69112 + *
69113 + *   To find something in the tree, you supply a key, and the key is resolved
69114 + *   by coord_by_key() into a coord (coordinate) that is valid as long as the
69115 + *   node the coord points to remains locked.  As mentioned above trees
69116 + *   consist of nodes that consist of items that consist of units. A unit is
69117 + *   the smallest and indivisible piece of tree as far as balancing and tree
69118 + *   search are concerned. Each node, item, and unit can be addressed by
69119 + *   giving its level in the tree and the key occupied by this entity.  A node
69120 + *   knows what the key ranges are of the items within it, and how to find its
69121 + *   items and invoke their item handlers, but it does not know how to access
69122 + *   individual units within its items except through the item handlers.
69123 + *   coord is a structure containing a pointer to the node, the ordinal number
69124 + *   of the item within this node (a sort of item offset), and the ordinal
69125 + *   number of the unit within this item.
69126 + *
69127 + * TREE LOOKUP
69128 + *
69129 + *   There are two types of access to the tree: lookup and modification.
69130 + *
69131 + *   Lookup is a search for the key in the tree. Search can look for either
69132 + *   exactly the key given to it, or for the largest key that is not greater
69133 + *   than the key given to it. This distinction is determined by "bias"
69134 + *   parameter of search routine (coord_by_key()). coord_by_key() either
69135 + *   returns error (key is not in the tree, or some kind of external error
69136 + *   occurred), or successfully resolves key into coord.
69137 + *
69138 + *   This resolution is done by traversing tree top-to-bottom from root level
69139 + *   to the desired level. On levels above twig level (level one above the
69140 + *   leaf level) nodes consist exclusively of internal items. Internal item is
69141 + *   nothing more than pointer to the tree node on the child level. On twig
69142 + *   level nodes consist of internal items intermixed with extent
69143 + *   items. Internal items form normal search tree structure used by traversal
69144 + *   to descent through the tree.
69145 + *
69146 + * TREE LOOKUP OPTIMIZATIONS
69147 + *
69148 + * Tree lookup described above is expensive even if all nodes traversed are
69149 + * already in the memory: for each node binary search within it has to be
69150 + * performed and binary searches are CPU consuming and tend to destroy CPU
69151 + * caches.
69152 + *
69153 + * Several optimizations are used to work around this:
69154 + *
69155 + *   . cbk_cache (look-aside cache for tree traversals, see search.c for
69156 + *   details)
69157 + *
69158 + *   . seals (see seal.[ch])
69159 + *
69160 + *   . vroot (see search.c)
69161 + *
69162 + * General search-by-key is layered thusly:
69163 + *
69164 + *                   [check seal, if any]   --ok--> done
69165 + *                           |
69166 + *                         failed
69167 + *                           |
69168 + *                           V
69169 + *                     [vroot defined] --no--> node = tree_root
69170 + *                           |                   |
69171 + *                          yes                  |
69172 + *                           |                   |
69173 + *                           V                   |
69174 + *                       node = vroot            |
69175 + *                                 |             |
69176 + *                                 |             |
69177 + *                                 |             |
69178 + *                                 V             V
69179 + *                            [check cbk_cache for key]  --ok--> done
69180 + *                                        |
69181 + *                                      failed
69182 + *                                        |
69183 + *                                        V
69184 + *                       [start tree traversal from node]
69185 + *
69186 + */
69187 +
69188 +#include "forward.h"
69189 +#include "debug.h"
69190 +#include "dformat.h"
69191 +#include "key.h"
69192 +#include "coord.h"
69193 +#include "plugin/item/static_stat.h"
69194 +#include "plugin/item/item.h"
69195 +#include "plugin/node/node.h"
69196 +#include "plugin/plugin.h"
69197 +#include "txnmgr.h"
69198 +#include "jnode.h"
69199 +#include "znode.h"
69200 +#include "block_alloc.h"
69201 +#include "tree_walk.h"
69202 +#include "carry.h"
69203 +#include "carry_ops.h"
69204 +#include "tap.h"
69205 +#include "tree.h"
69206 +#include "vfs_ops.h"
69207 +#include "page_cache.h"
69208 +#include "super.h"
69209 +#include "reiser4.h"
69210 +#include "inode.h"
69211 +
69212 +#include <linux/fs.h>          /* for struct super_block  */
69213 +#include <linux/spinlock.h>
69214 +
69215 +/* Disk address (block number) never ever used for any real tree node. This is
69216 +   used as block number of "uber" znode.
69217 +
69218 +   Invalid block addresses are 0 by tradition.
69219 +
69220 +*/
69221 +const reiser4_block_nr UBER_TREE_ADDR = 0ull;
69222 +
69223 +#define CUT_TREE_MIN_ITERATIONS 64
69224 +
69225 +static int find_child_by_addr(znode * parent, znode * child, coord_t * result);
69226 +
69227 +/* return node plugin of coord->node */
69228 +node_plugin *node_plugin_by_coord(const coord_t * coord)
69229 +{
69230 +       assert("vs-1", coord != NULL);
69231 +       assert("vs-2", coord->node != NULL);
69232 +
69233 +       return coord->node->nplug;
69234 +}
69235 +
69236 +/* insert item into tree. Fields of @coord are updated so that they can be
69237 + * used by consequent insert operation. */
69238 +insert_result insert_by_key(reiser4_tree * tree        /* tree to insert new item
69239 +                                                * into */ ,
69240 +                           const reiser4_key * key /* key of new item */ ,
69241 +                           reiser4_item_data * data    /* parameters for item
69242 +                                                        * creation */ ,
69243 +                           coord_t * coord /* resulting insertion coord */ ,
69244 +                           lock_handle * lh    /* resulting lock
69245 +                                                * handle */ ,
69246 +                           tree_level stop_level /** level where to insert */ ,
69247 +                           __u32 flags /* insertion flags */ )
69248 +{
69249 +       int result;
69250 +
69251 +       assert("nikita-358", tree != NULL);
69252 +       assert("nikita-360", coord != NULL);
69253 +
69254 +       result = coord_by_key(tree, key, coord, lh, ZNODE_WRITE_LOCK,
69255 +                             FIND_EXACT, stop_level, stop_level,
69256 +                             flags | CBK_FOR_INSERT, NULL /*ra_info */ );
69257 +       switch (result) {
69258 +       default:
69259 +               break;
69260 +       case CBK_COORD_FOUND:
69261 +               result = IBK_ALREADY_EXISTS;
69262 +               break;
69263 +       case CBK_COORD_NOTFOUND:
69264 +               assert("nikita-2017", coord->node != NULL);
69265 +               result = insert_by_coord(coord, data, key, lh, 0 /*flags */ );
69266 +               break;
69267 +       }
69268 +       return result;
69269 +}
69270 +
69271 +/* insert item by calling carry. Helper function called if short-cut
69272 +   insertion failed  */
69273 +static insert_result insert_with_carry_by_coord(coord_t * coord,       /* coord where to insert */
69274 +                                               lock_handle * lh,       /* lock handle of insertion
69275 +                                                                        * node */
69276 +                                               reiser4_item_data * data,       /* parameters of new
69277 +                                                                                * item */
69278 +                                               const reiser4_key * key,        /* key of new item */
69279 +                                               carry_opcode cop,       /* carry operation to perform */
69280 +                                               cop_insert_flag flags
69281 +                                               /* carry flags */ )
69282 +{
69283 +       int result;
69284 +       carry_pool *pool;
69285 +       carry_level *lowest_level;
69286 +       carry_insert_data *cdata;
69287 +       carry_op *op;
69288 +
69289 +       assert("umka-314", coord != NULL);
69290 +
69291 +       /* allocate carry_pool and 3 carry_level-s */
69292 +       pool =
69293 +           init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
69294 +                           sizeof(*cdata));
69295 +       if (IS_ERR(pool))
69296 +               return PTR_ERR(pool);
69297 +       lowest_level = (carry_level *) (pool + 1);
69298 +       init_carry_level(lowest_level, pool);
69299 +
69300 +       op = post_carry(lowest_level, cop, coord->node, 0);
69301 +       if (IS_ERR(op) || (op == NULL)) {
69302 +               done_carry_pool(pool);
69303 +               return RETERR(op ? PTR_ERR(op) : -EIO);
69304 +       }
69305 +       cdata = (carry_insert_data *) (lowest_level + 3);
69306 +       cdata->coord = coord;
69307 +       cdata->data = data;
69308 +       cdata->key = key;
69309 +       op->u.insert.d = cdata;
69310 +       if (flags == 0)
69311 +               flags = znode_get_tree(coord->node)->carry.insert_flags;
69312 +       op->u.insert.flags = flags;
69313 +       op->u.insert.type = COPT_ITEM_DATA;
69314 +       op->u.insert.child = NULL;
69315 +       if (lh != NULL) {
69316 +               assert("nikita-3245", lh->node == coord->node);
69317 +               lowest_level->track_type = CARRY_TRACK_CHANGE;
69318 +               lowest_level->tracked = lh;
69319 +       }
69320 +
69321 +       result = carry(lowest_level, NULL);
69322 +       done_carry_pool(pool);
69323 +
69324 +       return result;
69325 +}
69326 +
69327 +/* form carry queue to perform paste of @data with @key at @coord, and launch
69328 +   its execution by calling carry().
69329 +
69330 +   Instruct carry to update @lh it after balancing insertion coord moves into
69331 +   different block.
69332 +
69333 +*/
69334 +static int paste_with_carry(coord_t * coord,   /* coord of paste */
69335 +                           lock_handle * lh,   /* lock handle of node
69336 +                                                * where item is
69337 +                                                * pasted */
69338 +                           reiser4_item_data * data,   /* parameters of new
69339 +                                                        * item */
69340 +                           const reiser4_key * key,    /* key of new item */
69341 +                           unsigned flags /* paste flags */ )
69342 +{
69343 +       int result;
69344 +       carry_pool *pool;
69345 +       carry_level *lowest_level;
69346 +       carry_insert_data *cdata;
69347 +       carry_op *op;
69348 +
69349 +       assert("umka-315", coord != NULL);
69350 +       assert("umka-316", key != NULL);
69351 +
69352 +       pool =
69353 +           init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
69354 +                           sizeof(*cdata));
69355 +       if (IS_ERR(pool))
69356 +               return PTR_ERR(pool);
69357 +       lowest_level = (carry_level *) (pool + 1);
69358 +       init_carry_level(lowest_level, pool);
69359 +
69360 +       op = post_carry(lowest_level, COP_PASTE, coord->node, 0);
69361 +       if (IS_ERR(op) || (op == NULL)) {
69362 +               done_carry_pool(pool);
69363 +               return RETERR(op ? PTR_ERR(op) : -EIO);
69364 +       }
69365 +       cdata = (carry_insert_data *) (lowest_level + 3);
69366 +       cdata->coord = coord;
69367 +       cdata->data = data;
69368 +       cdata->key = key;
69369 +       op->u.paste.d = cdata;
69370 +       if (flags == 0)
69371 +               flags = znode_get_tree(coord->node)->carry.paste_flags;
69372 +       op->u.paste.flags = flags;
69373 +       op->u.paste.type = COPT_ITEM_DATA;
69374 +       if (lh != NULL) {
69375 +               lowest_level->track_type = CARRY_TRACK_CHANGE;
69376 +               lowest_level->tracked = lh;
69377 +       }
69378 +
69379 +       result = carry(lowest_level, NULL);
69380 +       done_carry_pool(pool);
69381 +
69382 +       return result;
69383 +}
69384 +
69385 +/* insert item at the given coord.
69386 +
69387 +   First try to skip carry by directly calling ->create_item() method of node
69388 +   plugin. If this is impossible (there is not enough free space in the node,
69389 +   or leftmost item in the node is created), call insert_with_carry_by_coord()
69390 +   that will do full carry().
69391 +
69392 +*/
69393 +insert_result insert_by_coord(coord_t * coord  /* coord where to
69394 +                                                * insert. coord->node has
69395 +                                                * to be write locked by
69396 +                                                * caller */ ,
69397 +                             reiser4_item_data * data  /* data to be
69398 +                                                        * inserted */ ,
69399 +                             const reiser4_key * key /* key of new item */ ,
69400 +                             lock_handle * lh  /* lock handle of write
69401 +                                                * lock on node */ ,
69402 +                             __u32 flags /* insertion flags */ )
69403 +{
69404 +       unsigned item_size;
69405 +       int result;
69406 +       znode *node;
69407 +
69408 +       assert("vs-247", coord != NULL);
69409 +       assert("vs-248", data != NULL);
69410 +       assert("vs-249", data->length >= 0);
69411 +       assert("nikita-1191", znode_is_write_locked(coord->node));
69412 +
69413 +       node = coord->node;
69414 +       coord_clear_iplug(coord);
69415 +       result = zload(node);
69416 +       if (result != 0)
69417 +               return result;
69418 +
69419 +       item_size = space_needed(node, NULL, data, 1);
69420 +       if (item_size > znode_free_space(node) &&
69421 +           (flags & COPI_DONT_SHIFT_LEFT) && (flags & COPI_DONT_SHIFT_RIGHT)
69422 +           && (flags & COPI_DONT_ALLOCATE)) {
69423 +               /* we are forced to use free space of coord->node and new item
69424 +                  does not fit into it.
69425 +
69426 +                  Currently we get here only when we allocate and copy units
69427 +                  of extent item from a node to its left neighbor during
69428 +                  "squalloc"-ing.  If @node (this is left neighbor) does not
69429 +                  have enough free space - we do not want to attempt any
69430 +                  shifting and allocations because we are in squeezing and
69431 +                  everything to the left of @node is tightly packed.
69432 +                */
69433 +               result = -E_NODE_FULL;
69434 +       } else if ((item_size <= znode_free_space(node)) &&
69435 +                  !coord_is_before_leftmost(coord) &&
69436 +                  (node_plugin_by_node(node)->fast_insert != NULL)
69437 +                  && node_plugin_by_node(node)->fast_insert(coord)) {
69438 +               /* shortcut insertion without carry() overhead.
69439 +
69440 +                  Only possible if:
69441 +
69442 +                  - there is enough free space
69443 +
69444 +                  - insertion is not into the leftmost position in a node
69445 +                  (otherwise it would require updating of delimiting key in a
69446 +                  parent)
69447 +
69448 +                  - node plugin agrees with this
69449 +
69450 +                */
69451 +               result =
69452 +                   node_plugin_by_node(node)->create_item(coord, key, data,
69453 +                                                          NULL);
69454 +               znode_make_dirty(node);
69455 +       } else {
69456 +               /* otherwise do full-fledged carry(). */
69457 +               result =
69458 +                   insert_with_carry_by_coord(coord, lh, data, key, COP_INSERT,
69459 +                                              flags);
69460 +       }
69461 +       zrelse(node);
69462 +       return result;
69463 +}
69464 +
69465 +/* @coord is set to leaf level and @data is to be inserted to twig level */
69466 +insert_result
69467 +insert_extent_by_coord(coord_t *
69468 +                      coord
69469 +                      /* coord where to insert. coord->node * has to be write * locked by caller */
69470 +                      ,
69471 +                      reiser4_item_data * data /* data to be inserted */ ,
69472 +                      const reiser4_key * key /* key of new item */ ,
69473 +                      lock_handle *
69474 +                      lh /* lock handle of write lock on * node */ )
69475 +{
69476 +       assert("vs-405", coord != NULL);
69477 +       assert("vs-406", data != NULL);
69478 +       assert("vs-407", data->length > 0);
69479 +       assert("vs-408", znode_is_write_locked(coord->node));
69480 +       assert("vs-409", znode_get_level(coord->node) == LEAF_LEVEL);
69481 +
69482 +       return insert_with_carry_by_coord(coord, lh, data, key, COP_EXTENT,
69483 +                                         0 /*flags */ );
69484 +}
69485 +
69486 +/* Insert into the item at the given coord.
69487 +
69488 +   First try to skip carry by directly calling ->paste() method of item
69489 +   plugin. If this is impossible (there is not enough free space in the node,
69490 +   or we are pasting into leftmost position in the node), call
69491 +   paste_with_carry() that will do full carry().
69492 +
69493 +*/
69494 +/* paste_into_item */
69495 +int insert_into_item(coord_t * coord /* coord of pasting */ ,
69496 +                    lock_handle * lh /* lock handle on node involved */ ,
69497 +                    const reiser4_key * key /* key of unit being pasted */ ,
69498 +                    reiser4_item_data * data /* parameters for new unit */ ,
69499 +                    unsigned flags /* insert/paste flags */ )
69500 +{
69501 +       int result;
69502 +       int size_change;
69503 +       node_plugin *nplug;
69504 +       item_plugin *iplug;
69505 +
69506 +       assert("umka-317", coord != NULL);
69507 +       assert("umka-318", key != NULL);
69508 +
69509 +       iplug = item_plugin_by_coord(coord);
69510 +       nplug = node_plugin_by_coord(coord);
69511 +
69512 +       assert("nikita-1480", iplug == data->iplug);
69513 +
69514 +       size_change = space_needed(coord->node, coord, data, 0);
69515 +       if (size_change > (int)znode_free_space(coord->node) &&
69516 +           (flags & COPI_DONT_SHIFT_LEFT) && (flags & COPI_DONT_SHIFT_RIGHT)
69517 +           && (flags & COPI_DONT_ALLOCATE)) {
69518 +               /* we are forced to use free space of coord->node and new data
69519 +                  does not fit into it. */
69520 +               return -E_NODE_FULL;
69521 +       }
69522 +
69523 +       /* shortcut paste without carry() overhead.
69524 +
69525 +          Only possible if:
69526 +
69527 +          - there is enough free space
69528 +
69529 +          - paste is not into the leftmost unit in a node (otherwise
69530 +          it would require updating of delimiting key in a parent)
69531 +
69532 +          - node plugin agrees with this
69533 +
69534 +          - item plugin agrees with us
69535 +        */
69536 +       if (size_change <= (int)znode_free_space(coord->node) &&
69537 +           (coord->item_pos != 0 ||
69538 +            coord->unit_pos != 0 || coord->between == AFTER_UNIT) &&
69539 +           coord->unit_pos != 0 && nplug->fast_paste != NULL &&
69540 +           nplug->fast_paste(coord) &&
69541 +           iplug->b.fast_paste != NULL && iplug->b.fast_paste(coord)) {
69542 +               if (size_change > 0)
69543 +                       nplug->change_item_size(coord, size_change);
69544 +               /* NOTE-NIKITA: huh? where @key is used? */
69545 +               result = iplug->b.paste(coord, data, NULL);
69546 +               if (size_change < 0)
69547 +                       nplug->change_item_size(coord, size_change);
69548 +               znode_make_dirty(coord->node);
69549 +       } else
69550 +               /* otherwise do full-fledged carry(). */
69551 +               result = paste_with_carry(coord, lh, data, key, flags);
69552 +       return result;
69553 +}
69554 +
69555 +/* this either appends or truncates item @coord */
69556 +int resize_item(coord_t * coord /* coord of item being resized */ ,
69557 +               reiser4_item_data * data /* parameters of resize */ ,
69558 +               reiser4_key * key /* key of new unit */ ,
69559 +               lock_handle * lh        /* lock handle of node
69560 +                                        * being modified */ ,
69561 +               cop_insert_flag flags /* carry flags */ )
69562 +{
69563 +       int result;
69564 +       znode *node;
69565 +
69566 +       assert("nikita-362", coord != NULL);
69567 +       assert("nikita-363", data != NULL);
69568 +       assert("vs-245", data->length != 0);
69569 +
69570 +       node = coord->node;
69571 +       coord_clear_iplug(coord);
69572 +       result = zload(node);
69573 +       if (result != 0)
69574 +               return result;
69575 +
69576 +       if (data->length < 0)
69577 +               result = node_plugin_by_coord(coord)->shrink_item(coord,
69578 +                                                                 -data->length);
69579 +       else
69580 +               result = insert_into_item(coord, lh, key, data, flags);
69581 +
69582 +       zrelse(node);
69583 +       return result;
69584 +}
69585 +
69586 +/* insert flow @f */
69587 +int insert_flow(coord_t * coord, lock_handle * lh, flow_t * f)
69588 +{
69589 +       int result;
69590 +       carry_pool *pool;
69591 +       carry_level *lowest_level;
69592 +       reiser4_item_data *data;
69593 +       carry_op *op;
69594 +
69595 +       pool =
69596 +           init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
69597 +                           sizeof(*data));
69598 +       if (IS_ERR(pool))
69599 +               return PTR_ERR(pool);
69600 +       lowest_level = (carry_level *) (pool + 1);
69601 +       init_carry_level(lowest_level, pool);
69602 +
69603 +       op = post_carry(lowest_level, COP_INSERT_FLOW, coord->node,
69604 +                       0 /* operate directly on coord -> node */ );
69605 +       if (IS_ERR(op) || (op == NULL)) {
69606 +               done_carry_pool(pool);
69607 +               return RETERR(op ? PTR_ERR(op) : -EIO);
69608 +       }
69609 +
69610 +       /* these are permanent during insert_flow */
69611 +       data = (reiser4_item_data *) (lowest_level + 3);
69612 +       data->user = 1;
69613 +       data->iplug = item_plugin_by_id(FORMATTING_ID);
69614 +       data->arg = NULL;
69615 +       /* data.length and data.data will be set before calling paste or
69616 +          insert */
69617 +       data->length = 0;
69618 +       data->data = NULL;
69619 +
69620 +       op->u.insert_flow.flags = 0;
69621 +       op->u.insert_flow.insert_point = coord;
69622 +       op->u.insert_flow.flow = f;
69623 +       op->u.insert_flow.data = data;
69624 +       op->u.insert_flow.new_nodes = 0;
69625 +
69626 +       lowest_level->track_type = CARRY_TRACK_CHANGE;
69627 +       lowest_level->tracked = lh;
69628 +
69629 +       result = carry(lowest_level, NULL);
69630 +       done_carry_pool(pool);
69631 +
69632 +       return result;
69633 +}
69634 +
69635 +/* Given a coord in parent node, obtain a znode for the corresponding child */
69636 +znode *child_znode(const coord_t * parent_coord        /* coord of pointer to
69637 +                                                * child */ ,
69638 +                  znode * parent /* parent of child */ ,
69639 +                  int incore_p /* if !0 only return child if already in
69640 +                                * memory */ ,
69641 +                  int setup_dkeys_p    /* if !0 update delimiting keys of
69642 +                                        * child */ )
69643 +{
69644 +       znode *child;
69645 +
69646 +       assert("nikita-1374", parent_coord != NULL);
69647 +       assert("nikita-1482", parent != NULL);
69648 +#if REISER4_DEBUG
69649 +       if (setup_dkeys_p)
69650 +               assert_rw_not_locked(&(znode_get_tree(parent)->dk_lock));
69651 +#endif
69652 +       assert("nikita-2947", znode_is_any_locked(parent));
69653 +
69654 +       if (znode_get_level(parent) <= LEAF_LEVEL) {
69655 +               /* trying to get child of leaf node */
69656 +               warning("nikita-1217", "Child of maize?");
69657 +               return ERR_PTR(RETERR(-EIO));
69658 +       }
69659 +       if (item_is_internal(parent_coord)) {
69660 +               reiser4_block_nr addr;
69661 +               item_plugin *iplug;
69662 +               reiser4_tree *tree;
69663 +
69664 +               iplug = item_plugin_by_coord(parent_coord);
69665 +               assert("vs-512", iplug->s.internal.down_link);
69666 +               iplug->s.internal.down_link(parent_coord, NULL, &addr);
69667 +
69668 +               tree = znode_get_tree(parent);
69669 +               if (incore_p)
69670 +                       child = zlook(tree, &addr);
69671 +               else
69672 +                       child =
69673 +                           zget(tree, &addr, parent,
69674 +                                znode_get_level(parent) - 1, get_gfp_mask());
69675 +               if ((child != NULL) && !IS_ERR(child) && setup_dkeys_p)
69676 +                       set_child_delimiting_keys(parent, parent_coord, child);
69677 +       } else {
69678 +               warning("nikita-1483", "Internal item expected");
69679 +               child = ERR_PTR(RETERR(-EIO));
69680 +       }
69681 +       return child;
69682 +}
69683 +
69684 +/* remove znode from transaction */
69685 +static void uncapture_znode(znode * node)
69686 +{
69687 +       struct page *page;
69688 +
69689 +       assert("zam-1001", ZF_ISSET(node, JNODE_HEARD_BANSHEE));
69690 +
69691 +       if (!blocknr_is_fake(znode_get_block(node))) {
69692 +               int ret;
69693 +
69694 +               /* An already allocated block goes right to the atom's delete set. */
69695 +               ret =
69696 +                   reiser4_dealloc_block(znode_get_block(node), 0,
69697 +                                         BA_DEFER | BA_FORMATTED);
69698 +               if (ret)
69699 +                       warning("zam-942",
69700 +                               "can\'t add a block (%llu) number to atom's delete set\n",
69701 +                               (unsigned long long)(*znode_get_block(node)));
69702 +
69703 +               spin_lock_znode(node);
69704 +               /* Here we return flush reserved block which was reserved at the
69705 +                * moment when this allocated node was marked dirty and still
69706 +                * not used by flush in node relocation procedure.  */
69707 +               if (ZF_ISSET(node, JNODE_FLUSH_RESERVED)) {
69708 +                       txn_atom *atom;
69709 +
69710 +                       atom = jnode_get_atom(ZJNODE(node));
69711 +                       assert("zam-939", atom != NULL);
69712 +                       spin_unlock_znode(node);
69713 +                       flush_reserved2grabbed(atom, (__u64) 1);
69714 +                       spin_unlock_atom(atom);
69715 +               } else
69716 +                       spin_unlock_znode(node);
69717 +       } else {
69718 +               /* znode has assigned block which is counted as "fake
69719 +                  allocated". Return it back to "free blocks") */
69720 +               fake_allocated2free((__u64) 1, BA_FORMATTED);
69721 +       }
69722 +
69723 +       /*
69724 +        * uncapture page from transaction. There is a possibility of a race
69725 +        * with ->releasepage(): reiser4_releasepage() detaches page from this
69726 +        * jnode and we have nothing to uncapture. To avoid this, get
69727 +        * reference of node->pg under jnode spin lock. uncapture_page() will
69728 +        * deal with released page itself.
69729 +        */
69730 +       spin_lock_znode(node);
69731 +       page = znode_page(node);
69732 +       if (likely(page != NULL)) {
69733 +               /*
69734 +                * uncapture_page() can only be called when we are sure that
69735 +                * znode is pinned in memory, which we are, because
69736 +                * forget_znode() is only called from longterm_unlock_znode().
69737 +                */
69738 +               page_cache_get(page);
69739 +               spin_unlock_znode(node);
69740 +               lock_page(page);
69741 +               uncapture_page(page);
69742 +               unlock_page(page);
69743 +               page_cache_release(page);
69744 +       } else {
69745 +               txn_atom *atom;
69746 +
69747 +               /* handle "flush queued" znodes */
69748 +               while (1) {
69749 +                       atom = jnode_get_atom(ZJNODE(node));
69750 +                       assert("zam-943", atom != NULL);
69751 +
69752 +                       if (!ZF_ISSET(node, JNODE_FLUSH_QUEUED)
69753 +                           || !atom->nr_running_queues)
69754 +                               break;
69755 +
69756 +                       spin_unlock_znode(node);
69757 +                       atom_wait_event(atom);
69758 +                       spin_lock_znode(node);
69759 +               }
69760 +
69761 +               uncapture_block(ZJNODE(node));
69762 +               spin_unlock_atom(atom);
69763 +               zput(node);
69764 +       }
69765 +}
69766 +
69767 +/* This is called from longterm_unlock_znode() when last lock is released from
69768 +   the node that has been removed from the tree. At this point node is removed
69769 +   from sibling list and its lock is invalidated. */
69770 +void forget_znode(lock_handle * handle)
69771 +{
69772 +       znode *node;
69773 +       reiser4_tree *tree;
69774 +
69775 +       assert("umka-319", handle != NULL);
69776 +
69777 +       node = handle->node;
69778 +       tree = znode_get_tree(node);
69779 +
69780 +       assert("vs-164", znode_is_write_locked(node));
69781 +       assert("nikita-1280", ZF_ISSET(node, JNODE_HEARD_BANSHEE));
69782 +       assert_rw_locked(&(node->lock.guard));
69783 +
69784 +       /* We assume that this node was detached from its parent before
69785 +        * unlocking, it gives no way to reach this node from parent through a
69786 +        * down link.  The node should have no children and, thereby, can't be
69787 +        * reached from them by their parent pointers.  The only way to obtain a
69788 +        * reference to the node is to use sibling pointers from its left and
69789 +        * right neighbors.  In the next several lines we remove the node from
69790 +        * the sibling list. */
69791 +
69792 +       write_lock_tree(tree);
69793 +       sibling_list_remove(node);
69794 +       znode_remove(node, tree);
69795 +       write_unlock_tree(tree);
69796 +
69797 +       /* Here we set JNODE_DYING and cancel all pending lock requests.  It
69798 +        * forces all lock requestor threads to repeat iterations of getting
69799 +        * lock on a child, neighbor or parent node.  But, those threads can't
69800 +        * come to this node again, because this node is no longer a child,
69801 +        * neighbor or parent of any other node.  This order of znode
69802 +        * invalidation does not allow other threads to waste cpu time is a busy
69803 +        * loop, trying to lock dying object.  The exception is in the flush
69804 +        * code when we take node directly from atom's capture list.*/
69805 +       invalidate_lock(handle);
69806 +       uncapture_znode(node);
69807 +}
69808 +
69809 +/* Check that internal item at @pointer really contains pointer to @child. */
69810 +int check_tree_pointer(const coord_t * pointer /* would-be pointer to
69811 +                                                * @child */ ,
69812 +                      const znode * child /* child znode */ )
69813 +{
69814 +       assert("nikita-1016", pointer != NULL);
69815 +       assert("nikita-1017", child != NULL);
69816 +       assert("nikita-1018", pointer->node != NULL);
69817 +
69818 +       assert("nikita-1325", znode_is_any_locked(pointer->node));
69819 +
69820 +       assert("nikita-2985",
69821 +              znode_get_level(pointer->node) == znode_get_level(child) + 1);
69822 +
69823 +       coord_clear_iplug((coord_t *) pointer);
69824 +
69825 +       if (coord_is_existing_unit(pointer)) {
69826 +               item_plugin *iplug;
69827 +               reiser4_block_nr addr;
69828 +
69829 +               if (item_is_internal(pointer)) {
69830 +                       iplug = item_plugin_by_coord(pointer);
69831 +                       assert("vs-513", iplug->s.internal.down_link);
69832 +                       iplug->s.internal.down_link(pointer, NULL, &addr);
69833 +                       /* check that cached value is correct */
69834 +                       if (disk_addr_eq(&addr, znode_get_block(child))) {
69835 +                               return NS_FOUND;
69836 +                       }
69837 +               }
69838 +       }
69839 +       /* warning ("jmacd-1002", "tree pointer incorrect"); */
69840 +       return NS_NOT_FOUND;
69841 +}
69842 +
69843 +/* find coord of pointer to new @child in @parent.
69844 +
69845 +   Find the &coord_t in the @parent where pointer to a given @child will
69846 +   be in.
69847 +
69848 +*/
69849 +int find_new_child_ptr(znode * parent /* parent znode, passed locked */ ,
69850 +                      znode *
69851 +                      child UNUSED_ARG /* child znode, passed locked */ ,
69852 +                      znode * left /* left brother of new node */ ,
69853 +                      coord_t * result /* where result is stored in */ )
69854 +{
69855 +       int ret;
69856 +
69857 +       assert("nikita-1486", parent != NULL);
69858 +       assert("nikita-1487", child != NULL);
69859 +       assert("nikita-1488", result != NULL);
69860 +
69861 +       ret = find_child_ptr(parent, left, result);
69862 +       if (ret != NS_FOUND) {
69863 +               warning("nikita-1489", "Cannot find brother position: %i", ret);
69864 +               return RETERR(-EIO);
69865 +       } else {
69866 +               result->between = AFTER_UNIT;
69867 +               return RETERR(NS_NOT_FOUND);
69868 +       }
69869 +}
69870 +
69871 +/* find coord of pointer to @child in @parent.
69872 +
69873 +   Find the &coord_t in the @parent where pointer to a given @child is in.
69874 +
69875 +*/
69876 +int find_child_ptr(znode * parent /* parent znode, passed locked */ ,
69877 +                  znode * child /* child znode, passed locked */ ,
69878 +                  coord_t * result /* where result is stored in */ )
69879 +{
69880 +       int lookup_res;
69881 +       node_plugin *nplug;
69882 +       /* left delimiting key of a child */
69883 +       reiser4_key ld;
69884 +       reiser4_tree *tree;
69885 +
69886 +       assert("nikita-934", parent != NULL);
69887 +       assert("nikita-935", child != NULL);
69888 +       assert("nikita-936", result != NULL);
69889 +       assert("zam-356", znode_is_loaded(parent));
69890 +
69891 +       coord_init_zero(result);
69892 +       result->node = parent;
69893 +
69894 +       nplug = parent->nplug;
69895 +       assert("nikita-939", nplug != NULL);
69896 +
69897 +       tree = znode_get_tree(parent);
69898 +       /* NOTE-NIKITA taking read-lock on tree here assumes that @result is
69899 +        * not aliased to ->in_parent of some znode. Otherwise,
69900 +        * parent_coord_to_coord() below would modify data protected by tree
69901 +        * lock. */
69902 +       read_lock_tree(tree);
69903 +       /* fast path. Try to use cached value. Lock tree to keep
69904 +          node->pos_in_parent and pos->*_blocknr consistent. */
69905 +       if (child->in_parent.item_pos + 1 != 0) {
69906 +               parent_coord_to_coord(&child->in_parent, result);
69907 +               if (check_tree_pointer(result, child) == NS_FOUND) {
69908 +                       read_unlock_tree(tree);
69909 +                       return NS_FOUND;
69910 +               }
69911 +
69912 +               child->in_parent.item_pos = (unsigned short)~0;
69913 +       }
69914 +       read_unlock_tree(tree);
69915 +
69916 +       /* is above failed, find some key from @child. We are looking for the
69917 +          least key in a child. */
69918 +       read_lock_dk(tree);
69919 +       ld = *znode_get_ld_key(child);
69920 +       read_unlock_dk(tree);
69921 +       /*
69922 +        * now, lookup parent with key just found. Note, that left delimiting
69923 +        * key doesn't identify node uniquely, because (in extremely rare
69924 +        * case) two nodes can have equal left delimiting keys, if one of them
69925 +        * is completely filled with directory entries that all happened to be
69926 +        * hash collision. But, we check block number in check_tree_pointer()
69927 +        * and, so, are safe.
69928 +        */
69929 +       lookup_res = nplug->lookup(parent, &ld, FIND_EXACT, result);
69930 +       /* update cached pos_in_node */
69931 +       if (lookup_res == NS_FOUND) {
69932 +               write_lock_tree(tree);
69933 +               coord_to_parent_coord(result, &child->in_parent);
69934 +               write_unlock_tree(tree);
69935 +               lookup_res = check_tree_pointer(result, child);
69936 +       }
69937 +       if (lookup_res == NS_NOT_FOUND)
69938 +               lookup_res = find_child_by_addr(parent, child, result);
69939 +       return lookup_res;
69940 +}
69941 +
69942 +/* find coord of pointer to @child in @parent by scanning
69943 +
69944 +   Find the &coord_t in the @parent where pointer to a given @child
69945 +   is in by scanning all internal items in @parent and comparing block
69946 +   numbers in them with that of @child.
69947 +
69948 +*/
69949 +static int find_child_by_addr(znode * parent /* parent znode, passed locked */ ,
69950 +                             znode * child /* child znode, passed locked */ ,
69951 +                             coord_t * result /* where result is stored in */ )
69952 +{
69953 +       int ret;
69954 +
69955 +       assert("nikita-1320", parent != NULL);
69956 +       assert("nikita-1321", child != NULL);
69957 +       assert("nikita-1322", result != NULL);
69958 +
69959 +       ret = NS_NOT_FOUND;
69960 +
69961 +       for_all_units(result, parent) {
69962 +               if (check_tree_pointer(result, child) == NS_FOUND) {
69963 +                       write_lock_tree(znode_get_tree(parent));
69964 +                       coord_to_parent_coord(result, &child->in_parent);
69965 +                       write_unlock_tree(znode_get_tree(parent));
69966 +                       ret = NS_FOUND;
69967 +                       break;
69968 +               }
69969 +       }
69970 +       return ret;
69971 +}
69972 +
69973 +/* true, if @addr is "unallocated block number", which is just address, with
69974 +   highest bit set. */
69975 +int is_disk_addr_unallocated(const reiser4_block_nr * addr     /* address to
69976 +                                                                * check */ )
69977 +{
69978 +       assert("nikita-1766", addr != NULL);
69979 +       cassert(sizeof(reiser4_block_nr) == 8);
69980 +       return (*addr & REISER4_BLOCKNR_STATUS_BIT_MASK) ==
69981 +           REISER4_UNALLOCATED_STATUS_VALUE;
69982 +}
69983 +
69984 +/* returns true if removing bytes of given range of key [from_key, to_key]
69985 +   causes removing of whole item @from */
69986 +static int
69987 +item_removed_completely(coord_t * from, const reiser4_key * from_key,
69988 +                       const reiser4_key * to_key)
69989 +{
69990 +       item_plugin *iplug;
69991 +       reiser4_key key_in_item;
69992 +
69993 +       assert("umka-325", from != NULL);
69994 +       assert("", item_is_extent(from));
69995 +
69996 +       /* check first key just for case */
69997 +       item_key_by_coord(from, &key_in_item);
69998 +       if (keygt(from_key, &key_in_item))
69999 +               return 0;
70000 +
70001 +       /* check last key */
70002 +       iplug = item_plugin_by_coord(from);
70003 +       assert("vs-611", iplug && iplug->s.file.append_key);
70004 +
70005 +       iplug->s.file.append_key(from, &key_in_item);
70006 +       set_key_offset(&key_in_item, get_key_offset(&key_in_item) - 1);
70007 +
70008 +       if (keylt(to_key, &key_in_item))
70009 +               /* last byte is not removed */
70010 +               return 0;
70011 +       return 1;
70012 +}
70013 +
70014 +/* helper function for prepare_twig_kill(): @left and @right are formatted
70015 + * neighbors of extent item being completely removed. Load and lock neighbors
70016 + * and store lock handles into @cdata for later use by kill_hook_extent() */
70017 +static int
70018 +prepare_children(znode * left, znode * right, carry_kill_data * kdata)
70019 +{
70020 +       int result;
70021 +       int left_loaded;
70022 +       int right_loaded;
70023 +
70024 +       result = 0;
70025 +       left_loaded = right_loaded = 0;
70026 +
70027 +       if (left != NULL) {
70028 +               result = zload(left);
70029 +               if (result == 0) {
70030 +                       left_loaded = 1;
70031 +                       result = longterm_lock_znode(kdata->left, left,
70032 +                                                    ZNODE_READ_LOCK,
70033 +                                                    ZNODE_LOCK_LOPRI);
70034 +               }
70035 +       }
70036 +       if (result == 0 && right != NULL) {
70037 +               result = zload(right);
70038 +               if (result == 0) {
70039 +                       right_loaded = 1;
70040 +                       result = longterm_lock_znode(kdata->right, right,
70041 +                                                    ZNODE_READ_LOCK,
70042 +                                                    ZNODE_LOCK_HIPRI |
70043 +                                                    ZNODE_LOCK_NONBLOCK);
70044 +               }
70045 +       }
70046 +       if (result != 0) {
70047 +               done_lh(kdata->left);
70048 +               done_lh(kdata->right);
70049 +               if (left_loaded != 0)
70050 +                       zrelse(left);
70051 +               if (right_loaded != 0)
70052 +                       zrelse(right);
70053 +       }
70054 +       return result;
70055 +}
70056 +
70057 +static void done_children(carry_kill_data * kdata)
70058 +{
70059 +       if (kdata->left != NULL && kdata->left->node != NULL) {
70060 +               zrelse(kdata->left->node);
70061 +               done_lh(kdata->left);
70062 +       }
70063 +       if (kdata->right != NULL && kdata->right->node != NULL) {
70064 +               zrelse(kdata->right->node);
70065 +               done_lh(kdata->right);
70066 +       }
70067 +}
70068 +
70069 +/* part of cut_node. It is called when cut_node is called to remove or cut part
70070 +   of extent item. When head of that item is removed - we have to update right
70071 +   delimiting of left neighbor of extent. When item is removed completely - we
70072 +   have to set sibling link between left and right neighbor of removed
70073 +   extent. This may return -E_DEADLOCK because of trying to get left neighbor
70074 +   locked. So, caller should repeat an attempt
70075 +*/
70076 +/* Audited by: umka (2002.06.16) */
70077 +static int
70078 +prepare_twig_kill(carry_kill_data * kdata, znode * locked_left_neighbor)
70079 +{
70080 +       int result;
70081 +       reiser4_key key;
70082 +       lock_handle left_lh;
70083 +       lock_handle right_lh;
70084 +       coord_t left_coord;
70085 +       coord_t *from;
70086 +       znode *left_child;
70087 +       znode *right_child;
70088 +       reiser4_tree *tree;
70089 +       int left_zloaded_here, right_zloaded_here;
70090 +
70091 +       from = kdata->params.from;
70092 +       assert("umka-326", from != NULL);
70093 +       assert("umka-327", kdata->params.to != NULL);
70094 +
70095 +       /* for one extent item only yet */
70096 +       assert("vs-591", item_is_extent(from));
70097 +       assert("vs-592", from->item_pos == kdata->params.to->item_pos);
70098 +
70099 +       if ((kdata->params.from_key
70100 +            && keygt(kdata->params.from_key, item_key_by_coord(from, &key)))
70101 +           || from->unit_pos != 0) {
70102 +               /* head of item @from is not removed, there is nothing to
70103 +                  worry about */
70104 +               return 0;
70105 +       }
70106 +
70107 +       result = 0;
70108 +       left_zloaded_here = 0;
70109 +       right_zloaded_here = 0;
70110 +
70111 +       left_child = right_child = NULL;
70112 +
70113 +       coord_dup(&left_coord, from);
70114 +       init_lh(&left_lh);
70115 +       init_lh(&right_lh);
70116 +       if (coord_prev_unit(&left_coord)) {
70117 +               /* @from is leftmost item in its node */
70118 +               if (!locked_left_neighbor) {
70119 +                       result =
70120 +                           reiser4_get_left_neighbor(&left_lh, from->node,
70121 +                                                     ZNODE_READ_LOCK,
70122 +                                                     GN_CAN_USE_UPPER_LEVELS);
70123 +                       switch (result) {
70124 +                       case 0:
70125 +                               break;
70126 +                       case -E_NO_NEIGHBOR:
70127 +                               /* there is no formatted node to the left of
70128 +                                  from->node */
70129 +                               warning("vs-605",
70130 +                                       "extent item has smallest key in "
70131 +                                       "the tree and it is about to be removed");
70132 +                               return 0;
70133 +                       case -E_DEADLOCK:
70134 +                               /* need to restart */
70135 +                       default:
70136 +                               return result;
70137 +                       }
70138 +
70139 +                       /* we have acquired left neighbor of from->node */
70140 +                       result = zload(left_lh.node);
70141 +                       if (result)
70142 +                               goto done;
70143 +
70144 +                       locked_left_neighbor = left_lh.node;
70145 +               } else {
70146 +                       /* squalloc_right_twig_cut should have supplied locked
70147 +                        * left neighbor */
70148 +                       assert("vs-834",
70149 +                              znode_is_write_locked(locked_left_neighbor));
70150 +                       result = zload(locked_left_neighbor);
70151 +                       if (result)
70152 +                               return result;
70153 +               }
70154 +
70155 +               left_zloaded_here = 1;
70156 +               coord_init_last_unit(&left_coord, locked_left_neighbor);
70157 +       }
70158 +
70159 +       if (!item_is_internal(&left_coord)) {
70160 +               /* what else but extent can be on twig level */
70161 +               assert("vs-606", item_is_extent(&left_coord));
70162 +
70163 +               /* there is no left formatted child */
70164 +               if (left_zloaded_here)
70165 +                       zrelse(locked_left_neighbor);
70166 +               done_lh(&left_lh);
70167 +               return 0;
70168 +       }
70169 +
70170 +       tree = znode_get_tree(left_coord.node);
70171 +       left_child = child_znode(&left_coord, left_coord.node, 1, 0);
70172 +
70173 +       if (IS_ERR(left_child)) {
70174 +               result = PTR_ERR(left_child);
70175 +               goto done;
70176 +       }
70177 +
70178 +       /* left child is acquired, calculate new right delimiting key for it
70179 +          and get right child if it is necessary */
70180 +       if (item_removed_completely
70181 +           (from, kdata->params.from_key, kdata->params.to_key)) {
70182 +               /* try to get right child of removed item */
70183 +               coord_t right_coord;
70184 +
70185 +               assert("vs-607",
70186 +                      kdata->params.to->unit_pos ==
70187 +                      coord_last_unit_pos(kdata->params.to));
70188 +               coord_dup(&right_coord, kdata->params.to);
70189 +               if (coord_next_unit(&right_coord)) {
70190 +                       /* @to is rightmost unit in the node */
70191 +                       result =
70192 +                           reiser4_get_right_neighbor(&right_lh, from->node,
70193 +                                                      ZNODE_READ_LOCK,
70194 +                                                      GN_CAN_USE_UPPER_LEVELS);
70195 +                       switch (result) {
70196 +                       case 0:
70197 +                               result = zload(right_lh.node);
70198 +                               if (result)
70199 +                                       goto done;
70200 +
70201 +                               right_zloaded_here = 1;
70202 +                               coord_init_first_unit(&right_coord,
70203 +                                                     right_lh.node);
70204 +                               item_key_by_coord(&right_coord, &key);
70205 +                               break;
70206 +
70207 +                       case -E_NO_NEIGHBOR:
70208 +                               /* there is no formatted node to the right of
70209 +                                  from->node */
70210 +                               read_lock_dk(tree);
70211 +                               key = *znode_get_rd_key(from->node);
70212 +                               read_unlock_dk(tree);
70213 +                               right_coord.node = NULL;
70214 +                               result = 0;
70215 +                               break;
70216 +                       default:
70217 +                               /* real error */
70218 +                               goto done;
70219 +                       }
70220 +               } else {
70221 +                       /* there is an item to the right of @from - take its key */
70222 +                       item_key_by_coord(&right_coord, &key);
70223 +               }
70224 +
70225 +               /* try to get right child of @from */
70226 +               if (right_coord.node && /* there is right neighbor of @from */
70227 +                   item_is_internal(&right_coord)) {   /* it is internal item */
70228 +                       right_child = child_znode(&right_coord,
70229 +                                                 right_coord.node, 1, 0);
70230 +
70231 +                       if (IS_ERR(right_child)) {
70232 +                               result = PTR_ERR(right_child);
70233 +                               goto done;
70234 +                       }
70235 +
70236 +               }
70237 +               /* whole extent is removed between znodes left_child and right_child. Prepare them for linking and
70238 +                  update of right delimiting key of left_child */
70239 +               result = prepare_children(left_child, right_child, kdata);
70240 +       } else {
70241 +               /* head of item @to is removed. left_child has to get right delimting key update. Prepare it for that */
70242 +               result = prepare_children(left_child, NULL, kdata);
70243 +       }
70244 +
70245 +      done:
70246 +       if (right_child)
70247 +               zput(right_child);
70248 +       if (right_zloaded_here)
70249 +               zrelse(right_lh.node);
70250 +       done_lh(&right_lh);
70251 +
70252 +       if (left_child)
70253 +               zput(left_child);
70254 +       if (left_zloaded_here)
70255 +               zrelse(locked_left_neighbor);
70256 +       done_lh(&left_lh);
70257 +       return result;
70258 +}
70259 +
70260 +/* this is used to remove part of node content between coordinates @from and @to. Units to which @from and @to are set
70261 +   are to be cut completely */
70262 +/* for try_to_merge_with_left, delete_copied, delete_node */
70263 +int cut_node_content(coord_t * from, coord_t * to, const reiser4_key * from_key,       /* first key to be removed */
70264 +                    const reiser4_key * to_key,        /* last key to be removed */
70265 +                    reiser4_key *
70266 +                    smallest_removed /* smallest key actually removed */ )
70267 +{
70268 +       int result;
70269 +       carry_pool *pool;
70270 +       carry_level *lowest_level;
70271 +       carry_cut_data *cut_data;
70272 +       carry_op *op;
70273 +
70274 +       assert("vs-1715", coord_compare(from, to) != COORD_CMP_ON_RIGHT);
70275 +
70276 +       pool =
70277 +           init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
70278 +                           sizeof(*cut_data));
70279 +       if (IS_ERR(pool))
70280 +               return PTR_ERR(pool);
70281 +       lowest_level = (carry_level *) (pool + 1);
70282 +       init_carry_level(lowest_level, pool);
70283 +
70284 +       op = post_carry(lowest_level, COP_CUT, from->node, 0);
70285 +       assert("vs-1509", op != 0);
70286 +       if (IS_ERR(op)) {
70287 +               done_carry_pool(pool);
70288 +               return PTR_ERR(op);
70289 +       }
70290 +
70291 +       cut_data = (carry_cut_data *) (lowest_level + 3);
70292 +       cut_data->params.from = from;
70293 +       cut_data->params.to = to;
70294 +       cut_data->params.from_key = from_key;
70295 +       cut_data->params.to_key = to_key;
70296 +       cut_data->params.smallest_removed = smallest_removed;
70297 +
70298 +       op->u.cut_or_kill.is_cut = 1;
70299 +       op->u.cut_or_kill.u.cut = cut_data;
70300 +
70301 +       result = carry(lowest_level, NULL);
70302 +       done_carry_pool(pool);
70303 +
70304 +       return result;
70305 +}
70306 +
70307 +/* cut part of the node
70308 +
70309 +   Cut part or whole content of node.
70310 +
70311 +   cut data between @from and @to of @from->node and call carry() to make
70312 +   corresponding changes in the tree. @from->node may become empty. If so -
70313 +   pointer to it will be removed. Neighboring nodes are not changed. Smallest
70314 +   removed key is stored in @smallest_removed
70315 +
70316 +*/
70317 +int kill_node_content(coord_t * from,  /* coord of the first unit/item that will be eliminated */
70318 +                     coord_t * to,     /* coord of the last unit/item that will be eliminated */
70319 +                     const reiser4_key * from_key,     /* first key to be removed */
70320 +                     const reiser4_key * to_key,       /* last key to be removed */
70321 +                     reiser4_key * smallest_removed,   /* smallest key actually removed */
70322 +                     znode * locked_left_neighbor,     /* this is set when kill_node_content is called with left neighbor
70323 +                                                        * locked (in squalloc_right_twig_cut, namely) */
70324 +                     struct inode *inode,      /* inode of file whose item (or its part) is to be killed. This is necessary to
70325 +                                                  invalidate pages together with item pointing to them */
70326 +                     int truncate)
70327 +{                              /* this call is made for file truncate)  */
70328 +       int result;
70329 +       carry_pool *pool;
70330 +       carry_level *lowest_level;
70331 +       carry_kill_data *kdata;
70332 +       lock_handle *left_child;
70333 +       lock_handle *right_child;
70334 +       carry_op *op;
70335 +
70336 +       assert("umka-328", from != NULL);
70337 +       assert("vs-316", !node_is_empty(from->node));
70338 +       assert("nikita-1812", coord_is_existing_unit(from)
70339 +              && coord_is_existing_unit(to));
70340 +
70341 +       /* allocate carry_pool, 3 carry_level-s, carry_kill_data and structures for kill_hook_extent */
70342 +       pool = init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
70343 +                              sizeof(carry_kill_data) +
70344 +                              2 * sizeof(lock_handle) +
70345 +                              5 * sizeof(reiser4_key) + 2 * sizeof(coord_t));
70346 +       if (IS_ERR(pool))
70347 +               return PTR_ERR(pool);
70348 +
70349 +       lowest_level = (carry_level *) (pool + 1);
70350 +       init_carry_level(lowest_level, pool);
70351 +
70352 +       kdata = (carry_kill_data *) (lowest_level + 3);
70353 +       left_child = (lock_handle *) (kdata + 1);
70354 +       right_child = left_child + 1;
70355 +
70356 +       init_lh(left_child);
70357 +       init_lh(right_child);
70358 +
70359 +       kdata->params.from = from;
70360 +       kdata->params.to = to;
70361 +       kdata->params.from_key = from_key;
70362 +       kdata->params.to_key = to_key;
70363 +       kdata->params.smallest_removed = smallest_removed;
70364 +       kdata->params.truncate = truncate;
70365 +       kdata->flags = 0;
70366 +       kdata->inode = inode;
70367 +       kdata->left = left_child;
70368 +       kdata->right = right_child;
70369 +       /* memory for 5 reiser4_key and 2 coord_t will be used in kill_hook_extent */
70370 +       kdata->buf = (char *)(right_child + 1);
70371 +
70372 +       if (znode_get_level(from->node) == TWIG_LEVEL && item_is_extent(from)) {
70373 +               /* left child of extent item may have to get updated right
70374 +                  delimiting key and to get linked with right child of extent
70375 +                  @from if it will be removed completely */
70376 +               result = prepare_twig_kill(kdata, locked_left_neighbor);
70377 +               if (result) {
70378 +                       done_children(kdata);
70379 +                       done_carry_pool(pool);
70380 +                       return result;
70381 +               }
70382 +       }
70383 +
70384 +       op = post_carry(lowest_level, COP_CUT, from->node, 0);
70385 +       if (IS_ERR(op) || (op == NULL)) {
70386 +               done_children(kdata);
70387 +               done_carry_pool(pool);
70388 +               return RETERR(op ? PTR_ERR(op) : -EIO);
70389 +       }
70390 +
70391 +       op->u.cut_or_kill.is_cut = 0;
70392 +       op->u.cut_or_kill.u.kill = kdata;
70393 +
70394 +       result = carry(lowest_level, NULL);
70395 +
70396 +       done_children(kdata);
70397 +       done_carry_pool(pool);
70398 +       return result;
70399 +}
70400 +
70401 +void
70402 +fake_kill_hook_tail(struct inode *inode, loff_t start, loff_t end, int truncate)
70403 +{
70404 +       if (inode_get_flag(inode, REISER4_HAS_MMAP)) {
70405 +               pgoff_t start_pg, end_pg;
70406 +
70407 +               start_pg = start >> PAGE_CACHE_SHIFT;
70408 +               end_pg = (end - 1) >> PAGE_CACHE_SHIFT;
70409 +
70410 +               if ((start & (PAGE_CACHE_SIZE - 1)) == 0) {
70411 +                       /*
70412 +                        * kill up to the page boundary.
70413 +                        */
70414 +                       assert("vs-123456", start_pg == end_pg);
70415 +                       reiser4_invalidate_pages(inode->i_mapping, start_pg, 1,
70416 +                                                truncate);
70417 +               } else if (start_pg != end_pg) {
70418 +                       /*
70419 +                        * page boundary is within killed portion of node.
70420 +                        */
70421 +                       assert("vs-654321", end_pg - start_pg == 1);
70422 +                       reiser4_invalidate_pages(inode->i_mapping, end_pg,
70423 +                                                end_pg - start_pg, 1);
70424 +               }
70425 +       }
70426 +       inode_sub_bytes(inode, end - start);
70427 +}
70428 +
70429 +/**
70430 + * Delete whole @node from the reiser4 tree without loading it.
70431 + *
70432 + * @left: locked left neighbor,
70433 + * @node: node to be deleted,
70434 + * @smallest_removed: leftmost key of deleted node,
70435 + * @object: inode pointer, if we truncate a file body.
70436 + * @truncate: true if called for file truncate.
70437 + *
70438 + * @return: 0 if success, error code otherwise.
70439 + *
70440 + * NOTE: if @object!=NULL we assume that @smallest_removed != NULL and it
70441 + * contains the right value of the smallest removed key from the previous
70442 + * cut_worker() iteration.  This is needed for proper accounting of
70443 + * "i_blocks" and "i_bytes" fields of the @object.
70444 + */
70445 +int delete_node(znode * node, reiser4_key * smallest_removed,
70446 +               struct inode *object, int truncate)
70447 +{
70448 +       lock_handle parent_lock;
70449 +       coord_t cut_from;
70450 +       coord_t cut_to;
70451 +       reiser4_tree *tree;
70452 +       int ret;
70453 +
70454 +       assert("zam-937", node != NULL);
70455 +       assert("zam-933", znode_is_write_locked(node));
70456 +       assert("zam-999", smallest_removed != NULL);
70457 +
70458 +       init_lh(&parent_lock);
70459 +
70460 +       ret = reiser4_get_parent(&parent_lock, node, ZNODE_WRITE_LOCK);
70461 +       if (ret)
70462 +               return ret;
70463 +
70464 +       assert("zam-934", !znode_above_root(parent_lock.node));
70465 +
70466 +       ret = zload(parent_lock.node);
70467 +       if (ret)
70468 +               goto failed_nozrelse;
70469 +
70470 +       ret = find_child_ptr(parent_lock.node, node, &cut_from);
70471 +       if (ret)
70472 +               goto failed;
70473 +
70474 +       /* decrement child counter and set parent pointer to NULL before
70475 +          deleting the list from parent node because of checks in
70476 +          internal_kill_item_hook (we can delete the last item from the parent
70477 +          node, the parent node is going to be deleted and its c_count should
70478 +          be zero). */
70479 +
70480 +       tree = znode_get_tree(node);
70481 +       write_lock_tree(tree);
70482 +       init_parent_coord(&node->in_parent, NULL);
70483 +       --parent_lock.node->c_count;
70484 +       write_unlock_tree(tree);
70485 +
70486 +       assert("zam-989", item_is_internal(&cut_from));
70487 +
70488 +       /* @node should be deleted after unlocking. */
70489 +       ZF_SET(node, JNODE_HEARD_BANSHEE);
70490 +
70491 +       /* remove a pointer from the parent node to the node being deleted. */
70492 +       coord_dup(&cut_to, &cut_from);
70493 +       /* FIXME: shouldn't this be kill_node_content */
70494 +       ret = cut_node_content(&cut_from, &cut_to, NULL, NULL, NULL);
70495 +       if (ret)
70496 +               /* FIXME(Zam): Should we re-connect the node to its parent if
70497 +                * cut_node fails? */
70498 +               goto failed;
70499 +
70500 +       {
70501 +               reiser4_tree *tree = current_tree;
70502 +               __u64 start_offset = 0, end_offset = 0;
70503 +
70504 +               read_lock_tree(tree);
70505 +               write_lock_dk(tree);
70506 +               if (object) {
70507 +                       /* We use @smallest_removed and the left delimiting of
70508 +                        * the current node for @object->i_blocks, i_bytes
70509 +                        * calculation.  We assume that the items after the
70510 +                        * *@smallest_removed key have been deleted from the
70511 +                        * file body. */
70512 +                       start_offset = get_key_offset(znode_get_ld_key(node));
70513 +                       end_offset = get_key_offset(smallest_removed);
70514 +               }
70515 +
70516 +               assert("zam-1021", znode_is_connected(node));
70517 +               if (node->left)
70518 +                       znode_set_rd_key(node->left, znode_get_rd_key(node));
70519 +
70520 +               *smallest_removed = *znode_get_ld_key(node);
70521 +
70522 +               write_unlock_dk(tree);
70523 +               read_unlock_tree(tree);
70524 +
70525 +               if (object) {
70526 +                       /* we used to perform actions which are to be performed on items on their removal from tree in
70527 +                          special item method - kill_hook. Here for optimization reasons we avoid reading node
70528 +                          containing item we remove and can not call item's kill hook. Instead we call function which
70529 +                          does exactly the same things as tail kill hook in assumption that node we avoid reading
70530 +                          contains only one item and that item is a tail one. */
70531 +                       fake_kill_hook_tail(object, start_offset, end_offset,
70532 +                                           truncate);
70533 +               }
70534 +       }
70535 +      failed:
70536 +       zrelse(parent_lock.node);
70537 +      failed_nozrelse:
70538 +       done_lh(&parent_lock);
70539 +
70540 +       return ret;
70541 +}
70542 +
70543 +static int can_delete(const reiser4_key *key, znode *node)
70544 +{
70545 +       int result;
70546 +
70547 +       read_lock_dk(current_tree);
70548 +       result = keyle(key, znode_get_ld_key(node));
70549 +       read_unlock_dk(current_tree);
70550 +       return result;
70551 +}
70552 +
70553 +/**
70554 + * This subroutine is not optimal but implementation seems to
70555 + * be easier).
70556 + *
70557 + * @tap: the point deletion process begins from,
70558 + * @from_key: the beginning of the deleted key range,
70559 + * @to_key: the end of the deleted key range,
70560 + * @smallest_removed: the smallest removed key,
70561 + * @truncate: true if called for file truncate.
70562 + * @progress: return true if a progress in file items deletions was made,
70563 + *            @smallest_removed value is actual in that case.
70564 + *
70565 + * @return: 0 if success, error code otherwise, -E_REPEAT means that long cut_tree
70566 + * operation was interrupted for allowing atom commit .
70567 + */
70568 +int
70569 +cut_tree_worker_common(tap_t * tap, const reiser4_key * from_key,
70570 +                      const reiser4_key * to_key,
70571 +                      reiser4_key * smallest_removed, struct inode *object,
70572 +                      int truncate, int *progress)
70573 +{
70574 +       lock_handle next_node_lock;
70575 +       coord_t left_coord;
70576 +       int result;
70577 +
70578 +       assert("zam-931", tap->coord->node != NULL);
70579 +       assert("zam-932", znode_is_write_locked(tap->coord->node));
70580 +
70581 +       *progress = 0;
70582 +       init_lh(&next_node_lock);
70583 +
70584 +       while (1) {
70585 +               znode *node;    /* node from which items are cut */
70586 +               node_plugin *nplug;     /* node plugin for @node */
70587 +
70588 +               node = tap->coord->node;
70589 +
70590 +               /* Move next_node_lock to the next node on the left. */
70591 +               result =
70592 +                   reiser4_get_left_neighbor(&next_node_lock, node,
70593 +                                             ZNODE_WRITE_LOCK,
70594 +                                             GN_CAN_USE_UPPER_LEVELS);
70595 +               if (result != 0 && result != -E_NO_NEIGHBOR)
70596 +                       break;
70597 +               /* Check can we delete the node as a whole. */
70598 +               if (*progress && znode_get_level(node) == LEAF_LEVEL &&
70599 +                   can_delete(from_key, node)) {
70600 +                       result = delete_node(node, smallest_removed, object,
70601 +                                            truncate);
70602 +               } else {
70603 +                       result = tap_load(tap);
70604 +                       if (result)
70605 +                               return result;
70606 +
70607 +                       /* Prepare the second (right) point for cut_node() */
70608 +                       if (*progress)
70609 +                               coord_init_last_unit(tap->coord, node);
70610 +
70611 +                       else if (item_plugin_by_coord(tap->coord)->b.lookup ==
70612 +                                NULL)
70613 +                               /* set rightmost unit for the items without lookup method */
70614 +                               tap->coord->unit_pos =
70615 +                                   coord_last_unit_pos(tap->coord);
70616 +
70617 +                       nplug = node->nplug;
70618 +
70619 +                       assert("vs-686", nplug);
70620 +                       assert("vs-687", nplug->lookup);
70621 +
70622 +                       /* left_coord is leftmost unit cut from @node */
70623 +                       result = nplug->lookup(node, from_key,
70624 +                                              FIND_MAX_NOT_MORE_THAN,
70625 +                                              &left_coord);
70626 +
70627 +                       if (IS_CBKERR(result))
70628 +                               break;
70629 +
70630 +                       /* adjust coordinates so that they are set to existing units */
70631 +                       if (coord_set_to_right(&left_coord)
70632 +                           || coord_set_to_left(tap->coord)) {
70633 +                               result = 0;
70634 +                               break;
70635 +                       }
70636 +
70637 +                       if (coord_compare(&left_coord, tap->coord) ==
70638 +                           COORD_CMP_ON_RIGHT) {
70639 +                               /* keys from @from_key to @to_key are not in the tree */
70640 +                               result = 0;
70641 +                               break;
70642 +                       }
70643 +
70644 +                       if (left_coord.item_pos != tap->coord->item_pos) {
70645 +                               /* do not allow to cut more than one item. It is added to solve problem of truncating
70646 +                                  partially converted files. If file is partially converted there may exist a twig node
70647 +                                  containing both internal item or items pointing to leaf nodes with formatting items
70648 +                                  and extent item. We do not want to kill internal items being at twig node here
70649 +                                  because cut_tree_worker assumes killing them from level level */
70650 +                               coord_dup(&left_coord, tap->coord);
70651 +                               assert("vs-1652",
70652 +                                      coord_is_existing_unit(&left_coord));
70653 +                               left_coord.unit_pos = 0;
70654 +                       }
70655 +
70656 +                       /* cut data from one node */
70657 +                       // *smallest_removed = *min_key();
70658 +                       result =
70659 +                           kill_node_content(&left_coord, tap->coord, from_key,
70660 +                                             to_key, smallest_removed,
70661 +                                             next_node_lock.node, object,
70662 +                                             truncate);
70663 +                       tap_relse(tap);
70664 +               }
70665 +               if (result)
70666 +                       break;
70667 +
70668 +               ++(*progress);
70669 +
70670 +               /* Check whether all items with keys >= from_key were removed
70671 +                * from the tree. */
70672 +               if (keyle(smallest_removed, from_key))
70673 +                       /* result = 0; */
70674 +                       break;
70675 +
70676 +               if (next_node_lock.node == NULL)
70677 +                       break;
70678 +
70679 +               result = tap_move(tap, &next_node_lock);
70680 +               done_lh(&next_node_lock);
70681 +               if (result)
70682 +                       break;
70683 +
70684 +               /* Break long cut_tree operation (deletion of a large file) if
70685 +                * atom requires commit. */
70686 +               if (*progress > CUT_TREE_MIN_ITERATIONS
70687 +                   && current_atom_should_commit()) {
70688 +                       result = -E_REPEAT;
70689 +                       break;
70690 +               }
70691 +       }
70692 +       done_lh(&next_node_lock);
70693 +       // assert("vs-301", !keyeq(&smallest_removed, min_key()));
70694 +       return result;
70695 +}
70696 +
70697 +/* there is a fundamental problem with optimizing deletes: VFS does it
70698 +   one file at a time.  Another problem is that if an item can be
70699 +   anything, then deleting items must be done one at a time.  It just
70700 +   seems clean to writes this to specify a from and a to key, and cut
70701 +   everything between them though.  */
70702 +
70703 +/* use this function with care if deleting more than what is part of a single file. */
70704 +/* do not use this when cutting a single item, it is suboptimal for that */
70705 +
70706 +/* You are encouraged to write plugin specific versions of this.  It
70707 +   cannot be optimal for all plugins because it works item at a time,
70708 +   and some plugins could sometimes work node at a time. Regular files
70709 +   however are not optimizable to work node at a time because of
70710 +   extents needing to free the blocks they point to.
70711 +
70712 +   Optimizations compared to v3 code:
70713 +
70714 +   It does not balance (that task is left to memory pressure code).
70715 +
70716 +   Nodes are deleted only if empty.
70717 +
70718 +   Uses extents.
70719 +
70720 +   Performs read-ahead of formatted nodes whose contents are part of
70721 +   the deletion.
70722 +*/
70723 +
70724 +/**
70725 + * Delete everything from the reiser4 tree between two keys: @from_key and
70726 + * @to_key.
70727 + *
70728 + * @from_key: the beginning of the deleted key range,
70729 + * @to_key: the end of the deleted key range,
70730 + * @smallest_removed: the smallest removed key,
70731 + * @object: owner of cutting items.
70732 + * @truncate: true if called for file truncate.
70733 + * @progress: return true if a progress in file items deletions was made,
70734 + *            @smallest_removed value is actual in that case.
70735 + *
70736 + * @return: 0 if success, error code otherwise, -E_REPEAT means that long cut_tree
70737 + * operation was interrupted for allowing atom commit .
70738 + */
70739 +
70740 +int
70741 +cut_tree_object(reiser4_tree * tree, const reiser4_key * from_key,
70742 +               const reiser4_key * to_key, reiser4_key * smallest_removed_p,
70743 +               struct inode *object, int truncate, int *progress)
70744 +{
70745 +       lock_handle lock;
70746 +       int result;
70747 +       tap_t tap;
70748 +       coord_t right_coord;
70749 +       reiser4_key smallest_removed;
70750 +       int (*cut_tree_worker) (tap_t *, const reiser4_key *,
70751 +                               const reiser4_key *, reiser4_key *,
70752 +                               struct inode *, int, int *);
70753 +       STORE_COUNTERS;
70754 +
70755 +       assert("umka-329", tree != NULL);
70756 +       assert("umka-330", from_key != NULL);
70757 +       assert("umka-331", to_key != NULL);
70758 +       assert("zam-936", keyle(from_key, to_key));
70759 +
70760 +       if (smallest_removed_p == NULL)
70761 +               smallest_removed_p = &smallest_removed;
70762 +
70763 +       init_lh(&lock);
70764 +
70765 +       do {
70766 +               /* Find rightmost item to cut away from the tree. */
70767 +               result = object_lookup(object, to_key, &right_coord, &lock,
70768 +                                      ZNODE_WRITE_LOCK, FIND_MAX_NOT_MORE_THAN,
70769 +                                      TWIG_LEVEL, LEAF_LEVEL, CBK_UNIQUE,
70770 +                                      NULL /*ra_info */ );
70771 +               if (result != CBK_COORD_FOUND)
70772 +                       break;
70773 +               if (object == NULL
70774 +                   || inode_file_plugin(object)->cut_tree_worker == NULL)
70775 +                       cut_tree_worker = cut_tree_worker_common;
70776 +               else
70777 +                       cut_tree_worker =
70778 +                           inode_file_plugin(object)->cut_tree_worker;
70779 +               tap_init(&tap, &right_coord, &lock, ZNODE_WRITE_LOCK);
70780 +               result =
70781 +                   cut_tree_worker(&tap, from_key, to_key, smallest_removed_p,
70782 +                                   object, truncate, progress);
70783 +               tap_done(&tap);
70784 +
70785 +               preempt_point();
70786 +
70787 +       } while (0);
70788 +
70789 +       done_lh(&lock);
70790 +
70791 +       if (result) {
70792 +               switch (result) {
70793 +               case -E_NO_NEIGHBOR:
70794 +                       result = 0;
70795 +                       break;
70796 +               case -E_DEADLOCK:
70797 +                       result = -E_REPEAT;
70798 +               case -E_REPEAT:
70799 +               case -ENOMEM:
70800 +               case -ENOENT:
70801 +                       break;
70802 +               default:
70803 +                       warning("nikita-2861", "failure: %i", result);
70804 +               }
70805 +       }
70806 +
70807 +       CHECK_COUNTERS;
70808 +       return result;
70809 +}
70810 +
70811 +/* repeat cut_tree_object until everything is deleted. unlike cut_file_items, it
70812 + * does not end current transaction if -E_REPEAT is returned by
70813 + * cut_tree_object. */
70814 +int
70815 +cut_tree(reiser4_tree * tree, const reiser4_key * from, const reiser4_key * to,
70816 +        struct inode *inode, int truncate)
70817 +{
70818 +       int result;
70819 +       int progress;
70820 +
70821 +       do {
70822 +               result =
70823 +                   cut_tree_object(tree, from, to, NULL, inode, truncate,
70824 +                                   &progress);
70825 +       } while (result == -E_REPEAT);
70826 +
70827 +       return result;
70828 +}
70829 +
70830 +/* finishing reiser4 initialization */
70831 +int init_tree(reiser4_tree * tree      /* pointer to structure being
70832 +                                        * initialized */ ,
70833 +             const reiser4_block_nr * root_block       /* address of a root block
70834 +                                                        * on a disk */ ,
70835 +             tree_level height /* height of a tree */ ,
70836 +             node_plugin * nplug /* default node plugin */ )
70837 +{
70838 +       int result;
70839 +
70840 +       assert("nikita-306", tree != NULL);
70841 +       assert("nikita-307", root_block != NULL);
70842 +       assert("nikita-308", height > 0);
70843 +       assert("nikita-309", nplug != NULL);
70844 +       assert("zam-587", tree->super != NULL);
70845 +
70846 +       tree->root_block = *root_block;
70847 +       tree->height = height;
70848 +       tree->estimate_one_insert = calc_estimate_one_insert(height);
70849 +       tree->nplug = nplug;
70850 +
70851 +       tree->znode_epoch = 1ull;
70852 +
70853 +       cbk_cache_init(&tree->cbk_cache);
70854 +
70855 +       result = znodes_tree_init(tree);
70856 +       if (result == 0)
70857 +               result = jnodes_tree_init(tree);
70858 +       if (result == 0) {
70859 +               tree->uber = zget(tree, &UBER_TREE_ADDR, NULL, 0, get_gfp_mask());
70860 +               if (IS_ERR(tree->uber)) {
70861 +                       result = PTR_ERR(tree->uber);
70862 +                       tree->uber = NULL;
70863 +               }
70864 +       }
70865 +       return result;
70866 +}
70867 +
70868 +/* release resources associated with @tree */
70869 +void done_tree(reiser4_tree * tree /* tree to release */ )
70870 +{
70871 +       if (tree == NULL)
70872 +               return;
70873 +
70874 +       if (tree->uber != NULL) {
70875 +               zput(tree->uber);
70876 +               tree->uber = NULL;
70877 +       }
70878 +       znodes_tree_done(tree);
70879 +       jnodes_tree_done(tree);
70880 +       cbk_cache_done(&tree->cbk_cache);
70881 +}
70882 +
70883 +/* Make Linus happy.
70884 +   Local variables:
70885 +   c-indentation-style: "K&R"
70886 +   mode-name: "LC"
70887 +   c-basic-offset: 8
70888 +   tab-width: 8
70889 +   fill-column: 120
70890 +   scroll-step: 1
70891 +   End:
70892 +*/
70893 diff --git a/fs/reiser4/tree.h b/fs/reiser4/tree.h
70894 new file mode 100644
70895 index 0000000..33428b2
70896 --- /dev/null
70897 +++ b/fs/reiser4/tree.h
70898 @@ -0,0 +1,579 @@
70899 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
70900 + * reiser4/README */
70901 +
70902 +/* Tree operations. See fs/reiser4/tree.c for comments */
70903 +
70904 +#if !defined( __REISER4_TREE_H__ )
70905 +#define __REISER4_TREE_H__
70906 +
70907 +#include "forward.h"
70908 +#include "debug.h"
70909 +#include "dformat.h"
70910 +#include "plugin/node/node.h"
70911 +#include "plugin/plugin.h"
70912 +#include "znode.h"
70913 +#include "tap.h"
70914 +
70915 +#include <linux/types.h>       /* for __u??  */
70916 +#include <linux/fs.h>          /* for struct super_block  */
70917 +#include <linux/spinlock.h>
70918 +#include <linux/sched.h>       /* for struct task_struct */
70919 +
70920 +/* fictive block number never actually used */
70921 +extern const reiser4_block_nr UBER_TREE_ADDR;
70922 +
70923 +/* &cbk_cache_slot - entry in a coord cache.
70924 +
70925 +   This is entry in a coord_by_key (cbk) cache, represented by
70926 +   &cbk_cache.
70927 +
70928 +*/
70929 +typedef struct cbk_cache_slot {
70930 +       /* cached node */
70931 +       znode *node;
70932 +       /* linkage to the next cbk cache slot in a LRU order */
70933 +       struct list_head lru;
70934 +} cbk_cache_slot;
70935 +
70936 +/* &cbk_cache - coord cache. This is part of reiser4_tree.
70937 +
70938 +   cbk_cache is supposed to speed up tree lookups by caching results of recent
70939 +   successful lookups (we don't cache negative results as dentry cache
70940 +   does). Cache consists of relatively small number of entries kept in a LRU
70941 +   order. Each entry (&cbk_cache_slot) contains a pointer to znode, from
70942 +   which we can obtain a range of keys that covered by this znode. Before
70943 +   embarking into real tree traversal we scan cbk_cache slot by slot and for
70944 +   each slot check whether key we are looking for is between minimal and
70945 +   maximal keys for node pointed to by this slot. If no match is found, real
70946 +   tree traversal is performed and if result is successful, appropriate entry
70947 +   is inserted into cache, possibly pulling least recently used entry out of
70948 +   it.
70949 +
70950 +   Tree spin lock is used to protect coord cache. If contention for this
70951 +   lock proves to be too high, more finer grained locking can be added.
70952 +
70953 +   Invariants involving parts of this data-type:
70954 +
70955 +      [cbk-cache-invariant]
70956 +*/
70957 +typedef struct cbk_cache {
70958 +       /* serializator */
70959 +       rwlock_t guard;
70960 +       int nr_slots;
70961 +       /* head of LRU list of cache slots */
70962 +       struct list_head lru;
70963 +       /* actual array of slots */
70964 +       cbk_cache_slot *slot;
70965 +} cbk_cache;
70966 +
70967 +
70968 +/* level_lookup_result - possible outcome of looking up key at some level.
70969 +   This is used by coord_by_key when traversing tree downward. */
70970 +typedef enum {
70971 +       /* continue to the next level */
70972 +       LOOKUP_CONT,
70973 +       /* done. Either required item was found, or we can prove it
70974 +          doesn't exist, or some error occurred. */
70975 +       LOOKUP_DONE,
70976 +       /* restart traversal from the root. Infamous "repetition". */
70977 +       LOOKUP_REST
70978 +} level_lookup_result;
70979 +
70980 +/*    This is representation of internal reiser4 tree where all file-system
70981 +   data and meta-data are stored. This structure is passed to all tree
70982 +   manipulation functions. It's different from the super block because:
70983 +   we don't want to limit ourselves to strictly one to one mapping
70984 +   between super blocks and trees, and, because they are logically
70985 +   different: there are things in a super block that have no relation to
70986 +   the tree (bitmaps, journalling area, mount options, etc.) and there
70987 +   are things in a tree that bear no relation to the super block, like
70988 +   tree of znodes.
70989 +
70990 +   At this time, there is only one tree
70991 +   per filesystem, and this struct is part of the super block.  We only
70992 +   call the super block the super block for historical reasons (most
70993 +   other filesystems call the per filesystem metadata the super block).
70994 +*/
70995 +
70996 +struct reiser4_tree {
70997 +       /* block_nr == 0 is fake znode. Write lock it, while changing
70998 +          tree height. */
70999 +       /* disk address of root node of a tree */
71000 +       reiser4_block_nr root_block;
71001 +
71002 +       /* level of the root node. If this is 1, tree consists of root
71003 +          node only */
71004 +       tree_level height;
71005 +
71006 +       /*
71007 +        * this is cached here avoid calling plugins through function
71008 +        * dereference all the time.
71009 +        */
71010 +       __u64 estimate_one_insert;
71011 +
71012 +       /* cache of recent tree lookup results */
71013 +       cbk_cache cbk_cache;
71014 +
71015 +       /* hash table to look up znodes by block number. */
71016 +       z_hash_table zhash_table;
71017 +       z_hash_table zfake_table;
71018 +       /* hash table to look up jnodes by inode and offset. */
71019 +       j_hash_table jhash_table;
71020 +
71021 +       /* lock protecting:
71022 +          - parent pointers,
71023 +          - sibling pointers,
71024 +          - znode hash table
71025 +          - coord cache
71026 +        */
71027 +       /* NOTE: The "giant" tree lock can be replaced by more spin locks,
71028 +          hoping they will be less contented. We can use one spin lock per one
71029 +          znode hash bucket.  With adding of some code complexity, sibling
71030 +          pointers can be protected by both znode spin locks.  However it looks
71031 +          more SMP scalable we should test this locking change on n-ways (n >
71032 +          4) SMP machines.  Current 4-ways machine test does not show that tree
71033 +          lock is contented and it is a bottleneck (2003.07.25). */
71034 +
71035 +       rwlock_t tree_lock;
71036 +
71037 +       /* lock protecting delimiting keys */
71038 +       rwlock_t dk_lock;
71039 +
71040 +       /* spin lock protecting znode_epoch */
71041 +       spinlock_t epoch_lock;
71042 +       /* version stamp used to mark znode updates. See seal.[ch] for more
71043 +        * information. */
71044 +       __u64 znode_epoch;
71045 +
71046 +       znode *uber;
71047 +       node_plugin *nplug;
71048 +       struct super_block *super;
71049 +       struct {
71050 +               /* carry flags used for insertion of new nodes */
71051 +               __u32 new_node_flags;
71052 +               /* carry flags used for insertion of new extents */
71053 +               __u32 new_extent_flags;
71054 +               /* carry flags used for paste operations */
71055 +               __u32 paste_flags;
71056 +               /* carry flags used for insert operations */
71057 +               __u32 insert_flags;
71058 +       } carry;
71059 +};
71060 +
71061 +extern int init_tree(reiser4_tree * tree,
71062 +                    const reiser4_block_nr * root_block, tree_level height,
71063 +                    node_plugin * default_plugin);
71064 +extern void done_tree(reiser4_tree * tree);
71065 +
71066 +/* cbk flags: options for coord_by_key() */
71067 +typedef enum {
71068 +       /* coord_by_key() is called for insertion. This is necessary because
71069 +          of extents being located at the twig level. For explanation, see
71070 +          comment just above is_next_item_internal().
71071 +        */
71072 +       CBK_FOR_INSERT = (1 << 0),
71073 +       /* coord_by_key() is called with key that is known to be unique */
71074 +       CBK_UNIQUE = (1 << 1),
71075 +       /* coord_by_key() can trust delimiting keys. This options is not user
71076 +          accessible. coord_by_key() will set it automatically. It will be
71077 +          only cleared by special-case in extents-on-the-twig-level handling
71078 +          where it is necessary to insert item with a key smaller than
71079 +          leftmost key in a node. This is necessary because of extents being
71080 +          located at the twig level. For explanation, see comment just above
71081 +          is_next_item_internal().
71082 +        */
71083 +       CBK_TRUST_DK = (1 << 2),
71084 +       CBK_READA = (1 << 3),   /* original: readahead leaves which contain items of certain file */
71085 +       CBK_READDIR_RA = (1 << 4),      /* readdir: readahead whole directory and all its stat datas */
71086 +       CBK_DKSET = (1 << 5),
71087 +       CBK_EXTENDED_COORD = (1 << 6),  /* coord_t is actually */
71088 +       CBK_IN_CACHE = (1 << 7),        /* node is already in cache */
71089 +       CBK_USE_CRABLOCK = (1 << 8)     /* use crab_lock in stead of long term
71090 +                                        * lock */
71091 +} cbk_flags;
71092 +
71093 +/* insertion outcome. IBK = insert by key */
71094 +typedef enum {
71095 +       IBK_INSERT_OK = 0,
71096 +       IBK_ALREADY_EXISTS = -EEXIST,
71097 +       IBK_IO_ERROR = -EIO,
71098 +       IBK_NO_SPACE = -E_NODE_FULL,
71099 +       IBK_OOM = -ENOMEM
71100 +} insert_result;
71101 +
71102 +#define IS_CBKERR(err) ((err) != CBK_COORD_FOUND && (err) != CBK_COORD_NOTFOUND)
71103 +
71104 +typedef int (*tree_iterate_actor_t) (reiser4_tree * tree, coord_t * coord,
71105 +                                    lock_handle * lh, void *arg);
71106 +extern int iterate_tree(reiser4_tree * tree, coord_t * coord, lock_handle * lh,
71107 +                       tree_iterate_actor_t actor, void *arg,
71108 +                       znode_lock_mode mode, int through_units_p);
71109 +extern int get_uber_znode(reiser4_tree * tree, znode_lock_mode mode,
71110 +                         znode_lock_request pri, lock_handle * lh);
71111 +
71112 +/* return node plugin of @node */
71113 +static inline node_plugin *node_plugin_by_node(const znode *
71114 +                                              node /* node to query */ )
71115 +{
71116 +       assert("vs-213", node != NULL);
71117 +       assert("vs-214", znode_is_loaded(node));
71118 +
71119 +       return node->nplug;
71120 +}
71121 +
71122 +/* number of items in @node */
71123 +static inline pos_in_node_t node_num_items(const znode * node)
71124 +{
71125 +       assert("nikita-2754", znode_is_loaded(node));
71126 +       assert("nikita-2468",
71127 +              node_plugin_by_node(node)->num_of_items(node) == node->nr_items);
71128 +
71129 +       return node->nr_items;
71130 +}
71131 +
71132 +/* Return the number of items at the present node.  Asserts coord->node !=
71133 +   NULL. */
71134 +static inline unsigned coord_num_items(const coord_t * coord)
71135 +{
71136 +       assert("jmacd-9805", coord->node != NULL);
71137 +
71138 +       return node_num_items(coord->node);
71139 +}
71140 +
71141 +/* true if @node is empty */
71142 +static inline int node_is_empty(const znode * node)
71143 +{
71144 +       return node_num_items(node) == 0;
71145 +}
71146 +
71147 +typedef enum {
71148 +       SHIFTED_SOMETHING = 0,
71149 +       SHIFT_NO_SPACE = -E_NODE_FULL,
71150 +       SHIFT_IO_ERROR = -EIO,
71151 +       SHIFT_OOM = -ENOMEM,
71152 +} shift_result;
71153 +
71154 +extern node_plugin *node_plugin_by_coord(const coord_t * coord);
71155 +extern int is_coord_in_node(const coord_t * coord);
71156 +extern int key_in_node(const reiser4_key *, const coord_t *);
71157 +extern void coord_item_move_to(coord_t * coord, int items);
71158 +extern void coord_unit_move_to(coord_t * coord, int units);
71159 +
71160 +/* there are two types of repetitive accesses (ra): intra-syscall
71161 +   (local) and inter-syscall (global). Local ra is used when
71162 +   during single syscall we add/delete several items and units in the
71163 +   same place in a tree. Note that plan-A fragments local ra by
71164 +   separating stat-data and file body in key-space. Global ra is
71165 +   used when user does repetitive modifications in the same place in a
71166 +   tree.
71167 +
71168 +   Our ra implementation serves following purposes:
71169 +    1 it affects balancing decisions so that next operation in a row
71170 +      can be performed faster;
71171 +    2 it affects lower-level read-ahead in page-cache;
71172 +    3 it allows to avoid unnecessary lookups by maintaining some state
71173 +      across several operations (this is only for local ra);
71174 +    4 it leaves room for lazy-micro-balancing: when we start a sequence of
71175 +      operations they are performed without actually doing any intra-node
71176 +      shifts, until we finish sequence or scope of sequence leaves
71177 +      current node, only then we really pack node (local ra only).
71178 +*/
71179 +
71180 +/* another thing that can be useful is to keep per-tree and/or
71181 +   per-process cache of recent lookups. This cache can be organised as a
71182 +   list of block numbers of formatted nodes sorted by starting key in
71183 +   this node. Balancings should invalidate appropriate parts of this
71184 +   cache.
71185 +*/
71186 +
71187 +lookup_result coord_by_key(reiser4_tree * tree, const reiser4_key * key,
71188 +                          coord_t * coord, lock_handle * handle,
71189 +                          znode_lock_mode lock, lookup_bias bias,
71190 +                          tree_level lock_level, tree_level stop_level,
71191 +                          __u32 flags, ra_info_t *);
71192 +
71193 +lookup_result object_lookup(struct inode *object,
71194 +                           const reiser4_key * key,
71195 +                           coord_t * coord,
71196 +                           lock_handle * lh,
71197 +                           znode_lock_mode lock_mode,
71198 +                           lookup_bias bias,
71199 +                           tree_level lock_level,
71200 +                           tree_level stop_level,
71201 +                           __u32 flags, ra_info_t * info);
71202 +
71203 +insert_result insert_by_key(reiser4_tree * tree, const reiser4_key * key,
71204 +                           reiser4_item_data * data, coord_t * coord,
71205 +                           lock_handle * lh,
71206 +                           tree_level stop_level, __u32 flags);
71207 +insert_result insert_by_coord(coord_t * coord,
71208 +                             reiser4_item_data * data, const reiser4_key * key,
71209 +                             lock_handle * lh, __u32);
71210 +insert_result insert_extent_by_coord(coord_t * coord,
71211 +                                    reiser4_item_data * data,
71212 +                                    const reiser4_key * key, lock_handle * lh);
71213 +int cut_node_content(coord_t * from, coord_t * to, const reiser4_key * from_key,
71214 +                    const reiser4_key * to_key,
71215 +                    reiser4_key * smallest_removed);
71216 +int kill_node_content(coord_t * from, coord_t * to,
71217 +                     const reiser4_key * from_key, const reiser4_key * to_key,
71218 +                     reiser4_key * smallest_removed,
71219 +                     znode * locked_left_neighbor, struct inode *inode,
71220 +                     int truncate);
71221 +
71222 +int resize_item(coord_t * coord, reiser4_item_data * data,
71223 +               reiser4_key * key, lock_handle * lh, cop_insert_flag);
71224 +int insert_into_item(coord_t * coord, lock_handle * lh, const reiser4_key * key,
71225 +                    reiser4_item_data * data, unsigned);
71226 +int insert_flow(coord_t * coord, lock_handle * lh, flow_t * f);
71227 +int find_new_child_ptr(znode * parent, znode * child, znode * left,
71228 +                      coord_t * result);
71229 +
71230 +int shift_right_of_but_excluding_insert_coord(coord_t * insert_coord);
71231 +int shift_left_of_and_including_insert_coord(coord_t * insert_coord);
71232 +
71233 +void fake_kill_hook_tail(struct inode *, loff_t start, loff_t end, int);
71234 +
71235 +extern int cut_tree_worker_common(tap_t *, const reiser4_key *,
71236 +                                 const reiser4_key *, reiser4_key *,
71237 +                                 struct inode *, int, int *);
71238 +extern int cut_tree_object(reiser4_tree *, const reiser4_key *,
71239 +                          const reiser4_key *, reiser4_key *, struct inode *,
71240 +                          int, int *);
71241 +extern int cut_tree(reiser4_tree * tree, const reiser4_key * from,
71242 +                   const reiser4_key * to, struct inode *, int);
71243 +
71244 +extern int delete_node(znode * node, reiser4_key *, struct inode *, int);
71245 +extern int check_tree_pointer(const coord_t * pointer, const znode * child);
71246 +extern int find_new_child_ptr(znode * parent, znode * child UNUSED_ARG,
71247 +                             znode * left, coord_t * result);
71248 +extern int find_child_ptr(znode * parent, znode * child, coord_t * result);
71249 +extern int set_child_delimiting_keys(znode * parent, const coord_t * in_parent,
71250 +                                    znode * child);
71251 +extern znode *child_znode(const coord_t * in_parent, znode * parent,
71252 +                         int incore_p, int setup_dkeys_p);
71253 +
71254 +extern int cbk_cache_init(cbk_cache * cache);
71255 +extern void cbk_cache_done(cbk_cache * cache);
71256 +extern void cbk_cache_invalidate(const znode * node, reiser4_tree * tree);
71257 +
71258 +extern char *sprint_address(const reiser4_block_nr * block);
71259 +
71260 +#if REISER4_DEBUG
71261 +extern void print_coord_content(const char *prefix, coord_t * p);
71262 +extern void reiser4_print_address(const char *prefix,
71263 +                       const reiser4_block_nr * block);
71264 +extern void print_tree_rec(const char *prefix, reiser4_tree * tree,
71265 +                          __u32 flags);
71266 +extern void check_dkeys(znode *node);
71267 +#else
71268 +#define print_coord_content(p, c) noop
71269 +#define reiser4_print_address(p, b) noop
71270 +#endif
71271 +
71272 +extern void forget_znode(lock_handle * handle);
71273 +extern int deallocate_znode(znode * node);
71274 +
71275 +extern int is_disk_addr_unallocated(const reiser4_block_nr * addr);
71276 +
71277 +/* struct used internally to pack all numerous arguments of tree lookup.
71278 +    Used to avoid passing a lot of arguments to helper functions. */
71279 +typedef struct cbk_handle {
71280 +       /* tree we are in */
71281 +       reiser4_tree *tree;
71282 +       /* key we are going after */
71283 +       const reiser4_key *key;
71284 +       /* coord we will store result in */
71285 +       coord_t *coord;
71286 +       /* type of lock to take on target node */
71287 +       znode_lock_mode lock_mode;
71288 +       /* lookup bias. See comments at the declaration of lookup_bias */
71289 +       lookup_bias bias;
71290 +       /* lock level: level starting from which tree traversal starts taking
71291 +        * write locks. */
71292 +       tree_level lock_level;
71293 +       /* level where search will stop. Either item will be found between
71294 +          lock_level and stop_level, or CBK_COORD_NOTFOUND will be
71295 +          returned.
71296 +        */
71297 +       tree_level stop_level;
71298 +       /* level we are currently at */
71299 +       tree_level level;
71300 +       /* block number of @active node. Tree traversal operates on two
71301 +          nodes: active and parent.  */
71302 +       reiser4_block_nr block;
71303 +       /* put here error message to be printed by caller */
71304 +       const char *error;
71305 +       /* result passed back to caller */
71306 +       lookup_result result;
71307 +       /* lock handles for active and parent */
71308 +       lock_handle *parent_lh;
71309 +       lock_handle *active_lh;
71310 +       reiser4_key ld_key;
71311 +       reiser4_key rd_key;
71312 +       /* flags, passed to the cbk routine. Bits of this bitmask are defined
71313 +          in tree.h:cbk_flags enum. */
71314 +       __u32 flags;
71315 +       ra_info_t *ra_info;
71316 +       struct inode *object;
71317 +} cbk_handle;
71318 +
71319 +extern znode_lock_mode cbk_lock_mode(tree_level level, cbk_handle * h);
71320 +
71321 +/* eottl.c */
71322 +extern int handle_eottl(cbk_handle *h, int *outcome);
71323 +
71324 +int lookup_multikey(cbk_handle * handle, int nr_keys);
71325 +int lookup_couple(reiser4_tree * tree,
71326 +                 const reiser4_key * key1, const reiser4_key * key2,
71327 +                 coord_t * coord1, coord_t * coord2,
71328 +                 lock_handle * lh1, lock_handle * lh2,
71329 +                 znode_lock_mode lock_mode, lookup_bias bias,
71330 +                 tree_level lock_level, tree_level stop_level, __u32 flags,
71331 +                 int *result1, int *result2);
71332 +
71333 +
71334 +static inline void read_lock_tree(reiser4_tree *tree)
71335 +{
71336 +       /* check that tree is not locked */
71337 +       assert("", (LOCK_CNT_NIL(rw_locked_tree) &&
71338 +                   LOCK_CNT_NIL(read_locked_tree) &&
71339 +                   LOCK_CNT_NIL(write_locked_tree)));
71340 +       /* check that spinlocks of lower priorities are not held */
71341 +       assert("", (LOCK_CNT_NIL(spin_locked_txnh) &&
71342 +                   LOCK_CNT_NIL(rw_locked_dk) &&
71343 +                   LOCK_CNT_NIL(spin_locked_stack)));
71344 +
71345 +       read_lock(&(tree->tree_lock));
71346 +
71347 +       LOCK_CNT_INC(read_locked_tree);
71348 +       LOCK_CNT_INC(rw_locked_tree);
71349 +       LOCK_CNT_INC(spin_locked);
71350 +}
71351 +
71352 +static inline void read_unlock_tree(reiser4_tree *tree)
71353 +{
71354 +       assert("nikita-1375", LOCK_CNT_GTZ(read_locked_tree));
71355 +       assert("nikita-1376", LOCK_CNT_GTZ(rw_locked_tree));
71356 +       assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
71357 +
71358 +       LOCK_CNT_DEC(read_locked_tree);
71359 +       LOCK_CNT_DEC(rw_locked_tree);
71360 +       LOCK_CNT_DEC(spin_locked);
71361 +
71362 +       read_unlock(&(tree->tree_lock));
71363 +}
71364 +
71365 +static inline void write_lock_tree(reiser4_tree *tree)
71366 +{
71367 +       /* check that tree is not locked */
71368 +       assert("", (LOCK_CNT_NIL(rw_locked_tree) &&
71369 +                   LOCK_CNT_NIL(read_locked_tree) &&
71370 +                   LOCK_CNT_NIL(write_locked_tree)));
71371 +       /* check that spinlocks of lower priorities are not held */
71372 +       assert("", (LOCK_CNT_NIL(spin_locked_txnh) &&
71373 +                   LOCK_CNT_NIL(rw_locked_dk) &&
71374 +                   LOCK_CNT_NIL(spin_locked_stack)));
71375 +
71376 +       write_lock(&(tree->tree_lock));
71377 +
71378 +       LOCK_CNT_INC(write_locked_tree);
71379 +       LOCK_CNT_INC(rw_locked_tree);
71380 +       LOCK_CNT_INC(spin_locked);
71381 +}
71382 +
71383 +static inline void write_unlock_tree(reiser4_tree *tree)
71384 +{
71385 +       assert("nikita-1375", LOCK_CNT_GTZ(write_locked_tree));
71386 +       assert("nikita-1376", LOCK_CNT_GTZ(rw_locked_tree));
71387 +       assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
71388 +
71389 +       LOCK_CNT_DEC(write_locked_tree);
71390 +       LOCK_CNT_DEC(rw_locked_tree);
71391 +       LOCK_CNT_DEC(spin_locked);
71392 +
71393 +       write_unlock(&(tree->tree_lock));
71394 +}
71395 +
71396 +static inline void read_lock_dk(reiser4_tree *tree)
71397 +{
71398 +       /* check that dk is not locked */
71399 +       assert("", (LOCK_CNT_NIL(rw_locked_dk) &&
71400 +                   LOCK_CNT_NIL(read_locked_dk) &&
71401 +                   LOCK_CNT_NIL(write_locked_dk)));
71402 +       /* check that spinlocks of lower priorities are not held */
71403 +       assert("", LOCK_CNT_NIL(spin_locked_stack));
71404 +
71405 +       read_lock(&((tree)->dk_lock));
71406 +
71407 +       LOCK_CNT_INC(read_locked_dk);
71408 +       LOCK_CNT_INC(rw_locked_dk);
71409 +       LOCK_CNT_INC(spin_locked);
71410 +}
71411 +
71412 +static inline void read_unlock_dk(reiser4_tree *tree)
71413 +{
71414 +       assert("nikita-1375", LOCK_CNT_GTZ(read_locked_dk));
71415 +       assert("nikita-1376", LOCK_CNT_GTZ(rw_locked_dk));
71416 +       assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
71417 +
71418 +       LOCK_CNT_DEC(read_locked_dk);
71419 +       LOCK_CNT_DEC(rw_locked_dk);
71420 +       LOCK_CNT_DEC(spin_locked);
71421 +
71422 +       read_unlock(&(tree->dk_lock));
71423 +}
71424 +
71425 +static inline void write_lock_dk(reiser4_tree *tree)
71426 +{
71427 +       /* check that dk is not locked */
71428 +       assert("", (LOCK_CNT_NIL(rw_locked_dk) &&
71429 +                   LOCK_CNT_NIL(read_locked_dk) &&
71430 +                   LOCK_CNT_NIL(write_locked_dk)));
71431 +       /* check that spinlocks of lower priorities are not held */
71432 +       assert("", LOCK_CNT_NIL(spin_locked_stack));
71433 +
71434 +       write_lock(&((tree)->dk_lock));
71435 +
71436 +       LOCK_CNT_INC(write_locked_dk);
71437 +       LOCK_CNT_INC(rw_locked_dk);
71438 +       LOCK_CNT_INC(spin_locked);
71439 +}
71440 +
71441 +static inline void write_unlock_dk(reiser4_tree *tree)
71442 +{
71443 +       assert("nikita-1375", LOCK_CNT_GTZ(write_locked_dk));
71444 +       assert("nikita-1376", LOCK_CNT_GTZ(rw_locked_dk));
71445 +       assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
71446 +
71447 +       LOCK_CNT_DEC(write_locked_dk);
71448 +       LOCK_CNT_DEC(rw_locked_dk);
71449 +       LOCK_CNT_DEC(spin_locked);
71450 +
71451 +       write_unlock(&(tree->dk_lock));
71452 +}
71453 +
71454 +/* estimate api. Implementation is in estimate.c */
71455 +reiser4_block_nr estimate_one_insert_item(reiser4_tree *);
71456 +reiser4_block_nr estimate_one_insert_into_item(reiser4_tree *);
71457 +reiser4_block_nr estimate_insert_flow(tree_level);
71458 +reiser4_block_nr estimate_one_item_removal(reiser4_tree *);
71459 +reiser4_block_nr calc_estimate_one_insert(tree_level);
71460 +reiser4_block_nr estimate_dirty_cluster(struct inode *);
71461 +reiser4_block_nr estimate_insert_cluster(struct inode *);
71462 +reiser4_block_nr estimate_update_cluster(struct inode *);
71463 +
71464 +
71465 +/* __REISER4_TREE_H__ */
71466 +#endif
71467 +
71468 +/* Make Linus happy.
71469 +   Local variables:
71470 +   c-indentation-style: "K&R"
71471 +   mode-name: "LC"
71472 +   c-basic-offset: 8
71473 +   tab-width: 8
71474 +   fill-column: 120
71475 +   scroll-step: 1
71476 +   End:
71477 +*/
71478 diff --git a/fs/reiser4/tree_mod.c b/fs/reiser4/tree_mod.c
71479 new file mode 100644
71480 index 0000000..823edef
71481 --- /dev/null
71482 +++ b/fs/reiser4/tree_mod.c
71483 @@ -0,0 +1,383 @@
71484 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
71485 + * reiser4/README */
71486 +
71487 +/*
71488 + * Functions to add/delete new nodes to/from the tree.
71489 + *
71490 + * Functions from this file are used by carry (see carry*) to handle:
71491 + *
71492 + *     . insertion of new formatted node into tree
71493 + *
71494 + *     . addition of new tree root, increasing tree height
71495 + *
71496 + *     . removing tree root, decreasing tree height
71497 + *
71498 + */
71499 +
71500 +#include "forward.h"
71501 +#include "debug.h"
71502 +#include "dformat.h"
71503 +#include "key.h"
71504 +#include "coord.h"
71505 +#include "plugin/plugin.h"
71506 +#include "jnode.h"
71507 +#include "znode.h"
71508 +#include "tree_mod.h"
71509 +#include "block_alloc.h"
71510 +#include "tree_walk.h"
71511 +#include "tree.h"
71512 +#include "super.h"
71513 +
71514 +#include <linux/err.h>
71515 +
71516 +static int add_child_ptr(znode * parent, znode * child);
71517 +/* warning only issued if error is not -E_REPEAT */
71518 +#define ewarning( error, ... )                 \
71519 +       if( ( error ) != -E_REPEAT )            \
71520 +               warning( __VA_ARGS__ )
71521 +
71522 +/* allocate new node on the @level and immediately on the right of @brother. */
71523 +znode *new_node(znode * brother /* existing left neighbor of new node */ ,
71524 +               tree_level level        /* tree level at which new node is to
71525 +                                        * be allocated */ )
71526 +{
71527 +       znode *result;
71528 +       int retcode;
71529 +       reiser4_block_nr blocknr;
71530 +
71531 +       assert("nikita-930", brother != NULL);
71532 +       assert("umka-264", level < REAL_MAX_ZTREE_HEIGHT);
71533 +
71534 +       retcode = assign_fake_blocknr_formatted(&blocknr);
71535 +       if (retcode == 0) {
71536 +               result =
71537 +                   zget(znode_get_tree(brother), &blocknr, NULL, level,
71538 +                        get_gfp_mask());
71539 +               if (IS_ERR(result)) {
71540 +                       ewarning(PTR_ERR(result), "nikita-929",
71541 +                                "Cannot allocate znode for carry: %li",
71542 +                                PTR_ERR(result));
71543 +                       return result;
71544 +               }
71545 +               /* cheap test, can be executed even when debugging is off */
71546 +               if (!znode_just_created(result)) {
71547 +                       warning("nikita-2213",
71548 +                               "Allocated already existing block: %llu",
71549 +                               (unsigned long long)blocknr);
71550 +                       zput(result);
71551 +                       return ERR_PTR(RETERR(-EIO));
71552 +               }
71553 +
71554 +               assert("nikita-931", result != NULL);
71555 +               result->nplug = znode_get_tree(brother)->nplug;
71556 +               assert("nikita-933", result->nplug != NULL);
71557 +
71558 +               retcode = zinit_new(result, get_gfp_mask());
71559 +               if (retcode == 0) {
71560 +                       ZF_SET(result, JNODE_CREATED);
71561 +                       zrelse(result);
71562 +               } else {
71563 +                       zput(result);
71564 +                       result = ERR_PTR(retcode);
71565 +               }
71566 +       } else {
71567 +               /* failure to allocate new node during balancing.
71568 +                  This should never happen. Ever. Returning -E_REPEAT
71569 +                  is not viable solution, because "out of disk space"
71570 +                  is not transient error that will go away by itself.
71571 +                */
71572 +               ewarning(retcode, "nikita-928",
71573 +                        "Cannot allocate block for carry: %i", retcode);
71574 +               result = ERR_PTR(retcode);
71575 +       }
71576 +       assert("nikita-1071", result != NULL);
71577 +       return result;
71578 +}
71579 +
71580 +/* allocate new root and add it to the tree
71581 +
71582 +   This helper function is called by add_new_root().
71583 +
71584 +*/
71585 +znode *add_tree_root(znode * old_root /* existing tree root */ ,
71586 +                    znode * fake /* "fake" znode */ )
71587 +{
71588 +       reiser4_tree *tree = znode_get_tree(old_root);
71589 +       znode *new_root = NULL; /* to shut gcc up */
71590 +       int result;
71591 +
71592 +       assert("nikita-1069", old_root != NULL);
71593 +       assert("umka-262", fake != NULL);
71594 +       assert("umka-263", tree != NULL);
71595 +
71596 +       /* "fake" znode---one always hanging just above current root. This
71597 +          node is locked when new root is created or existing root is
71598 +          deleted. Downward tree traversal takes lock on it before taking
71599 +          lock on a root node. This avoids race conditions with root
71600 +          manipulations.
71601 +
71602 +        */
71603 +       assert("nikita-1348", znode_above_root(fake));
71604 +       assert("nikita-1211", znode_is_root(old_root));
71605 +
71606 +       result = 0;
71607 +       if (tree->height >= REAL_MAX_ZTREE_HEIGHT) {
71608 +               warning("nikita-1344", "Tree is too tall: %i", tree->height);
71609 +               /* ext2 returns -ENOSPC when it runs out of free inodes with a
71610 +                  following comment (fs/ext2/ialloc.c:441): Is it really
71611 +                  ENOSPC?
71612 +
71613 +                  -EXFULL? -EINVAL?
71614 +                */
71615 +               result = RETERR(-ENOSPC);
71616 +       } else {
71617 +               /* Allocate block for new root. It's not that
71618 +                  important where it will be allocated, as root is
71619 +                  almost always in memory. Moreover, allocate on
71620 +                  flush can be going here.
71621 +                */
71622 +               assert("nikita-1448", znode_is_root(old_root));
71623 +               new_root = new_node(fake, tree->height + 1);
71624 +               if (!IS_ERR(new_root) && (result = zload(new_root)) == 0) {
71625 +                       lock_handle rlh;
71626 +
71627 +                       init_lh(&rlh);
71628 +                       result =
71629 +                           longterm_lock_znode(&rlh, new_root,
71630 +                                               ZNODE_WRITE_LOCK,
71631 +                                               ZNODE_LOCK_LOPRI);
71632 +                       if (result == 0) {
71633 +                               parent_coord_t *in_parent;
71634 +
71635 +                               znode_make_dirty(fake);
71636 +
71637 +                               /* new root is a child of "fake" node */
71638 +                               write_lock_tree(tree);
71639 +
71640 +                               ++tree->height;
71641 +
71642 +                               /* recalculate max balance overhead */
71643 +                               tree->estimate_one_insert =
71644 +                                   estimate_one_insert_item(tree);
71645 +
71646 +                               tree->root_block = *znode_get_block(new_root);
71647 +                               in_parent = &new_root->in_parent;
71648 +                               init_parent_coord(in_parent, fake);
71649 +                               /* manually insert new root into sibling
71650 +                                * list. With this all nodes involved into
71651 +                                * balancing are connected after balancing is
71652 +                                * done---useful invariant to check. */
71653 +                               sibling_list_insert_nolock(new_root, NULL);
71654 +                               write_unlock_tree(tree);
71655 +
71656 +                               /* insert into new root pointer to the
71657 +                                  @old_root. */
71658 +                               assert("nikita-1110",
71659 +                                      WITH_DATA(new_root,
71660 +                                                node_is_empty(new_root)));
71661 +                               write_lock_dk(tree);
71662 +                               znode_set_ld_key(new_root, min_key());
71663 +                               znode_set_rd_key(new_root, max_key());
71664 +                               write_unlock_dk(tree);
71665 +                               if (REISER4_DEBUG) {
71666 +                                       ZF_CLR(old_root, JNODE_LEFT_CONNECTED);
71667 +                                       ZF_CLR(old_root, JNODE_RIGHT_CONNECTED);
71668 +                                       ZF_SET(old_root, JNODE_ORPHAN);
71669 +                               }
71670 +                               result = add_child_ptr(new_root, old_root);
71671 +                               done_lh(&rlh);
71672 +                       }
71673 +                       zrelse(new_root);
71674 +               }
71675 +       }
71676 +       if (result != 0)
71677 +               new_root = ERR_PTR(result);
71678 +       return new_root;
71679 +}
71680 +
71681 +/* build &reiser4_item_data for inserting child pointer
71682 +
71683 +   Build &reiser4_item_data that can be later used to insert pointer to @child
71684 +   in its parent.
71685 +
71686 +*/
71687 +void build_child_ptr_data(znode * child        /* node pointer to which will be
71688 +                                        * inserted */ ,
71689 +                         reiser4_item_data * data /* where to store result */ )
71690 +{
71691 +       assert("nikita-1116", child != NULL);
71692 +       assert("nikita-1117", data != NULL);
71693 +
71694 +       /*
71695 +        * NOTE: use address of child's blocknr as address of data to be
71696 +        * inserted. As result of this data gets into on-disk structure in cpu
71697 +        * byte order. internal's create_hook converts it to little endian byte
71698 +        * order.
71699 +        */
71700 +       data->data = (char *)znode_get_block(child);
71701 +       /* data -> data is kernel space */
71702 +       data->user = 0;
71703 +       data->length = sizeof(reiser4_block_nr);
71704 +       /* FIXME-VS: hardcoded internal item? */
71705 +
71706 +       /* AUDIT: Is it possible that "item_plugin_by_id" may find nothing? */
71707 +       data->iplug = item_plugin_by_id(NODE_POINTER_ID);
71708 +}
71709 +
71710 +/* add pointer to @child into empty @parent.
71711 +
71712 +   This is used when pointer to old root is inserted into new root which is
71713 +   empty.
71714 +*/
71715 +static int add_child_ptr(znode * parent, znode * child)
71716 +{
71717 +       coord_t coord;
71718 +       reiser4_item_data data;
71719 +       int result;
71720 +       reiser4_key key;
71721 +
71722 +       assert("nikita-1111", parent != NULL);
71723 +       assert("nikita-1112", child != NULL);
71724 +       assert("nikita-1115",
71725 +              znode_get_level(parent) == znode_get_level(child) + 1);
71726 +
71727 +       result = zload(parent);
71728 +       if (result != 0)
71729 +               return result;
71730 +       assert("nikita-1113", node_is_empty(parent));
71731 +       coord_init_first_unit(&coord, parent);
71732 +
71733 +       build_child_ptr_data(child, &data);
71734 +       data.arg = NULL;
71735 +
71736 +       read_lock_dk(znode_get_tree(parent));
71737 +       key = *znode_get_ld_key(child);
71738 +       read_unlock_dk(znode_get_tree(parent));
71739 +
71740 +       result = node_plugin_by_node(parent)->create_item(&coord, &key, &data,
71741 +                                                         NULL);
71742 +       znode_make_dirty(parent);
71743 +       zrelse(parent);
71744 +       return result;
71745 +}
71746 +
71747 +/* actually remove tree root */
71748 +static int kill_root(reiser4_tree * tree       /* tree from which root is being
71749 +                                                * removed */ ,
71750 +                    znode * old_root /* root node that is being removed */ ,
71751 +                    znode * new_root   /* new root---sole child of *
71752 +                                        * @old_root */ ,
71753 +                    const reiser4_block_nr * new_root_blk      /* disk address of
71754 +                                                                * @new_root */ )
71755 +{
71756 +       znode *uber;
71757 +       int result;
71758 +       lock_handle handle_for_uber;
71759 +
71760 +       assert("umka-265", tree != NULL);
71761 +       assert("nikita-1198", new_root != NULL);
71762 +       assert("nikita-1199",
71763 +              znode_get_level(new_root) + 1 == znode_get_level(old_root));
71764 +
71765 +       assert("nikita-1201", znode_is_write_locked(old_root));
71766 +
71767 +       assert("nikita-1203",
71768 +              disk_addr_eq(new_root_blk, znode_get_block(new_root)));
71769 +
71770 +       init_lh(&handle_for_uber);
71771 +       /* obtain and lock "fake" znode protecting changes in tree height. */
71772 +       result = get_uber_znode(tree, ZNODE_WRITE_LOCK, ZNODE_LOCK_HIPRI,
71773 +                               &handle_for_uber);
71774 +       if (result == 0) {
71775 +               uber = handle_for_uber.node;
71776 +
71777 +               znode_make_dirty(uber);
71778 +
71779 +               /* don't take long term lock a @new_root. Take spinlock. */
71780 +
71781 +               write_lock_tree(tree);
71782 +
71783 +               tree->root_block = *new_root_blk;
71784 +               --tree->height;
71785 +
71786 +               /* recalculate max balance overhead */
71787 +               tree->estimate_one_insert = estimate_one_insert_item(tree);
71788 +
71789 +               assert("nikita-1202",
71790 +                      tree->height == znode_get_level(new_root));
71791 +
71792 +               /* new root is child on "fake" node */
71793 +               init_parent_coord(&new_root->in_parent, uber);
71794 +               ++uber->c_count;
71795 +
71796 +               /* sibling_list_insert_nolock(new_root, NULL); */
71797 +               write_unlock_tree(tree);
71798 +
71799 +               /* reinitialise old root. */
71800 +               result = node_plugin_by_node(old_root)->init(old_root);
71801 +               znode_make_dirty(old_root);
71802 +               if (result == 0) {
71803 +                       assert("nikita-1279", node_is_empty(old_root));
71804 +                       ZF_SET(old_root, JNODE_HEARD_BANSHEE);
71805 +                       old_root->c_count = 0;
71806 +               }
71807 +       }
71808 +       done_lh(&handle_for_uber);
71809 +
71810 +       return result;
71811 +}
71812 +
71813 +/* remove tree root
71814 +
71815 +   This function removes tree root, decreasing tree height by one.  Tree root
71816 +   and its only child (that is going to become new tree root) are write locked
71817 +   at the entry.
71818 +
71819 +   To remove tree root we need to take lock on special "fake" znode that
71820 +   protects changes of tree height. See comments in add_tree_root() for more
71821 +   on this.
71822 +
71823 +   Also parent pointers have to be updated in
71824 +   old and new root. To simplify code, function is split into two parts: outer
71825 +   kill_tree_root() collects all necessary arguments and calls kill_root()
71826 +   to do the actual job.
71827 +
71828 +*/
71829 +int kill_tree_root(znode * old_root /* tree root that we are removing */ )
71830 +{
71831 +       int result;
71832 +       coord_t down_link;
71833 +       znode *new_root;
71834 +       reiser4_tree *tree;
71835 +
71836 +       assert("umka-266", current_tree != NULL);
71837 +       assert("nikita-1194", old_root != NULL);
71838 +       assert("nikita-1196", znode_is_root(old_root));
71839 +       assert("nikita-1200", node_num_items(old_root) == 1);
71840 +       assert("nikita-1401", znode_is_write_locked(old_root));
71841 +
71842 +       coord_init_first_unit(&down_link, old_root);
71843 +
71844 +       tree = znode_get_tree(old_root);
71845 +       new_root = child_znode(&down_link, old_root, 0, 1);
71846 +       if (!IS_ERR(new_root)) {
71847 +               result =
71848 +                   kill_root(tree, old_root, new_root,
71849 +                             znode_get_block(new_root));
71850 +               zput(new_root);
71851 +       } else
71852 +               result = PTR_ERR(new_root);
71853 +
71854 +       return result;
71855 +}
71856 +
71857 +/* Make Linus happy.
71858 +   Local variables:
71859 +   c-indentation-style: "K&R"
71860 +   mode-name: "LC"
71861 +   c-basic-offset: 8
71862 +   tab-width: 8
71863 +   fill-column: 120
71864 +   scroll-step: 1
71865 +   End:
71866 +*/
71867 diff --git a/fs/reiser4/tree_mod.h b/fs/reiser4/tree_mod.h
71868 new file mode 100644
71869 index 0000000..644857c
71870 --- /dev/null
71871 +++ b/fs/reiser4/tree_mod.h
71872 @@ -0,0 +1,29 @@
71873 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
71874 + * reiser4/README */
71875 +
71876 +/* Functions to add/delete new nodes to/from the tree. See tree_mod.c for
71877 + * comments. */
71878 +
71879 +#if !defined( __REISER4_TREE_MOD_H__ )
71880 +#define __REISER4_TREE_MOD_H__
71881 +
71882 +#include "forward.h"
71883 +
71884 +znode *new_node(znode * brother, tree_level level);
71885 +znode *add_tree_root(znode * old_root, znode * fake);
71886 +int kill_tree_root(znode * old_root);
71887 +void build_child_ptr_data(znode * child, reiser4_item_data * data);
71888 +
71889 +/* __REISER4_TREE_MOD_H__ */
71890 +#endif
71891 +
71892 +/* Make Linus happy.
71893 +   Local variables:
71894 +   c-indentation-style: "K&R"
71895 +   mode-name: "LC"
71896 +   c-basic-offset: 8
71897 +   tab-width: 8
71898 +   fill-column: 120
71899 +   scroll-step: 1
71900 +   End:
71901 +*/
71902 diff --git a/fs/reiser4/tree_walk.c b/fs/reiser4/tree_walk.c
71903 new file mode 100644
71904 index 0000000..4423d4b
71905 --- /dev/null
71906 +++ b/fs/reiser4/tree_walk.c
71907 @@ -0,0 +1,926 @@
71908 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
71909 + * reiser4/README */
71910 +
71911 +/* Routines and macros to:
71912 +
71913 +   get_left_neighbor()
71914 +
71915 +   get_right_neighbor()
71916 +
71917 +   get_parent()
71918 +
71919 +   get_first_child()
71920 +
71921 +   get_last_child()
71922 +
71923 +   various routines to walk the whole tree and do things to it like
71924 +   repack it, or move it to tertiary storage.  Please make them as
71925 +   generic as is reasonable.
71926 +
71927 +*/
71928 +
71929 +#include "forward.h"
71930 +#include "debug.h"
71931 +#include "dformat.h"
71932 +#include "coord.h"
71933 +#include "plugin/item/item.h"
71934 +#include "jnode.h"
71935 +#include "znode.h"
71936 +#include "tree_walk.h"
71937 +#include "tree.h"
71938 +#include "super.h"
71939 +
71940 +/* These macros are used internally in tree_walk.c in attempt to make
71941 +   lock_neighbor() code usable to build lock_parent(), lock_right_neighbor,
71942 +   lock_left_neighbor */
71943 +#define GET_NODE_BY_PTR_OFFSET(node, off) (*(znode**)(((unsigned long)(node)) + (off)))
71944 +#define FIELD_OFFSET(name)  offsetof(znode, name)
71945 +#define PARENT_PTR_OFFSET FIELD_OFFSET(in_parent.node)
71946 +#define LEFT_PTR_OFFSET   FIELD_OFFSET(left)
71947 +#define RIGHT_PTR_OFFSET  FIELD_OFFSET(right)
71948 +
71949 +/* This is the generic procedure to get and lock `generic' neighbor (left or
71950 +    right neighbor or parent). It implements common algorithm for all cases of
71951 +    getting lock on neighbor node, only znode structure field is different in
71952 +    each case. This is parameterized by ptr_offset argument, which is byte
71953 +    offset for the pointer to the desired neighbor within the current node's
71954 +    znode structure. This function should be called with the tree lock held */
71955 +static int lock_neighbor(
71956 +                               /* resulting lock handle */
71957 +                               lock_handle * result,
71958 +                               /* znode to lock */
71959 +                               znode * node,
71960 +                               /* pointer to neighbor (or parent) znode field offset, in bytes from
71961 +                                  the base address of znode structure  */
71962 +                               int ptr_offset,
71963 +                               /* lock mode for longterm_lock_znode call */
71964 +                               znode_lock_mode mode,
71965 +                               /* lock request for longterm_lock_znode call */
71966 +                               znode_lock_request req,
71967 +                               /* GN_* flags */
71968 +                               int flags, int rlocked)
71969 +{
71970 +       reiser4_tree *tree = znode_get_tree(node);
71971 +       znode *neighbor;
71972 +       int ret;
71973 +
71974 +       assert("umka-236", node != NULL);
71975 +       assert("umka-237", tree != NULL);
71976 +       assert_rw_locked(&(tree->tree_lock));
71977 +
71978 +       if (flags & GN_TRY_LOCK)
71979 +               req |= ZNODE_LOCK_NONBLOCK;
71980 +       if (flags & GN_SAME_ATOM)
71981 +               req |= ZNODE_LOCK_DONT_FUSE;
71982 +
71983 +       /* get neighbor's address by using of sibling link, quit while loop
71984 +          (and return) if link is not available. */
71985 +       while (1) {
71986 +               neighbor = GET_NODE_BY_PTR_OFFSET(node, ptr_offset);
71987 +
71988 +               /* return -E_NO_NEIGHBOR if parent or side pointer is NULL or if
71989 +                * node pointed by it is not connected.
71990 +                *
71991 +                * However, GN_ALLOW_NOT_CONNECTED option masks "connected"
71992 +                * check and allows passing reference to not connected znode to
71993 +                * subsequent longterm_lock_znode() call.  This kills possible
71994 +                * busy loop if we are trying to get longterm lock on locked but
71995 +                * not yet connected parent node. */
71996 +               if (neighbor == NULL || !((flags & GN_ALLOW_NOT_CONNECTED)
71997 +                                         || znode_is_connected(neighbor))) {
71998 +                       return RETERR(-E_NO_NEIGHBOR);
71999 +               }
72000 +
72001 +               /* protect it from deletion. */
72002 +               zref(neighbor);
72003 +
72004 +               rlocked ? read_unlock_tree(tree) : write_unlock_tree(tree);
72005 +
72006 +               ret = longterm_lock_znode(result, neighbor, mode, req);
72007 +
72008 +               /* The lock handle obtains its own reference, release the one from above. */
72009 +               zput(neighbor);
72010 +
72011 +               rlocked ? read_lock_tree(tree) : write_lock_tree(tree);
72012 +
72013 +               /* restart if node we got reference to is being
72014 +                  invalidated. we should not get reference to this node
72015 +                  again. */
72016 +               if (ret == -EINVAL)
72017 +                       continue;
72018 +               if (ret)
72019 +                       return ret;
72020 +
72021 +               /* check if neighbor link still points to just locked znode;
72022 +                  the link could have been changed while the process slept. */
72023 +               if (neighbor == GET_NODE_BY_PTR_OFFSET(node, ptr_offset))
72024 +                       return 0;
72025 +
72026 +               /* znode was locked by mistake; unlock it and restart locking
72027 +                  process from beginning. */
72028 +               rlocked ? read_unlock_tree(tree) : write_unlock_tree(tree);
72029 +               longterm_unlock_znode(result);
72030 +               rlocked ? read_lock_tree(tree) : write_lock_tree(tree);
72031 +       }
72032 +}
72033 +
72034 +/* get parent node with longterm lock, accepts GN* flags. */
72035 +int reiser4_get_parent_flags(lock_handle * lh /* resulting lock handle */ ,
72036 +                            znode * node /* child node */ ,
72037 +                            znode_lock_mode mode
72038 +                            /* type of lock: read or write */ ,
72039 +                            int flags /* GN_* flags */ )
72040 +{
72041 +       int result;
72042 +
72043 +       read_lock_tree(znode_get_tree(node));
72044 +       result = lock_neighbor(lh, node, PARENT_PTR_OFFSET, mode,
72045 +                              ZNODE_LOCK_HIPRI, flags, 1);
72046 +       read_unlock_tree(znode_get_tree(node));
72047 +       return result;
72048 +}
72049 +
72050 +/* wrapper function to lock right or left neighbor depending on GN_GO_LEFT
72051 +   bit in @flags parameter  */
72052 +/* Audited by: umka (2002.06.14) */
72053 +static inline int
72054 +lock_side_neighbor(lock_handle * result,
72055 +                  znode * node, znode_lock_mode mode, int flags, int rlocked)
72056 +{
72057 +       int ret;
72058 +       int ptr_offset;
72059 +       znode_lock_request req;
72060 +
72061 +       if (flags & GN_GO_LEFT) {
72062 +               ptr_offset = LEFT_PTR_OFFSET;
72063 +               req = ZNODE_LOCK_LOPRI;
72064 +       } else {
72065 +               ptr_offset = RIGHT_PTR_OFFSET;
72066 +               req = ZNODE_LOCK_HIPRI;
72067 +       }
72068 +
72069 +       ret =
72070 +           lock_neighbor(result, node, ptr_offset, mode, req, flags, rlocked);
72071 +
72072 +       if (ret == -E_NO_NEIGHBOR)      /* if we walk left or right -E_NO_NEIGHBOR does not
72073 +                                        * guarantee that neighbor is absent in the
72074 +                                        * tree; in this case we return -ENOENT --
72075 +                                        * means neighbor at least not found in
72076 +                                        * cache */
72077 +               return RETERR(-ENOENT);
72078 +
72079 +       return ret;
72080 +}
72081 +
72082 +#if REISER4_DEBUG
72083 +
72084 +int check_sibling_list(znode * node)
72085 +{
72086 +       znode *scan;
72087 +       znode *next;
72088 +
72089 +       assert("nikita-3283", LOCK_CNT_GTZ(write_locked_tree));
72090 +
72091 +       if (node == NULL)
72092 +               return 1;
72093 +
72094 +       if (ZF_ISSET(node, JNODE_RIP))
72095 +               return 1;
72096 +
72097 +       assert("nikita-3270", node != NULL);
72098 +       assert_rw_write_locked(&(znode_get_tree(node)->tree_lock));
72099 +
72100 +       for (scan = node; znode_is_left_connected(scan); scan = next) {
72101 +               next = scan->left;
72102 +               if (next != NULL && !ZF_ISSET(next, JNODE_RIP)) {
72103 +                       assert("nikita-3271", znode_is_right_connected(next));
72104 +                       assert("nikita-3272", next->right == scan);
72105 +               } else
72106 +                       break;
72107 +       }
72108 +       for (scan = node; znode_is_right_connected(scan); scan = next) {
72109 +               next = scan->right;
72110 +               if (next != NULL && !ZF_ISSET(next, JNODE_RIP)) {
72111 +                       assert("nikita-3273", znode_is_left_connected(next));
72112 +                       assert("nikita-3274", next->left == scan);
72113 +               } else
72114 +                       break;
72115 +       }
72116 +       return 1;
72117 +}
72118 +
72119 +#endif
72120 +
72121 +/* Znode sibling pointers maintenence. */
72122 +
72123 +/* Znode sibling pointers are established between any neighbored nodes which are
72124 +   in cache.  There are two znode state bits (JNODE_LEFT_CONNECTED,
72125 +   JNODE_RIGHT_CONNECTED), if left or right sibling pointer contains actual
72126 +   value (even NULL), corresponded JNODE_*_CONNECTED bit is set.
72127 +
72128 +   Reiser4 tree operations which may allocate new znodes (CBK, tree balancing)
72129 +   take care about searching (hash table lookup may be required) of znode
72130 +   neighbors, establishing sibling pointers between them and setting
72131 +   JNODE_*_CONNECTED state bits. */
72132 +
72133 +/* adjusting of sibling pointers and `connected' states for two
72134 +   neighbors; works if one neighbor is NULL (was not found). */
72135 +
72136 +/* FIXME-VS: this is unstatic-ed to use in tree.c in prepare_twig_cut */
72137 +void link_left_and_right(znode * left, znode * right)
72138 +{
72139 +       assert("nikita-3275", check_sibling_list(left));
72140 +       assert("nikita-3275", check_sibling_list(right));
72141 +
72142 +       if (left != NULL) {
72143 +               if (left->right == NULL) {
72144 +                       left->right = right;
72145 +                       ZF_SET(left, JNODE_RIGHT_CONNECTED);
72146 +
72147 +                       ON_DEBUG(left->right_version =
72148 +                                atomic_inc_return(&delim_key_version);
72149 +                           );
72150 +
72151 +               } else if (ZF_ISSET(left->right, JNODE_HEARD_BANSHEE)
72152 +                          && left->right != right) {
72153 +
72154 +                       ON_DEBUG(left->right->left_version =
72155 +                                atomic_inc_return(&delim_key_version);
72156 +                                left->right_version =
72157 +                                atomic_inc_return(&delim_key_version););
72158 +
72159 +                       left->right->left = NULL;
72160 +                       left->right = right;
72161 +                       ZF_SET(left, JNODE_RIGHT_CONNECTED);
72162 +               } else
72163 +                       /*
72164 +                        * there is a race condition in renew_sibling_link()
72165 +                        * and assertions below check that it is only one
72166 +                        * there. Thread T1 calls renew_sibling_link() without
72167 +                        * GN_NO_ALLOC flag. zlook() doesn't find neighbor
72168 +                        * node, but before T1 gets to the
72169 +                        * link_left_and_right(), another thread T2 creates
72170 +                        * neighbor node and connects it. check for
72171 +                        * left->right == NULL above protects T1 from
72172 +                        * overwriting correct left->right pointer installed
72173 +                        * by T2.
72174 +                        */
72175 +                       assert("nikita-3302",
72176 +                              right == NULL || left->right == right);
72177 +       }
72178 +       if (right != NULL) {
72179 +               if (right->left == NULL) {
72180 +                       right->left = left;
72181 +                       ZF_SET(right, JNODE_LEFT_CONNECTED);
72182 +
72183 +                       ON_DEBUG(right->left_version =
72184 +                                atomic_inc_return(&delim_key_version);
72185 +                           );
72186 +
72187 +               } else if (ZF_ISSET(right->left, JNODE_HEARD_BANSHEE)
72188 +                          && right->left != left) {
72189 +
72190 +                       ON_DEBUG(right->left->right_version =
72191 +                                atomic_inc_return(&delim_key_version);
72192 +                                right->left_version =
72193 +                                atomic_inc_return(&delim_key_version););
72194 +
72195 +                       right->left->right = NULL;
72196 +                       right->left = left;
72197 +                       ZF_SET(right, JNODE_LEFT_CONNECTED);
72198 +
72199 +               } else
72200 +                       assert("nikita-3303",
72201 +                              left == NULL || right->left == left);
72202 +       }
72203 +       assert("nikita-3275", check_sibling_list(left));
72204 +       assert("nikita-3275", check_sibling_list(right));
72205 +}
72206 +
72207 +/* Audited by: umka (2002.06.14) */
72208 +static void link_znodes(znode * first, znode * second, int to_left)
72209 +{
72210 +       if (to_left)
72211 +               link_left_and_right(second, first);
72212 +       else
72213 +               link_left_and_right(first, second);
72214 +}
72215 +
72216 +/* getting of next (to left or to right, depend on gn_to_left bit in flags)
72217 +   coord's unit position in horizontal direction, even across node
72218 +   boundary. Should be called under tree lock, it protects nonexistence of
72219 +   sibling link on parent level, if lock_side_neighbor() fails with
72220 +   -ENOENT. */
72221 +static int far_next_coord(coord_t * coord, lock_handle * handle, int flags)
72222 +{
72223 +       int ret;
72224 +       znode *node;
72225 +       reiser4_tree *tree;
72226 +
72227 +       assert("umka-243", coord != NULL);
72228 +       assert("umka-244", handle != NULL);
72229 +       assert("zam-1069", handle->node == NULL);
72230 +
72231 +       ret =
72232 +           (flags & GN_GO_LEFT) ? coord_prev_unit(coord) :
72233 +           coord_next_unit(coord);
72234 +       if (!ret)
72235 +               return 0;
72236 +
72237 +       ret =
72238 +           lock_side_neighbor(handle, coord->node, ZNODE_READ_LOCK, flags, 0);
72239 +       if (ret)
72240 +               return ret;
72241 +
72242 +       node = handle->node;
72243 +       tree = znode_get_tree(node);
72244 +       write_unlock_tree(tree);
72245 +
72246 +       coord_init_zero(coord);
72247 +
72248 +       /* We avoid synchronous read here if it is specified by flag. */
72249 +       if ((flags & GN_ASYNC) && znode_page(handle->node) == NULL) {
72250 +               ret = jstartio(ZJNODE(handle->node));
72251 +               if (!ret)
72252 +                       ret = -E_REPEAT;
72253 +               goto error_locked;
72254 +       }
72255 +
72256 +       /* corresponded zrelse() should be called by the clients of
72257 +          far_next_coord(), in place when this node gets unlocked. */
72258 +       ret = zload(handle->node);
72259 +       if (ret)
72260 +               goto error_locked;
72261 +
72262 +       if (flags & GN_GO_LEFT)
72263 +               coord_init_last_unit(coord, node);
72264 +       else
72265 +               coord_init_first_unit(coord, node);
72266 +
72267 +       if (0) {
72268 +             error_locked:
72269 +               longterm_unlock_znode(handle);
72270 +       }
72271 +       write_lock_tree(tree);
72272 +       return ret;
72273 +}
72274 +
72275 +/* Very significant function which performs a step in horizontal direction
72276 +   when sibling pointer is not available.  Actually, it is only function which
72277 +   does it.
72278 +   Note: this function does not restore locking status at exit,
72279 +   caller should does care about proper unlocking and zrelsing */
72280 +static int
72281 +renew_sibling_link(coord_t * coord, lock_handle * handle, znode * child,
72282 +                  tree_level level, int flags, int *nr_locked)
72283 +{
72284 +       int ret;
72285 +       int to_left = flags & GN_GO_LEFT;
72286 +       reiser4_block_nr da;
72287 +       /* parent of the neighbor node; we set it to parent until not sharing
72288 +          of one parent between child and neighbor node is detected */
72289 +       znode *side_parent = coord->node;
72290 +       reiser4_tree *tree = znode_get_tree(child);
72291 +       znode *neighbor = NULL;
72292 +
72293 +       assert("umka-245", coord != NULL);
72294 +       assert("umka-246", handle != NULL);
72295 +       assert("umka-247", child != NULL);
72296 +       assert("umka-303", tree != NULL);
72297 +
72298 +       init_lh(handle);
72299 +       write_lock_tree(tree);
72300 +       ret = far_next_coord(coord, handle, flags);
72301 +
72302 +       if (ret) {
72303 +               if (ret != -ENOENT) {
72304 +                       write_unlock_tree(tree);
72305 +                       return ret;
72306 +               }
72307 +       } else {
72308 +               item_plugin *iplug;
72309 +
72310 +               if (handle->node != NULL) {
72311 +                       (*nr_locked)++;
72312 +                       side_parent = handle->node;
72313 +               }
72314 +
72315 +               /* does coord object points to internal item? We do not
72316 +                  support sibling pointers between znode for formatted and
72317 +                  unformatted nodes and return -E_NO_NEIGHBOR in that case. */
72318 +               iplug = item_plugin_by_coord(coord);
72319 +               if (!item_is_internal(coord)) {
72320 +                       link_znodes(child, NULL, to_left);
72321 +                       write_unlock_tree(tree);
72322 +                       /* we know there can't be formatted neighbor */
72323 +                       return RETERR(-E_NO_NEIGHBOR);
72324 +               }
72325 +               write_unlock_tree(tree);
72326 +
72327 +               iplug->s.internal.down_link(coord, NULL, &da);
72328 +
72329 +               if (flags & GN_NO_ALLOC) {
72330 +                       neighbor = zlook(tree, &da);
72331 +               } else {
72332 +                       neighbor =
72333 +                           zget(tree, &da, side_parent, level, get_gfp_mask());
72334 +               }
72335 +
72336 +               if (IS_ERR(neighbor)) {
72337 +                       ret = PTR_ERR(neighbor);
72338 +                       return ret;
72339 +               }
72340 +
72341 +               if (neighbor)
72342 +                       /* update delimiting keys */
72343 +                       set_child_delimiting_keys(coord->node, coord, neighbor);
72344 +
72345 +               write_lock_tree(tree);
72346 +       }
72347 +
72348 +       if (likely(neighbor == NULL ||
72349 +                  (znode_get_level(child) == znode_get_level(neighbor)
72350 +                   && child != neighbor)))
72351 +               link_znodes(child, neighbor, to_left);
72352 +       else {
72353 +               warning("nikita-3532",
72354 +                       "Sibling nodes on the different levels: %i != %i\n",
72355 +                       znode_get_level(child), znode_get_level(neighbor));
72356 +               ret = RETERR(-EIO);
72357 +       }
72358 +
72359 +       write_unlock_tree(tree);
72360 +
72361 +       /* if GN_NO_ALLOC isn't set we keep reference to neighbor znode */
72362 +       if (neighbor != NULL && (flags & GN_NO_ALLOC))
72363 +               /* atomic_dec(&ZJNODE(neighbor)->x_count); */
72364 +               zput(neighbor);
72365 +
72366 +       return ret;
72367 +}
72368 +
72369 +/* This function is for establishing of one side relation. */
72370 +/* Audited by: umka (2002.06.14) */
72371 +static int connect_one_side(coord_t * coord, znode * node, int flags)
72372 +{
72373 +       coord_t local;
72374 +       lock_handle handle;
72375 +       int nr_locked;
72376 +       int ret;
72377 +
72378 +       assert("umka-248", coord != NULL);
72379 +       assert("umka-249", node != NULL);
72380 +
72381 +       coord_dup_nocheck(&local, coord);
72382 +
72383 +       init_lh(&handle);
72384 +
72385 +       ret =
72386 +           renew_sibling_link(&local, &handle, node, znode_get_level(node),
72387 +                              flags | GN_NO_ALLOC, &nr_locked);
72388 +
72389 +       if (handle.node != NULL) {
72390 +               /* complementary operations for zload() and lock() in far_next_coord() */
72391 +               zrelse(handle.node);
72392 +               longterm_unlock_znode(&handle);
72393 +       }
72394 +
72395 +       /* we catch error codes which are not interesting for us because we
72396 +          run renew_sibling_link() only for znode connection. */
72397 +       if (ret == -ENOENT || ret == -E_NO_NEIGHBOR)
72398 +               return 0;
72399 +
72400 +       return ret;
72401 +}
72402 +
72403 +/* if @child is not in `connected' state, performs hash searches for left and
72404 +   right neighbor nodes and establishes horizontal sibling links */
72405 +/* Audited by: umka (2002.06.14), umka (2002.06.15) */
72406 +int connect_znode(coord_t * parent_coord, znode * child)
72407 +{
72408 +       reiser4_tree *tree = znode_get_tree(child);
72409 +       int ret = 0;
72410 +
72411 +       assert("zam-330", parent_coord != NULL);
72412 +       assert("zam-331", child != NULL);
72413 +       assert("zam-332", parent_coord->node != NULL);
72414 +       assert("umka-305", tree != NULL);
72415 +
72416 +       /* it is trivial to `connect' root znode because it can't have
72417 +          neighbors */
72418 +       if (znode_above_root(parent_coord->node)) {
72419 +               child->left = NULL;
72420 +               child->right = NULL;
72421 +               ZF_SET(child, JNODE_LEFT_CONNECTED);
72422 +               ZF_SET(child, JNODE_RIGHT_CONNECTED);
72423 +
72424 +               ON_DEBUG(child->left_version =
72425 +                        atomic_inc_return(&delim_key_version);
72426 +                        child->right_version =
72427 +                        atomic_inc_return(&delim_key_version););
72428 +
72429 +               return 0;
72430 +       }
72431 +
72432 +       /* load parent node */
72433 +       coord_clear_iplug(parent_coord);
72434 +       ret = zload(parent_coord->node);
72435 +
72436 +       if (ret != 0)
72437 +               return ret;
72438 +
72439 +       /* protect `connected' state check by tree_lock */
72440 +       read_lock_tree(tree);
72441 +
72442 +       if (!znode_is_right_connected(child)) {
72443 +               read_unlock_tree(tree);
72444 +               /* connect right (default is right) */
72445 +               ret = connect_one_side(parent_coord, child, GN_NO_ALLOC);
72446 +               if (ret)
72447 +                       goto zrelse_and_ret;
72448 +
72449 +               read_lock_tree(tree);
72450 +       }
72451 +
72452 +       ret = znode_is_left_connected(child);
72453 +
72454 +       read_unlock_tree(tree);
72455 +
72456 +       if (!ret) {
72457 +               ret =
72458 +                   connect_one_side(parent_coord, child,
72459 +                                    GN_NO_ALLOC | GN_GO_LEFT);
72460 +       } else
72461 +               ret = 0;
72462 +
72463 +      zrelse_and_ret:
72464 +       zrelse(parent_coord->node);
72465 +
72466 +       return ret;
72467 +}
72468 +
72469 +/* this function is like renew_sibling_link() but allocates neighbor node if
72470 +   it doesn't exist and `connects' it. It may require making two steps in
72471 +   horizontal direction, first one for neighbor node finding/allocation,
72472 +   second one is for finding neighbor of neighbor to connect freshly allocated
72473 +   znode. */
72474 +/* Audited by: umka (2002.06.14), umka (2002.06.15) */
72475 +static int
72476 +renew_neighbor(coord_t * coord, znode * node, tree_level level, int flags)
72477 +{
72478 +       coord_t local;
72479 +       lock_handle empty[2];
72480 +       reiser4_tree *tree = znode_get_tree(node);
72481 +       znode *neighbor = NULL;
72482 +       int nr_locked = 0;
72483 +       int ret;
72484 +
72485 +       assert("umka-250", coord != NULL);
72486 +       assert("umka-251", node != NULL);
72487 +       assert("umka-307", tree != NULL);
72488 +       assert("umka-308", level <= tree->height);
72489 +
72490 +       /* umka (2002.06.14)
72491 +          Here probably should be a check for given "level" validness.
72492 +          Something like assert("xxx-yyy", level < REAL_MAX_ZTREE_HEIGHT);
72493 +        */
72494 +
72495 +       coord_dup(&local, coord);
72496 +
72497 +       ret =
72498 +           renew_sibling_link(&local, &empty[0], node, level,
72499 +                              flags & ~GN_NO_ALLOC, &nr_locked);
72500 +       if (ret)
72501 +               goto out;
72502 +
72503 +       /* tree lock is not needed here because we keep parent node(s) locked
72504 +          and reference to neighbor znode incremented */
72505 +       neighbor = (flags & GN_GO_LEFT) ? node->left : node->right;
72506 +
72507 +       read_lock_tree(tree);
72508 +       ret = znode_is_connected(neighbor);
72509 +       read_unlock_tree(tree);
72510 +       if (ret) {
72511 +               ret = 0;
72512 +               goto out;
72513 +       }
72514 +
72515 +       ret =
72516 +           renew_sibling_link(&local, &empty[nr_locked], neighbor, level,
72517 +                              flags | GN_NO_ALLOC, &nr_locked);
72518 +       /* second renew_sibling_link() call is used for znode connection only,
72519 +          so we can live with these errors */
72520 +       if (-ENOENT == ret || -E_NO_NEIGHBOR == ret)
72521 +               ret = 0;
72522 +
72523 +      out:
72524 +
72525 +       for (--nr_locked; nr_locked >= 0; --nr_locked) {
72526 +               zrelse(empty[nr_locked].node);
72527 +               longterm_unlock_znode(&empty[nr_locked]);
72528 +       }
72529 +
72530 +       if (neighbor != NULL)
72531 +               /* decrement znode reference counter without actually
72532 +                  releasing it. */
72533 +               atomic_dec(&ZJNODE(neighbor)->x_count);
72534 +
72535 +       return ret;
72536 +}
72537 +
72538 +/*
72539 +   reiser4_get_neighbor() -- lock node's neighbor.
72540 +
72541 +   reiser4_get_neighbor() locks node's neighbor (left or right one, depends on
72542 +   given parameter) using sibling link to it. If sibling link is not available
72543 +   (i.e. neighbor znode is not in cache) and flags allow read blocks, we go one
72544 +   level up for information about neighbor's disk address. We lock node's
72545 +   parent, if it is common parent for both 'node' and its neighbor, neighbor's
72546 +   disk address is in next (to left or to right) down link from link that points
72547 +   to original node. If not, we need to lock parent's neighbor, read its content
72548 +   and take first(last) downlink with neighbor's disk address.  That locking
72549 +   could be done by using sibling link and lock_neighbor() function, if sibling
72550 +   link exists. In another case we have to go level up again until we find
72551 +   common parent or valid sibling link. Then go down
72552 +   allocating/connecting/locking/reading nodes until neighbor of first one is
72553 +   locked.
72554 +
72555 +   @neighbor:  result lock handle,
72556 +   @node: a node which we lock neighbor of,
72557 +   @lock_mode: lock mode {LM_READ, LM_WRITE},
72558 +   @flags: logical OR of {GN_*} (see description above) subset.
72559 +
72560 +   @return: 0 if success, negative value if lock was impossible due to an error
72561 +   or lack of neighbor node.
72562 +*/
72563 +
72564 +/* Audited by: umka (2002.06.14), umka (2002.06.15) */
72565 +int
72566 +reiser4_get_neighbor(lock_handle * neighbor, znode * node,
72567 +                    znode_lock_mode lock_mode, int flags)
72568 +{
72569 +       reiser4_tree *tree = znode_get_tree(node);
72570 +       lock_handle path[REAL_MAX_ZTREE_HEIGHT];
72571 +
72572 +       coord_t coord;
72573 +
72574 +       tree_level base_level;
72575 +       tree_level h = 0;
72576 +       int ret;
72577 +
72578 +       assert("umka-252", tree != NULL);
72579 +       assert("umka-253", neighbor != NULL);
72580 +       assert("umka-254", node != NULL);
72581 +
72582 +       base_level = znode_get_level(node);
72583 +
72584 +       assert("umka-310", base_level <= tree->height);
72585 +
72586 +       coord_init_zero(&coord);
72587 +
72588 +      again:
72589 +       /* first, we try to use simple lock_neighbor() which requires sibling
72590 +          link existence */
72591 +       read_lock_tree(tree);
72592 +       ret = lock_side_neighbor(neighbor, node, lock_mode, flags, 1);
72593 +       read_unlock_tree(tree);
72594 +       if (!ret) {
72595 +               /* load znode content if it was specified */
72596 +               if (flags & GN_LOAD_NEIGHBOR) {
72597 +                       ret = zload(node);
72598 +                       if (ret)
72599 +                               longterm_unlock_znode(neighbor);
72600 +               }
72601 +               return ret;
72602 +       }
72603 +
72604 +       /* only -ENOENT means we may look upward and try to connect
72605 +          @node with its neighbor (if @flags allow us to do it) */
72606 +       if (ret != -ENOENT || !(flags & GN_CAN_USE_UPPER_LEVELS))
72607 +               return ret;
72608 +
72609 +       /* before establishing of sibling link we lock parent node; it is
72610 +          required by renew_neighbor() to work.  */
72611 +       init_lh(&path[0]);
72612 +       ret = reiser4_get_parent(&path[0], node, ZNODE_READ_LOCK);
72613 +       if (ret)
72614 +               return ret;
72615 +       if (znode_above_root(path[0].node)) {
72616 +               longterm_unlock_znode(&path[0]);
72617 +               return RETERR(-E_NO_NEIGHBOR);
72618 +       }
72619 +
72620 +       while (1) {
72621 +               znode *child = (h == 0) ? node : path[h - 1].node;
72622 +               znode *parent = path[h].node;
72623 +
72624 +               ret = zload(parent);
72625 +               if (ret)
72626 +                       break;
72627 +
72628 +               ret = find_child_ptr(parent, child, &coord);
72629 +
72630 +               if (ret) {
72631 +                       zrelse(parent);
72632 +                       break;
72633 +               }
72634 +
72635 +               /* try to establish missing sibling link */
72636 +               ret = renew_neighbor(&coord, child, h + base_level, flags);
72637 +
72638 +               zrelse(parent);
72639 +
72640 +               switch (ret) {
72641 +               case 0:
72642 +                       /* unlocking of parent znode prevents simple
72643 +                          deadlock situation */
72644 +                       done_lh(&path[h]);
72645 +
72646 +                       /* depend on tree level we stay on we repeat first
72647 +                          locking attempt ...  */
72648 +                       if (h == 0)
72649 +                               goto again;
72650 +
72651 +                       /* ... or repeat establishing of sibling link at
72652 +                          one level below. */
72653 +                       --h;
72654 +                       break;
72655 +
72656 +               case -ENOENT:
72657 +                       /* sibling link is not available -- we go
72658 +                          upward. */
72659 +                       init_lh(&path[h + 1]);
72660 +                       ret =
72661 +                           reiser4_get_parent(&path[h + 1], parent,
72662 +                                              ZNODE_READ_LOCK);
72663 +                       if (ret)
72664 +                               goto fail;
72665 +                       ++h;
72666 +                       if (znode_above_root(path[h].node)) {
72667 +                               ret = RETERR(-E_NO_NEIGHBOR);
72668 +                               goto fail;
72669 +                       }
72670 +                       break;
72671 +
72672 +               case -E_DEADLOCK:
72673 +                       /* there was lock request from hi-pri locker. if
72674 +                          it is possible we unlock last parent node and
72675 +                          re-lock it again. */
72676 +                       for (; check_deadlock(); h--) {
72677 +                               done_lh(&path[h]);
72678 +                               if (h == 0)
72679 +                                       goto fail;
72680 +                       }
72681 +
72682 +                       break;
72683 +
72684 +               default:        /* other errors. */
72685 +                       goto fail;
72686 +               }
72687 +       }
72688 +      fail:
72689 +       ON_DEBUG(check_lock_node_data(node));
72690 +       ON_DEBUG(check_lock_data());
72691 +
72692 +       /* unlock path */
72693 +       do {
72694 +               /* FIXME-Zam: when we get here from case -E_DEADLOCK's goto
72695 +                  fail; path[0] is already done_lh-ed, therefore
72696 +                  longterm_unlock_znode(&path[h]); is not applicable */
72697 +               done_lh(&path[h]);
72698 +               --h;
72699 +       } while (h + 1 != 0);
72700 +
72701 +       return ret;
72702 +}
72703 +
72704 +/* remove node from sibling list */
72705 +/* Audited by: umka (2002.06.14) */
72706 +void sibling_list_remove(znode * node)
72707 +{
72708 +       reiser4_tree *tree;
72709 +
72710 +       tree = znode_get_tree(node);
72711 +       assert("umka-255", node != NULL);
72712 +       assert_rw_write_locked(&(tree->tree_lock));
72713 +       assert("nikita-3275", check_sibling_list(node));
72714 +
72715 +       write_lock_dk(tree);
72716 +       if (znode_is_right_connected(node) && node->right != NULL &&
72717 +           znode_is_left_connected(node) && node->left != NULL) {
72718 +               assert("zam-32245",
72719 +                      keyeq(znode_get_rd_key(node),
72720 +                            znode_get_ld_key(node->right)));
72721 +               znode_set_rd_key(node->left, znode_get_ld_key(node->right));
72722 +       }
72723 +       write_unlock_dk(tree);
72724 +
72725 +       if (znode_is_right_connected(node) && node->right != NULL) {
72726 +               assert("zam-322", znode_is_left_connected(node->right));
72727 +               node->right->left = node->left;
72728 +               ON_DEBUG(node->right->left_version =
72729 +                        atomic_inc_return(&delim_key_version);
72730 +                   );
72731 +       }
72732 +       if (znode_is_left_connected(node) && node->left != NULL) {
72733 +               assert("zam-323", znode_is_right_connected(node->left));
72734 +               node->left->right = node->right;
72735 +               ON_DEBUG(node->left->right_version =
72736 +                        atomic_inc_return(&delim_key_version);
72737 +                   );
72738 +       }
72739 +
72740 +       ZF_CLR(node, JNODE_LEFT_CONNECTED);
72741 +       ZF_CLR(node, JNODE_RIGHT_CONNECTED);
72742 +       ON_DEBUG(node->left = node->right = NULL;
72743 +                node->left_version = atomic_inc_return(&delim_key_version);
72744 +                node->right_version = atomic_inc_return(&delim_key_version););
72745 +       assert("nikita-3276", check_sibling_list(node));
72746 +}
72747 +
72748 +/* disconnect node from sibling list */
72749 +void sibling_list_drop(znode * node)
72750 +{
72751 +       znode *right;
72752 +       znode *left;
72753 +
72754 +       assert("nikita-2464", node != NULL);
72755 +       assert("nikita-3277", check_sibling_list(node));
72756 +
72757 +       right = node->right;
72758 +       if (right != NULL) {
72759 +               assert("nikita-2465", znode_is_left_connected(right));
72760 +               right->left = NULL;
72761 +               ON_DEBUG(right->left_version =
72762 +                        atomic_inc_return(&delim_key_version);
72763 +                   );
72764 +       }
72765 +       left = node->left;
72766 +       if (left != NULL) {
72767 +               assert("zam-323", znode_is_right_connected(left));
72768 +               left->right = NULL;
72769 +               ON_DEBUG(left->right_version =
72770 +                        atomic_inc_return(&delim_key_version);
72771 +                   );
72772 +       }
72773 +       ZF_CLR(node, JNODE_LEFT_CONNECTED);
72774 +       ZF_CLR(node, JNODE_RIGHT_CONNECTED);
72775 +       ON_DEBUG(node->left = node->right = NULL;
72776 +                node->left_version = atomic_inc_return(&delim_key_version);
72777 +                node->right_version = atomic_inc_return(&delim_key_version););
72778 +}
72779 +
72780 +/* Insert new node into sibling list. Regular balancing inserts new node
72781 +   after (at right side) existing and locked node (@before), except one case
72782 +   of adding new tree root node. @before should be NULL in that case. */
72783 +void sibling_list_insert_nolock(znode * new, znode * before)
72784 +{
72785 +       assert("zam-334", new != NULL);
72786 +       assert("nikita-3298", !znode_is_left_connected(new));
72787 +       assert("nikita-3299", !znode_is_right_connected(new));
72788 +       assert("nikita-3300", new->left == NULL);
72789 +       assert("nikita-3301", new->right == NULL);
72790 +       assert("nikita-3278", check_sibling_list(new));
72791 +       assert("nikita-3279", check_sibling_list(before));
72792 +
72793 +       if (before != NULL) {
72794 +               assert("zam-333", znode_is_connected(before));
72795 +               new->right = before->right;
72796 +               new->left = before;
72797 +               ON_DEBUG(new->right_version =
72798 +                        atomic_inc_return(&delim_key_version);
72799 +                        new->left_version =
72800 +                        atomic_inc_return(&delim_key_version););
72801 +               if (before->right != NULL) {
72802 +                       before->right->left = new;
72803 +                       ON_DEBUG(before->right->left_version =
72804 +                                atomic_inc_return(&delim_key_version);
72805 +                           );
72806 +               }
72807 +               before->right = new;
72808 +               ON_DEBUG(before->right_version =
72809 +                        atomic_inc_return(&delim_key_version);
72810 +                   );
72811 +       } else {
72812 +               new->right = NULL;
72813 +               new->left = NULL;
72814 +               ON_DEBUG(new->right_version =
72815 +                        atomic_inc_return(&delim_key_version);
72816 +                        new->left_version =
72817 +                        atomic_inc_return(&delim_key_version););
72818 +       }
72819 +       ZF_SET(new, JNODE_LEFT_CONNECTED);
72820 +       ZF_SET(new, JNODE_RIGHT_CONNECTED);
72821 +       assert("nikita-3280", check_sibling_list(new));
72822 +       assert("nikita-3281", check_sibling_list(before));
72823 +}
72824 +
72825 +/*
72826 +   Local variables:
72827 +   c-indentation-style: "K&R"
72828 +   mode-name: "LC"
72829 +   c-basic-offset: 8
72830 +   tab-width: 8
72831 +   fill-column: 80
72832 +   End:
72833 +*/
72834 diff --git a/fs/reiser4/tree_walk.h b/fs/reiser4/tree_walk.h
72835 new file mode 100644
72836 index 0000000..3d5f09f
72837 --- /dev/null
72838 +++ b/fs/reiser4/tree_walk.h
72839 @@ -0,0 +1,125 @@
72840 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
72841 +
72842 +/* definitions of reiser4 tree walk functions */
72843 +
72844 +#ifndef __FS_REISER4_TREE_WALK_H__
72845 +#define __FS_REISER4_TREE_WALK_H__
72846 +
72847 +#include "debug.h"
72848 +#include "forward.h"
72849 +
72850 +/* establishes horizontal links between cached znodes */
72851 +int connect_znode(coord_t * coord, znode * node);
72852 +
72853 +/* tree traversal functions (reiser4_get_parent(), reiser4_get_neighbor())
72854 +  have the following common arguments:
72855 +
72856 +  return codes:
72857 +
72858 +  @return : 0        - OK,
72859 +
72860 +ZAM-FIXME-HANS: wrong return code name.  Change them all.
72861 +           -ENOENT  - neighbor is not in cache, what is detected by sibling
72862 +                      link absence.
72863 +
72864 +            -E_NO_NEIGHBOR - we are sure that neighbor (or parent) node cannot be
72865 +                       found (because we are left-/right- most node of the
72866 +                      tree, for example). Also, this return code is for
72867 +                      reiser4_get_parent() when we see no parent link -- it
72868 +                      means that our node is root node.
72869 +
72870 +            -E_DEADLOCK - deadlock detected (request from high-priority process
72871 +                      received), other error codes are conformed to
72872 +                      /usr/include/asm/errno.h .
72873 +*/
72874 +
72875 +int
72876 +reiser4_get_parent_flags(lock_handle * result, znode * node,
72877 +                        znode_lock_mode mode, int flags);
72878 +
72879 +/* bits definition for reiser4_get_neighbor function `flags' arg. */
72880 +typedef enum {
72881 +       /* If sibling pointer is NULL, this flag allows get_neighbor() to try to
72882 +        * find not allocated not connected neigbor by going though upper
72883 +        * levels */
72884 +       GN_CAN_USE_UPPER_LEVELS = 0x1,
72885 +       /* locking left neighbor instead of right one */
72886 +       GN_GO_LEFT = 0x2,
72887 +       /* automatically load neighbor node content */
72888 +       GN_LOAD_NEIGHBOR = 0x4,
72889 +       /* return -E_REPEAT if can't lock  */
72890 +       GN_TRY_LOCK = 0x8,
72891 +       /* used internally in tree_walk.c, causes renew_sibling to not
72892 +          allocate neighbor znode, but only search for it in znode cache */
72893 +       GN_NO_ALLOC = 0x10,
72894 +       /* do not go across atom boundaries */
72895 +       GN_SAME_ATOM = 0x20,
72896 +       /* allow to lock not connected nodes */
72897 +       GN_ALLOW_NOT_CONNECTED = 0x40,
72898 +       /*  Avoid synchronous jload, instead, call jstartio() and return -E_REPEAT. */
72899 +       GN_ASYNC = 0x80
72900 +} znode_get_neigbor_flags;
72901 +
72902 +/* A commonly used wrapper for reiser4_get_parent_flags(). */
72903 +static inline int reiser4_get_parent(lock_handle * result, znode * node,
72904 +                                    znode_lock_mode mode)
72905 +{
72906 +       return reiser4_get_parent_flags(result, node, mode,
72907 +                                       GN_ALLOW_NOT_CONNECTED);
72908 +}
72909 +
72910 +int reiser4_get_neighbor(lock_handle * neighbor, znode * node,
72911 +                        znode_lock_mode lock_mode, int flags);
72912 +
72913 +/* there are wrappers for most common usages of reiser4_get_neighbor() */
72914 +static inline int
72915 +reiser4_get_left_neighbor(lock_handle * result, znode * node, int lock_mode,
72916 +                         int flags)
72917 +{
72918 +       return reiser4_get_neighbor(result, node, lock_mode,
72919 +                                   flags | GN_GO_LEFT);
72920 +}
72921 +
72922 +static inline int
72923 +reiser4_get_right_neighbor(lock_handle * result, znode * node, int lock_mode,
72924 +                          int flags)
72925 +{
72926 +       ON_DEBUG(check_lock_node_data(node));
72927 +       ON_DEBUG(check_lock_data());
72928 +       return reiser4_get_neighbor(result, node, lock_mode,
72929 +                                   flags & (~GN_GO_LEFT));
72930 +}
72931 +
72932 +extern void sibling_list_remove(znode * node);
72933 +extern void sibling_list_drop(znode * node);
72934 +extern void sibling_list_insert_nolock(znode * new, znode * before);
72935 +extern void link_left_and_right(znode * left, znode * right);
72936 +
72937 +/* Functions called by tree_walk() when tree_walk() ...  */
72938 +struct tree_walk_actor {
72939 +       /* ... meets a formatted node, */
72940 +       int (*process_znode) (tap_t *, void *);
72941 +       /* ... meets an extent, */
72942 +       int (*process_extent) (tap_t *, void *);
72943 +       /* ... begins tree traversal or repeats it after -E_REPEAT was returned by
72944 +        * node or extent processing functions. */
72945 +       int (*before) (void *);
72946 +};
72947 +
72948 +#if REISER4_DEBUG
72949 +int check_sibling_list(znode * node);
72950 +#else
72951 +#define check_sibling_list(n) (1)
72952 +#endif
72953 +
72954 +#endif                         /* __FS_REISER4_TREE_WALK_H__ */
72955 +
72956 +/*
72957 +   Local variables:
72958 +   c-indentation-style: "K&R"
72959 +   mode-name: "LC"
72960 +   c-basic-offset: 8
72961 +   tab-width: 8
72962 +   fill-column: 120
72963 +   End:
72964 +*/
72965 diff --git a/fs/reiser4/txnmgr.c b/fs/reiser4/txnmgr.c
72966 new file mode 100644
72967 index 0000000..ae3f9f2
72968 --- /dev/null
72969 +++ b/fs/reiser4/txnmgr.c
72970 @@ -0,0 +1,3158 @@
72971 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
72972 + * reiser4/README */
72973 +
72974 +/* Joshua MacDonald wrote the first draft of this code. */
72975 +
72976 +/* ZAM-LONGTERM-FIXME-HANS: The locking in this file is badly designed, and a
72977 +filesystem scales only as well as its worst locking design.  You need to
72978 +substantially restructure this code. Josh was not as experienced a programmer
72979 +as you.  Particularly review how the locking style differs from what you did
72980 +for znodes usingt hi-lo priority locking, and present to me an opinion on
72981 +whether the differences are well founded.  */
72982 +
72983 +/* I cannot help but to disagree with the sentiment above. Locking of
72984 + * transaction manager is _not_ badly designed, and, at the very least, is not
72985 + * the scaling bottleneck. Scaling bottleneck is _exactly_ hi-lo priority
72986 + * locking on znodes, especially on the root node of the tree. --nikita,
72987 + * 2003.10.13 */
72988 +
72989 +/* The txnmgr is a set of interfaces that keep track of atoms and transcrash handles.  The
72990 +   txnmgr processes capture_block requests and manages the relationship between jnodes and
72991 +   atoms through the various stages of a transcrash, and it also oversees the fusion and
72992 +   capture-on-copy processes.  The main difficulty with this task is maintaining a
72993 +   deadlock-free lock ordering between atoms and jnodes/handles.  The reason for the
72994 +   difficulty is that jnodes, handles, and atoms contain pointer circles, and the cycle
72995 +   must be broken.  The main requirement is that atom-fusion be deadlock free, so once you
72996 +   hold the atom_lock you may then wait to acquire any jnode or handle lock.  This implies
72997 +   that any time you check the atom-pointer of a jnode or handle and then try to lock that
72998 +   atom, you must use trylock() and possibly reverse the order.
72999 +
73000 +   This code implements the design documented at:
73001 +
73002 +     http://namesys.com/txn-doc.html
73003 +
73004 +ZAM-FIXME-HANS: update v4.html to contain all of the information present in the above (but updated), and then remove the
73005 +above document and reference the new.  Be sure to provide some credit to Josh.  I already have some writings on this
73006 +topic in v4.html, but they are lacking in details present in the above.  Cure that.  Remember to write for the bright 12
73007 +year old --- define all technical terms used.
73008 +
73009 +*/
73010 +
73011 +/* Thoughts on the external transaction interface:
73012 +
73013 +   In the current code, a TRANSCRASH handle is created implicitly by init_context() (which
73014 +   creates state that lasts for the duration of a system call and is called at the start
73015 +   of ReiserFS methods implementing VFS operations), and closed by reiser4_exit_context(),
73016 +   occupying the scope of a single system call.  We wish to give certain applications an
73017 +   interface to begin and close (commit) transactions.  Since our implementation of
73018 +   transactions does not yet support isolation, allowing an application to open a
73019 +   transaction implies trusting it to later close the transaction.  Part of the
73020 +   transaction interface will be aimed at enabling that trust, but the interface for
73021 +   actually using transactions is fairly narrow.
73022 +
73023 +   BEGIN_TRANSCRASH: Returns a transcrash identifier.  It should be possible to translate
73024 +   this identifier into a string that a shell-script could use, allowing you to start a
73025 +   transaction by issuing a command.  Once open, the transcrash should be set in the task
73026 +   structure, and there should be options (I suppose) to allow it to be carried across
73027 +   fork/exec.  A transcrash has several options:
73028 +
73029 +     - READ_FUSING or WRITE_FUSING: The default policy is for txn-capture to capture only
73030 +     on writes (WRITE_FUSING) and allow "dirty reads".  If the application wishes to
73031 +     capture on reads as well, it should set READ_FUSING.
73032 +
73033 +     - TIMEOUT: Since a non-isolated transcrash cannot be undone, every transcrash must
73034 +     eventually close (or else the machine must crash).  If the application dies an
73035 +     unexpected death with an open transcrash, for example, or if it hangs for a long
73036 +     duration, one solution (to avoid crashing the machine) is to simply close it anyway.
73037 +     This is a dangerous option, but it is one way to solve the problem until isolated
73038 +     transcrashes are available for untrusted applications.
73039 +
73040 +     It seems to be what databases do, though it is unclear how one avoids a DoS attack
73041 +     creating a vulnerability based on resource starvation.  Guaranteeing that some
73042 +     minimum amount of computational resources are made available would seem more correct
73043 +     than guaranteeing some amount of time.  When we again have someone to code the work,
73044 +     this issue should be considered carefully.  -Hans
73045 +
73046 +   RESERVE_BLOCKS: A running transcrash should indicate to the transaction manager how
73047 +   many dirty blocks it expects.  The reserve_blocks interface should be called at a point
73048 +   where it is safe for the application to fail, because the system may not be able to
73049 +   grant the allocation and the application must be able to back-out.  For this reason,
73050 +   the number of reserve-blocks can also be passed as an argument to BEGIN_TRANSCRASH, but
73051 +   the application may also wish to extend the allocation after beginning its transcrash.
73052 +
73053 +   CLOSE_TRANSCRASH: The application closes the transcrash when it is finished making
73054 +   modifications that require transaction protection.  When isolated transactions are
73055 +   supported the CLOSE operation is replaced by either COMMIT or ABORT.  For example, if a
73056 +   RESERVE_BLOCKS call fails for the application, it should "abort" by calling
73057 +   CLOSE_TRANSCRASH, even though it really commits any changes that were made (which is
73058 +   why, for safety, the application should call RESERVE_BLOCKS before making any changes).
73059 +
73060 +   For actually implementing these out-of-system-call-scopped transcrashes, the
73061 +   reiser4_context has a "txn_handle *trans" pointer that may be set to an open
73062 +   transcrash.  Currently there are no dynamically-allocated transcrashes, but there is a
73063 +   "kmem_cache_t *_txnh_slab" created for that purpose in this file.
73064 +*/
73065 +
73066 +/* Extending the other system call interfaces for future transaction features:
73067 +
73068 +   Specialized applications may benefit from passing flags to the ordinary system call
73069 +   interface such as read(), write(), or stat().  For example, the application specifies
73070 +   WRITE_FUSING by default but wishes to add that a certain read() command should be
73071 +   treated as READ_FUSING.  But which read?  Is it the directory-entry read, the stat-data
73072 +   read, or the file-data read?  These issues are straight-forward, but there are a lot of
73073 +   them and adding the necessary flags-passing code will be tedious.
73074 +
73075 +   When supporting isolated transactions, there is a corresponding READ_MODIFY_WRITE (RMW)
73076 +   flag, which specifies that although it is a read operation being requested, a
73077 +   write-lock should be taken.  The reason is that read-locks are shared while write-locks
73078 +   are exclusive, so taking a read-lock when a later-write is known in advance will often
73079 +   leads to deadlock.  If a reader knows it will write later, it should issue read
73080 +   requests with the RMW flag set.
73081 +*/
73082 +
73083 +/*
73084 +   The znode/atom deadlock avoidance.
73085 +
73086 +   FIXME(Zam): writing of this comment is in progress.
73087 +
73088 +   The atom's special stage ASTAGE_CAPTURE_WAIT introduces a kind of atom's
73089 +   long-term locking, which makes reiser4 locking scheme more complex.  It had
73090 +   deadlocks until we implement deadlock avoidance algorithms.  That deadlocks
73091 +   looked as the following: one stopped thread waits for a long-term lock on
73092 +   znode, the thread who owns that lock waits when fusion with another atom will
73093 +   be allowed.
73094 +
73095 +   The source of the deadlocks is an optimization of not capturing index nodes
73096 +   for read.  Let's prove it.  Suppose we have dumb node capturing scheme which
73097 +   unconditionally captures each block before locking it.
73098 +
73099 +   That scheme has no deadlocks.  Let's begin with the thread which stage is
73100 +   ASTAGE_CAPTURE_WAIT and it waits for a znode lock.  The thread can't wait for
73101 +   a capture because it's stage allows fusion with any atom except which are
73102 +   being committed currently. A process of atom commit can't deadlock because
73103 +   atom commit procedure does not acquire locks and does not fuse with other
73104 +   atoms.  Reiser4 does capturing right before going to sleep inside the
73105 +   longtertm_lock_znode() function, it means the znode which we want to lock is
73106 +   already captured and its atom is in ASTAGE_CAPTURE_WAIT stage.  If we
73107 +   continue the analysis we understand that no one process in the sequence may
73108 +   waits atom fusion.  Thereby there are no deadlocks of described kind.
73109 +
73110 +   The capturing optimization makes the deadlocks possible.  A thread can wait a
73111 +   lock which owner did not captured that node.  The lock owner's current atom
73112 +   is not fused with the first atom and it does not get a ASTAGE_CAPTURE_WAIT
73113 +   state. A deadlock is possible when that atom meets another one which is in
73114 +   ASTAGE_CAPTURE_WAIT already.
73115 +
73116 +   The deadlock avoidance scheme includes two algorithms:
73117 +
73118 +   First algorithm is used when a thread captures a node which is locked but not
73119 +   captured by another thread.  Those nodes are marked MISSED_IN_CAPTURE at the
73120 +   moment we skip their capturing.  If such a node (marked MISSED_IN_CAPTURE) is
73121 +   being captured by a thread with current atom is in ASTAGE_CAPTURE_WAIT, the
73122 +   routine which forces all lock owners to join with current atom is executed.
73123 +
73124 +   Second algorithm does not allow to skip capturing of already captured nodes.
73125 +
73126 +   Both algorithms together prevent waiting a longterm lock without atom fusion
73127 +   with atoms of all lock owners, which is a key thing for getting atom/znode
73128 +   locking deadlocks.
73129 +*/
73130 +
73131 +/*
73132 + * Transactions and mmap(2).
73133 + *
73134 + *     1. Transactions are not supported for accesses through mmap(2), because
73135 + *     this would effectively amount to user-level transactions whose duration
73136 + *     is beyond control of the kernel.
73137 + *
73138 + *     2. That said, we still want to preserve some decency with regard to
73139 + *     mmap(2). During normal write(2) call, following sequence of events
73140 + *     happens:
73141 + *
73142 + *         1. page is created;
73143 + *
73144 + *         2. jnode is created, dirtied and captured into current atom.
73145 + *
73146 + *         3. extent is inserted and modified.
73147 + *
73148 + *     Steps (2) and (3) take place under long term lock on the twig node.
73149 + *
73150 + *     When file is accessed through mmap(2) page is always created during
73151 + *     page fault. After this (in reiser4_readpage()->readpage_extent()):
73152 + *
73153 + *         1. if access is made to non-hole page new jnode is created, (if
73154 + *         necessary)
73155 + *
73156 + *         2. if access is made to the hole page, jnode is not created (XXX
73157 + *         not clear why).
73158 + *
73159 + *     Also, even if page is created by write page fault it is not marked
73160 + *     dirty immediately by handle_mm_fault(). Probably this is to avoid races
73161 + *     with page write-out.
73162 + *
73163 + *     Dirty bit installed by hardware is only transferred to the struct page
73164 + *     later, when page is unmapped (in zap_pte_range(), or
73165 + *     try_to_unmap_one()).
73166 + *
73167 + *     So, with mmap(2) we have to handle following irksome situations:
73168 + *
73169 + *         1. there exists modified page (clean or dirty) without jnode
73170 + *
73171 + *         2. there exists modified page (clean or dirty) with clean jnode
73172 + *
73173 + *         3. clean page which is a part of atom can be transparently modified
73174 + *         at any moment through mapping without becoming dirty.
73175 + *
73176 + *     (1) and (2) can lead to the out-of-memory situation: ->writepage()
73177 + *     doesn't know what to do with such pages and ->sync_sb()/->writepages()
73178 + *     don't see them, because these methods operate on atoms.
73179 + *
73180 + *     (3) can lead to the loss of data: suppose we have dirty page with dirty
73181 + *     captured jnode captured by some atom. As part of early flush (for
73182 + *     example) page was written out. Dirty bit was cleared on both page and
73183 + *     jnode. After this page is modified through mapping, but kernel doesn't
73184 + *     notice and just discards page and jnode as part of commit. (XXX
73185 + *     actually it doesn't, because to reclaim page ->releasepage() has to be
73186 + *     called and before this dirty bit will be transferred to the struct
73187 + *     page).
73188 + *
73189 + */
73190 +
73191 +#include "debug.h"
73192 +#include "txnmgr.h"
73193 +#include "jnode.h"
73194 +#include "znode.h"
73195 +#include "block_alloc.h"
73196 +#include "tree.h"
73197 +#include "wander.h"
73198 +#include "ktxnmgrd.h"
73199 +#include "super.h"
73200 +#include "page_cache.h"
73201 +#include "reiser4.h"
73202 +#include "vfs_ops.h"
73203 +#include "inode.h"
73204 +#include "flush.h"
73205 +
73206 +#include <asm/atomic.h>
73207 +#include <linux/types.h>
73208 +#include <linux/fs.h>
73209 +#include <linux/mm.h>
73210 +#include <linux/slab.h>
73211 +#include <linux/pagemap.h>
73212 +#include <linux/writeback.h>
73213 +#include <linux/swap.h>                /* for totalram_pages */
73214 +
73215 +static void atom_free(txn_atom * atom);
73216 +
73217 +static int commit_txnh(txn_handle * txnh);
73218 +
73219 +static void wakeup_atom_waitfor_list(txn_atom * atom);
73220 +static void wakeup_atom_waiting_list(txn_atom * atom);
73221 +
73222 +static void capture_assign_txnh_nolock(txn_atom * atom, txn_handle * txnh);
73223 +
73224 +static void capture_assign_block_nolock(txn_atom * atom, jnode * node);
73225 +
73226 +static void fuse_not_fused_lock_owners(txn_handle * txnh, znode * node);
73227 +
73228 +static int capture_init_fusion(jnode * node, txn_handle * txnh,
73229 +                              txn_capture mode);
73230 +
73231 +static int capture_fuse_wait(txn_handle *, txn_atom *, txn_atom *, txn_capture);
73232 +
73233 +static void capture_fuse_into(txn_atom * small, txn_atom * large);
73234 +
73235 +void invalidate_list(struct list_head *);
73236 +
73237 +/* GENERIC STRUCTURES */
73238 +
73239 +typedef struct _txn_wait_links txn_wait_links;
73240 +
73241 +struct _txn_wait_links {
73242 +       lock_stack *_lock_stack;
73243 +       struct list_head _fwaitfor_link;
73244 +       struct list_head _fwaiting_link;
73245 +       int (*waitfor_cb) (txn_atom * atom, struct _txn_wait_links * wlinks);
73246 +       int (*waiting_cb) (txn_atom * atom, struct _txn_wait_links * wlinks);
73247 +};
73248 +
73249 +/* FIXME: In theory, we should be using the slab cache init & destructor
73250 +   methods instead of, e.g., jnode_init, etc. */
73251 +static kmem_cache_t *_atom_slab = NULL;
73252 +/* this is for user-visible, cross system-call transactions. */
73253 +static kmem_cache_t *_txnh_slab = NULL;
73254 +
73255 +/**
73256 + * init_txnmgr_static - create transaction manager slab caches
73257 + *
73258 + * Initializes caches of txn-atoms and txn_handle. It is part of reiser4 module
73259 + * initialization.
73260 + */
73261 +int init_txnmgr_static(void)
73262 +{
73263 +       assert("jmacd-600", _atom_slab == NULL);
73264 +       assert("jmacd-601", _txnh_slab == NULL);
73265 +
73266 +       ON_DEBUG(atomic_set(&flush_cnt, 0));
73267 +
73268 +       _atom_slab = kmem_cache_create("txn_atom", sizeof(txn_atom), 0,
73269 +                                      SLAB_HWCACHE_ALIGN |
73270 +                                      SLAB_RECLAIM_ACCOUNT, NULL, NULL);
73271 +       if (_atom_slab == NULL)
73272 +               return RETERR(-ENOMEM);
73273 +
73274 +       _txnh_slab = kmem_cache_create("txn_handle", sizeof(txn_handle), 0,
73275 +                             SLAB_HWCACHE_ALIGN, NULL, NULL);
73276 +       if (_txnh_slab == NULL) {
73277 +               kmem_cache_destroy(_atom_slab);
73278 +               _atom_slab = NULL;
73279 +               return RETERR(-ENOMEM);
73280 +       }
73281 +
73282 +       return 0;
73283 +}
73284 +
73285 +/**
73286 + * done_txnmgr_static - delete txn_atom and txn_handle caches
73287 + *
73288 + * This is called on reiser4 module unloading or system shutdown.
73289 + */
73290 +void done_txnmgr_static(void)
73291 +{
73292 +       destroy_reiser4_cache(&_atom_slab);
73293 +       destroy_reiser4_cache(&_txnh_slab);
73294 +}
73295 +
73296 +/**
73297 + * init_txnmgr - initialize a new transaction manager
73298 + * @mgr: pointer to transaction manager embedded in reiser4 super block
73299 + *
73300 + * This is called on mount. Makes necessary initializations.
73301 + */
73302 +void init_txnmgr(txn_mgr *mgr)
73303 +{
73304 +       assert("umka-169", mgr != NULL);
73305 +
73306 +       mgr->atom_count = 0;
73307 +       mgr->id_count = 1;
73308 +       INIT_LIST_HEAD(&mgr->atoms_list);
73309 +       spin_lock_init(&mgr->tmgr_lock);
73310 +       sema_init(&mgr->commit_semaphore, 1);
73311 +}
73312 +
73313 +/**
73314 + * done_txnmgr - stop transaction manager
73315 + * @mgr: pointer to transaction manager embedded in reiser4 super block
73316 + *
73317 + * This is called on umount. Does sanity checks.
73318 + */
73319 +void done_txnmgr(txn_mgr *mgr)
73320 +{
73321 +       assert("umka-170", mgr != NULL);
73322 +       assert("umka-1701", list_empty_careful(&mgr->atoms_list));
73323 +       assert("umka-1702", mgr->atom_count == 0);
73324 +}
73325 +
73326 +/* Initialize a transaction handle. */
73327 +/* Audited by: umka (2002.06.13) */
73328 +static void txnh_init(txn_handle * txnh, txn_mode mode)
73329 +{
73330 +       assert("umka-171", txnh != NULL);
73331 +
73332 +       txnh->mode = mode;
73333 +       txnh->atom = NULL;
73334 +       set_gfp_mask();
73335 +       txnh->flags = 0;
73336 +       spin_lock_init(&txnh->hlock);
73337 +       INIT_LIST_HEAD(&txnh->txnh_link);
73338 +}
73339 +
73340 +#if REISER4_DEBUG
73341 +/* Check if a transaction handle is clean. */
73342 +static int txnh_isclean(txn_handle * txnh)
73343 +{
73344 +       assert("umka-172", txnh != NULL);
73345 +       return txnh->atom == NULL &&
73346 +               LOCK_CNT_NIL(spin_locked_txnh);
73347 +}
73348 +#endif
73349 +
73350 +/* Initialize an atom. */
73351 +static void atom_init(txn_atom * atom)
73352 +{
73353 +       int level;
73354 +
73355 +       assert("umka-173", atom != NULL);
73356 +
73357 +       memset(atom, 0, sizeof(txn_atom));
73358 +
73359 +       atom->stage = ASTAGE_FREE;
73360 +       atom->start_time = jiffies;
73361 +
73362 +       for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1)
73363 +               INIT_LIST_HEAD(ATOM_DIRTY_LIST(atom, level));
73364 +
73365 +       INIT_LIST_HEAD(ATOM_CLEAN_LIST(atom));
73366 +       INIT_LIST_HEAD(ATOM_OVRWR_LIST(atom));
73367 +       INIT_LIST_HEAD(ATOM_WB_LIST(atom));
73368 +       INIT_LIST_HEAD(&atom->inodes);
73369 +       spin_lock_init(&atom->alock);
73370 +       /* list of transaction handles */
73371 +       INIT_LIST_HEAD(&atom->txnh_list);
73372 +       /* link to transaction manager's list of atoms */
73373 +       INIT_LIST_HEAD(&atom->atom_link);
73374 +       INIT_LIST_HEAD(&atom->fwaitfor_list);
73375 +       INIT_LIST_HEAD(&atom->fwaiting_list);
73376 +       blocknr_set_init(&atom->delete_set);
73377 +       blocknr_set_init(&atom->wandered_map);
73378 +
73379 +       init_atom_fq_parts(atom);
73380 +}
73381 +
73382 +#if REISER4_DEBUG
73383 +/* Check if an atom is clean. */
73384 +static int atom_isclean(txn_atom * atom)
73385 +{
73386 +       int level;
73387 +
73388 +       assert("umka-174", atom != NULL);
73389 +
73390 +       for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1) {
73391 +               if (!list_empty_careful(ATOM_DIRTY_LIST(atom, level))) {
73392 +                       return 0;
73393 +               }
73394 +       }
73395 +
73396 +       return  atom->stage == ASTAGE_FREE &&
73397 +               atom->txnh_count == 0 &&
73398 +               atom->capture_count == 0 &&
73399 +               atomic_read(&atom->refcount) == 0 &&
73400 +               (&atom->atom_link == atom->atom_link.next &&
73401 +                &atom->atom_link == atom->atom_link.prev) &&
73402 +               list_empty_careful(&atom->txnh_list) &&
73403 +               list_empty_careful(ATOM_CLEAN_LIST(atom)) &&
73404 +               list_empty_careful(ATOM_OVRWR_LIST(atom)) &&
73405 +               list_empty_careful(ATOM_WB_LIST(atom)) &&
73406 +               list_empty_careful(&atom->fwaitfor_list) &&
73407 +               list_empty_careful(&atom->fwaiting_list) &&
73408 +               atom_fq_parts_are_clean(atom);
73409 +}
73410 +#endif
73411 +
73412 +/* Begin a transaction in this context.  Currently this uses the reiser4_context's
73413 +   trans_in_ctx, which means that transaction handles are stack-allocated.  Eventually
73414 +   this will be extended to allow transaction handles to span several contexts. */
73415 +/* Audited by: umka (2002.06.13) */
73416 +void txn_begin(reiser4_context * context)
73417 +{
73418 +       assert("jmacd-544", context->trans == NULL);
73419 +
73420 +       context->trans = &context->trans_in_ctx;
73421 +
73422 +       /* FIXME_LATER_JMACD Currently there's no way to begin a TXN_READ_FUSING
73423 +          transcrash.  Default should be TXN_WRITE_FUSING.  Also, the _trans variable is
73424 +          stack allocated right now, but we would like to allow for dynamically allocated
73425 +          transcrashes that span multiple system calls.
73426 +        */
73427 +       txnh_init(context->trans, TXN_WRITE_FUSING);
73428 +}
73429 +
73430 +/* Finish a transaction handle context. */
73431 +int txn_end(reiser4_context * context)
73432 +{
73433 +       long ret = 0;
73434 +       txn_handle *txnh;
73435 +
73436 +       assert("umka-283", context != NULL);
73437 +       assert("nikita-3012", schedulable());
73438 +       assert("vs-24", context == get_current_context());
73439 +       assert("nikita-2967", lock_stack_isclean(get_current_lock_stack()));
73440 +
73441 +       txnh = context->trans;
73442 +       if (txnh != NULL) {
73443 +               if (txnh->atom != NULL)
73444 +                       ret = commit_txnh(txnh);
73445 +               assert("jmacd-633", txnh_isclean(txnh));
73446 +               context->trans = NULL;
73447 +       }
73448 +       return ret;
73449 +}
73450 +
73451 +void txn_restart(reiser4_context * context)
73452 +{
73453 +       txn_end(context);
73454 +       preempt_point();
73455 +       txn_begin(context);
73456 +}
73457 +
73458 +void txn_restart_current(void)
73459 +{
73460 +       txn_restart(get_current_context());
73461 +}
73462 +
73463 +/* TXN_ATOM */
73464 +
73465 +/* Get the atom belonging to a txnh, which is not locked.  Return txnh locked. Locks atom, if atom
73466 +   is not NULL.  This performs the necessary spin_trylock to break the lock-ordering cycle.  May
73467 +   return NULL. */
73468 +static txn_atom *txnh_get_atom(txn_handle * txnh)
73469 +{
73470 +       txn_atom *atom;
73471 +
73472 +       assert("umka-180", txnh != NULL);
73473 +       assert_spin_not_locked(&(txnh->hlock));
73474 +
73475 +       while (1) {
73476 +               spin_lock_txnh(txnh);
73477 +               atom = txnh->atom;
73478 +
73479 +               if (atom == NULL)
73480 +                       break;
73481 +
73482 +               if (spin_trylock_atom(atom))
73483 +                       break;
73484 +
73485 +               atomic_inc(&atom->refcount);
73486 +
73487 +               spin_unlock_txnh(txnh);
73488 +               spin_lock_atom(atom);
73489 +               spin_lock_txnh(txnh);
73490 +
73491 +               if (txnh->atom == atom) {
73492 +                       atomic_dec(&atom->refcount);
73493 +                       break;
73494 +               }
73495 +
73496 +               spin_unlock_txnh(txnh);
73497 +               atom_dec_and_unlock(atom);
73498 +       }
73499 +
73500 +       return atom;
73501 +}
73502 +
73503 +/* Get the current atom and spinlock it if current atom present. May return NULL  */
73504 +txn_atom *get_current_atom_locked_nocheck(void)
73505 +{
73506 +       reiser4_context *cx;
73507 +       txn_atom *atom;
73508 +       txn_handle *txnh;
73509 +
73510 +       cx = get_current_context();
73511 +       assert("zam-437", cx != NULL);
73512 +
73513 +       txnh = cx->trans;
73514 +       assert("zam-435", txnh != NULL);
73515 +
73516 +       atom = txnh_get_atom(txnh);
73517 +
73518 +       spin_unlock_txnh(txnh);
73519 +       return atom;
73520 +}
73521 +
73522 +/* Get the atom belonging to a jnode, which is initially locked.  Return with
73523 +   both jnode and atom locked.  This performs the necessary spin_trylock to
73524 +   break the lock-ordering cycle.  Assumes the jnode is already locked, and
73525 +   returns NULL if atom is not set. */
73526 +txn_atom *jnode_get_atom(jnode * node)
73527 +{
73528 +       txn_atom *atom;
73529 +
73530 +       assert("umka-181", node != NULL);
73531 +
73532 +       while (1) {
73533 +               assert_spin_locked(&(node->guard));
73534 +
73535 +               atom = node->atom;
73536 +               /* node is not in any atom */
73537 +               if (atom == NULL)
73538 +                       break;
73539 +
73540 +               /* If atom is not locked, grab the lock and return */
73541 +               if (spin_trylock_atom(atom))
73542 +                       break;
73543 +
73544 +               /* At least one jnode belongs to this atom it guarantees that
73545 +                * atom->refcount > 0, we can safely increment refcount. */
73546 +               atomic_inc(&atom->refcount);
73547 +               spin_unlock_jnode(node);
73548 +
73549 +               /* re-acquire spin locks in the right order */
73550 +               spin_lock_atom(atom);
73551 +               spin_lock_jnode(node);
73552 +
73553 +               /* check if node still points to the same atom. */
73554 +               if (node->atom == atom) {
73555 +                       atomic_dec(&atom->refcount);
73556 +                       break;
73557 +               }
73558 +
73559 +               /* releasing of atom lock and reference requires not holding
73560 +                * locks on jnodes.  */
73561 +               spin_unlock_jnode(node);
73562 +
73563 +               /* We do not sure that this atom has extra references except our
73564 +                * one, so we should call proper function which may free atom if
73565 +                * last reference is released. */
73566 +               atom_dec_and_unlock(atom);
73567 +
73568 +               /* lock jnode again for getting valid node->atom pointer
73569 +                * value. */
73570 +               spin_lock_jnode(node);
73571 +       }
73572 +
73573 +       return atom;
73574 +}
73575 +
73576 +/* Returns true if @node is dirty and part of the same atom as one of its neighbors.  Used
73577 +   by flush code to indicate whether the next node (in some direction) is suitable for
73578 +   flushing. */
73579 +int
73580 +same_slum_check(jnode * node, jnode * check, int alloc_check, int alloc_value)
73581 +{
73582 +       int compat;
73583 +       txn_atom *atom;
73584 +
73585 +       assert("umka-182", node != NULL);
73586 +       assert("umka-183", check != NULL);
73587 +
73588 +       /* Not sure what this function is supposed to do if supplied with @check that is
73589 +          neither formatted nor unformatted (bitmap or so). */
73590 +       assert("nikita-2373", jnode_is_znode(check)
73591 +              || jnode_is_unformatted(check));
73592 +
73593 +       /* Need a lock on CHECK to get its atom and to check various state bits.
73594 +          Don't need a lock on NODE once we get the atom lock. */
73595 +       /* It is not enough to lock two nodes and check (node->atom ==
73596 +          check->atom) because atom could be locked and being fused at that
73597 +          moment, jnodes of the atom of that state (being fused) can point to
73598 +          different objects, but the atom is the same. */
73599 +       spin_lock_jnode(check);
73600 +
73601 +       atom = jnode_get_atom(check);
73602 +
73603 +       if (atom == NULL) {
73604 +               compat = 0;
73605 +       } else {
73606 +               compat = (node->atom == atom && JF_ISSET(check, JNODE_DIRTY));
73607 +
73608 +               if (compat && jnode_is_znode(check)) {
73609 +                       compat &= znode_is_connected(JZNODE(check));
73610 +               }
73611 +
73612 +               if (compat && alloc_check) {
73613 +                       compat &= (alloc_value == jnode_is_flushprepped(check));
73614 +               }
73615 +
73616 +               spin_unlock_atom(atom);
73617 +       }
73618 +
73619 +       spin_unlock_jnode(check);
73620 +
73621 +       return compat;
73622 +}
73623 +
73624 +/* Decrement the atom's reference count and if it falls to zero, free it. */
73625 +void atom_dec_and_unlock(txn_atom * atom)
73626 +{
73627 +       txn_mgr *mgr = &get_super_private(reiser4_get_current_sb())->tmgr;
73628 +
73629 +       assert("umka-186", atom != NULL);
73630 +       assert_spin_locked(&(atom->alock));
73631 +       assert("zam-1039", atomic_read(&atom->refcount) > 0);
73632 +
73633 +       if (atomic_dec_and_test(&atom->refcount)) {
73634 +               /* take txnmgr lock and atom lock in proper order. */
73635 +               if (!spin_trylock_txnmgr(mgr)) {
73636 +                       /* This atom should exist after we re-acquire its
73637 +                        * spinlock, so we increment its reference counter. */
73638 +                       atomic_inc(&atom->refcount);
73639 +                       spin_unlock_atom(atom);
73640 +                       spin_lock_txnmgr(mgr);
73641 +                       spin_lock_atom(atom);
73642 +
73643 +                       if (!atomic_dec_and_test(&atom->refcount)) {
73644 +                               spin_unlock_atom(atom);
73645 +                               spin_unlock_txnmgr(mgr);
73646 +                               return;
73647 +                       }
73648 +               }
73649 +               assert_spin_locked(&(mgr->tmgr_lock));
73650 +               atom_free(atom);
73651 +               spin_unlock_txnmgr(mgr);
73652 +       } else
73653 +               spin_unlock_atom(atom);
73654 +}
73655 +
73656 +/* Create new atom and connect it to given transaction handle.  This adds the
73657 +   atom to the transaction manager's list and sets its reference count to 1, an
73658 +   artificial reference which is kept until it commits.  We play strange games
73659 +   to avoid allocation under jnode & txnh spinlocks.*/
73660 +
73661 +static int atom_begin_and_assign_to_txnh(txn_atom ** atom_alloc, txn_handle * txnh)
73662 +{
73663 +       txn_atom *atom;
73664 +       txn_mgr *mgr;
73665 +
73666 +       if (REISER4_DEBUG && rofs_tree(current_tree)) {
73667 +               warning("nikita-3366", "Creating atom on rofs");
73668 +               dump_stack();
73669 +       }
73670 +
73671 +       if (*atom_alloc == NULL) {
73672 +               (*atom_alloc) = kmem_cache_alloc(_atom_slab, get_gfp_mask());
73673 +
73674 +               if (*atom_alloc == NULL)
73675 +                       return RETERR(-ENOMEM);
73676 +       }
73677 +
73678 +       /* and, also, txnmgr spin lock should be taken before jnode and txnh
73679 +          locks. */
73680 +       mgr = &get_super_private(reiser4_get_current_sb())->tmgr;
73681 +       spin_lock_txnmgr(mgr);
73682 +       spin_lock_txnh(txnh);
73683 +
73684 +       /* Check whether new atom still needed */
73685 +       if (txnh->atom != NULL) {
73686 +               /* NOTE-NIKITA probably it is rather better to free
73687 +                * atom_alloc here than thread it up to try_capture(). */
73688 +
73689 +               spin_unlock_txnh(txnh);
73690 +               spin_unlock_txnmgr(mgr);
73691 +
73692 +               return -E_REPEAT;
73693 +       }
73694 +
73695 +       atom = *atom_alloc;
73696 +       *atom_alloc = NULL;
73697 +
73698 +       atom_init(atom);
73699 +
73700 +       assert("jmacd-17", atom_isclean(atom));
73701 +
73702 +        /*
73703 +        * do not use spin_lock_atom because we have broken lock ordering here
73704 +        * which is ok, as long as @atom is new and inaccessible for others.
73705 +        */
73706 +       spin_lock(&(atom->alock));
73707 +
73708 +       /* add atom to the end of transaction manager's list of atoms */
73709 +       list_add_tail(&atom->atom_link, &mgr->atoms_list);
73710 +       atom->atom_id = mgr->id_count++;
73711 +       mgr->atom_count += 1;
73712 +
73713 +       /* Release txnmgr lock */
73714 +       spin_unlock_txnmgr(mgr);
73715 +
73716 +       /* One reference until it commits. */
73717 +       atomic_inc(&atom->refcount);
73718 +       atom->stage = ASTAGE_CAPTURE_FUSE;
73719 +       atom->super = reiser4_get_current_sb();
73720 +       capture_assign_txnh_nolock(atom, txnh);
73721 +
73722 +       spin_unlock(&(atom->alock));
73723 +       spin_unlock_txnh(txnh);
73724 +
73725 +       return -E_REPEAT;
73726 +}
73727 +
73728 +/* Return true if an atom is currently "open". */
73729 +static int atom_isopen(const txn_atom * atom)
73730 +{
73731 +       assert("umka-185", atom != NULL);
73732 +
73733 +       return atom->stage > 0 && atom->stage < ASTAGE_PRE_COMMIT;
73734 +}
73735 +
73736 +/* Return the number of pointers to this atom that must be updated during fusion.  This
73737 +   approximates the amount of work to be done.  Fusion chooses the atom with fewer
73738 +   pointers to fuse into the atom with more pointers. */
73739 +static int atom_pointer_count(const txn_atom * atom)
73740 +{
73741 +       assert("umka-187", atom != NULL);
73742 +
73743 +       /* This is a measure of the amount of work needed to fuse this atom
73744 +        * into another. */
73745 +       return atom->txnh_count + atom->capture_count;
73746 +}
73747 +
73748 +/* Called holding the atom lock, this removes the atom from the transaction manager list
73749 +   and frees it. */
73750 +static void atom_free(txn_atom * atom)
73751 +{
73752 +       txn_mgr *mgr = &get_super_private(reiser4_get_current_sb())->tmgr;
73753 +
73754 +       assert("umka-188", atom != NULL);
73755 +       assert_spin_locked(&(atom->alock));
73756 +
73757 +       /* Remove from the txn_mgr's atom list */
73758 +       assert_spin_locked(&(mgr->tmgr_lock));
73759 +       mgr->atom_count -= 1;
73760 +       list_del_init(&atom->atom_link);
73761 +
73762 +       /* Clean the atom */
73763 +       assert("jmacd-16",
73764 +              (atom->stage == ASTAGE_INVALID || atom->stage == ASTAGE_DONE));
73765 +       atom->stage = ASTAGE_FREE;
73766 +
73767 +       blocknr_set_destroy(&atom->delete_set);
73768 +       blocknr_set_destroy(&atom->wandered_map);
73769 +
73770 +       assert("jmacd-16", atom_isclean(atom));
73771 +
73772 +       spin_unlock_atom(atom);
73773 +
73774 +       kmem_cache_free(_atom_slab, atom);
73775 +}
73776 +
73777 +static int atom_is_dotard(const txn_atom * atom)
73778 +{
73779 +       return time_after(jiffies, atom->start_time +
73780 +                         get_current_super_private()->tmgr.atom_max_age);
73781 +}
73782 +
73783 +static int atom_can_be_committed(txn_atom * atom)
73784 +{
73785 +       assert_spin_locked(&(atom->alock));
73786 +       assert("zam-885", atom->txnh_count > atom->nr_waiters);
73787 +       return atom->txnh_count == atom->nr_waiters + 1;
73788 +}
73789 +
73790 +/* Return true if an atom should commit now.  This is determined by aging, atom
73791 +   size or atom flags. */
73792 +static int atom_should_commit(const txn_atom * atom)
73793 +{
73794 +       assert("umka-189", atom != NULL);
73795 +       return
73796 +           (atom->flags & ATOM_FORCE_COMMIT) ||
73797 +           ((unsigned)atom_pointer_count(atom) >
73798 +            get_current_super_private()->tmgr.atom_max_size)
73799 +           || atom_is_dotard(atom);
73800 +}
73801 +
73802 +/* return 1 if current atom exists and requires commit. */
73803 +int current_atom_should_commit(void)
73804 +{
73805 +       txn_atom *atom;
73806 +       int result = 0;
73807 +
73808 +       atom = get_current_atom_locked_nocheck();
73809 +       if (atom) {
73810 +               result = atom_should_commit(atom);
73811 +               spin_unlock_atom(atom);
73812 +       }
73813 +       return result;
73814 +}
73815 +
73816 +static int atom_should_commit_asap(const txn_atom * atom)
73817 +{
73818 +       unsigned int captured;
73819 +       unsigned int pinnedpages;
73820 +
73821 +       assert("nikita-3309", atom != NULL);
73822 +
73823 +       captured = (unsigned)atom->capture_count;
73824 +       pinnedpages = (captured >> PAGE_CACHE_SHIFT) * sizeof(znode);
73825 +
73826 +       return (pinnedpages > (totalram_pages >> 3)) || (atom->flushed > 100);
73827 +}
73828 +
73829 +static jnode *find_first_dirty_in_list(struct list_head *head, int flags)
73830 +{
73831 +       jnode *first_dirty;
73832 +
73833 +       list_for_each_entry(first_dirty, head, capture_link) {
73834 +               if (!(flags & JNODE_FLUSH_COMMIT)) {
73835 +                       /*
73836 +                        * skip jnodes which "heard banshee" or having active
73837 +                        * I/O
73838 +                        */
73839 +                       if (JF_ISSET(first_dirty, JNODE_HEARD_BANSHEE) ||
73840 +                           JF_ISSET(first_dirty, JNODE_WRITEBACK))
73841 +                               continue;
73842 +               }
73843 +               return first_dirty;
73844 +       }
73845 +       return NULL;
73846 +}
73847 +
73848 +/* Get first dirty node from the atom's dirty_nodes[n] lists; return NULL if atom has no dirty
73849 +   nodes on atom's lists */
73850 +jnode *find_first_dirty_jnode(txn_atom * atom, int flags)
73851 +{
73852 +       jnode *first_dirty;
73853 +       tree_level level;
73854 +
73855 +       assert_spin_locked(&(atom->alock));
73856 +
73857 +       /* The flush starts from LEAF_LEVEL (=1). */
73858 +       for (level = 1; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1) {
73859 +               if (list_empty_careful(ATOM_DIRTY_LIST(atom, level)))
73860 +                       continue;
73861 +
73862 +               first_dirty =
73863 +                   find_first_dirty_in_list(ATOM_DIRTY_LIST(atom, level),
73864 +                                            flags);
73865 +               if (first_dirty)
73866 +                       return first_dirty;
73867 +       }
73868 +
73869 +       /* znode-above-root is on the list #0. */
73870 +       return find_first_dirty_in_list(ATOM_DIRTY_LIST(atom, 0), flags);
73871 +}
73872 +
73873 +static void dispatch_wb_list(txn_atom * atom, flush_queue_t * fq)
73874 +{
73875 +       jnode *cur;
73876 +
73877 +       assert("zam-905", atom_is_protected(atom));
73878 +
73879 +       cur = list_entry(ATOM_WB_LIST(atom)->next, jnode, capture_link);
73880 +       while (ATOM_WB_LIST(atom) != &cur->capture_link) {
73881 +               jnode *next = list_entry(cur->capture_link.next, jnode, capture_link);
73882 +
73883 +               spin_lock_jnode(cur);
73884 +               if (!JF_ISSET(cur, JNODE_WRITEBACK)) {
73885 +                       if (JF_ISSET(cur, JNODE_DIRTY)) {
73886 +                               queue_jnode(fq, cur);
73887 +                       } else {
73888 +                               /* move jnode to atom's clean list */
73889 +                               list_move_tail(&cur->capture_link,
73890 +                                             ATOM_CLEAN_LIST(atom));
73891 +                       }
73892 +               }
73893 +               spin_unlock_jnode(cur);
73894 +
73895 +               cur = next;
73896 +       }
73897 +}
73898 +
73899 +/* Scan current atom->writeback_nodes list, re-submit dirty and !writeback
73900 + * jnodes to disk. */
73901 +static int submit_wb_list(void)
73902 +{
73903 +       int ret;
73904 +       flush_queue_t *fq;
73905 +
73906 +       fq = get_fq_for_current_atom();
73907 +       if (IS_ERR(fq))
73908 +               return PTR_ERR(fq);
73909 +
73910 +       dispatch_wb_list(fq->atom, fq);
73911 +       spin_unlock_atom(fq->atom);
73912 +
73913 +       ret = write_fq(fq, NULL, 1);
73914 +       fq_put(fq);
73915 +
73916 +       return ret;
73917 +}
73918 +
73919 +/* Wait completion of all writes, re-submit atom writeback list if needed. */
73920 +static int current_atom_complete_writes(void)
73921 +{
73922 +       int ret;
73923 +
73924 +       /* Each jnode from that list was modified and dirtied when it had i/o
73925 +        * request running already. After i/o completion we have to resubmit
73926 +        * them to disk again.*/
73927 +       ret = submit_wb_list();
73928 +       if (ret < 0)
73929 +               return ret;
73930 +
73931 +       /* Wait all i/o completion */
73932 +       ret = current_atom_finish_all_fq();
73933 +       if (ret)
73934 +               return ret;
73935 +
73936 +       /* Scan wb list again; all i/o should be completed, we re-submit dirty
73937 +        * nodes to disk */
73938 +       ret = submit_wb_list();
73939 +       if (ret < 0)
73940 +               return ret;
73941 +
73942 +       /* Wait all nodes we just submitted */
73943 +       return current_atom_finish_all_fq();
73944 +}
73945 +
73946 +#define TOOMANYFLUSHES (1 << 13)
73947 +
73948 +/* Called with the atom locked and no open "active" transaction handlers except
73949 +   ours, this function calls flush_current_atom() until all dirty nodes are
73950 +   processed.  Then it initiates commit processing.
73951 +
73952 +   Called by the single remaining open "active" txnh, which is closing. Other
73953 +   open txnhs belong to processes which wait atom commit in commit_txnh()
73954 +   routine. They are counted as "waiters" in atom->nr_waiters.  Therefore as
73955 +   long as we hold the atom lock none of the jnodes can be captured and/or
73956 +   locked.
73957 +
73958 +   Return value is an error code if commit fails.
73959 +*/
73960 +static int commit_current_atom(long *nr_submitted, txn_atom ** atom)
73961 +{
73962 +       reiser4_super_info_data *sbinfo = get_current_super_private();
73963 +       long ret = 0;
73964 +       /* how many times jnode_flush() was called as a part of attempt to
73965 +        * commit this atom. */
73966 +       int flushiters;
73967 +
73968 +       assert("zam-888", atom != NULL && *atom != NULL);
73969 +       assert_spin_locked(&((*atom)->alock));
73970 +       assert("zam-887", get_current_context()->trans->atom == *atom);
73971 +       assert("jmacd-151", atom_isopen(*atom));
73972 +
73973 +       /* lock ordering: delete_sema and commit_sema are unordered */
73974 +       assert("nikita-3184",
73975 +              get_current_super_private()->delete_sema_owner != current);
73976 +
73977 +       for (flushiters = 0;; ++flushiters) {
73978 +               ret =
73979 +                   flush_current_atom(JNODE_FLUSH_WRITE_BLOCKS |
73980 +                                      JNODE_FLUSH_COMMIT,
73981 +                                      LONG_MAX /* nr_to_write */ ,
73982 +                                      nr_submitted, atom, NULL);
73983 +               if (ret != -E_REPEAT)
73984 +                       break;
73985 +
73986 +               /* if atom's dirty list contains one znode which is
73987 +                  HEARD_BANSHEE and is locked we have to allow lock owner to
73988 +                  continue and uncapture that znode */
73989 +               preempt_point();
73990 +
73991 +               *atom = get_current_atom_locked();
73992 +               if (flushiters > TOOMANYFLUSHES && IS_POW(flushiters)) {
73993 +                       warning("nikita-3176",
73994 +                               "Flushing like mad: %i", flushiters);
73995 +                       info_atom("atom", *atom);
73996 +                       DEBUGON(flushiters > (1 << 20));
73997 +               }
73998 +       }
73999 +
74000 +       if (ret)
74001 +               return ret;
74002 +
74003 +       assert_spin_locked(&((*atom)->alock));
74004 +
74005 +       if (!atom_can_be_committed(*atom)) {
74006 +               spin_unlock_atom(*atom);
74007 +               return RETERR(-E_REPEAT);
74008 +       }
74009 +
74010 +       if ((*atom)->capture_count == 0)
74011 +               goto done;
74012 +
74013 +       /* Up to this point we have been flushing and after flush is called we
74014 +          return -E_REPEAT.  Now we can commit.  We cannot return -E_REPEAT
74015 +          at this point, commit should be successful. */
74016 +       atom_set_stage(*atom, ASTAGE_PRE_COMMIT);
74017 +       ON_DEBUG(((*atom)->committer = current));
74018 +       spin_unlock_atom(*atom);
74019 +
74020 +       ret = current_atom_complete_writes();
74021 +       if (ret)
74022 +               return ret;
74023 +
74024 +       assert("zam-906", list_empty(ATOM_WB_LIST(*atom)));
74025 +
74026 +       /* isolate critical code path which should be executed by only one
74027 +        * thread using tmgr semaphore */
74028 +       down(&sbinfo->tmgr.commit_semaphore);
74029 +
74030 +       ret = reiser4_write_logs(nr_submitted);
74031 +       if (ret < 0)
74032 +               reiser4_panic("zam-597", "write log failed (%ld)\n", ret);
74033 +
74034 +       /* The atom->ovrwr_nodes list is processed under commit semaphore held
74035 +          because of bitmap nodes which are captured by special way in
74036 +          bitmap_pre_commit_hook(), that way does not include
74037 +          capture_fuse_wait() as a capturing of other nodes does -- the commit
74038 +          semaphore is used for transaction isolation instead. */
74039 +       invalidate_list(ATOM_OVRWR_LIST(*atom));
74040 +       up(&sbinfo->tmgr.commit_semaphore);
74041 +
74042 +       invalidate_list(ATOM_CLEAN_LIST(*atom));
74043 +       invalidate_list(ATOM_WB_LIST(*atom));
74044 +       assert("zam-927", list_empty(&(*atom)->inodes));
74045 +
74046 +       spin_lock_atom(*atom);
74047 + done:
74048 +       atom_set_stage(*atom, ASTAGE_DONE);
74049 +       ON_DEBUG((*atom)->committer = NULL);
74050 +
74051 +       /* Atom's state changes, so wake up everybody waiting for this
74052 +          event. */
74053 +       wakeup_atom_waiting_list(*atom);
74054 +
74055 +       /* Decrement the "until commit" reference, at least one txnh (the caller) is
74056 +          still open. */
74057 +       atomic_dec(&(*atom)->refcount);
74058 +
74059 +       assert("jmacd-1070", atomic_read(&(*atom)->refcount) > 0);
74060 +       assert("jmacd-1062", (*atom)->capture_count == 0);
74061 +       BUG_ON((*atom)->capture_count != 0);
74062 +       assert_spin_locked(&((*atom)->alock));
74063 +
74064 +       return ret;
74065 +}
74066 +
74067 +/* TXN_TXNH */
74068 +
74069 +/**
74070 + * force_commit_atom - commit current atom and wait commit completion
74071 + * @txnh:
74072 + *
74073 + * Commits current atom and wait commit completion; current atom and @txnh have
74074 + * to be spinlocked before call, this function unlocks them on exit.
74075 + */
74076 +int force_commit_atom(txn_handle *txnh)
74077 +{
74078 +       txn_atom *atom;
74079 +
74080 +       assert("zam-837", txnh != NULL);
74081 +       assert_spin_locked(&(txnh->hlock));
74082 +       assert("nikita-2966", lock_stack_isclean(get_current_lock_stack()));
74083 +
74084 +       atom = txnh->atom;
74085 +
74086 +       assert("zam-834", atom != NULL);
74087 +       assert_spin_locked(&(atom->alock));
74088 +
74089 +       /*
74090 +        * Set flags for atom and txnh: forcing atom commit and waiting for
74091 +        * commit completion
74092 +        */
74093 +       txnh->flags |= TXNH_WAIT_COMMIT;
74094 +       atom->flags |= ATOM_FORCE_COMMIT;
74095 +
74096 +       spin_unlock_txnh(txnh);
74097 +       spin_unlock_atom(atom);
74098 +
74099 +       /* commit is here */
74100 +       txn_restart_current();
74101 +       return 0;
74102 +}
74103 +
74104 +/* Called to force commit of any outstanding atoms.  @commit_all_atoms controls
74105 + * should we commit all atoms including new ones which are created after this
74106 + * functions is called. */
74107 +int txnmgr_force_commit_all(struct super_block *super, int commit_all_atoms)
74108 +{
74109 +       int ret;
74110 +       txn_atom *atom;
74111 +       txn_mgr *mgr;
74112 +       txn_handle *txnh;
74113 +       unsigned long start_time = jiffies;
74114 +       reiser4_context *ctx = get_current_context();
74115 +
74116 +       assert("nikita-2965", lock_stack_isclean(get_current_lock_stack()));
74117 +       assert("nikita-3058", commit_check_locks());
74118 +
74119 +       txn_restart_current();
74120 +
74121 +       mgr = &get_super_private(super)->tmgr;
74122 +
74123 +       txnh = ctx->trans;
74124 +
74125 +      again:
74126 +
74127 +       spin_lock_txnmgr(mgr);
74128 +
74129 +       list_for_each_entry(atom, &mgr->atoms_list, atom_link) {
74130 +               spin_lock_atom(atom);
74131 +
74132 +               /* Commit any atom which can be committed.  If @commit_new_atoms
74133 +                * is not set we commit only atoms which were created before
74134 +                * this call is started. */
74135 +               if (commit_all_atoms
74136 +                   || time_before_eq(atom->start_time, start_time)) {
74137 +                       if (atom->stage <= ASTAGE_POST_COMMIT) {
74138 +                               spin_unlock_txnmgr(mgr);
74139 +
74140 +                               if (atom->stage < ASTAGE_PRE_COMMIT) {
74141 +                                       spin_lock_txnh(txnh);
74142 +                                       /* Add force-context txnh */
74143 +                                       capture_assign_txnh_nolock(atom, txnh);
74144 +                                       ret = force_commit_atom(txnh);
74145 +                                       if (ret)
74146 +                                               return ret;
74147 +                               } else
74148 +                                       /* wait atom commit */
74149 +                                       atom_wait_event(atom);
74150 +
74151 +                               goto again;
74152 +                       }
74153 +               }
74154 +
74155 +               spin_unlock_atom(atom);
74156 +       }
74157 +
74158 +#if REISER4_DEBUG
74159 +       if (commit_all_atoms) {
74160 +               reiser4_super_info_data *sbinfo = get_super_private(super);
74161 +               spin_lock_reiser4_super(sbinfo);
74162 +               assert("zam-813",
74163 +                      sbinfo->blocks_fake_allocated_unformatted == 0);
74164 +               assert("zam-812", sbinfo->blocks_fake_allocated == 0);
74165 +               spin_unlock_reiser4_super(sbinfo);
74166 +       }
74167 +#endif
74168 +
74169 +       spin_unlock_txnmgr(mgr);
74170 +
74171 +       return 0;
74172 +}
74173 +
74174 +/* check whether commit_some_atoms() can commit @atom. Locking is up to the
74175 + * caller */
74176 +static int atom_is_committable(txn_atom * atom)
74177 +{
74178 +       return
74179 +           atom->stage < ASTAGE_PRE_COMMIT &&
74180 +           atom->txnh_count == atom->nr_waiters && atom_should_commit(atom);
74181 +}
74182 +
74183 +/* called periodically from ktxnmgrd to commit old atoms. Releases ktxnmgrd spin
74184 + * lock at exit */
74185 +int commit_some_atoms(txn_mgr * mgr)
74186 +{
74187 +       int ret = 0;
74188 +       txn_atom *atom;
74189 +       txn_handle *txnh;
74190 +       reiser4_context *ctx;
74191 +       struct list_head *pos, *tmp;
74192 +
74193 +       ctx = get_current_context();
74194 +       assert("nikita-2444", ctx != NULL);
74195 +
74196 +       txnh = ctx->trans;
74197 +       spin_lock_txnmgr(mgr);
74198 +
74199 +       /*
74200 +        * this is to avoid gcc complain that atom might be used
74201 +        * uninitialized
74202 +        */
74203 +       atom = NULL;
74204 +
74205 +       /* look for atom to commit */
74206 +       list_for_each_safe(pos, tmp, &mgr->atoms_list) {
74207 +               atom = list_entry(pos, txn_atom, atom_link);
74208 +               /*
74209 +                * first test without taking atom spin lock, whether it is
74210 +                * eligible for committing at all
74211 +                */
74212 +               if (atom_is_committable(atom)) {
74213 +                       /* now, take spin lock and re-check */
74214 +                       spin_lock_atom(atom);
74215 +                       if (atom_is_committable(atom))
74216 +                               break;
74217 +                       spin_unlock_atom(atom);
74218 +               }
74219 +       }
74220 +
74221 +       ret = (&mgr->atoms_list == pos);
74222 +       spin_unlock_txnmgr(mgr);
74223 +
74224 +       if (ret) {
74225 +               /* nothing found */
74226 +               spin_unlock(&mgr->daemon->guard);
74227 +               return 0;
74228 +       }
74229 +
74230 +       spin_lock_txnh(txnh);
74231 +
74232 +       BUG_ON(atom == NULL);
74233 +       /* Set the atom to force committing */
74234 +       atom->flags |= ATOM_FORCE_COMMIT;
74235 +
74236 +       /* Add force-context txnh */
74237 +       capture_assign_txnh_nolock(atom, txnh);
74238 +
74239 +       spin_unlock_txnh(txnh);
74240 +       spin_unlock_atom(atom);
74241 +
74242 +       /* we are about to release daemon spin lock, notify daemon it
74243 +          has to rescan atoms */
74244 +       mgr->daemon->rescan = 1;
74245 +       spin_unlock(&mgr->daemon->guard);
74246 +       txn_restart_current();
74247 +       return 0;
74248 +}
74249 +
74250 +static int txn_try_to_fuse_small_atom(txn_mgr * tmgr, txn_atom * atom)
74251 +{
74252 +       int atom_stage;
74253 +       txn_atom *atom_2;
74254 +       int repeat;
74255 +
74256 +       assert("zam-1051", atom->stage < ASTAGE_PRE_COMMIT);
74257 +
74258 +       atom_stage = atom->stage;
74259 +       repeat = 0;
74260 +
74261 +       if (!spin_trylock_txnmgr(tmgr)) {
74262 +               atomic_inc(&atom->refcount);
74263 +               spin_unlock_atom(atom);
74264 +               spin_lock_txnmgr(tmgr);
74265 +               spin_lock_atom(atom);
74266 +               repeat = 1;
74267 +               if (atom->stage != atom_stage) {
74268 +                       spin_unlock_txnmgr(tmgr);
74269 +                       atom_dec_and_unlock(atom);
74270 +                       return -E_REPEAT;
74271 +               }
74272 +               atomic_dec(&atom->refcount);
74273 +       }
74274 +
74275 +       list_for_each_entry(atom_2, &tmgr->atoms_list, atom_link) {
74276 +               if (atom == atom_2)
74277 +                       continue;
74278 +               /*
74279 +                * if trylock does not succeed we just do not fuse with that
74280 +                * atom.
74281 +                */
74282 +               if (spin_trylock_atom(atom_2)) {
74283 +                       if (atom_2->stage < ASTAGE_PRE_COMMIT) {
74284 +                               spin_unlock_txnmgr(tmgr);
74285 +                               capture_fuse_into(atom_2, atom);
74286 +                               /* all locks are lost we can only repeat here */
74287 +                               return -E_REPEAT;
74288 +                       }
74289 +                       spin_unlock_atom(atom_2);
74290 +               }
74291 +       }
74292 +       atom->flags |= ATOM_CANCEL_FUSION;
74293 +       spin_unlock_txnmgr(tmgr);
74294 +       if (repeat) {
74295 +               spin_unlock_atom(atom);
74296 +               return -E_REPEAT;
74297 +       }
74298 +       return 0;
74299 +}
74300 +
74301 +/* Calls jnode_flush for current atom if it exists; if not, just take another
74302 +   atom and call jnode_flush() for him.  If current transaction handle has
74303 +   already assigned atom (current atom) we have to close current transaction
74304 +   prior to switch to another atom or do something with current atom. This
74305 +   code tries to flush current atom.
74306 +
74307 +   flush_some_atom() is called as part of memory clearing process. It is
74308 +   invoked from balance_dirty_pages(), pdflushd, and entd.
74309 +
74310 +   If we can flush no nodes, atom is committed, because this frees memory.
74311 +
74312 +   If atom is too large or too old it is committed also.
74313 +*/
74314 +int
74315 +flush_some_atom(jnode * start, long *nr_submitted, const struct writeback_control *wbc,
74316 +               int flags)
74317 +{
74318 +       reiser4_context *ctx = get_current_context();
74319 +       txn_mgr *tmgr = &get_super_private(ctx->super)->tmgr;
74320 +       txn_handle *txnh = ctx->trans;
74321 +       txn_atom *atom;
74322 +       int ret;
74323 +
74324 +       BUG_ON(wbc->nr_to_write == 0);
74325 +       BUG_ON(*nr_submitted != 0);
74326 +       assert("zam-1042", txnh != NULL);
74327 +      repeat:
74328 +       if (txnh->atom == NULL) {
74329 +               /* current atom is not available, take first from txnmgr */
74330 +               spin_lock_txnmgr(tmgr);
74331 +
74332 +               /* traverse the list of all atoms */
74333 +               list_for_each_entry(atom, &tmgr->atoms_list, atom_link) {
74334 +                       /* lock atom before checking its state */
74335 +                       spin_lock_atom(atom);
74336 +
74337 +                       /*
74338 +                        * we need an atom which is not being committed and
74339 +                        * which has no flushers (jnode_flush() add one flusher
74340 +                        * at the beginning and subtract one at the end).
74341 +                        */
74342 +                       if (atom->stage < ASTAGE_PRE_COMMIT &&
74343 +                           atom->nr_flushers == 0) {
74344 +                               spin_lock_txnh(txnh);
74345 +                               capture_assign_txnh_nolock(atom, txnh);
74346 +                               spin_unlock_txnh(txnh);
74347 +
74348 +                               goto found;
74349 +                       }
74350 +
74351 +                       spin_unlock_atom(atom);
74352 +               }
74353 +
74354 +               /*
74355 +                * Write throttling is case of no one atom can be
74356 +                * flushed/committed.
74357 +                */
74358 +               if (!current_is_pdflush() && !wbc->nonblocking) {
74359 +                       list_for_each_entry(atom, &tmgr->atoms_list, atom_link) {
74360 +                               spin_lock_atom(atom);
74361 +                               /* Repeat the check from the above. */
74362 +                               if (atom->stage < ASTAGE_PRE_COMMIT
74363 +                                   && atom->nr_flushers == 0) {
74364 +                                       spin_lock_txnh(txnh);
74365 +                                       capture_assign_txnh_nolock(atom, txnh);
74366 +                                       spin_unlock_txnh(txnh);
74367 +
74368 +                                       goto found;
74369 +                               }
74370 +                               if (atom->stage <= ASTAGE_POST_COMMIT) {
74371 +                                       spin_unlock_txnmgr(tmgr);
74372 +                                       /*
74373 +                                        * we just wait until atom's flusher
74374 +                                        * makes a progress in flushing or
74375 +                                        * committing the atom
74376 +                                        */
74377 +                                       atom_wait_event(atom);
74378 +                                       goto repeat;
74379 +                               }
74380 +                               spin_unlock_atom(atom);
74381 +                       }
74382 +               }
74383 +               spin_unlock_txnmgr(tmgr);
74384 +               return 0;
74385 +             found:
74386 +               spin_unlock_txnmgr(tmgr);
74387 +       } else
74388 +               atom = get_current_atom_locked();
74389 +
74390 +       BUG_ON(atom->super != ctx->super);
74391 +       assert("vs-35", atom->super == ctx->super);
74392 +       if (start) {
74393 +               spin_lock_jnode(start);
74394 +               ret = (atom == start->atom) ? 1 : 0;
74395 +               spin_unlock_jnode(start);
74396 +               if (ret == 0)
74397 +                       start = NULL;
74398 +       }
74399 +       ret = flush_current_atom(flags, wbc->nr_to_write, nr_submitted, &atom, start);
74400 +       if (ret == 0) {
74401 +               /* flush_current_atom returns 0 only if it submitted for write
74402 +                  nothing */
74403 +               BUG_ON(*nr_submitted != 0);
74404 +               if (*nr_submitted == 0 || atom_should_commit_asap(atom)) {
74405 +                       if (atom->capture_count < tmgr->atom_min_size &&
74406 +                           !(atom->flags & ATOM_CANCEL_FUSION)) {
74407 +                               ret = txn_try_to_fuse_small_atom(tmgr, atom);
74408 +                               if (ret == -E_REPEAT) {
74409 +                                       preempt_point();
74410 +                                       goto repeat;
74411 +                               }
74412 +                       }
74413 +                       /* if early flushing could not make more nodes clean,
74414 +                        * or atom is too old/large,
74415 +                        * we force current atom to commit */
74416 +                       /* wait for commit completion but only if this
74417 +                        * wouldn't stall pdflushd and ent thread. */
74418 +                       if (!wbc->nonblocking && !ctx->entd)
74419 +                               txnh->flags |= TXNH_WAIT_COMMIT;
74420 +                       atom->flags |= ATOM_FORCE_COMMIT;
74421 +               }
74422 +               spin_unlock_atom(atom);
74423 +       } else if (ret == -E_REPEAT) {
74424 +               if (*nr_submitted == 0) {
74425 +                       /* let others who hampers flushing (hold longterm locks,
74426 +                          for instance) to free the way for flush */
74427 +                       preempt_point();
74428 +                       goto repeat;
74429 +               }
74430 +               ret = 0;
74431 +       }
74432 +/*
74433 +       if (*nr_submitted > wbc->nr_to_write)
74434 +               warning("", "asked for %ld, written %ld\n", wbc->nr_to_write, *nr_submitted);
74435 +*/
74436 +       txn_restart(ctx);
74437 +
74438 +       return ret;
74439 +}
74440 +
74441 +/* Remove processed nodes from atom's clean list (thereby remove them from transaction). */
74442 +void invalidate_list(struct list_head *head)
74443 +{
74444 +       while (!list_empty(head)) {
74445 +               jnode *node;
74446 +
74447 +               node = list_entry(head->next, jnode, capture_link);
74448 +               spin_lock_jnode(node);
74449 +               uncapture_block(node);
74450 +               jput(node);
74451 +       }
74452 +}
74453 +
74454 +static void init_wlinks(txn_wait_links * wlinks)
74455 +{
74456 +       wlinks->_lock_stack = get_current_lock_stack();
74457 +       INIT_LIST_HEAD(&wlinks->_fwaitfor_link);
74458 +       INIT_LIST_HEAD(&wlinks->_fwaiting_link);
74459 +       wlinks->waitfor_cb = NULL;
74460 +       wlinks->waiting_cb = NULL;
74461 +}
74462 +
74463 +/* Add atom to the atom's waitfor list and wait for somebody to wake us up; */
74464 +void atom_wait_event(txn_atom * atom)
74465 +{
74466 +       txn_wait_links _wlinks;
74467 +
74468 +       assert_spin_locked(&(atom->alock));
74469 +       assert("nikita-3156",
74470 +              lock_stack_isclean(get_current_lock_stack()) ||
74471 +              atom->nr_running_queues > 0);
74472 +
74473 +       init_wlinks(&_wlinks);
74474 +       list_add_tail(&_wlinks._fwaitfor_link, &atom->fwaitfor_list);
74475 +       atomic_inc(&atom->refcount);
74476 +       spin_unlock_atom(atom);
74477 +
74478 +       prepare_to_sleep(_wlinks._lock_stack);
74479 +       go_to_sleep(_wlinks._lock_stack);
74480 +
74481 +       spin_lock_atom(atom);
74482 +       list_del(&_wlinks._fwaitfor_link);
74483 +       atom_dec_and_unlock(atom);
74484 +}
74485 +
74486 +void atom_set_stage(txn_atom * atom, txn_stage stage)
74487 +{
74488 +       assert("nikita-3535", atom != NULL);
74489 +       assert_spin_locked(&(atom->alock));
74490 +       assert("nikita-3536", ASTAGE_FREE <= stage && stage <= ASTAGE_INVALID);
74491 +       /* Excelsior! */
74492 +       assert("nikita-3537", stage >= atom->stage);
74493 +       if (atom->stage != stage) {
74494 +               atom->stage = stage;
74495 +               atom_send_event(atom);
74496 +       }
74497 +}
74498 +
74499 +/* wake all threads which wait for an event */
74500 +void atom_send_event(txn_atom * atom)
74501 +{
74502 +       assert_spin_locked(&(atom->alock));
74503 +       wakeup_atom_waitfor_list(atom);
74504 +}
74505 +
74506 +/* Informs txn manager code that owner of this txn_handle should wait atom commit completion (for
74507 +   example, because it does fsync(2)) */
74508 +static int should_wait_commit(txn_handle * h)
74509 +{
74510 +       return h->flags & TXNH_WAIT_COMMIT;
74511 +}
74512 +
74513 +typedef struct commit_data {
74514 +       txn_atom *atom;
74515 +       txn_handle *txnh;
74516 +       long nr_written;
74517 +       /* as an optimization we start committing atom by first trying to
74518 +        * flush it few times without switching into ASTAGE_CAPTURE_WAIT. This
74519 +        * allows to reduce stalls due to other threads waiting for atom in
74520 +        * ASTAGE_CAPTURE_WAIT stage. ->preflush is counter of these
74521 +        * preliminary flushes. */
74522 +       int preflush;
74523 +       /* have we waited on atom. */
74524 +       int wait;
74525 +       int failed;
74526 +       int wake_ktxnmgrd_up;
74527 +} commit_data;
74528 +
74529 +/*
74530 + * Called from commit_txnh() repeatedly, until either error happens, or atom
74531 + * commits successfully.
74532 + */
74533 +static int try_commit_txnh(commit_data * cd)
74534 +{
74535 +       int result;
74536 +
74537 +       assert("nikita-2968", lock_stack_isclean(get_current_lock_stack()));
74538 +
74539 +       /* Get the atom and txnh locked. */
74540 +       cd->atom = txnh_get_atom(cd->txnh);
74541 +       assert("jmacd-309", cd->atom != NULL);
74542 +       spin_unlock_txnh(cd->txnh);
74543 +
74544 +       if (cd->wait) {
74545 +               cd->atom->nr_waiters--;
74546 +               cd->wait = 0;
74547 +       }
74548 +
74549 +       if (cd->atom->stage == ASTAGE_DONE)
74550 +               return 0;
74551 +
74552 +       if (cd->failed)
74553 +               return 0;
74554 +
74555 +       if (atom_should_commit(cd->atom)) {
74556 +               /* if atom is _very_ large schedule it for commit as soon as
74557 +                * possible. */
74558 +               if (atom_should_commit_asap(cd->atom)) {
74559 +                       /*
74560 +                        * When atom is in PRE_COMMIT or later stage following
74561 +                        * invariant (encoded   in    atom_can_be_committed())
74562 +                        * holds:  there is exactly one non-waiter transaction
74563 +                        * handle opened  on this atom.  When  thread wants to
74564 +                        * wait  until atom  commits (for  example  sync()) it
74565 +                        * waits    on    atom  event     after     increasing
74566 +                        * atom->nr_waiters (see blow  in  this  function). It
74567 +                        * cannot be guaranteed that atom is already committed
74568 +                        * after    receiving event,  so     loop has   to  be
74569 +                        * re-started. But  if  atom switched into  PRE_COMMIT
74570 +                        * stage and became  too  large, we cannot  change its
74571 +                        * state back   to CAPTURE_WAIT (atom  stage can  only
74572 +                        * increase monotonically), hence this check.
74573 +                        */
74574 +                       if (cd->atom->stage < ASTAGE_CAPTURE_WAIT)
74575 +                               atom_set_stage(cd->atom, ASTAGE_CAPTURE_WAIT);
74576 +                       cd->atom->flags |= ATOM_FORCE_COMMIT;
74577 +               }
74578 +               if (cd->txnh->flags & TXNH_DONT_COMMIT) {
74579 +                       /*
74580 +                        * this  thread (transaction  handle  that is) doesn't
74581 +                        * want to commit  atom. Notify waiters that handle is
74582 +                        * closed. This can happen, for  example, when we  are
74583 +                        * under  VFS directory lock  and don't want to commit
74584 +                        * atom  right   now to  avoid  stalling other threads
74585 +                        * working in the same directory.
74586 +                        */
74587 +
74588 +                       /* Wake  the ktxnmgrd up if  the ktxnmgrd is needed to
74589 +                        * commit this  atom: no  atom  waiters  and only  one
74590 +                        * (our) open transaction handle. */
74591 +                       cd->wake_ktxnmgrd_up =
74592 +                           cd->atom->txnh_count == 1 &&
74593 +                           cd->atom->nr_waiters == 0;
74594 +                       atom_send_event(cd->atom);
74595 +                       result = 0;
74596 +               } else if (!atom_can_be_committed(cd->atom)) {
74597 +                       if (should_wait_commit(cd->txnh)) {
74598 +                               /* sync(): wait for commit */
74599 +                               cd->atom->nr_waiters++;
74600 +                               cd->wait = 1;
74601 +                               atom_wait_event(cd->atom);
74602 +                               result = RETERR(-E_REPEAT);
74603 +                       } else {
74604 +                               result = 0;
74605 +                       }
74606 +               } else if (cd->preflush > 0 && !is_current_ktxnmgrd()) {
74607 +                       /*
74608 +                        * optimization: flush  atom without switching it into
74609 +                        * ASTAGE_CAPTURE_WAIT.
74610 +                        *
74611 +                        * But don't  do this for  ktxnmgrd, because  ktxnmgrd
74612 +                        * should never block on atom fusion.
74613 +                        */
74614 +                       result = flush_current_atom(JNODE_FLUSH_WRITE_BLOCKS,
74615 +                                                   LONG_MAX, &cd->nr_written,
74616 +                                                   &cd->atom, NULL);
74617 +                       if (result == 0) {
74618 +                               spin_unlock_atom(cd->atom);
74619 +                               cd->preflush = 0;
74620 +                               result = RETERR(-E_REPEAT);
74621 +                       } else  /* Atoms wasn't flushed
74622 +                                * completely. Rinse. Repeat. */
74623 +                               --cd->preflush;
74624 +               } else {
74625 +                       /* We change   atom state  to   ASTAGE_CAPTURE_WAIT to
74626 +                          prevent atom fusion and count  ourself as an active
74627 +                          flusher */
74628 +                       atom_set_stage(cd->atom, ASTAGE_CAPTURE_WAIT);
74629 +                       cd->atom->flags |= ATOM_FORCE_COMMIT;
74630 +
74631 +                       result =
74632 +                           commit_current_atom(&cd->nr_written, &cd->atom);
74633 +                       if (result != 0 && result != -E_REPEAT)
74634 +                               cd->failed = 1;
74635 +               }
74636 +       } else
74637 +               result = 0;
74638 +
74639 +#if REISER4_DEBUG
74640 +       if (result == 0)
74641 +               assert_spin_locked(&(cd->atom->alock));
74642 +#endif
74643 +
74644 +       /* perfectly valid assertion, except that when atom/txnh is not locked
74645 +        * fusion can take place, and cd->atom points nowhere. */
74646 +       /*
74647 +          assert("jmacd-1028", ergo(result != 0, spin_atom_is_not_locked(cd->atom)));
74648 +        */
74649 +       return result;
74650 +}
74651 +
74652 +/* Called to commit a transaction handle.  This decrements the atom's number of open
74653 +   handles and if it is the last handle to commit and the atom should commit, initiates
74654 +   atom commit. if commit does not fail, return number of written blocks */
74655 +static int commit_txnh(txn_handle * txnh)
74656 +{
74657 +       commit_data cd;
74658 +       assert("umka-192", txnh != NULL);
74659 +
74660 +       memset(&cd, 0, sizeof cd);
74661 +       cd.txnh = txnh;
74662 +       cd.preflush = 10;
74663 +
74664 +       /* calls try_commit_txnh() until either atom commits, or error
74665 +        * happens */
74666 +       while (try_commit_txnh(&cd) != 0)
74667 +               preempt_point();
74668 +
74669 +       spin_lock_txnh(txnh);
74670 +
74671 +       cd.atom->txnh_count -= 1;
74672 +       txnh->atom = NULL;
74673 +       /* remove transaction handle from atom's list of transaction handles */
74674 +       list_del_init(&txnh->txnh_link);
74675 +
74676 +       spin_unlock_txnh(txnh);
74677 +       atom_dec_and_unlock(cd.atom);
74678 +       /* if we don't want to do a commit (TXNH_DONT_COMMIT is set, probably
74679 +        * because it takes time) by current thread, we do that work
74680 +        * asynchronously by ktxnmgrd daemon. */
74681 +       if (cd.wake_ktxnmgrd_up)
74682 +               ktxnmgrd_kick(&get_current_super_private()->tmgr);
74683 +
74684 +       return 0;
74685 +}
74686 +
74687 +/* TRY_CAPTURE */
74688 +
74689 +/* This routine attempts a single block-capture request.  It may return -E_REPEAT if some
74690 +   condition indicates that the request should be retried, and it may block if the
74691 +   txn_capture mode does not include the TXN_CAPTURE_NONBLOCKING request flag.
74692 +
74693 +   This routine encodes the basic logic of block capturing described by:
74694 +
74695 +     http://namesys.com/v4/v4.html
74696 +
74697 +   Our goal here is to ensure that any two blocks that contain dependent modifications
74698 +   should commit at the same time.  This function enforces this discipline by initiating
74699 +   fusion whenever a transaction handle belonging to one atom requests to read or write a
74700 +   block belonging to another atom (TXN_CAPTURE_WRITE or TXN_CAPTURE_READ_ATOMIC).
74701 +
74702 +   In addition, this routine handles the initial assignment of atoms to blocks and
74703 +   transaction handles.  These are possible outcomes of this function:
74704 +
74705 +   1. The block and handle are already part of the same atom: return immediate success
74706 +
74707 +   2. The block is assigned but the handle is not: call capture_assign_txnh to assign
74708 +      the handle to the block's atom.
74709 +
74710 +   3. The handle is assigned but the block is not: call capture_assign_block to assign
74711 +      the block to the handle's atom.
74712 +
74713 +   4. Both handle and block are assigned, but to different atoms: call capture_init_fusion
74714 +      to fuse atoms.
74715 +
74716 +   5. Neither block nor handle are assigned: create a new atom and assign them both.
74717 +
74718 +   6. A read request for a non-captured block: return immediate success.
74719 +
74720 +   This function acquires and releases the handle's spinlock.  This function is called
74721 +   under the jnode lock and if the return value is 0, it returns with the jnode lock still
74722 +   held.  If the return is -E_REPEAT or some other error condition, the jnode lock is
74723 +   released.  The external interface (try_capture) manages re-aquiring the jnode lock
74724 +   in the failure case.
74725 +*/
74726 +static int try_capture_block(
74727 +       txn_handle * txnh, jnode * node, txn_capture mode,
74728 +       txn_atom ** atom_alloc)
74729 +{
74730 +       txn_atom *block_atom;
74731 +       txn_atom *txnh_atom;
74732 +
74733 +       /* Should not call capture for READ_NONCOM requests, handled in try_capture. */
74734 +       assert("jmacd-567", CAPTURE_TYPE(mode) != TXN_CAPTURE_READ_NONCOM);
74735 +
74736 +       /* FIXME-ZAM-HANS: FIXME_LATER_JMACD Should assert that atom->tree ==
74737 +        * node->tree somewhere. */
74738 +       assert("umka-194", txnh != NULL);
74739 +       assert("umka-195", node != NULL);
74740 +
74741 +       /* The jnode is already locked!  Being called from try_capture(). */
74742 +       assert_spin_locked(&(node->guard));
74743 +       block_atom = node->atom;
74744 +
74745 +       /* Get txnh spinlock, this allows us to compare txn_atom pointers but it doesn't
74746 +          let us touch the atoms themselves. */
74747 +       spin_lock_txnh(txnh);
74748 +       txnh_atom = txnh->atom;
74749 +       /* Process of capturing continues into one of four branches depends on
74750 +          which atoms from (block atom (node->atom), current atom (txnh->atom))
74751 +          exist. */
74752 +       if (txnh_atom == NULL) {
74753 +               if (block_atom == NULL) {
74754 +                       spin_unlock_txnh(txnh);
74755 +                       spin_unlock_jnode(node);
74756 +                       /* assign empty atom to the txnh and repeat */
74757 +                       return atom_begin_and_assign_to_txnh(atom_alloc, txnh);
74758 +               } else {
74759 +                       atomic_inc(&block_atom->refcount);
74760 +                       /* node spin-lock isn't needed anymore */
74761 +                       spin_unlock_jnode(node);
74762 +                       if (!spin_trylock_atom(block_atom)) {
74763 +                               spin_unlock_txnh(txnh);
74764 +                               spin_lock_atom(block_atom);
74765 +                               spin_lock_txnh(txnh);
74766 +                       }
74767 +                       /* re-check state after getting txnh and the node
74768 +                        * atom spin-locked */
74769 +                       if (node->atom != block_atom || txnh->atom != NULL) {
74770 +                               spin_unlock_txnh(txnh);
74771 +                               atom_dec_and_unlock(block_atom);
74772 +                               return RETERR(-E_REPEAT);
74773 +                       }
74774 +                       atomic_dec(&block_atom->refcount);
74775 +                       if (block_atom->stage > ASTAGE_CAPTURE_WAIT ||
74776 +                           (block_atom->stage == ASTAGE_CAPTURE_WAIT &&
74777 +                            block_atom->txnh_count != 0))
74778 +                               return capture_fuse_wait(txnh, block_atom, NULL, mode);
74779 +                       capture_assign_txnh_nolock(block_atom, txnh);
74780 +                       spin_unlock_txnh(txnh);
74781 +                       spin_unlock_atom(block_atom);
74782 +                       return RETERR(-E_REPEAT);
74783 +               }
74784 +       } else {
74785 +               /* It is time to perform deadlock prevention check over the
74786 +                  node we want to capture.  It is possible this node was locked
74787 +                  for read without capturing it. The optimization which allows
74788 +                  to do it helps us in keeping atoms independent as long as
74789 +                  possible but it may cause lock/fuse deadlock problems.
74790 +
74791 +                  A number of similar deadlock situations with locked but not
74792 +                  captured nodes were found.  In each situation there are two
74793 +                  or more threads: one of them does flushing while another one
74794 +                  does routine balancing or tree lookup.  The flushing thread
74795 +                  (F) sleeps in long term locking request for node (N), another
74796 +                  thread (A) sleeps in trying to capture some node already
74797 +                  belonging the atom F, F has a state which prevents
74798 +                  immediately fusion .
74799 +
74800 +                  Deadlocks of this kind cannot happen if node N was properly
74801 +                  captured by thread A. The F thread fuse atoms before locking
74802 +                  therefore current atom of thread F and current atom of thread
74803 +                  A became the same atom and thread A may proceed.  This does
74804 +                  not work if node N was not captured because the fusion of
74805 +                  atom does not happens.
74806 +
74807 +                  The following scheme solves the deadlock: If
74808 +                  longterm_lock_znode locks and does not capture a znode, that
74809 +                  znode is marked as MISSED_IN_CAPTURE.  A node marked this way
74810 +                  is processed by the code below which restores the missed
74811 +                  capture and fuses current atoms of all the node lock owners
74812 +                  by calling the fuse_not_fused_lock_owners() function. */
74813 +               if (JF_ISSET(node, JNODE_MISSED_IN_CAPTURE)) {
74814 +                       JF_CLR(node, JNODE_MISSED_IN_CAPTURE);
74815 +                       if (jnode_is_znode(node) && znode_is_locked(JZNODE(node))) {
74816 +                               spin_unlock_txnh(txnh);
74817 +                               spin_unlock_jnode(node);
74818 +                               fuse_not_fused_lock_owners(txnh, JZNODE(node));
74819 +                               return RETERR(-E_REPEAT);
74820 +                       }
74821 +               }
74822 +               if (block_atom == NULL) {
74823 +                       atomic_inc(&txnh_atom->refcount);
74824 +                       spin_unlock_txnh(txnh);
74825 +                       if (!spin_trylock_atom(txnh_atom)) {
74826 +                               spin_unlock_jnode(node);
74827 +                               spin_lock_atom(txnh_atom);
74828 +                               spin_lock_jnode(node);
74829 +                       }
74830 +                       if (txnh->atom != txnh_atom || node->atom != NULL
74831 +                               || JF_ISSET(node, JNODE_IS_DYING)) {
74832 +                               spin_unlock_jnode(node);
74833 +                               atom_dec_and_unlock(txnh_atom);
74834 +                               return RETERR(-E_REPEAT);
74835 +                       }
74836 +                       atomic_dec(&txnh_atom->refcount);
74837 +                       capture_assign_block_nolock(txnh_atom, node);
74838 +                       spin_unlock_atom(txnh_atom);
74839 +               } else {
74840 +                       if (txnh_atom != block_atom) {
74841 +                               if (mode & TXN_CAPTURE_DONT_FUSE) {
74842 +                                       spin_unlock_txnh(txnh);
74843 +                                       spin_unlock_jnode(node);
74844 +                                       /* we are in a "no-fusion" mode and @node is
74845 +                                        * already part of transaction. */
74846 +                                       return RETERR(-E_NO_NEIGHBOR);
74847 +                               }
74848 +                               return capture_init_fusion(node, txnh, mode);
74849 +                       }
74850 +                       spin_unlock_txnh(txnh);
74851 +               }
74852 +       }
74853 +       return 0;
74854 +}
74855 +
74856 +static txn_capture
74857 +build_capture_mode(jnode * node, znode_lock_mode lock_mode, txn_capture flags)
74858 +{
74859 +       txn_capture cap_mode;
74860 +
74861 +       assert_spin_locked(&(node->guard));
74862 +
74863 +       /* FIXME_JMACD No way to set TXN_CAPTURE_READ_MODIFY yet. */
74864 +
74865 +       if (lock_mode == ZNODE_WRITE_LOCK) {
74866 +               cap_mode = TXN_CAPTURE_WRITE;
74867 +       } else if (node->atom != NULL) {
74868 +               cap_mode = TXN_CAPTURE_WRITE;
74869 +       } else if (0 &&         /* txnh->mode == TXN_READ_FUSING && */
74870 +                  jnode_get_level(node) == LEAF_LEVEL) {
74871 +               /* NOTE-NIKITA TXN_READ_FUSING is not currently used */
74872 +               /* We only need a READ_FUSING capture at the leaf level.  This
74873 +                  is because the internal levels of the tree (twigs included)
74874 +                  are redundant from the point of the user that asked for a
74875 +                  read-fusing transcrash.  The user only wants to read-fuse
74876 +                  atoms due to reading uncommitted data that another user has
74877 +                  written.  It is the file system that reads/writes the
74878 +                  internal tree levels, the user only reads/writes leaves. */
74879 +               cap_mode = TXN_CAPTURE_READ_ATOMIC;
74880 +       } else {
74881 +               /* In this case (read lock at a non-leaf) there's no reason to
74882 +                * capture. */
74883 +               /* cap_mode = TXN_CAPTURE_READ_NONCOM; */
74884 +               return 0;
74885 +       }
74886 +
74887 +       cap_mode |= (flags & (TXN_CAPTURE_NONBLOCKING | TXN_CAPTURE_DONT_FUSE));
74888 +       assert("nikita-3186", cap_mode != 0);
74889 +       return cap_mode;
74890 +}
74891 +
74892 +/* This is an external interface to try_capture_block(), it calls
74893 +   try_capture_block() repeatedly as long as -E_REPEAT is returned.
74894 +
74895 +   @node:         node to capture,
74896 +   @lock_mode:    read or write lock is used in capture mode calculation,
74897 +   @flags:        see txn_capture flags enumeration,
74898 +   @can_coc     : can copy-on-capture
74899 +
74900 +   @return: 0 - node was successfully captured, -E_REPEAT - capture request
74901 +            cannot be processed immediately as it was requested in flags,
74902 +           < 0 - other errors.
74903 +*/
74904 +int try_capture(jnode *node, znode_lock_mode lock_mode,
74905 +               txn_capture flags)
74906 +{
74907 +       txn_atom *atom_alloc = NULL;
74908 +       txn_capture cap_mode;
74909 +       txn_handle *txnh = get_current_context()->trans;
74910 +       int ret;
74911 +
74912 +       assert_spin_locked(&(node->guard));
74913 +
74914 +      repeat:
74915 +       if (JF_ISSET(node, JNODE_IS_DYING))
74916 +               return RETERR(-EINVAL);
74917 +       if (node->atom != NULL && txnh->atom == node->atom)
74918 +               return 0;
74919 +       cap_mode = build_capture_mode(node, lock_mode, flags);
74920 +       if (cap_mode == 0 ||
74921 +           (!(cap_mode & TXN_CAPTURE_WTYPES) && node->atom == NULL)) {
74922 +               /* Mark this node as "MISSED".  It helps in further deadlock
74923 +                * analysis */
74924 +               if (jnode_is_znode(node))
74925 +                       JF_SET(node, JNODE_MISSED_IN_CAPTURE);
74926 +               return 0;
74927 +       }
74928 +       /* Repeat try_capture as long as -E_REPEAT is returned. */
74929 +       ret = try_capture_block(txnh, node, cap_mode, &atom_alloc);
74930 +       /* Regardless of non_blocking:
74931 +
74932 +          If ret == 0 then jnode is still locked.
74933 +          If ret != 0 then jnode is unlocked.
74934 +        */
74935 +#if REISER4_DEBUG
74936 +       if (ret == 0)
74937 +               assert_spin_locked(&(node->guard));
74938 +       else
74939 +               assert_spin_not_locked(&(node->guard));
74940 +#endif
74941 +       assert_spin_not_locked(&(txnh->guard));
74942 +
74943 +       if (ret == -E_REPEAT) {
74944 +               /* E_REPEAT implies all locks were released, therefore we need
74945 +                  to take the jnode's lock again. */
74946 +               spin_lock_jnode(node);
74947 +
74948 +               /* Although this may appear to be a busy loop, it is not.
74949 +                  There are several conditions that cause E_REPEAT to be
74950 +                  returned by the call to try_capture_block, all cases
74951 +                  indicating some kind of state change that means you should
74952 +                  retry the request and will get a different result.  In some
74953 +                  cases this could be avoided with some extra code, but
74954 +                  generally it is done because the necessary locks were
74955 +                  released as a result of the operation and repeating is the
74956 +                  simplest thing to do (less bug potential).  The cases are:
74957 +                  atom fusion returns E_REPEAT after it completes (jnode and
74958 +                  txnh were unlocked); race conditions in assign_block,
74959 +                  assign_txnh, and init_fusion return E_REPEAT (trylock
74960 +                  failure); after going to sleep in capture_fuse_wait
74961 +                  (request was blocked but may now succeed).  I'm not quite
74962 +                  sure how capture_copy works yet, but it may also return
74963 +                  E_REPEAT.  When the request is legitimately blocked, the
74964 +                  requestor goes to sleep in fuse_wait, so this is not a busy
74965 +                  loop. */
74966 +               /* NOTE-NIKITA: still don't understand:
74967 +
74968 +                  try_capture_block->capture_assign_txnh->spin_trylock_atom->E_REPEAT
74969 +
74970 +                  looks like busy loop?
74971 +                */
74972 +               goto repeat;
74973 +       }
74974 +
74975 +       /* free extra atom object that was possibly allocated by
74976 +          try_capture_block().
74977 +
74978 +          Do this before acquiring jnode spin lock to
74979 +          minimize time spent under lock. --nikita */
74980 +       if (atom_alloc != NULL) {
74981 +               kmem_cache_free(_atom_slab, atom_alloc);
74982 +       }
74983 +
74984 +       if (ret != 0) {
74985 +               if (ret == -E_BLOCK) {
74986 +                       assert("nikita-3360",
74987 +                              cap_mode & TXN_CAPTURE_NONBLOCKING);
74988 +                       ret = -E_REPEAT;
74989 +               }
74990 +
74991 +               /* Failure means jnode is not locked.  FIXME_LATER_JMACD May
74992 +                  want to fix the above code to avoid releasing the lock and
74993 +                  re-acquiring it, but there are cases were failure occurs
74994 +                  when the lock is not held, and those cases would need to be
74995 +                  modified to re-take the lock. */
74996 +               spin_lock_jnode(node);
74997 +       }
74998 +
74999 +       /* Jnode is still locked. */
75000 +       assert_spin_locked(&(node->guard));
75001 +       return ret;
75002 +}
75003 +
75004 +static void release_two_atoms(txn_atom *one, txn_atom *two)
75005 +{
75006 +       spin_unlock_atom(one);
75007 +       atom_dec_and_unlock(two);
75008 +       spin_lock_atom(one);
75009 +       atom_dec_and_unlock(one);
75010 +}
75011 +
75012 +/* This function sets up a call to try_capture_block and repeats as long as -E_REPEAT is
75013 +   returned by that routine.  The txn_capture request mode is computed here depending on
75014 +   the transaction handle's type and the lock request.  This is called from the depths of
75015 +   the lock manager with the jnode lock held and it always returns with the jnode lock
75016 +   held.
75017 +*/
75018 +
75019 +/* fuse all 'active' atoms of lock owners of given node. */
75020 +static void fuse_not_fused_lock_owners(txn_handle * txnh, znode * node)
75021 +{
75022 +       lock_handle *lh;
75023 +       int repeat;
75024 +       txn_atom *atomh, *atomf;
75025 +       reiser4_context *me = get_current_context();
75026 +       reiser4_context *ctx = NULL;
75027 +
75028 +       assert_spin_not_locked(&(ZJNODE(node)->guard));
75029 +       assert_spin_not_locked(&(txnh->hlock));
75030 +
75031 + repeat:
75032 +       repeat = 0;
75033 +       atomh = txnh_get_atom(txnh);
75034 +       spin_unlock_txnh(txnh);
75035 +       assert("zam-692", atomh != NULL);
75036 +
75037 +       spin_lock_zlock(&node->lock);
75038 +       /* inspect list of lock owners */
75039 +       list_for_each_entry(lh, &node->lock.owners, owners_link) {
75040 +               ctx = get_context_by_lock_stack(lh->owner);
75041 +               if (ctx == me)
75042 +                       continue;
75043 +               /* below we use two assumptions to avoid addition spin-locks
75044 +                  for checking the condition :
75045 +
75046 +                  1) if the lock stack has lock, the transaction should be
75047 +                  opened, i.e. ctx->trans != NULL;
75048 +
75049 +                  2) reading of well-aligned ctx->trans->atom is atomic, if it
75050 +                  equals to the address of spin-locked atomh, we take that
75051 +                  the atoms are the same, nothing has to be captured. */
75052 +               if (atomh != ctx->trans->atom) {
75053 +                       reiser4_wake_up(lh->owner);
75054 +                       repeat = 1;
75055 +                       break;
75056 +               }
75057 +       }
75058 +       if (repeat) {
75059 +               if (!spin_trylock_txnh(ctx->trans)) {
75060 +                       spin_unlock_zlock(&node->lock);
75061 +                       spin_unlock_atom(atomh);
75062 +                       goto repeat;
75063 +               }
75064 +               atomf = ctx->trans->atom;
75065 +               if (atomf == NULL) {
75066 +                       capture_assign_txnh_nolock(atomh, ctx->trans);
75067 +                       /* release zlock lock _after_ assigning the atom to the
75068 +                        * transaction handle, otherwise the lock owner thread
75069 +                        * may unlock all znodes, exit kernel context and here
75070 +                        * we would access an invalid transaction handle. */
75071 +                       spin_unlock_zlock(&node->lock);
75072 +                       spin_unlock_atom(atomh);
75073 +                       spin_unlock_txnh(ctx->trans);
75074 +                       goto repeat;
75075 +               }
75076 +               assert("zam-1059", atomf != atomh);
75077 +               spin_unlock_zlock(&node->lock);
75078 +               atomic_inc(&atomh->refcount);
75079 +               atomic_inc(&atomf->refcount);
75080 +               spin_unlock_txnh(ctx->trans);
75081 +               if (atomf > atomh) {
75082 +                       spin_lock_atom(atomf);
75083 +               } else {
75084 +                       spin_unlock_atom(atomh);
75085 +                       spin_lock_atom(atomf);
75086 +                       spin_lock_atom(atomh);
75087 +               }
75088 +               if (atomh == atomf || !atom_isopen(atomh) || !atom_isopen(atomf)) {
75089 +                       release_two_atoms(atomf, atomh);
75090 +                       goto repeat;
75091 +               }
75092 +               atomic_dec(&atomh->refcount);
75093 +               atomic_dec(&atomf->refcount);
75094 +               capture_fuse_into(atomf, atomh);
75095 +               goto repeat;
75096 +       }
75097 +       spin_unlock_zlock(&node->lock);
75098 +       spin_unlock_atom(atomh);
75099 +}
75100 +
75101 +/* This is the interface to capture unformatted nodes via their struct page
75102 +   reference. Currently it is only used in reiser4_invalidatepage */
75103 +int try_capture_page_to_invalidate(struct page *pg)
75104 +{
75105 +       int ret;
75106 +       jnode *node;
75107 +
75108 +       assert("umka-292", pg != NULL);
75109 +       assert("nikita-2597", PageLocked(pg));
75110 +
75111 +       if (IS_ERR(node = jnode_of_page(pg))) {
75112 +               return PTR_ERR(node);
75113 +       }
75114 +
75115 +       spin_lock_jnode(node);
75116 +       unlock_page(pg);
75117 +
75118 +       ret = try_capture(node, ZNODE_WRITE_LOCK, 0);
75119 +       spin_unlock_jnode(node);
75120 +       jput(node);
75121 +       lock_page(pg);
75122 +       return ret;
75123 +}
75124 +
75125 +/* This informs the transaction manager when a node is deleted.  Add the block to the
75126 +   atom's delete set and uncapture the block.
75127 +
75128 +VS-FIXME-HANS: this E_REPEAT paradigm clutters the code and creates a need for
75129 +explanations.  find all the functions that use it, and unless there is some very
75130 +good reason to use it (I have not noticed one so far and I doubt it exists, but maybe somewhere somehow....),
75131 +move the loop to inside the function.
75132 +
75133 +VS-FIXME-HANS: can this code be at all streamlined?  In particular, can you lock and unlock the jnode fewer times?
75134 +  */
75135 +void uncapture_page(struct page *pg)
75136 +{
75137 +       jnode *node;
75138 +       txn_atom *atom;
75139 +
75140 +       assert("umka-199", pg != NULL);
75141 +       assert("nikita-3155", PageLocked(pg));
75142 +
75143 +       clear_page_dirty_for_io(pg);
75144 +
75145 +       reiser4_wait_page_writeback(pg);
75146 +
75147 +       node = jprivate(pg);
75148 +       BUG_ON(node == NULL);
75149 +
75150 +       spin_lock_jnode(node);
75151 +
75152 +       atom = jnode_get_atom(node);
75153 +       if (atom == NULL) {
75154 +               assert("jmacd-7111", !JF_ISSET(node, JNODE_DIRTY));
75155 +               spin_unlock_jnode(node);
75156 +               return;
75157 +       }
75158 +
75159 +       /* We can remove jnode from transaction even if it is on flush queue
75160 +        * prepped list, we only need to be sure that flush queue is not being
75161 +        * written by write_fq().  write_fq() does not use atom spin lock for
75162 +        * protection of the prepped nodes list, instead write_fq() increments
75163 +        * atom's nr_running_queues counters for the time when prepped list is
75164 +        * not protected by spin lock.  Here we check this counter if we want
75165 +        * to remove jnode from flush queue and, if the counter is not zero,
75166 +        * wait all write_fq() for this atom to complete. This is not
75167 +        * significant overhead. */
75168 +       while (JF_ISSET(node, JNODE_FLUSH_QUEUED) && atom->nr_running_queues) {
75169 +               spin_unlock_jnode(node);
75170 +               /*
75171 +                * at this moment we want to wait for "atom event", viz. wait
75172 +                * until @node can be removed from flush queue. But
75173 +                * atom_wait_event() cannot be called with page locked, because
75174 +                * it deadlocks with jnode_extent_write(). Unlock page, after
75175 +                * making sure (through page_cache_get()) that it cannot be
75176 +                * released from memory.
75177 +                */
75178 +               page_cache_get(pg);
75179 +               unlock_page(pg);
75180 +               atom_wait_event(atom);
75181 +               lock_page(pg);
75182 +               /*
75183 +                * page may has been detached by ->writepage()->releasepage().
75184 +                */
75185 +               reiser4_wait_page_writeback(pg);
75186 +               spin_lock_jnode(node);
75187 +               page_cache_release(pg);
75188 +               atom = jnode_get_atom(node);
75189 +/* VS-FIXME-HANS: improve the commenting in this function */
75190 +               if (atom == NULL) {
75191 +                       spin_unlock_jnode(node);
75192 +                       return;
75193 +               }
75194 +       }
75195 +       uncapture_block(node);
75196 +       spin_unlock_atom(atom);
75197 +       jput(node);
75198 +}
75199 +
75200 +/* this is used in extent's kill hook to uncapture and unhash jnodes attached to
75201 + * inode's tree of jnodes */
75202 +void uncapture_jnode(jnode * node)
75203 +{
75204 +       txn_atom *atom;
75205 +
75206 +       assert_spin_locked(&(node->guard));
75207 +       assert("", node->pg == 0);
75208 +
75209 +       atom = jnode_get_atom(node);
75210 +       if (atom == NULL) {
75211 +               assert("jmacd-7111", !JF_ISSET(node, JNODE_DIRTY));
75212 +               spin_unlock_jnode(node);
75213 +               return;
75214 +       }
75215 +
75216 +       uncapture_block(node);
75217 +       spin_unlock_atom(atom);
75218 +       jput(node);
75219 +}
75220 +
75221 +/* No-locking version of assign_txnh.  Sets the transaction handle's atom pointer,
75222 +   increases atom refcount and txnh_count, adds to txnh_list. */
75223 +static void capture_assign_txnh_nolock(txn_atom *atom, txn_handle *txnh)
75224 +{
75225 +       assert("umka-200", atom != NULL);
75226 +       assert("umka-201", txnh != NULL);
75227 +
75228 +       assert_spin_locked(&(txnh->hlock));
75229 +       assert_spin_locked(&(atom->alock));
75230 +       assert("jmacd-824", txnh->atom == NULL);
75231 +       assert("nikita-3540", atom_isopen(atom));
75232 +       BUG_ON(txnh->atom != NULL);
75233 +
75234 +       atomic_inc(&atom->refcount);
75235 +       txnh->atom = atom;
75236 +       set_gfp_mask();
75237 +       list_add_tail(&txnh->txnh_link, &atom->txnh_list);
75238 +       atom->txnh_count += 1;
75239 +}
75240 +
75241 +/* No-locking version of assign_block.  Sets the block's atom pointer, references the
75242 +   block, adds it to the clean or dirty capture_jnode list, increments capture_count. */
75243 +static void capture_assign_block_nolock(txn_atom *atom, jnode *node)
75244 +{
75245 +       assert("umka-202", atom != NULL);
75246 +       assert("umka-203", node != NULL);
75247 +       assert_spin_locked(&(node->guard));
75248 +       assert_spin_locked(&(atom->alock));
75249 +       assert("jmacd-323", node->atom == NULL);
75250 +       BUG_ON(!list_empty_careful(&node->capture_link));
75251 +       assert("nikita-3470", !JF_ISSET(node, JNODE_DIRTY));
75252 +
75253 +       /* Pointer from jnode to atom is not counted in atom->refcount. */
75254 +       node->atom = atom;
75255 +
75256 +       list_add_tail(&node->capture_link, ATOM_CLEAN_LIST(atom));
75257 +       atom->capture_count += 1;
75258 +       /* reference to jnode is acquired by atom. */
75259 +       jref(node);
75260 +
75261 +       ON_DEBUG(count_jnode(atom, node, NOT_CAPTURED, CLEAN_LIST, 1));
75262 +
75263 +       LOCK_CNT_INC(t_refs);
75264 +}
75265 +
75266 +/* common code for dirtying both unformatted jnodes and formatted znodes. */
75267 +static void do_jnode_make_dirty(jnode * node, txn_atom * atom)
75268 +{
75269 +       assert_spin_locked(&(node->guard));
75270 +       assert_spin_locked(&(atom->alock));
75271 +       assert("jmacd-3981", !JF_ISSET(node, JNODE_DIRTY));
75272 +
75273 +       JF_SET(node, JNODE_DIRTY);
75274 +
75275 +       get_current_context()->nr_marked_dirty++;
75276 +
75277 +       /* We grab2flush_reserve one additional block only if node was
75278 +          not CREATED and jnode_flush did not sort it into neither
75279 +          relocate set nor overwrite one. If node is in overwrite or
75280 +          relocate set we assume that atom's flush reserved counter was
75281 +          already adjusted. */
75282 +       if (!JF_ISSET(node, JNODE_CREATED) && !JF_ISSET(node, JNODE_RELOC)
75283 +           && !JF_ISSET(node, JNODE_OVRWR) && jnode_is_leaf(node)
75284 +           && !jnode_is_cluster_page(node)) {
75285 +               assert("vs-1093", !blocknr_is_fake(&node->blocknr));
75286 +               assert("vs-1506", *jnode_get_block(node) != 0);
75287 +               grabbed2flush_reserved_nolock(atom, (__u64) 1);
75288 +               JF_SET(node, JNODE_FLUSH_RESERVED);
75289 +       }
75290 +
75291 +       if (!JF_ISSET(node, JNODE_FLUSH_QUEUED)) {
75292 +               /* If the atom is not set yet, it will be added to the appropriate list in
75293 +                  capture_assign_block_nolock. */
75294 +               /* Sometimes a node is set dirty before being captured -- the case for new
75295 +                  jnodes.  In that case the jnode will be added to the appropriate list
75296 +                  in capture_assign_block_nolock. Another reason not to re-link jnode is
75297 +                  that jnode is on a flush queue (see flush.c for details) */
75298 +
75299 +               int level = jnode_get_level(node);
75300 +
75301 +               assert("nikita-3152", !JF_ISSET(node, JNODE_OVRWR));
75302 +               assert("zam-654", atom->stage < ASTAGE_PRE_COMMIT);
75303 +               assert("nikita-2607", 0 <= level);
75304 +               assert("nikita-2606", level <= REAL_MAX_ZTREE_HEIGHT);
75305 +
75306 +               /* move node to atom's dirty list */
75307 +               list_move_tail(&node->capture_link, ATOM_DIRTY_LIST(atom, level));
75308 +               ON_DEBUG(count_jnode
75309 +                        (atom, node, NODE_LIST(node), DIRTY_LIST, 1));
75310 +       }
75311 +}
75312 +
75313 +/* Set the dirty status for this (spin locked) jnode. */
75314 +void jnode_make_dirty_locked(jnode * node)
75315 +{
75316 +       assert("umka-204", node != NULL);
75317 +       assert_spin_locked(&(node->guard));
75318 +
75319 +       if (REISER4_DEBUG && rofs_jnode(node)) {
75320 +               warning("nikita-3365", "Dirtying jnode on rofs");
75321 +               dump_stack();
75322 +       }
75323 +
75324 +       /* Fast check for already dirty node */
75325 +       if (!JF_ISSET(node, JNODE_DIRTY)) {
75326 +               txn_atom *atom;
75327 +
75328 +               atom = jnode_get_atom(node);
75329 +               assert("vs-1094", atom);
75330 +               /* Check jnode dirty status again because node spin lock might
75331 +                * be released inside jnode_get_atom(). */
75332 +               if (likely(!JF_ISSET(node, JNODE_DIRTY)))
75333 +                       do_jnode_make_dirty(node, atom);
75334 +               spin_unlock_atom(atom);
75335 +       }
75336 +}
75337 +
75338 +/* Set the dirty status for this znode. */
75339 +void znode_make_dirty(znode * z)
75340 +{
75341 +       jnode *node;
75342 +       struct page *page;
75343 +
75344 +       assert("umka-204", z != NULL);
75345 +       assert("nikita-3290", znode_above_root(z) || znode_is_loaded(z));
75346 +       assert("nikita-3560", znode_is_write_locked(z));
75347 +
75348 +       node = ZJNODE(z);
75349 +       /* znode is longterm locked, we can check dirty bit without spinlock */
75350 +       if (JF_ISSET(node, JNODE_DIRTY)) {
75351 +               /* znode is dirty already. All we have to do is to change znode version */
75352 +               z->version = znode_build_version(jnode_get_tree(node));
75353 +               return;
75354 +       }
75355 +
75356 +       spin_lock_jnode(node);
75357 +       jnode_make_dirty_locked(node);
75358 +       page = jnode_page(node);
75359 +       if (page != NULL) {
75360 +               /* this is useful assertion (allows one to check that no
75361 +                * modifications are lost due to update of in-flight page),
75362 +                * but it requires locking on page to check PG_writeback
75363 +                * bit. */
75364 +               /* assert("nikita-3292",
75365 +                  !PageWriteback(page) || ZF_ISSET(z, JNODE_WRITEBACK)); */
75366 +               page_cache_get(page);
75367 +
75368 +               /* jnode lock is not needed for the rest of
75369 +                * znode_set_dirty(). */
75370 +               spin_unlock_jnode(node);
75371 +               /* reiser4 file write code calls set_page_dirty for
75372 +                * unformatted nodes, for formatted nodes we do it here. */
75373 +               set_page_dirty_internal(page);
75374 +               page_cache_release(page);
75375 +               /* bump version counter in znode */
75376 +               z->version = znode_build_version(jnode_get_tree(node));
75377 +       } else {
75378 +               assert("zam-596", znode_above_root(JZNODE(node)));
75379 +               spin_unlock_jnode(node);
75380 +       }
75381 +
75382 +       assert("nikita-1900", znode_is_write_locked(z));
75383 +       assert("jmacd-9777", node->atom != NULL);
75384 +}
75385 +
75386 +int sync_atom(txn_atom * atom)
75387 +{
75388 +       int result;
75389 +       txn_handle *txnh;
75390 +
75391 +       txnh = get_current_context()->trans;
75392 +
75393 +       result = 0;
75394 +       if (atom != NULL) {
75395 +               if (atom->stage < ASTAGE_PRE_COMMIT) {
75396 +                       spin_lock_txnh(txnh);
75397 +                       capture_assign_txnh_nolock(atom, txnh);
75398 +                       result = force_commit_atom(txnh);
75399 +               } else if (atom->stage < ASTAGE_POST_COMMIT) {
75400 +                       /* wait atom commit */
75401 +                       atom_wait_event(atom);
75402 +                       /* try once more */
75403 +                       result = RETERR(-E_REPEAT);
75404 +               } else
75405 +                       spin_unlock_atom(atom);
75406 +       }
75407 +       return result;
75408 +}
75409 +
75410 +#if REISER4_DEBUG
75411 +
75412 +/* move jnode form one list to another
75413 +   call this after atom->capture_count is updated */
75414 +void
75415 +count_jnode(txn_atom * atom, jnode * node, atom_list old_list,
75416 +           atom_list new_list, int check_lists)
75417 +{
75418 +       struct list_head *pos;
75419 +
75420 +       assert("zam-1018", atom_is_protected(atom));
75421 +       assert_spin_locked(&(node->guard));
75422 +       assert("", NODE_LIST(node) == old_list);
75423 +
75424 +       switch (NODE_LIST(node)) {
75425 +       case NOT_CAPTURED:
75426 +               break;
75427 +       case DIRTY_LIST:
75428 +               assert("", atom->dirty > 0);
75429 +               atom->dirty--;
75430 +               break;
75431 +       case CLEAN_LIST:
75432 +               assert("", atom->clean > 0);
75433 +               atom->clean--;
75434 +               break;
75435 +       case FQ_LIST:
75436 +               assert("", atom->fq > 0);
75437 +               atom->fq--;
75438 +               break;
75439 +       case WB_LIST:
75440 +               assert("", atom->wb > 0);
75441 +               atom->wb--;
75442 +               break;
75443 +       case OVRWR_LIST:
75444 +               assert("", atom->ovrwr > 0);
75445 +               atom->ovrwr--;
75446 +               break;
75447 +       default:
75448 +               impossible("", "");
75449 +       }
75450 +
75451 +       switch (new_list) {
75452 +       case NOT_CAPTURED:
75453 +               break;
75454 +       case DIRTY_LIST:
75455 +               atom->dirty++;
75456 +               break;
75457 +       case CLEAN_LIST:
75458 +               atom->clean++;
75459 +               break;
75460 +       case FQ_LIST:
75461 +               atom->fq++;
75462 +               break;
75463 +       case WB_LIST:
75464 +               atom->wb++;
75465 +               break;
75466 +       case OVRWR_LIST:
75467 +               atom->ovrwr++;
75468 +               break;
75469 +       default:
75470 +               impossible("", "");
75471 +       }
75472 +       ASSIGN_NODE_LIST(node, new_list);
75473 +       if (0 && check_lists) {
75474 +               int count;
75475 +               tree_level level;
75476 +
75477 +               count = 0;
75478 +
75479 +               /* flush queue list */
75480 +               /*check_fq(atom); */
75481 +
75482 +               /* dirty list */
75483 +               count = 0;
75484 +               for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1) {
75485 +                       list_for_each(pos, ATOM_DIRTY_LIST(atom, level))
75486 +                               count++;
75487 +               }
75488 +               if (count != atom->dirty)
75489 +                       warning("", "dirty counter %d, real %d\n", atom->dirty,
75490 +                               count);
75491 +
75492 +               /* clean list */
75493 +               count = 0;
75494 +               list_for_each(pos, ATOM_CLEAN_LIST(atom))
75495 +                       count++;
75496 +               if (count != atom->clean)
75497 +                       warning("", "clean counter %d, real %d\n", atom->clean,
75498 +                               count);
75499 +
75500 +               /* wb list */
75501 +               count = 0;
75502 +               list_for_each(pos, ATOM_WB_LIST(atom))
75503 +                       count++;
75504 +               if (count != atom->wb)
75505 +                       warning("", "wb counter %d, real %d\n", atom->wb,
75506 +                               count);
75507 +
75508 +               /* overwrite list */
75509 +               count = 0;
75510 +               list_for_each(pos, ATOM_OVRWR_LIST(atom))
75511 +                       count++;
75512 +
75513 +               if (count != atom->ovrwr)
75514 +                       warning("", "ovrwr counter %d, real %d\n", atom->ovrwr,
75515 +                               count);
75516 +       }
75517 +       assert("vs-1624", atom->num_queued == atom->fq);
75518 +       if (atom->capture_count !=
75519 +           atom->dirty + atom->clean + atom->ovrwr + atom->wb + atom->fq) {
75520 +               printk
75521 +                   ("count %d, dirty %d clean %d ovrwr %d wb %d fq %d\n",
75522 +                    atom->capture_count, atom->dirty, atom->clean, atom->ovrwr,
75523 +                    atom->wb, atom->fq);
75524 +               assert("vs-1622",
75525 +                      atom->capture_count ==
75526 +                      atom->dirty + atom->clean + atom->ovrwr + atom->wb +
75527 +                      atom->fq);
75528 +       }
75529 +}
75530 +
75531 +#endif
75532 +
75533 +/* Make node OVRWR and put it on atom->overwrite_nodes list, atom lock and jnode
75534 + * lock should be taken before calling this function. */
75535 +void jnode_make_wander_nolock(jnode * node)
75536 +{
75537 +       txn_atom *atom;
75538 +
75539 +       assert("nikita-2431", node != NULL);
75540 +       assert("nikita-2432", !JF_ISSET(node, JNODE_RELOC));
75541 +       assert("nikita-3153", JF_ISSET(node, JNODE_DIRTY));
75542 +       assert("zam-897", !JF_ISSET(node, JNODE_FLUSH_QUEUED));
75543 +       assert("nikita-3367", !blocknr_is_fake(jnode_get_block(node)));
75544 +
75545 +       atom = node->atom;
75546 +
75547 +       assert("zam-895", atom != NULL);
75548 +       assert("zam-894", atom_is_protected(atom));
75549 +
75550 +       JF_SET(node, JNODE_OVRWR);
75551 +       /* move node to atom's overwrite list */
75552 +       list_move_tail(&node->capture_link, ATOM_OVRWR_LIST(atom));
75553 +       ON_DEBUG(count_jnode(atom, node, DIRTY_LIST, OVRWR_LIST, 1));
75554 +}
75555 +
75556 +/* Same as jnode_make_wander_nolock, but all necessary locks are taken inside
75557 + * this function. */
75558 +void jnode_make_wander(jnode * node)
75559 +{
75560 +       txn_atom *atom;
75561 +
75562 +       spin_lock_jnode(node);
75563 +       atom = jnode_get_atom(node);
75564 +       assert("zam-913", atom != NULL);
75565 +       assert("zam-914", !JF_ISSET(node, JNODE_RELOC));
75566 +
75567 +       jnode_make_wander_nolock(node);
75568 +       spin_unlock_atom(atom);
75569 +       spin_unlock_jnode(node);
75570 +}
75571 +
75572 +/* this just sets RELOC bit  */
75573 +static void jnode_make_reloc_nolock(flush_queue_t * fq, jnode * node)
75574 +{
75575 +       assert_spin_locked(&(node->guard));
75576 +       assert("zam-916", JF_ISSET(node, JNODE_DIRTY));
75577 +       assert("zam-917", !JF_ISSET(node, JNODE_RELOC));
75578 +       assert("zam-918", !JF_ISSET(node, JNODE_OVRWR));
75579 +       assert("zam-920", !JF_ISSET(node, JNODE_FLUSH_QUEUED));
75580 +       assert("nikita-3367", !blocknr_is_fake(jnode_get_block(node)));
75581 +       jnode_set_reloc(node);
75582 +}
75583 +
75584 +/* Make znode RELOC and put it on flush queue */
75585 +void znode_make_reloc(znode * z, flush_queue_t * fq)
75586 +{
75587 +       jnode *node;
75588 +       txn_atom *atom;
75589 +
75590 +       node = ZJNODE(z);
75591 +       spin_lock_jnode(node);
75592 +
75593 +       atom = jnode_get_atom(node);
75594 +       assert("zam-919", atom != NULL);
75595 +
75596 +       jnode_make_reloc_nolock(fq, node);
75597 +       queue_jnode(fq, node);
75598 +
75599 +       spin_unlock_atom(atom);
75600 +       spin_unlock_jnode(node);
75601 +
75602 +}
75603 +
75604 +/* Make unformatted node RELOC and put it on flush queue */
75605 +void unformatted_make_reloc(jnode *node, flush_queue_t *fq)
75606 +{
75607 +       assert("vs-1479", jnode_is_unformatted(node));
75608 +
75609 +       jnode_make_reloc_nolock(fq, node);
75610 +       queue_jnode(fq, node);
75611 +}
75612 +
75613 +int capture_super_block(struct super_block *s)
75614 +{
75615 +       int result;
75616 +       znode *uber;
75617 +       lock_handle lh;
75618 +
75619 +       init_lh(&lh);
75620 +       result = get_uber_znode(get_tree(s),
75621 +                               ZNODE_WRITE_LOCK, ZNODE_LOCK_LOPRI, &lh);
75622 +       if (result)
75623 +               return result;
75624 +
75625 +       uber = lh.node;
75626 +       /* Grabbing one block for superblock */
75627 +       result = reiser4_grab_space_force((__u64) 1, BA_RESERVED);
75628 +       if (result != 0)
75629 +               return result;
75630 +
75631 +       znode_make_dirty(uber);
75632 +
75633 +       done_lh(&lh);
75634 +       return 0;
75635 +}
75636 +
75637 +/* Wakeup every handle on the atom's WAITFOR list */
75638 +static void wakeup_atom_waitfor_list(txn_atom * atom)
75639 +{
75640 +       txn_wait_links *wlinks;
75641 +
75642 +       assert("umka-210", atom != NULL);
75643 +
75644 +       /* atom is locked */
75645 +       list_for_each_entry(wlinks, &atom->fwaitfor_list, _fwaitfor_link) {
75646 +               if (wlinks->waitfor_cb == NULL ||
75647 +                   wlinks->waitfor_cb(atom, wlinks))
75648 +                       /* Wake up. */
75649 +                       reiser4_wake_up(wlinks->_lock_stack);
75650 +       }
75651 +}
75652 +
75653 +/* Wakeup every handle on the atom's WAITING list */
75654 +static void wakeup_atom_waiting_list(txn_atom * atom)
75655 +{
75656 +       txn_wait_links *wlinks;
75657 +
75658 +       assert("umka-211", atom != NULL);
75659 +
75660 +       /* atom is locked */
75661 +       list_for_each_entry(wlinks, &atom->fwaiting_list, _fwaiting_link) {
75662 +               if (wlinks->waiting_cb == NULL ||
75663 +                   wlinks->waiting_cb(atom, wlinks))
75664 +                       /* Wake up. */
75665 +                       reiser4_wake_up(wlinks->_lock_stack);
75666 +       }
75667 +}
75668 +
75669 +/* helper function used by capture_fuse_wait() to avoid "spurious wake-ups" */
75670 +static int wait_for_fusion(txn_atom * atom, txn_wait_links * wlinks)
75671 +{
75672 +       assert("nikita-3330", atom != NULL);
75673 +       assert_spin_locked(&(atom->alock));
75674 +
75675 +       /* atom->txnh_count == 1 is for waking waiters up if we are releasing
75676 +        * last transaction handle. */
75677 +       return atom->stage != ASTAGE_CAPTURE_WAIT || atom->txnh_count == 1;
75678 +}
75679 +
75680 +/* The general purpose of this function is to wait on the first of two possible events.
75681 +   The situation is that a handle (and its atom atomh) is blocked trying to capture a
75682 +   block (i.e., node) but the node's atom (atomf) is in the CAPTURE_WAIT state.  The
75683 +   handle's atom (atomh) is not in the CAPTURE_WAIT state.  However, atomh could fuse with
75684 +   another atom or, due to age, enter the CAPTURE_WAIT state itself, at which point it
75685 +   needs to unblock the handle to avoid deadlock.  When the txnh is unblocked it will
75686 +   proceed and fuse the two atoms in the CAPTURE_WAIT state.
75687 +
75688 +   In other words, if either atomh or atomf change state, the handle will be awakened,
75689 +   thus there are two lists per atom: WAITING and WAITFOR.
75690 +
75691 +   This is also called by capture_assign_txnh with (atomh == NULL) to wait for atomf to
75692 +   close but it is not assigned to an atom of its own.
75693 +
75694 +   Lock ordering in this method: all four locks are held: JNODE_LOCK, TXNH_LOCK,
75695 +   BOTH_ATOM_LOCKS.  Result: all four locks are released.
75696 +*/
75697 +static int capture_fuse_wait(txn_handle * txnh, txn_atom * atomf,
75698 +                   txn_atom * atomh, txn_capture mode)
75699 +{
75700 +       int ret;
75701 +       txn_wait_links wlinks;
75702 +
75703 +       assert("umka-213", txnh != NULL);
75704 +       assert("umka-214", atomf != NULL);
75705 +
75706 +       if ((mode & TXN_CAPTURE_NONBLOCKING) != 0) {
75707 +               spin_unlock_txnh(txnh);
75708 +               spin_unlock_atom(atomf);
75709 +
75710 +               if (atomh) {
75711 +                       spin_unlock_atom(atomh);
75712 +               }
75713 +
75714 +               return RETERR(-E_BLOCK);
75715 +       }
75716 +
75717 +       /* Initialize the waiting list links. */
75718 +       init_wlinks(&wlinks);
75719 +
75720 +       /* Add txnh to atomf's waitfor list, unlock atomf. */
75721 +       list_add_tail(&wlinks._fwaitfor_link, &atomf->fwaitfor_list);
75722 +       wlinks.waitfor_cb = wait_for_fusion;
75723 +       atomic_inc(&atomf->refcount);
75724 +       spin_unlock_atom(atomf);
75725 +
75726 +       if (atomh) {
75727 +               /* Add txnh to atomh's waiting list, unlock atomh. */
75728 +               list_add_tail(&wlinks._fwaiting_link, &atomh->fwaiting_list);
75729 +               atomic_inc(&atomh->refcount);
75730 +               spin_unlock_atom(atomh);
75731 +       }
75732 +
75733 +       /* Go to sleep. */
75734 +       spin_unlock_txnh(txnh);
75735 +
75736 +       ret = prepare_to_sleep(wlinks._lock_stack);
75737 +       if (ret == 0) {
75738 +               go_to_sleep(wlinks._lock_stack);
75739 +               ret = RETERR(-E_REPEAT);
75740 +       }
75741 +
75742 +       /* Remove from the waitfor list. */
75743 +       spin_lock_atom(atomf);
75744 +
75745 +       list_del(&wlinks._fwaitfor_link);
75746 +       atom_dec_and_unlock(atomf);
75747 +
75748 +       if (atomh) {
75749 +               /* Remove from the waiting list. */
75750 +               spin_lock_atom(atomh);
75751 +               list_del(&wlinks._fwaiting_link);
75752 +               atom_dec_and_unlock(atomh);
75753 +       }
75754 +       return ret;
75755 +}
75756 +
75757 +static void lock_two_atoms(txn_atom * one, txn_atom * two)
75758 +{
75759 +       assert("zam-1067", one != two);
75760 +
75761 +       /* lock the atom with lesser address first */
75762 +       if (one < two) {
75763 +               spin_lock_atom(one);
75764 +               spin_lock_atom(two);
75765 +       } else {
75766 +               spin_lock_atom(two);
75767 +               spin_lock_atom(one);
75768 +       }
75769 +}
75770 +
75771 +
75772 +/* Perform the necessary work to prepare for fusing two atoms, which involves
75773 + * acquiring two atom locks in the proper order.  If one of the node's atom is
75774 + * blocking fusion (i.e., it is in the CAPTURE_WAIT stage) and the handle's
75775 + * atom is not then the handle's request is put to sleep.  If the node's atom
75776 + * is committing, then the node can be copy-on-captured.  Otherwise, pick the
75777 + * atom with fewer pointers to be fused into the atom with more pointer and
75778 + * call capture_fuse_into.
75779 + */
75780 +static int capture_init_fusion(jnode *node, txn_handle *txnh, txn_capture mode)
75781 +{
75782 +       txn_atom * txnh_atom = txnh->atom;
75783 +       txn_atom * block_atom = node->atom;
75784 +
75785 +       atomic_inc(&txnh_atom->refcount);
75786 +       atomic_inc(&block_atom->refcount);
75787 +
75788 +       spin_unlock_txnh(txnh);
75789 +       spin_unlock_jnode(node);
75790 +
75791 +       lock_two_atoms(txnh_atom, block_atom);
75792 +
75793 +       if (txnh->atom != txnh_atom || node->atom != block_atom ) {
75794 +               release_two_atoms(txnh_atom, block_atom);
75795 +               return RETERR(-E_REPEAT);
75796 +       }
75797 +
75798 +       atomic_dec(&txnh_atom->refcount);
75799 +       atomic_dec(&block_atom->refcount);
75800 +
75801 +       assert ("zam-1066", atom_isopen(txnh_atom));
75802 +
75803 +       if (txnh_atom->stage >= block_atom->stage ||
75804 +           (block_atom->stage == ASTAGE_CAPTURE_WAIT && block_atom->txnh_count == 0)) {
75805 +               capture_fuse_into(txnh_atom, block_atom);
75806 +               return RETERR(-E_REPEAT);
75807 +       }
75808 +       spin_lock_txnh(txnh);
75809 +       return capture_fuse_wait(txnh, block_atom, txnh_atom, mode);
75810 +}
75811 +
75812 +/* This function splices together two jnode lists (small and large) and sets all jnodes in
75813 +   the small list to point to the large atom.  Returns the length of the list. */
75814 +static int
75815 +capture_fuse_jnode_lists(txn_atom *large, struct list_head *large_head,
75816 +                        struct list_head *small_head)
75817 +{
75818 +       int count = 0;
75819 +       jnode *node;
75820 +
75821 +       assert("umka-218", large != NULL);
75822 +       assert("umka-219", large_head != NULL);
75823 +       assert("umka-220", small_head != NULL);
75824 +       /* small atom should be locked also. */
75825 +       assert_spin_locked(&(large->alock));
75826 +
75827 +       /* For every jnode on small's capture list... */
75828 +       list_for_each_entry(node, small_head, capture_link) {
75829 +               count += 1;
75830 +
75831 +               /* With the jnode lock held, update atom pointer. */
75832 +               spin_lock_jnode(node);
75833 +               node->atom = large;
75834 +               spin_unlock_jnode(node);
75835 +       }
75836 +
75837 +       /* Splice the lists. */
75838 +       list_splice_init(small_head, large_head->prev);
75839 +
75840 +       return count;
75841 +}
75842 +
75843 +/* This function splices together two txnh lists (small and large) and sets all txn handles in
75844 +   the small list to point to the large atom.  Returns the length of the list. */
75845 +static int
75846 +capture_fuse_txnh_lists(txn_atom *large, struct list_head *large_head,
75847 +                       struct list_head *small_head)
75848 +{
75849 +       int count = 0;
75850 +       txn_handle *txnh;
75851 +
75852 +       assert("umka-221", large != NULL);
75853 +       assert("umka-222", large_head != NULL);
75854 +       assert("umka-223", small_head != NULL);
75855 +
75856 +       /* Adjust every txnh to the new atom. */
75857 +       list_for_each_entry(txnh, small_head, txnh_link) {
75858 +               count += 1;
75859 +
75860 +               /* With the txnh lock held, update atom pointer. */
75861 +               spin_lock_txnh(txnh);
75862 +               txnh->atom = large;
75863 +               spin_unlock_txnh(txnh);
75864 +       }
75865 +
75866 +       /* Splice the txn_handle list. */
75867 +       list_splice_init(small_head, large_head->prev);
75868 +
75869 +       return count;
75870 +}
75871 +
75872 +/* This function fuses two atoms.  The captured nodes and handles belonging to SMALL are
75873 +   added to LARGE and their ->atom pointers are all updated.  The associated counts are
75874 +   updated as well, and any waiting handles belonging to either are awakened.  Finally the
75875 +   smaller atom's refcount is decremented.
75876 +*/
75877 +static void capture_fuse_into(txn_atom * small, txn_atom * large)
75878 +{
75879 +       int level;
75880 +       unsigned zcount = 0;
75881 +       unsigned tcount = 0;
75882 +
75883 +       assert("umka-224", small != NULL);
75884 +       assert("umka-225", small != NULL);
75885 +
75886 +       assert_spin_locked(&(large->alock));
75887 +       assert_spin_locked(&(small->alock));
75888 +
75889 +       assert("jmacd-201", atom_isopen(small));
75890 +       assert("jmacd-202", atom_isopen(large));
75891 +
75892 +       /* Splice and update the per-level dirty jnode lists */
75893 +       for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1) {
75894 +               zcount +=
75895 +                   capture_fuse_jnode_lists(large,
75896 +                                            ATOM_DIRTY_LIST(large, level),
75897 +                                            ATOM_DIRTY_LIST(small, level));
75898 +       }
75899 +
75900 +       /* Splice and update the [clean,dirty] jnode and txnh lists */
75901 +       zcount +=
75902 +           capture_fuse_jnode_lists(large, ATOM_CLEAN_LIST(large),
75903 +                                    ATOM_CLEAN_LIST(small));
75904 +       zcount +=
75905 +           capture_fuse_jnode_lists(large, ATOM_OVRWR_LIST(large),
75906 +                                    ATOM_OVRWR_LIST(small));
75907 +       zcount +=
75908 +           capture_fuse_jnode_lists(large, ATOM_WB_LIST(large),
75909 +                                    ATOM_WB_LIST(small));
75910 +       zcount +=
75911 +           capture_fuse_jnode_lists(large, &large->inodes, &small->inodes);
75912 +       tcount +=
75913 +           capture_fuse_txnh_lists(large, &large->txnh_list,
75914 +                                   &small->txnh_list);
75915 +
75916 +       /* Check our accounting. */
75917 +       assert("jmacd-1063",
75918 +              zcount + small->num_queued == small->capture_count);
75919 +       assert("jmacd-1065", tcount == small->txnh_count);
75920 +
75921 +       /* sum numbers of waiters threads */
75922 +       large->nr_waiters += small->nr_waiters;
75923 +       small->nr_waiters = 0;
75924 +
75925 +       /* splice flush queues */
75926 +       fuse_fq(large, small);
75927 +
75928 +       /* update counter of jnode on every atom' list */
75929 +       ON_DEBUG(large->dirty += small->dirty;
75930 +                small->dirty = 0;
75931 +                large->clean += small->clean;
75932 +                small->clean = 0;
75933 +                large->ovrwr += small->ovrwr;
75934 +                small->ovrwr = 0;
75935 +                large->wb += small->wb;
75936 +                small->wb = 0;
75937 +                large->fq += small->fq;
75938 +                small->fq = 0;);
75939 +
75940 +       /* count flushers in result atom */
75941 +       large->nr_flushers += small->nr_flushers;
75942 +       small->nr_flushers = 0;
75943 +
75944 +       /* update counts of flushed nodes */
75945 +       large->flushed += small->flushed;
75946 +       small->flushed = 0;
75947 +
75948 +       /* Transfer list counts to large. */
75949 +       large->txnh_count += small->txnh_count;
75950 +       large->capture_count += small->capture_count;
75951 +
75952 +       /* Add all txnh references to large. */
75953 +       atomic_add(small->txnh_count, &large->refcount);
75954 +       atomic_sub(small->txnh_count, &small->refcount);
75955 +
75956 +       /* Reset small counts */
75957 +       small->txnh_count = 0;
75958 +       small->capture_count = 0;
75959 +
75960 +       /* Assign the oldest start_time, merge flags. */
75961 +       large->start_time = min(large->start_time, small->start_time);
75962 +       large->flags |= small->flags;
75963 +
75964 +       /* Merge blocknr sets. */
75965 +       blocknr_set_merge(&small->delete_set, &large->delete_set);
75966 +       blocknr_set_merge(&small->wandered_map, &large->wandered_map);
75967 +
75968 +       /* Merge allocated/deleted file counts */
75969 +       large->nr_objects_deleted += small->nr_objects_deleted;
75970 +       large->nr_objects_created += small->nr_objects_created;
75971 +
75972 +       small->nr_objects_deleted = 0;
75973 +       small->nr_objects_created = 0;
75974 +
75975 +       /* Merge allocated blocks counts */
75976 +       large->nr_blocks_allocated += small->nr_blocks_allocated;
75977 +
75978 +       large->nr_running_queues += small->nr_running_queues;
75979 +       small->nr_running_queues = 0;
75980 +
75981 +       /* Merge blocks reserved for overwrite set. */
75982 +       large->flush_reserved += small->flush_reserved;
75983 +       small->flush_reserved = 0;
75984 +
75985 +       if (large->stage < small->stage) {
75986 +               /* Large only needs to notify if it has changed state. */
75987 +               atom_set_stage(large, small->stage);
75988 +               wakeup_atom_waiting_list(large);
75989 +       }
75990 +
75991 +       atom_set_stage(small, ASTAGE_INVALID);
75992 +
75993 +       /* Notify any waiters--small needs to unload its wait lists.  Waiters
75994 +          actually remove themselves from the list before returning from the
75995 +          fuse_wait function. */
75996 +       wakeup_atom_waiting_list(small);
75997 +
75998 +       /* Unlock atoms */
75999 +       spin_unlock_atom(large);
76000 +       atom_dec_and_unlock(small);
76001 +}
76002 +
76003 +/* TXNMGR STUFF */
76004 +
76005 +/* Release a block from the atom, reversing the effects of being captured,
76006 +   do not release atom's reference to jnode due to holding spin-locks.
76007 +   Currently this is only called when the atom commits.
76008 +
76009 +   NOTE: this function does not release a (journal) reference to jnode
76010 +   due to locking optimizations, you should call jput() somewhere after
76011 +   calling uncapture_block(). */
76012 +void uncapture_block(jnode * node)
76013 +{
76014 +       txn_atom *atom;
76015 +
76016 +       assert("umka-226", node != NULL);
76017 +       atom = node->atom;
76018 +       assert("umka-228", atom != NULL);
76019 +
76020 +       assert("jmacd-1021", node->atom == atom);
76021 +       assert_spin_locked(&(node->guard));
76022 +       assert("jmacd-1023", atom_is_protected(atom));
76023 +
76024 +       JF_CLR(node, JNODE_DIRTY);
76025 +       JF_CLR(node, JNODE_RELOC);
76026 +       JF_CLR(node, JNODE_OVRWR);
76027 +       JF_CLR(node, JNODE_CREATED);
76028 +       JF_CLR(node, JNODE_WRITEBACK);
76029 +       JF_CLR(node, JNODE_REPACK);
76030 +
76031 +       list_del_init(&node->capture_link);
76032 +       if (JF_ISSET(node, JNODE_FLUSH_QUEUED)) {
76033 +               assert("zam-925", atom_isopen(atom));
76034 +               assert("vs-1623", NODE_LIST(node) == FQ_LIST);
76035 +               ON_DEBUG(atom->num_queued--);
76036 +               JF_CLR(node, JNODE_FLUSH_QUEUED);
76037 +       }
76038 +       atom->capture_count -= 1;
76039 +       ON_DEBUG(count_jnode(atom, node, NODE_LIST(node), NOT_CAPTURED, 1));
76040 +       node->atom = NULL;
76041 +
76042 +       spin_unlock_jnode(node);
76043 +       LOCK_CNT_DEC(t_refs);
76044 +}
76045 +
76046 +/* Unconditional insert of jnode into atom's overwrite list. Currently used in
76047 +   bitmap-based allocator code for adding modified bitmap blocks the
76048 +   transaction. @atom and @node are spin locked */
76049 +void insert_into_atom_ovrwr_list(txn_atom * atom, jnode * node)
76050 +{
76051 +       assert("zam-538", atom_is_protected(atom));
76052 +       assert_spin_locked(&(node->guard));
76053 +       assert("zam-899", JF_ISSET(node, JNODE_OVRWR));
76054 +       assert("zam-543", node->atom == NULL);
76055 +       assert("vs-1433", !jnode_is_unformatted(node) && !jnode_is_znode(node));
76056 +
76057 +       list_add(&node->capture_link, ATOM_OVRWR_LIST(atom));
76058 +       jref(node);
76059 +       node->atom = atom;
76060 +       atom->capture_count++;
76061 +       ON_DEBUG(count_jnode(atom, node, NODE_LIST(node), OVRWR_LIST, 1));
76062 +}
76063 +
76064 +
76065 +#if REISER4_DEBUG
76066 +
76067 +void info_atom(const char *prefix, const txn_atom * atom)
76068 +{
76069 +       if (atom == NULL) {
76070 +               printk("%s: no atom\n", prefix);
76071 +               return;
76072 +       }
76073 +
76074 +       printk("%s: refcount: %i id: %i flags: %x txnh_count: %i"
76075 +              " capture_count: %i stage: %x start: %lu, flushed: %i\n", prefix,
76076 +              atomic_read(&atom->refcount), atom->atom_id, atom->flags,
76077 +              atom->txnh_count, atom->capture_count, atom->stage,
76078 +              atom->start_time, atom->flushed);
76079 +}
76080 +
76081 +#endif
76082 +
76083 +static int count_deleted_blocks_actor(txn_atom * atom,
76084 +                                     const reiser4_block_nr * a,
76085 +                                     const reiser4_block_nr * b, void *data)
76086 +{
76087 +       reiser4_block_nr *counter = data;
76088 +
76089 +       assert("zam-995", data != NULL);
76090 +       assert("zam-996", a != NULL);
76091 +       if (b == NULL)
76092 +               *counter += 1;
76093 +       else
76094 +               *counter += *b;
76095 +       return 0;
76096 +}
76097 +
76098 +reiser4_block_nr txnmgr_count_deleted_blocks(void)
76099 +{
76100 +       reiser4_block_nr result;
76101 +       txn_mgr *tmgr = &get_super_private(reiser4_get_current_sb())->tmgr;
76102 +       txn_atom *atom;
76103 +
76104 +       result = 0;
76105 +
76106 +       spin_lock_txnmgr(tmgr);
76107 +       list_for_each_entry(atom, &tmgr->atoms_list, atom_link) {
76108 +               spin_lock_atom(atom);
76109 +               if (atom_isopen(atom))
76110 +                       blocknr_set_iterator(
76111 +                               atom, &atom->delete_set,
76112 +                               count_deleted_blocks_actor, &result, 0);
76113 +               spin_unlock_atom(atom);
76114 +       }
76115 +       spin_unlock_txnmgr(tmgr);
76116 +
76117 +       return result;
76118 +}
76119 +
76120 +/*
76121 + * Local variables:
76122 + * c-indentation-style: "K&R"
76123 + * mode-name: "LC"
76124 + * c-basic-offset: 8
76125 + * tab-width: 8
76126 + * fill-column: 79
76127 + * End:
76128 + */
76129 diff --git a/fs/reiser4/txnmgr.h b/fs/reiser4/txnmgr.h
76130 new file mode 100644
76131 index 0000000..07c80c4
76132 --- /dev/null
76133 +++ b/fs/reiser4/txnmgr.h
76134 @@ -0,0 +1,703 @@
76135 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
76136 + * reiser4/README */
76137 +
76138 +/* data-types and function declarations for transaction manager. See txnmgr.c
76139 + * for details. */
76140 +
76141 +#ifndef __REISER4_TXNMGR_H__
76142 +#define __REISER4_TXNMGR_H__
76143 +
76144 +#include "forward.h"
76145 +#include "dformat.h"
76146 +
76147 +#include <linux/fs.h>
76148 +#include <linux/mm.h>
76149 +#include <linux/types.h>
76150 +#include <linux/spinlock.h>
76151 +#include <asm/atomic.h>
76152 +#include <asm/semaphore.h>
76153 +
76154 +/* TYPE DECLARATIONS */
76155 +
76156 +/* This enumeration describes the possible types of a capture request (try_capture).
76157 +   A capture request dynamically assigns a block to the calling thread's transaction
76158 +   handle. */
76159 +typedef enum {
76160 +       /* A READ_ATOMIC request indicates that a block will be read and that the caller's
76161 +          atom should fuse in order to ensure that the block commits atomically with the
76162 +          caller. */
76163 +       TXN_CAPTURE_READ_ATOMIC = (1 << 0),
76164 +
76165 +       /* A READ_NONCOM request indicates that a block will be read and that the caller is
76166 +          willing to read a non-committed block without causing atoms to fuse. */
76167 +       TXN_CAPTURE_READ_NONCOM = (1 << 1),
76168 +
76169 +       /* A READ_MODIFY request indicates that a block will be read but that the caller
76170 +          wishes for the block to be captured as it will be written.  This capture request
76171 +          mode is not currently used, but eventually it will be useful for preventing
76172 +          deadlock in read-modify-write cycles. */
76173 +       TXN_CAPTURE_READ_MODIFY = (1 << 2),
76174 +
76175 +       /* A WRITE capture request indicates that a block will be modified and that atoms
76176 +          should fuse to make the commit atomic. */
76177 +       TXN_CAPTURE_WRITE = (1 << 3),
76178 +
76179 +       /* CAPTURE_TYPES is a mask of the four above capture types, used to separate the
76180 +          exclusive type designation from extra bits that may be supplied -- see
76181 +          below. */
76182 +       TXN_CAPTURE_TYPES = (TXN_CAPTURE_READ_ATOMIC |
76183 +                            TXN_CAPTURE_READ_NONCOM | TXN_CAPTURE_READ_MODIFY |
76184 +                            TXN_CAPTURE_WRITE),
76185 +
76186 +       /* A subset of CAPTURE_TYPES, CAPTURE_WTYPES is a mask of request types that
76187 +          indicate modification will occur. */
76188 +       TXN_CAPTURE_WTYPES = (TXN_CAPTURE_READ_MODIFY | TXN_CAPTURE_WRITE),
76189 +
76190 +       /* An option to try_capture, NONBLOCKING indicates that the caller would
76191 +          prefer not to sleep waiting for an aging atom to commit. */
76192 +       TXN_CAPTURE_NONBLOCKING = (1 << 4),
76193 +
76194 +       /* An option to try_capture to prevent atom fusion, just simple capturing is allowed */
76195 +       TXN_CAPTURE_DONT_FUSE = (1 << 5)
76196 +
76197 +       /* This macro selects only the exclusive capture request types, stripping out any
76198 +          options that were supplied (i.e., NONBLOCKING). */
76199 +#define CAPTURE_TYPE(x) ((x) & TXN_CAPTURE_TYPES)
76200 +} txn_capture;
76201 +
76202 +/* There are two kinds of transaction handle: WRITE_FUSING and READ_FUSING, the only
76203 +   difference is in the handling of read requests.  A WRITE_FUSING transaction handle
76204 +   defaults read capture requests to TXN_CAPTURE_READ_NONCOM whereas a READ_FUSIONG
76205 +   transaction handle defaults to TXN_CAPTURE_READ_ATOMIC. */
76206 +typedef enum {
76207 +       TXN_WRITE_FUSING = (1 << 0),
76208 +       TXN_READ_FUSING = (1 << 1) | TXN_WRITE_FUSING,  /* READ implies WRITE */
76209 +} txn_mode;
76210 +
76211 +/* Every atom has a stage, which is one of these exclusive values: */
76212 +typedef enum {
76213 +       /* Initially an atom is free. */
76214 +       ASTAGE_FREE = 0,
76215 +
76216 +       /* An atom begins by entering the CAPTURE_FUSE stage, where it proceeds to capture
76217 +          blocks and fuse with other atoms. */
76218 +       ASTAGE_CAPTURE_FUSE = 1,
76219 +
76220 +       /* We need to have a ASTAGE_CAPTURE_SLOW in which an atom fuses with one node for every X nodes it flushes to disk where X > 1. */
76221 +
76222 +       /* When an atom reaches a certain age it must do all it can to commit.  An atom in
76223 +          the CAPTURE_WAIT stage refuses new transaction handles and prevents fusion from
76224 +          atoms in the CAPTURE_FUSE stage. */
76225 +       ASTAGE_CAPTURE_WAIT = 2,
76226 +
76227 +       /* Waiting for I/O before commit.  Copy-on-capture (see
76228 +          http://namesys.com/v4/v4.html). */
76229 +       ASTAGE_PRE_COMMIT = 3,
76230 +
76231 +       /* Post-commit overwrite I/O.  Steal-on-capture. */
76232 +       ASTAGE_POST_COMMIT = 4,
76233 +
76234 +       /* Atom which waits for the removal of the last reference to (it? ) to
76235 +        * be deleted from memory  */
76236 +       ASTAGE_DONE = 5,
76237 +
76238 +       /* invalid atom. */
76239 +       ASTAGE_INVALID = 6,
76240 +
76241 +} txn_stage;
76242 +
76243 +/* Certain flags may be set in the txn_atom->flags field. */
76244 +typedef enum {
76245 +       /* Indicates that the atom should commit as soon as possible. */
76246 +       ATOM_FORCE_COMMIT = (1 << 0),
76247 +       /* to avoid endless loop, mark the atom (which was considered as too
76248 +        * small) after failed attempt to fuse it. */
76249 +       ATOM_CANCEL_FUSION = (1 << 1)
76250 +} txn_flags;
76251 +
76252 +/* Flags for controlling commit_txnh */
76253 +typedef enum {
76254 +       /* Wait commit atom completion in commit_txnh */
76255 +       TXNH_WAIT_COMMIT = 0x2,
76256 +       /* Don't commit atom when this handle is closed */
76257 +       TXNH_DONT_COMMIT = 0x4
76258 +} txn_handle_flags_t;
76259 +
76260 +/* TYPE DEFINITIONS */
76261 +
76262 +/* A note on lock ordering: the handle & jnode spinlock protects reading of their ->atom
76263 +   fields, so typically an operation on the atom through either of these objects must (1)
76264 +   lock the object, (2) read the atom pointer, (3) lock the atom.
76265 +
76266 +   During atom fusion, the process holds locks on both atoms at once.  Then, it iterates
76267 +   through the list of handles and pages held by the smaller of the two atoms.  For each
76268 +   handle and page referencing the smaller atom, the fusing process must: (1) lock the
76269 +   object, and (2) update the atom pointer.
76270 +
76271 +   You can see that there is a conflict of lock ordering here, so the more-complex
76272 +   procedure should have priority, i.e., the fusing process has priority so that it is
76273 +   guaranteed to make progress and to avoid restarts.
76274 +
76275 +   This decision, however, means additional complexity for aquiring the atom lock in the
76276 +   first place.
76277 +
76278 +   The general original procedure followed in the code was:
76279 +
76280 +       TXN_OBJECT *obj = ...;
76281 +       TXN_ATOM   *atom;
76282 +
76283 +       spin_lock (& obj->_lock);
76284 +
76285 +       atom = obj->_atom;
76286 +
76287 +       if (! spin_trylock_atom (atom))
76288 +         {
76289 +           spin_unlock (& obj->_lock);
76290 +           RESTART OPERATION, THERE WAS A RACE;
76291 +         }
76292 +
76293 +       ELSE YOU HAVE BOTH ATOM AND OBJ LOCKED
76294 +
76295 +   It has however been found that this wastes CPU a lot in a manner that is
76296 +   hard to profile. So, proper refcounting was added to atoms, and new
76297 +   standard locking sequence is like following:
76298 +
76299 +       TXN_OBJECT *obj = ...;
76300 +       TXN_ATOM   *atom;
76301 +
76302 +       spin_lock (& obj->_lock);
76303 +
76304 +       atom = obj->_atom;
76305 +
76306 +       if (! spin_trylock_atom (atom))
76307 +         {
76308 +           atomic_inc (& atom->refcount);
76309 +           spin_unlock (& obj->_lock);
76310 +           spin_lock (&atom->_lock);
76311 +           atomic_dec (& atom->refcount);
76312 +           // HERE atom is locked
76313 +           spin_unlock (&atom->_lock);
76314 +           RESTART OPERATION, THERE WAS A RACE;
76315 +         }
76316 +
76317 +       ELSE YOU HAVE BOTH ATOM AND OBJ LOCKED
76318 +
76319 +   (core of this is implemented in trylock_throttle() function)
76320 +
76321 +   See the jnode_get_atom() function for a common case.
76322 +
76323 +   As an additional (and important) optimization allowing to avoid restarts,
76324 +   it is possible to re-check required pre-conditions at the HERE point in
76325 +   code above and proceed without restarting if they are still satisfied.
76326 +*/
76327 +
76328 +/* A block number set consists of only the list head. */
76329 +struct blocknr_set {
76330 +       struct list_head entries;
76331 +};
76332 +
76333 +/* An atomic transaction: this is the underlying system representation
76334 +   of a transaction, not the one seen by clients.
76335 +
76336 +   Invariants involving this data-type:
76337 +
76338 +      [sb-fake-allocated]
76339 +*/
76340 +struct txn_atom {
76341 +       /* The spinlock protecting the atom, held during fusion and various other state
76342 +          changes. */
76343 +       spinlock_t alock;
76344 +
76345 +       /* The atom's reference counter, increasing (in case of a duplication
76346 +          of an existing reference or when we are sure that some other
76347 +          reference exists) may be done without taking spinlock, decrementing
76348 +          of the ref. counter requires a spinlock to be held.
76349 +
76350 +          Each transaction handle counts in ->refcount. All jnodes count as
76351 +          one reference acquired in atom_begin_andlock(), released in
76352 +          commit_current_atom().
76353 +        */
76354 +       atomic_t refcount;
76355 +
76356 +       /* The atom_id identifies the atom in persistent records such as the log. */
76357 +       __u32 atom_id;
76358 +
76359 +       /* Flags holding any of the txn_flags enumerated values (e.g.,
76360 +          ATOM_FORCE_COMMIT). */
76361 +       __u32 flags;
76362 +
76363 +       /* Number of open handles. */
76364 +       __u32 txnh_count;
76365 +
76366 +       /* The number of znodes captured by this atom.  Equal to the sum of lengths of the
76367 +          dirty_nodes[level] and clean_nodes lists. */
76368 +       __u32 capture_count;
76369 +
76370 +#if REISER4_DEBUG
76371 +       int clean;
76372 +       int dirty;
76373 +       int ovrwr;
76374 +       int wb;
76375 +       int fq;
76376 +#endif
76377 +
76378 +       __u32 flushed;
76379 +
76380 +       /* Current transaction stage. */
76381 +       txn_stage stage;
76382 +
76383 +       /* Start time. */
76384 +       unsigned long start_time;
76385 +
76386 +       /* The atom's delete set. It collects block numbers of the nodes
76387 +          which were deleted during the transaction. */
76388 +       blocknr_set delete_set;
76389 +
76390 +       /* The atom's wandered_block mapping. */
76391 +       blocknr_set wandered_map;
76392 +
76393 +       /* The transaction's list of dirty captured nodes--per level.  Index
76394 +          by (level). dirty_nodes[0] is for znode-above-root */
76395 +       struct list_head dirty_nodes[REAL_MAX_ZTREE_HEIGHT + 1];
76396 +
76397 +       /* The transaction's list of clean captured nodes. */
76398 +       struct list_head clean_nodes;
76399 +
76400 +       /* The atom's overwrite set */
76401 +       struct list_head ovrwr_nodes;
76402 +
76403 +       /* nodes which are being written to disk */
76404 +       struct list_head writeback_nodes;
76405 +
76406 +       /* list of inodes */
76407 +       struct list_head inodes;
76408 +
76409 +       /* List of handles associated with this atom. */
76410 +       struct list_head txnh_list;
76411 +
76412 +       /* Transaction list link: list of atoms in the transaction manager. */
76413 +       struct list_head atom_link;
76414 +
76415 +       /* List of handles waiting FOR this atom: see 'capture_fuse_wait' comment. */
76416 +       struct list_head fwaitfor_list;
76417 +
76418 +       /* List of this atom's handles that are waiting: see 'capture_fuse_wait' comment. */
76419 +       struct list_head fwaiting_list;
76420 +
76421 +       /* Numbers of objects which were deleted/created in this transaction
76422 +          thereby numbers of objects IDs which were released/deallocated. */
76423 +       int nr_objects_deleted;
76424 +       int nr_objects_created;
76425 +       /* number of blocks allocated during the transaction */
76426 +       __u64 nr_blocks_allocated;
76427 +       /* All atom's flush queue objects are on this list  */
76428 +       struct list_head flush_queues;
76429 +#if REISER4_DEBUG
76430 +       /* number of flush queues for this atom. */
76431 +       int nr_flush_queues;
76432 +       /* Number of jnodes which were removed from atom's lists and put
76433 +          on flush_queue */
76434 +       int num_queued;
76435 +#endif
76436 +       /* number of threads who wait for this atom to complete commit */
76437 +       int nr_waiters;
76438 +       /* number of threads which do jnode_flush() over this atom */
76439 +       int nr_flushers;
76440 +       /* number of flush queues which are IN_USE and jnodes from fq->prepped
76441 +          are submitted to disk by the write_fq() routine. */
76442 +       int nr_running_queues;
76443 +       /* A counter of grabbed unformatted nodes, see a description of the
76444 +        * reiser4 space reservation scheme at block_alloc.c */
76445 +       reiser4_block_nr flush_reserved;
76446 +#if REISER4_DEBUG
76447 +       void *committer;
76448 +#endif
76449 +       struct super_block *super;
76450 +};
76451 +
76452 +#define ATOM_DIRTY_LIST(atom, level) (&(atom)->dirty_nodes[level])
76453 +#define ATOM_CLEAN_LIST(atom) (&(atom)->clean_nodes)
76454 +#define ATOM_OVRWR_LIST(atom) (&(atom)->ovrwr_nodes)
76455 +#define ATOM_WB_LIST(atom) (&(atom)->writeback_nodes)
76456 +#define ATOM_FQ_LIST(fq) (&(fq)->prepped)
76457 +
76458 +#define NODE_LIST(node) (node)->list
76459 +#define ASSIGN_NODE_LIST(node, list) ON_DEBUG(NODE_LIST(node) = list)
76460 +ON_DEBUG(void
76461 +        count_jnode(txn_atom *, jnode *, atom_list old_list,
76462 +                    atom_list new_list, int check_lists));
76463 +
76464 +typedef struct protected_jnodes {
76465 +       struct list_head inatom; /* link to atom's list these structures */
76466 +       struct list_head nodes; /* head of list of protected nodes */
76467 +} protected_jnodes;
76468 +
76469 +/* A transaction handle: the client obtains and commits this handle which is assigned by
76470 +   the system to a txn_atom. */
76471 +struct txn_handle {
76472 +       /* Spinlock protecting ->atom pointer */
76473 +       spinlock_t hlock;
76474 +
76475 +       /* Flags for controlling commit_txnh() behavior */
76476 +       /* from txn_handle_flags_t */
76477 +       txn_handle_flags_t flags;
76478 +
76479 +       /* Whether it is READ_FUSING or WRITE_FUSING. */
76480 +       txn_mode mode;
76481 +
76482 +       /* If assigned, the atom it is part of. */
76483 +       txn_atom *atom;
76484 +
76485 +       /* Transaction list link. Head is in txn_atom. */
76486 +       struct list_head txnh_link;
76487 +};
76488 +
76489 +/* The transaction manager: one is contained in the reiser4_super_info_data */
76490 +struct txn_mgr {
76491 +       /* A spinlock protecting the atom list, id_count, flush_control */
76492 +       spinlock_t tmgr_lock;
76493 +
76494 +       /* List of atoms. */
76495 +       struct list_head atoms_list;
76496 +
76497 +       /* Number of atoms. */
76498 +       int atom_count;
76499 +
76500 +       /* A counter used to assign atom->atom_id values. */
76501 +       __u32 id_count;
76502 +
76503 +       /* a semaphore object for commit serialization */
76504 +       struct semaphore commit_semaphore;
76505 +
76506 +       /* a list of all txnmrgs served by particular daemon. */
76507 +       struct list_head linkage;
76508 +
76509 +       /* description of daemon for this txnmgr */
76510 +       ktxnmgrd_context *daemon;
76511 +
76512 +       /* parameters. Adjustable through mount options. */
76513 +       unsigned int atom_max_size;
76514 +       unsigned int atom_max_age;
76515 +       unsigned int atom_min_size;
76516 +       /* max number of concurrent flushers for one atom, 0 - unlimited.  */
76517 +       unsigned int atom_max_flushers;
76518 +       struct dentry *debugfs_atom_count;
76519 +       struct dentry *debugfs_id_count;
76520 +};
76521 +
76522 +/* FUNCTION DECLARATIONS */
76523 +
76524 +/* These are the externally (within Reiser4) visible transaction functions, therefore they
76525 +   are prefixed with "txn_".  For comments, see txnmgr.c. */
76526 +
76527 +extern int init_txnmgr_static(void);
76528 +extern void done_txnmgr_static(void);
76529 +
76530 +extern void init_txnmgr(txn_mgr *);
76531 +extern void done_txnmgr(txn_mgr *);
76532 +
76533 +extern int txn_reserve(int reserved);
76534 +
76535 +extern void txn_begin(reiser4_context * context);
76536 +extern int txn_end(reiser4_context * context);
76537 +
76538 +extern void txn_restart(reiser4_context * context);
76539 +extern void txn_restart_current(void);
76540 +
76541 +extern int txnmgr_force_commit_all(struct super_block *, int);
76542 +extern int current_atom_should_commit(void);
76543 +
76544 +extern jnode *find_first_dirty_jnode(txn_atom *, int);
76545 +
76546 +extern int commit_some_atoms(txn_mgr *);
76547 +extern int force_commit_atom(txn_handle *);
76548 +extern int flush_current_atom(int, long, long *, txn_atom **, jnode *);
76549 +
76550 +extern int flush_some_atom(jnode *, long *, const struct writeback_control *, int);
76551 +
76552 +extern void atom_set_stage(txn_atom * atom, txn_stage stage);
76553 +
76554 +extern int same_slum_check(jnode * base, jnode * check, int alloc_check,
76555 +                          int alloc_value);
76556 +extern void atom_dec_and_unlock(txn_atom * atom);
76557 +
76558 +extern int try_capture(jnode * node, znode_lock_mode mode, txn_capture flags);
76559 +extern int try_capture_page_to_invalidate(struct page *pg);
76560 +
76561 +extern void uncapture_page(struct page *pg);
76562 +extern void uncapture_block(jnode *);
76563 +extern void uncapture_jnode(jnode *);
76564 +
76565 +extern int capture_inode(struct inode *);
76566 +extern int uncapture_inode(struct inode *);
76567 +
76568 +extern txn_atom *get_current_atom_locked_nocheck(void);
76569 +
76570 +#if REISER4_DEBUG
76571 +
76572 +/**
76573 + * atom_is_protected - make sure that nobody but us can do anything with atom
76574 + * @atom: atom to be checked
76575 + *
76576 + * This is used to assert that atom either entered commit stages or is spin
76577 + * locked.
76578 + */
76579 +static inline int atom_is_protected(txn_atom *atom)
76580 +{
76581 +       if (atom->stage >= ASTAGE_PRE_COMMIT)
76582 +               return 1;
76583 +       assert_spin_locked(&(atom->alock));
76584 +       return 1;
76585 +}
76586 +
76587 +#endif
76588 +
76589 +/* Get the current atom and spinlock it if current atom present. May not return NULL */
76590 +static inline txn_atom *get_current_atom_locked(void)
76591 +{
76592 +       txn_atom *atom;
76593 +
76594 +       atom = get_current_atom_locked_nocheck();
76595 +       assert("zam-761", atom != NULL);
76596 +
76597 +       return atom;
76598 +}
76599 +
76600 +extern txn_atom *jnode_get_atom(jnode *);
76601 +
76602 +extern void atom_wait_event(txn_atom *);
76603 +extern void atom_send_event(txn_atom *);
76604 +
76605 +extern void insert_into_atom_ovrwr_list(txn_atom * atom, jnode * node);
76606 +extern int capture_super_block(struct super_block *s);
76607 +int capture_bulk(jnode **, int count);
76608 +
76609 +/* See the comment on the function blocknrset.c:blocknr_set_add for the
76610 +   calling convention of these three routines. */
76611 +extern void blocknr_set_init(blocknr_set * bset);
76612 +extern void blocknr_set_destroy(blocknr_set * bset);
76613 +extern void blocknr_set_merge(blocknr_set * from, blocknr_set * into);
76614 +extern int blocknr_set_add_extent(txn_atom * atom,
76615 +                                 blocknr_set * bset,
76616 +                                 blocknr_set_entry ** new_bsep,
76617 +                                 const reiser4_block_nr * start,
76618 +                                 const reiser4_block_nr * len);
76619 +extern int blocknr_set_add_pair(txn_atom * atom, blocknr_set * bset,
76620 +                               blocknr_set_entry ** new_bsep,
76621 +                               const reiser4_block_nr * a,
76622 +                               const reiser4_block_nr * b);
76623 +
76624 +typedef int (*blocknr_set_actor_f) (txn_atom *, const reiser4_block_nr *,
76625 +                                   const reiser4_block_nr *, void *);
76626 +
76627 +extern int blocknr_set_iterator(txn_atom * atom, blocknr_set * bset,
76628 +                               blocknr_set_actor_f actor, void *data,
76629 +                               int delete);
76630 +
76631 +/* flush code takes care about how to fuse flush queues */
76632 +extern void flush_init_atom(txn_atom * atom);
76633 +extern void flush_fuse_queues(txn_atom * large, txn_atom * small);
76634 +
76635 +static inline void spin_lock_atom(txn_atom *atom)
76636 +{
76637 +       /* check that spinlocks of lower priorities are not held */
76638 +       assert("", (LOCK_CNT_NIL(spin_locked_txnh) &&
76639 +                   LOCK_CNT_NIL(spin_locked_jnode) &&
76640 +                   LOCK_CNT_NIL(spin_locked_zlock) &&
76641 +                   LOCK_CNT_NIL(rw_locked_dk) &&
76642 +                   LOCK_CNT_NIL(rw_locked_tree)));
76643 +
76644 +       spin_lock(&(atom->alock));
76645 +
76646 +       LOCK_CNT_INC(spin_locked_atom);
76647 +       LOCK_CNT_INC(spin_locked);
76648 +}
76649 +
76650 +static inline int spin_trylock_atom(txn_atom *atom)
76651 +{
76652 +       if (spin_trylock(&(atom->alock))) {
76653 +               LOCK_CNT_INC(spin_locked_atom);
76654 +               LOCK_CNT_INC(spin_locked);
76655 +               return 1;
76656 +       }
76657 +       return 0;
76658 +}
76659 +
76660 +static inline void spin_unlock_atom(txn_atom *atom)
76661 +{
76662 +       assert_spin_locked(&(atom->alock));
76663 +       assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_atom));
76664 +       assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
76665 +
76666 +       LOCK_CNT_DEC(spin_locked_atom);
76667 +       LOCK_CNT_DEC(spin_locked);
76668 +
76669 +       spin_unlock(&(atom->alock));
76670 +}
76671 +
76672 +static inline void spin_lock_txnh(txn_handle *txnh)
76673 +{
76674 +       /* check that spinlocks of lower priorities are not held */
76675 +       assert("", (LOCK_CNT_NIL(rw_locked_dk) &&
76676 +                   LOCK_CNT_NIL(spin_locked_zlock) &&
76677 +                   LOCK_CNT_NIL(rw_locked_tree)));
76678 +
76679 +       spin_lock(&(txnh->hlock));
76680 +
76681 +       LOCK_CNT_INC(spin_locked_txnh);
76682 +       LOCK_CNT_INC(spin_locked);
76683 +}
76684 +
76685 +static inline int spin_trylock_txnh(txn_handle *txnh)
76686 +{
76687 +       if (spin_trylock(&(txnh->hlock))) {
76688 +               LOCK_CNT_INC(spin_locked_txnh);
76689 +               LOCK_CNT_INC(spin_locked);
76690 +               return 1;
76691 +       }
76692 +       return 0;
76693 +}
76694 +
76695 +static inline void spin_unlock_txnh(txn_handle *txnh)
76696 +{
76697 +       assert_spin_locked(&(txnh->hlock));
76698 +       assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_txnh));
76699 +       assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
76700 +
76701 +       LOCK_CNT_DEC(spin_locked_txnh);
76702 +       LOCK_CNT_DEC(spin_locked);
76703 +
76704 +       spin_unlock(&(txnh->hlock));
76705 +}
76706 +
76707 +#define spin_ordering_pred_txnmgr(tmgr)                \
76708 +       ( LOCK_CNT_NIL(spin_locked_atom) &&     \
76709 +         LOCK_CNT_NIL(spin_locked_txnh) &&     \
76710 +         LOCK_CNT_NIL(spin_locked_jnode) &&    \
76711 +         LOCK_CNT_NIL(rw_locked_zlock) &&      \
76712 +         LOCK_CNT_NIL(rw_locked_dk) &&         \
76713 +         LOCK_CNT_NIL(rw_locked_tree) )
76714 +
76715 +static inline void spin_lock_txnmgr(txn_mgr *mgr)
76716 +{
76717 +       /* check that spinlocks of lower priorities are not held */
76718 +       assert("", (LOCK_CNT_NIL(spin_locked_atom) &&
76719 +                   LOCK_CNT_NIL(spin_locked_txnh) &&
76720 +                   LOCK_CNT_NIL(spin_locked_jnode) &&
76721 +                   LOCK_CNT_NIL(spin_locked_zlock) &&
76722 +                   LOCK_CNT_NIL(rw_locked_dk) &&
76723 +                   LOCK_CNT_NIL(rw_locked_tree)));
76724 +
76725 +       spin_lock(&(mgr->tmgr_lock));
76726 +
76727 +       LOCK_CNT_INC(spin_locked_txnmgr);
76728 +       LOCK_CNT_INC(spin_locked);
76729 +}
76730 +
76731 +static inline int spin_trylock_txnmgr(txn_mgr *mgr)
76732 +{
76733 +       if (spin_trylock(&(mgr->tmgr_lock))) {
76734 +               LOCK_CNT_INC(spin_locked_txnmgr);
76735 +               LOCK_CNT_INC(spin_locked);
76736 +               return 1;
76737 +       }
76738 +       return 0;
76739 +}
76740 +
76741 +static inline void spin_unlock_txnmgr(txn_mgr *mgr)
76742 +{
76743 +       assert_spin_locked(&(mgr->tmgr_lock));
76744 +       assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_txnmgr));
76745 +       assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
76746 +
76747 +       LOCK_CNT_DEC(spin_locked_txnmgr);
76748 +       LOCK_CNT_DEC(spin_locked);
76749 +
76750 +       spin_unlock(&(mgr->tmgr_lock));
76751 +}
76752 +
76753 +typedef enum {
76754 +       FQ_IN_USE = 0x1
76755 +} flush_queue_state_t;
76756 +
76757 +typedef struct flush_queue flush_queue_t;
76758 +
76759 +/* This is an accumulator for jnodes prepared for writing to disk. A flush queue
76760 +   is filled by the jnode_flush() routine, and written to disk under memory
76761 +   pressure or at atom commit time. */
76762 +/* LOCKING: fq state and fq->atom are protected by guard spinlock, fq->nr_queued
76763 +   field and fq->prepped list can be modified if atom is spin-locked and fq
76764 +   object is "in-use" state.  For read-only traversal of the fq->prepped list
76765 +   and reading of the fq->nr_queued field it is enough to keep fq "in-use" or
76766 +   only have atom spin-locked. */
76767 +struct flush_queue {
76768 +       /* linkage element is the first in this structure to make debugging
76769 +          easier.  See field in atom struct for description of list. */
76770 +       struct list_head alink;
76771 +       /* A spinlock to protect changes of fq state and fq->atom pointer */
76772 +       spinlock_t guard;
76773 +       /* flush_queue state: [in_use | ready] */
76774 +       flush_queue_state_t state;
76775 +       /* A list which contains queued nodes, queued nodes are removed from any
76776 +        * atom's list and put on this ->prepped one. */
76777 +       struct list_head prepped;
76778 +       /* number of submitted i/o requests */
76779 +       atomic_t nr_submitted;
76780 +       /* number of i/o errors */
76781 +       atomic_t nr_errors;
76782 +       /* An atom this flush queue is attached to */
76783 +       txn_atom *atom;
76784 +       /* A semaphore for waiting on i/o completion */
76785 +       struct semaphore io_sem;
76786 +#if REISER4_DEBUG
76787 +       /* A thread which took this fq in exclusive use, NULL if fq is free,
76788 +        * used for debugging. */
76789 +       struct task_struct *owner;
76790 +#endif
76791 +};
76792 +
76793 +extern int fq_by_atom(txn_atom *, flush_queue_t **);
76794 +extern void fq_put_nolock(flush_queue_t *);
76795 +extern void fq_put(flush_queue_t *);
76796 +extern void fuse_fq(txn_atom * to, txn_atom * from);
76797 +extern void queue_jnode(flush_queue_t *, jnode *);
76798 +
76799 +extern int write_fq(flush_queue_t *, long *, int);
76800 +extern int current_atom_finish_all_fq(void);
76801 +extern void init_atom_fq_parts(txn_atom *);
76802 +
76803 +extern reiser4_block_nr txnmgr_count_deleted_blocks(void);
76804 +
76805 +extern void znode_make_dirty(znode * node);
76806 +extern void jnode_make_dirty_locked(jnode * node);
76807 +
76808 +extern int sync_atom(txn_atom * atom);
76809 +
76810 +#if REISER4_DEBUG
76811 +extern int atom_fq_parts_are_clean(txn_atom *);
76812 +#endif
76813 +
76814 +extern void add_fq_to_bio(flush_queue_t *, struct bio *);
76815 +extern flush_queue_t *get_fq_for_current_atom(void);
76816 +
76817 +void protected_jnodes_init(protected_jnodes * list);
76818 +void protected_jnodes_done(protected_jnodes * list);
76819 +void invalidate_list(struct list_head * head);
76820 +
76821 +#if REISER4_DEBUG
76822 +void info_atom(const char *prefix, const txn_atom * atom);
76823 +#else
76824 +#define info_atom(p,a) noop
76825 +#endif
76826 +
76827 +# endif                                /* __REISER4_TXNMGR_H__ */
76828 +
76829 +/* Make Linus happy.
76830 +   Local variables:
76831 +   c-indentation-style: "K&R"
76832 +   mode-name: "LC"
76833 +   c-basic-offset: 8
76834 +   tab-width: 8
76835 +   fill-column: 120
76836 +   End:
76837 +*/
76838 diff --git a/fs/reiser4/type_safe_hash.h b/fs/reiser4/type_safe_hash.h
76839 new file mode 100644
76840 index 0000000..b2fdacd
76841 --- /dev/null
76842 +++ b/fs/reiser4/type_safe_hash.h
76843 @@ -0,0 +1,320 @@
76844 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
76845 + * reiser4/README */
76846 +
76847 +/* A hash table class that uses hash chains (singly-linked) and is
76848 +   parametrized to provide type safety.  */
76849 +
76850 +#ifndef __REISER4_TYPE_SAFE_HASH_H__
76851 +#define __REISER4_TYPE_SAFE_HASH_H__
76852 +
76853 +#include "debug.h"
76854 +
76855 +#include <asm/errno.h>
76856 +/* Step 1: Use TYPE_SAFE_HASH_DECLARE() to define the TABLE and LINK objects
76857 +   based on the object type.  You need to declare the item type before
76858 +   this definition, define it after this definition. */
76859 +#define TYPE_SAFE_HASH_DECLARE(PREFIX,ITEM_TYPE)                                                     \
76860 +                                                                                              \
76861 +typedef struct PREFIX##_hash_table_  PREFIX##_hash_table;                                     \
76862 +typedef struct PREFIX##_hash_link_   PREFIX##_hash_link;                                      \
76863 +                                                                                              \
76864 +struct PREFIX##_hash_table_                                                                   \
76865 +{                                                                                             \
76866 +  ITEM_TYPE  **_table;                                                                        \
76867 +  __u32        _buckets;                                                                      \
76868 +};                                                                                            \
76869 +                                                                                              \
76870 +struct PREFIX##_hash_link_                                                                    \
76871 +{                                                                                             \
76872 +  ITEM_TYPE *_next;                                                                           \
76873 +}
76874 +
76875 +/* Step 2: Define the object type of the hash: give it field of type
76876 +   PREFIX_hash_link. */
76877 +
76878 +/* Step 3: Use TYPE_SAFE_HASH_DEFINE to define the hash table interface using
76879 +   the type and field name used in step 3.  The arguments are:
76880 +
76881 +   ITEM_TYPE    The item type being hashed
76882 +   KEY_TYPE     The type of key being hashed
76883 +   KEY_NAME     The name of the key field within the item
76884 +   LINK_NAME    The name of the link field within the item, which you must make type PREFIX_hash_link)
76885 +   HASH_FUNC    The name of the hash function (or macro, takes const pointer to key)
76886 +   EQ_FUNC      The name of the equality function (or macro, takes const pointer to two keys)
76887 +
76888 +   It implements these functions:
76889 +
76890 +   prefix_hash_init           Initialize the table given its size.
76891 +   prefix_hash_insert         Insert an item
76892 +   prefix_hash_insert_index   Insert an item w/ precomputed hash_index
76893 +   prefix_hash_find           Find an item by key
76894 +   prefix_hash_find_index     Find an item w/ precomputed hash_index
76895 +   prefix_hash_remove         Remove an item, returns 1 if found, 0 if not found
76896 +   prefix_hash_remove_index   Remove an item w/ precomputed hash_index
76897 +
76898 +   If you'd like something to be done differently, feel free to ask me
76899 +   for modifications.  Additional features that could be added but
76900 +   have not been:
76901 +
76902 +   prefix_hash_remove_key           Find and remove an item by key
76903 +   prefix_hash_remove_key_index     Find and remove an item by key w/ precomputed hash_index
76904 +
76905 +   The hash_function currently receives only the key as an argument,
76906 +   meaning it must somehow know the number of buckets.  If this is a
76907 +   problem let me know.
76908 +
76909 +   This hash table uses a single-linked hash chain.  This means
76910 +   insertion is fast but deletion requires searching the chain.
76911 +
76912 +   There is also the doubly-linked hash chain approach, under which
76913 +   deletion requires no search but the code is longer and it takes two
76914 +   pointers per item.
76915 +
76916 +   The circularly-linked approach has the shortest code but requires
76917 +   two pointers per bucket, doubling the size of the bucket array (in
76918 +   addition to two pointers per item).
76919 +*/
76920 +#define TYPE_SAFE_HASH_DEFINE(PREFIX,ITEM_TYPE,KEY_TYPE,KEY_NAME,LINK_NAME,HASH_FUNC,EQ_FUNC)  \
76921 +                                                                                       \
76922 +static __inline__ void                                                                 \
76923 +PREFIX##_check_hash (PREFIX##_hash_table *table UNUSED_ARG,                            \
76924 +                    __u32                hash UNUSED_ARG)                              \
76925 +{                                                                                      \
76926 +       assert("nikita-2780", hash < table->_buckets);                                  \
76927 +}                                                                                      \
76928 +                                                                                       \
76929 +static __inline__ int                                                                  \
76930 +PREFIX##_hash_init (PREFIX##_hash_table *hash,                                         \
76931 +                   __u32                buckets)                                       \
76932 +{                                                                                      \
76933 +  hash->_table   = (ITEM_TYPE**) KMALLOC (sizeof (ITEM_TYPE*) * buckets);              \
76934 +  hash->_buckets = buckets;                                                            \
76935 +  if (hash->_table == NULL)                                                            \
76936 +    {                                                                                  \
76937 +      return RETERR(-ENOMEM);                                                          \
76938 +    }                                                                                  \
76939 +  memset (hash->_table, 0, sizeof (ITEM_TYPE*) * buckets);                             \
76940 +  ON_DEBUG(printk(#PREFIX "_hash_table: %i buckets\n", buckets));                      \
76941 +  return 0;                                                                            \
76942 +}                                                                                      \
76943 +                                                                                       \
76944 +static __inline__ void                                                                 \
76945 +PREFIX##_hash_done (PREFIX##_hash_table *hash)                                         \
76946 +{                                                                                      \
76947 +  if (REISER4_DEBUG && hash->_table != NULL) {                                          \
76948 +           __u32 i;                                                                    \
76949 +           for (i = 0 ; i < hash->_buckets ; ++ i)                                     \
76950 +                   assert("nikita-2905", hash->_table[i] == NULL);                     \
76951 +  }                                                                                     \
76952 +  if (hash->_table != NULL)                                                            \
76953 +    KFREE (hash->_table, sizeof (ITEM_TYPE*) * hash->_buckets);                                \
76954 +  hash->_table = NULL;                                                                 \
76955 +}                                                                                      \
76956 +                                                                                       \
76957 +static __inline__ void                                                                 \
76958 +PREFIX##_hash_prefetch_next (ITEM_TYPE *item)                                          \
76959 +{                                                                                      \
76960 +       prefetch(item->LINK_NAME._next);                                                \
76961 +}                                                                                      \
76962 +                                                                                       \
76963 +static __inline__ void                                                                 \
76964 +PREFIX##_hash_prefetch_bucket (PREFIX##_hash_table *hash,                              \
76965 +                              __u32                index)                              \
76966 +{                                                                                      \
76967 +       prefetch(hash->_table[index]);                                                  \
76968 +}                                                                                      \
76969 +                                                                                       \
76970 +static __inline__ ITEM_TYPE*                                                           \
76971 +PREFIX##_hash_find_index (PREFIX##_hash_table *hash,                                   \
76972 +                         __u32                hash_index,                              \
76973 +                         KEY_TYPE const      *find_key)                                \
76974 +{                                                                                      \
76975 +  ITEM_TYPE *item;                                                                     \
76976 +                                                                                       \
76977 +  PREFIX##_check_hash(hash, hash_index);                                               \
76978 +                                                                                       \
76979 +  for (item  = hash->_table[hash_index];                                               \
76980 +       item != NULL;                                                                   \
76981 +       item  = item->LINK_NAME._next)                                                  \
76982 +    {                                                                                  \
76983 +      prefetch(item->LINK_NAME._next);                                                 \
76984 +      prefetch(item->LINK_NAME._next + offsetof(ITEM_TYPE, KEY_NAME));                 \
76985 +      if (EQ_FUNC (& item->KEY_NAME, find_key))                                                \
76986 +        {                                                                              \
76987 +          return item;                                                                 \
76988 +        }                                                                              \
76989 +    }                                                                                  \
76990 +                                                                                       \
76991 +  return NULL;                                                                         \
76992 +}                                                                                      \
76993 +                                                                                       \
76994 +static __inline__ ITEM_TYPE*                                                           \
76995 +PREFIX##_hash_find_index_lru (PREFIX##_hash_table *hash,                               \
76996 +                             __u32                hash_index,                          \
76997 +                             KEY_TYPE const      *find_key)                            \
76998 +{                                                                                      \
76999 +  ITEM_TYPE ** item = &hash->_table[hash_index];                                        \
77000 +                                                                                       \
77001 +  PREFIX##_check_hash(hash, hash_index);                                               \
77002 +                                                                                        \
77003 +  while (*item != NULL) {                                                               \
77004 +    prefetch(&(*item)->LINK_NAME._next);                                               \
77005 +    if (EQ_FUNC (&(*item)->KEY_NAME, find_key)) {                                       \
77006 +      ITEM_TYPE *found;                                                                \
77007 +                                                                                       \
77008 +      found = *item;                                                                   \
77009 +      *item = found->LINK_NAME._next;                                                   \
77010 +      found->LINK_NAME._next = hash->_table[hash_index];                               \
77011 +      hash->_table[hash_index] = found;                                                        \
77012 +      return found;                                                                     \
77013 +    }                                                                                   \
77014 +    item = &(*item)->LINK_NAME._next;                                                   \
77015 +  }                                                                                    \
77016 +  return NULL;                                                                         \
77017 +}                                                                                      \
77018 +                                                                                       \
77019 +static __inline__ int                                                                  \
77020 +PREFIX##_hash_remove_index (PREFIX##_hash_table *hash,                                 \
77021 +                           __u32                hash_index,                            \
77022 +                           ITEM_TYPE           *del_item)                              \
77023 +{                                                                                      \
77024 +  ITEM_TYPE ** hash_item_p = &hash->_table[hash_index];                                 \
77025 +                                                                                       \
77026 +  PREFIX##_check_hash(hash, hash_index);                                               \
77027 +                                                                                        \
77028 +  while (*hash_item_p != NULL) {                                                        \
77029 +    prefetch(&(*hash_item_p)->LINK_NAME._next);                                                \
77030 +    if (*hash_item_p == del_item) {                                                     \
77031 +      *hash_item_p = (*hash_item_p)->LINK_NAME._next;                                   \
77032 +      return 1;                                                                         \
77033 +    }                                                                                   \
77034 +    hash_item_p = &(*hash_item_p)->LINK_NAME._next;                                     \
77035 +  }                                                                                    \
77036 +  return 0;                                                                            \
77037 +}                                                                                      \
77038 +                                                                                       \
77039 +static __inline__ void                                                                 \
77040 +PREFIX##_hash_insert_index (PREFIX##_hash_table *hash,                                 \
77041 +                           __u32                hash_index,                            \
77042 +                           ITEM_TYPE           *ins_item)                              \
77043 +{                                                                                      \
77044 +  PREFIX##_check_hash(hash, hash_index);                                               \
77045 +                                                                                       \
77046 +  ins_item->LINK_NAME._next = hash->_table[hash_index];                                        \
77047 +  hash->_table[hash_index]  = ins_item;                                                        \
77048 +}                                                                                      \
77049 +                                                                                       \
77050 +static __inline__ void                                                                 \
77051 +PREFIX##_hash_insert_index_rcu (PREFIX##_hash_table *hash,                             \
77052 +                               __u32                hash_index,                        \
77053 +                               ITEM_TYPE           *ins_item)                          \
77054 +{                                                                                      \
77055 +  PREFIX##_check_hash(hash, hash_index);                                               \
77056 +                                                                                       \
77057 +  ins_item->LINK_NAME._next = hash->_table[hash_index];                                        \
77058 +  smp_wmb();                                                                           \
77059 +  hash->_table[hash_index]  = ins_item;                                                        \
77060 +}                                                                                      \
77061 +                                                                                       \
77062 +static __inline__ ITEM_TYPE*                                                           \
77063 +PREFIX##_hash_find (PREFIX##_hash_table *hash,                                         \
77064 +                   KEY_TYPE const      *find_key)                                      \
77065 +{                                                                                      \
77066 +  return PREFIX##_hash_find_index (hash, HASH_FUNC(hash, find_key), find_key);         \
77067 +}                                                                                      \
77068 +                                                                                       \
77069 +static __inline__ ITEM_TYPE*                                                           \
77070 +PREFIX##_hash_find_lru (PREFIX##_hash_table *hash,                                     \
77071 +                       KEY_TYPE const      *find_key)                                  \
77072 +{                                                                                      \
77073 +  return PREFIX##_hash_find_index_lru (hash, HASH_FUNC(hash, find_key), find_key);     \
77074 +}                                                                                      \
77075 +                                                                                       \
77076 +static __inline__ int                                                                  \
77077 +PREFIX##_hash_remove (PREFIX##_hash_table *hash,                                       \
77078 +                     ITEM_TYPE           *del_item)                                    \
77079 +{                                                                                      \
77080 +  return PREFIX##_hash_remove_index (hash,                                             \
77081 +                                     HASH_FUNC(hash, &del_item->KEY_NAME), del_item);  \
77082 +}                                                                                      \
77083 +                                                                                       \
77084 +static __inline__ int                                                                  \
77085 +PREFIX##_hash_remove_rcu (PREFIX##_hash_table *hash,                                   \
77086 +                     ITEM_TYPE           *del_item)                                    \
77087 +{                                                                                      \
77088 +  return PREFIX##_hash_remove (hash, del_item);                                                \
77089 +}                                                                                      \
77090 +                                                                                       \
77091 +static __inline__ void                                                                 \
77092 +PREFIX##_hash_insert (PREFIX##_hash_table *hash,                                       \
77093 +                     ITEM_TYPE           *ins_item)                                    \
77094 +{                                                                                      \
77095 +  return PREFIX##_hash_insert_index (hash,                                             \
77096 +                                     HASH_FUNC(hash, &ins_item->KEY_NAME), ins_item);  \
77097 +}                                                                                      \
77098 +                                                                                       \
77099 +static __inline__ void                                                                 \
77100 +PREFIX##_hash_insert_rcu (PREFIX##_hash_table *hash,                                   \
77101 +                         ITEM_TYPE           *ins_item)                                \
77102 +{                                                                                      \
77103 +  return PREFIX##_hash_insert_index_rcu (hash, HASH_FUNC(hash, &ins_item->KEY_NAME),           \
77104 +                                         ins_item);                                    \
77105 +}                                                                                      \
77106 +                                                                                       \
77107 +static __inline__ ITEM_TYPE *                                                          \
77108 +PREFIX##_hash_first (PREFIX##_hash_table *hash, __u32 ind)                             \
77109 +{                                                                                      \
77110 +  ITEM_TYPE *first;                                                                    \
77111 +                                                                                       \
77112 +  for (first = NULL; ind < hash->_buckets; ++ ind) {                                   \
77113 +    first = hash->_table[ind];                                                         \
77114 +    if (first != NULL)                                                                 \
77115 +      break;                                                                           \
77116 +  }                                                                                    \
77117 +  return first;                                                                                \
77118 +}                                                                                      \
77119 +                                                                                       \
77120 +static __inline__ ITEM_TYPE *                                                          \
77121 +PREFIX##_hash_next (PREFIX##_hash_table *hash,                                         \
77122 +                   ITEM_TYPE           *item)                                          \
77123 +{                                                                                      \
77124 +  ITEM_TYPE  *next;                                                                    \
77125 +                                                                                       \
77126 +  if (item == NULL)                                                                    \
77127 +    return NULL;                                                                       \
77128 +  next = item->LINK_NAME._next;                                                                \
77129 +  if (next == NULL)                                                                    \
77130 +    next = PREFIX##_hash_first (hash, HASH_FUNC(hash, &item->KEY_NAME) + 1);           \
77131 +  return next;                                                                         \
77132 +}                                                                                      \
77133 +                                                                                       \
77134 +typedef struct {} PREFIX##_hash_dummy
77135 +
77136 +#define for_all_ht_buckets(table, head)                                        \
77137 +for ((head) = &(table) -> _table[ 0 ] ;                                        \
77138 +     (head) != &(table) -> _table[ (table) -> _buckets ] ; ++ (head))
77139 +
77140 +#define for_all_in_bucket(bucket, item, next, field)                           \
77141 +for ((item) = *(bucket), (next) = (item) ? (item) -> field._next : NULL ;      \
77142 +     (item) != NULL ;                                                          \
77143 +     (item) = (next), (next) = (item) ? (item) -> field._next : NULL )
77144 +
77145 +#define for_all_in_htable(table, prefix, item, next)   \
77146 +for ((item) = prefix ## _hash_first ((table), 0),      \
77147 +     (next) = prefix ## _hash_next ((table), (item)) ; \
77148 +     (item) != NULL ;                                  \
77149 +     (item) = (next),                                  \
77150 +     (next) = prefix ## _hash_next ((table), (item)))
77151 +
77152 +/* __REISER4_TYPE_SAFE_HASH_H__ */
77153 +#endif
77154 +
77155 +/* Make Linus happy.
77156 +   Local variables:
77157 +   c-indentation-style: "K&R"
77158 +   mode-name: "LC"
77159 +   c-basic-offset: 8
77160 +   tab-width: 8
77161 +   fill-column: 120
77162 +   End:
77163 +*/
77164 diff --git a/fs/reiser4/vfs_ops.c b/fs/reiser4/vfs_ops.c
77165 new file mode 100644
77166 index 0000000..8324e07
77167 --- /dev/null
77168 +++ b/fs/reiser4/vfs_ops.c
77169 @@ -0,0 +1,267 @@
77170 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
77171 + * reiser4/README */
77172 +
77173 +/* Interface to VFS. Reiser4 {super|export|dentry}_operations are defined
77174 +   here. */
77175 +
77176 +#include "forward.h"
77177 +#include "debug.h"
77178 +#include "dformat.h"
77179 +#include "coord.h"
77180 +#include "plugin/item/item.h"
77181 +#include "plugin/file/file.h"
77182 +#include "plugin/security/perm.h"
77183 +#include "plugin/disk_format/disk_format.h"
77184 +#include "plugin/plugin.h"
77185 +#include "plugin/plugin_set.h"
77186 +#include "plugin/object.h"
77187 +#include "txnmgr.h"
77188 +#include "jnode.h"
77189 +#include "znode.h"
77190 +#include "block_alloc.h"
77191 +#include "tree.h"
77192 +#include "vfs_ops.h"
77193 +#include "inode.h"
77194 +#include "page_cache.h"
77195 +#include "ktxnmgrd.h"
77196 +#include "super.h"
77197 +#include "reiser4.h"
77198 +#include "entd.h"
77199 +#include "status_flags.h"
77200 +#include "flush.h"
77201 +#include "dscale.h"
77202 +
77203 +#include <linux/profile.h>
77204 +#include <linux/types.h>
77205 +#include <linux/mount.h>
77206 +#include <linux/vfs.h>
77207 +#include <linux/mm.h>
77208 +#include <linux/buffer_head.h>
77209 +#include <linux/dcache.h>
77210 +#include <linux/list.h>
77211 +#include <linux/pagemap.h>
77212 +#include <linux/slab.h>
77213 +#include <linux/seq_file.h>
77214 +#include <linux/init.h>
77215 +#include <linux/module.h>
77216 +#include <linux/writeback.h>
77217 +#include <linux/blkdev.h>
77218 +#include <linux/quotaops.h>
77219 +#include <linux/security.h>
77220 +#include <linux/reboot.h>
77221 +#include <linux/rcupdate.h>
77222 +
77223 +
77224 +/* update inode stat-data by calling plugin */
77225 +int reiser4_update_sd(struct inode *object)
77226 +{
77227 +       file_plugin *fplug;
77228 +
77229 +       assert("nikita-2338", object != NULL);
77230 +       /* check for read-only file system. */
77231 +       if (IS_RDONLY(object))
77232 +               return 0;
77233 +
77234 +       fplug = inode_file_plugin(object);
77235 +       assert("nikita-2339", fplug != NULL);
77236 +       return fplug->write_sd_by_inode(object);
77237 +}
77238 +
77239 +/* helper function: increase inode nlink count and call plugin method to save
77240 +   updated stat-data.
77241 +
77242 +   Used by link/create and during creation of dot and dotdot in mkdir
77243 +*/
77244 +int reiser4_add_nlink(struct inode *object /* object to which link is added */ ,
77245 +                     struct inode *parent /* parent where new entry will be */
77246 +                     ,
77247 +                     int write_sd_p    /* true if stat-data has to be
77248 +                                        * updated */ )
77249 +{
77250 +       file_plugin *fplug;
77251 +       int result;
77252 +
77253 +       assert("nikita-1351", object != NULL);
77254 +
77255 +       fplug = inode_file_plugin(object);
77256 +       assert("nikita-1445", fplug != NULL);
77257 +
77258 +       /* ask plugin whether it can add yet another link to this
77259 +          object */
77260 +       if (!fplug->can_add_link(object))
77261 +               return RETERR(-EMLINK);
77262 +
77263 +       assert("nikita-2211", fplug->add_link != NULL);
77264 +       /* call plugin to do actual addition of link */
77265 +       result = fplug->add_link(object, parent);
77266 +
77267 +       /* optionally update stat data */
77268 +       if (result == 0 && write_sd_p)
77269 +               result = fplug->write_sd_by_inode(object);
77270 +       return result;
77271 +}
77272 +
77273 +/* helper function: decrease inode nlink count and call plugin method to save
77274 +   updated stat-data.
77275 +
77276 +   Used by unlink/create
77277 +*/
77278 +int reiser4_del_nlink(struct inode *object     /* object from which link is
77279 +                                                * removed */ ,
77280 +                     struct inode *parent /* parent where entry was */ ,
77281 +                     int write_sd_p    /* true is stat-data has to be
77282 +                                        * updated */ )
77283 +{
77284 +       file_plugin *fplug;
77285 +       int result;
77286 +
77287 +       assert("nikita-1349", object != NULL);
77288 +
77289 +       fplug = inode_file_plugin(object);
77290 +       assert("nikita-1350", fplug != NULL);
77291 +       assert("nikita-1446", object->i_nlink > 0);
77292 +       assert("nikita-2210", fplug->rem_link != NULL);
77293 +
77294 +       /* call plugin to do actual deletion of link */
77295 +       result = fplug->rem_link(object, parent);
77296 +
77297 +       /* optionally update stat data */
77298 +       if (result == 0 && write_sd_p)
77299 +               result = fplug->write_sd_by_inode(object);
77300 +       return result;
77301 +}
77302 +
77303 +
77304 +
77305 +
77306 +/* Release reiser4 dentry. This is d_op->d_release() method. */
77307 +static void reiser4_d_release(struct dentry *dentry /* dentry released */ )
77308 +{
77309 +       reiser4_free_dentry_fsdata(dentry);
77310 +}
77311 +
77312 +/*
77313 + * Called by reiser4_sync_inodes(), during speculative write-back (through
77314 + * pdflush, or balance_dirty_pages()).
77315 + */
77316 +void writeout(struct super_block *sb, struct writeback_control *wbc)
77317 +{
77318 +       long written = 0;
77319 +       int repeats = 0;
77320 +       int result;
77321 +       struct address_space *mapping;
77322 +
77323 +       /*
77324 +        * Performs early flushing, trying to free some memory. If there is
77325 +        * nothing to flush, commits some atoms.
77326 +        */
77327 +
77328 +       /* Commit all atoms if reiser4_writepages() is called from sys_sync() or
77329 +          sys_fsync(). */
77330 +       if (wbc->sync_mode != WB_SYNC_NONE) {
77331 +               txnmgr_force_commit_all(sb, 0);
77332 +               return;
77333 +       }
77334 +
77335 +       BUG_ON(get_super_fake(sb) == NULL);
77336 +       mapping = get_super_fake(sb)->i_mapping;
77337 +       do {
77338 +               long nr_submitted = 0;
77339 +               jnode *node = NULL;
77340 +
77341 +               /* do not put more requests to overload write queue */
77342 +               if (wbc->nonblocking &&
77343 +                   bdi_write_congested(mapping->backing_dev_info)) {
77344 +                       blk_run_address_space(mapping);
77345 +                       wbc->encountered_congestion = 1;
77346 +                       break;
77347 +               }
77348 +               repeats++;
77349 +               BUG_ON(wbc->nr_to_write <= 0);
77350 +
77351 +               if (get_current_context()->entd) {
77352 +                       entd_context *ent = get_entd_context(sb);
77353 +
77354 +                       if (ent->cur_request->node)
77355 +                               /*
77356 +                                * this is ent thread and it managed to capture
77357 +                                * requested page itself - start flush from
77358 +                                * that page
77359 +                                */
77360 +                               node = jref(ent->cur_request->node);
77361 +               }
77362 +
77363 +               result = flush_some_atom(node, &nr_submitted, wbc,
77364 +                                        JNODE_FLUSH_WRITE_BLOCKS);
77365 +               if (result != 0)
77366 +                       warning("nikita-31001", "Flush failed: %i", result);
77367 +               if (node)
77368 +                       jput(node);
77369 +               if (!nr_submitted)
77370 +                       break;
77371 +
77372 +               wbc->nr_to_write -= nr_submitted;
77373 +               written += nr_submitted;
77374 +       } while (wbc->nr_to_write > 0);
77375 +}
77376 +
77377 +
77378 +void reiser4_throttle_write(struct inode *inode)
77379 +{
77380 +       txn_restart_current();
77381 +       balance_dirty_pages_ratelimited(inode->i_mapping);
77382 +}
77383 +
77384 +const char *REISER4_SUPER_MAGIC_STRING = "ReIsEr4";
77385 +const int REISER4_MAGIC_OFFSET = 16 * 4096;    /* offset to magic string from the
77386 +                                                * beginning of device */
77387 +
77388 +
77389 +
77390 +/*
77391 + * Reiser4 initialization/shutdown.
77392 + *
77393 + * Code below performs global reiser4 initialization that is done either as
77394 + * part of kernel initialization (when reiser4 is statically built-in), or
77395 + * during reiser4 module load (when compiled as module).
77396 + */
77397 +
77398 +
77399 +void reiser4_handle_error(void)
77400 +{
77401 +       struct super_block *sb = reiser4_get_current_sb();
77402 +
77403 +       if (!sb)
77404 +               return;
77405 +       reiser4_status_write(REISER4_STATUS_DAMAGED, 0,
77406 +                            "Filesystem error occured");
77407 +       switch (get_super_private(sb)->onerror) {
77408 +       case 0:
77409 +               reiser4_panic("foobar-42", "Filesystem error occured\n");
77410 +       case 1:
77411 +       default:
77412 +               if (sb->s_flags & MS_RDONLY)
77413 +                       return;
77414 +               sb->s_flags |= MS_RDONLY;
77415 +               break;
77416 +       }
77417 +}
77418 +
77419 +struct dentry_operations reiser4_dentry_operations = {
77420 +       .d_revalidate = NULL,
77421 +       .d_hash = NULL,
77422 +       .d_compare = NULL,
77423 +       .d_delete = NULL,
77424 +       .d_release = reiser4_d_release,
77425 +       .d_iput = NULL,
77426 +};
77427 +
77428 +/* Make Linus happy.
77429 +   Local variables:
77430 +   c-indentation-style: "K&R"
77431 +   mode-name: "LC"
77432 +   c-basic-offset: 8
77433 +   tab-width: 8
77434 +   fill-column: 120
77435 +   End:
77436 +*/
77437 diff --git a/fs/reiser4/vfs_ops.h b/fs/reiser4/vfs_ops.h
77438 new file mode 100644
77439 index 0000000..8908a1a
77440 --- /dev/null
77441 +++ b/fs/reiser4/vfs_ops.h
77442 @@ -0,0 +1,58 @@
77443 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
77444 + * reiser4/README */
77445 +
77446 +/* vfs_ops.c's exported symbols */
77447 +
77448 +#if !defined( __FS_REISER4_VFS_OPS_H__ )
77449 +#define __FS_REISER4_VFS_OPS_H__
77450 +
77451 +#include "forward.h"
77452 +#include "coord.h"
77453 +#include "seal.h"
77454 +#include "plugin/file/file.h"
77455 +#include "super.h"
77456 +#include "readahead.h"
77457 +
77458 +#include <linux/types.h>       /* for loff_t */
77459 +#include <linux/fs.h>          /* for struct address_space */
77460 +#include <linux/dcache.h>      /* for struct dentry */
77461 +#include <linux/mm.h>
77462 +#include <linux/backing-dev.h>
77463 +
77464 +/* address space operations */
77465 +int reiser4_writepage(struct page *, struct writeback_control *);
77466 +int reiser4_set_page_dirty(struct page *);
77467 +int reiser4_readpages(struct file *, struct address_space *,
77468 +                     struct list_head *pages, unsigned nr_pages);
77469 +void reiser4_invalidatepage(struct page *, unsigned long offset);
77470 +int reiser4_releasepage(struct page *, gfp_t);
77471 +
77472 +extern int reiser4_update_sd(struct inode *);
77473 +extern int reiser4_add_nlink(struct inode *, struct inode *, int);
77474 +extern int reiser4_del_nlink(struct inode *, struct inode *, int);
77475 +
77476 +
77477 +extern int reiser4_start_up_io(struct page *page);
77478 +extern void reiser4_throttle_write(struct inode *);
77479 +extern int jnode_is_releasable(jnode *);
77480 +
77481 +#define CAPTURE_APAGE_BURST (1024l)
77482 +void writeout(struct super_block *, struct writeback_control *);
77483 +
77484 +
77485 +extern void reiser4_handle_error(void);
77486 +
77487 +
77488 +/* __FS_REISER4_VFS_OPS_H__ */
77489 +#endif
77490 +
77491 +/* Make Linus happy.
77492 +   Local variables:
77493 +   c-indentation-style: "K&R"
77494 +   mode-name: "LC"
77495 +   c-basic-offset: 8
77496 +   tab-width: 8
77497 +   fill-column: 120
77498 +   scroll-step: 1
77499 +   End:
77500 +*/
77501 diff --git a/fs/reiser4/wander.c b/fs/reiser4/wander.c
77502 new file mode 100644
77503 index 0000000..80c6bb2
77504 --- /dev/null
77505 +++ b/fs/reiser4/wander.c
77506 @@ -0,0 +1,1799 @@
77507 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
77508 + * reiser4/README */
77509 +
77510 +/* Reiser4 Wandering Log */
77511 +
77512 +/* You should read http://www.namesys.com/txn-doc.html
77513 +
77514 +   That describes how filesystem operations are performed as atomic
77515 +   transactions, and how we try to arrange it so that we can write most of the
77516 +   data only once while performing the operation atomically.
77517 +
77518 +   For the purposes of this code, it is enough for it to understand that it
77519 +   has been told a given block should be written either once, or twice (if
77520 +   twice then once to the wandered location and once to the real location).
77521 +
77522 +   This code guarantees that those blocks that are defined to be part of an
77523 +   atom either all take effect or none of them take effect.
77524 +
77525 +   Relocate set nodes are submitted to write by the jnode_flush() routine, and
77526 +   the overwrite set is submitted by reiser4_write_log().  This is because with
77527 +   the overwrite set we seek to optimize writes, and with the relocate set we
77528 +   seek to cause disk order to correlate with the parent first pre-order.
77529 +
77530 +   reiser4_write_log() allocates and writes wandered blocks and maintains
77531 +   additional on-disk structures of the atom as wander records (each wander
77532 +   record occupies one block) for storing of the "wandered map" (a table which
77533 +   contains a relation between wandered and real block numbers) and other
77534 +   information which might be needed at transaction recovery time.
77535 +
77536 +   The wander records are unidirectionally linked into a circle: each wander
77537 +   record contains a block number of the next wander record, the last wander
77538 +   record points to the first one.
77539 +
77540 +   One wander record (named "tx head" in this file) has a format which is
77541 +   different from the other wander records. The "tx head" has a reference to the
77542 +   "tx head" block of the previously committed atom.  Also, "tx head" contains
77543 +   fs information (the free blocks counter, and the oid allocator state) which
77544 +   is logged in a special way .
77545 +
77546 +   There are two journal control blocks, named journal header and journal
77547 +   footer which have fixed on-disk locations.  The journal header has a
77548 +   reference to the "tx head" block of the last committed atom.  The journal
77549 +   footer points to the "tx head" of the last flushed atom.  The atom is
77550 +   "played" when all blocks from its overwrite set are written to disk the
77551 +   second time (i.e. written to their real locations).
77552 +
77553 +   NOTE: People who know reiserfs internals and its journal structure might be
77554 +   confused with these terms journal footer and journal header. There is a table
77555 +   with terms of similar semantics in reiserfs (reiser3) and reiser4:
77556 +
77557 +   REISER3 TERM        |  REISER4 TERM         | DESCRIPTION
77558 +   --------------------+-----------------------+----------------------------
77559 +   commit record       |  journal header       | atomic write of this record
77560 +                       |                       | ends transaction commit
77561 +   --------------------+-----------------------+----------------------------
77562 +   journal header      |  journal footer       | atomic write of this record
77563 +                       |                       | ends post-commit writes.
77564 +                       |                       | After successful
77565 +                       |                       | writing of this journal
77566 +                       |                       | blocks (in reiser3) or
77567 +                       |                       | wandered blocks/records are
77568 +                       |                       | free for re-use.
77569 +   --------------------+-----------------------+----------------------------
77570 +
77571 +   The atom commit process is the following:
77572 +
77573 +   1. The overwrite set is taken from atom's clean list, and its size is
77574 +      counted.
77575 +
77576 +   2. The number of necessary wander records (including tx head) is calculated,
77577 +      and the wander record blocks are allocated.
77578 +
77579 +   3. Allocate wandered blocks and populate wander records by wandered map.
77580 +
77581 +   4. submit write requests for wander records and wandered blocks.
77582 +
77583 +   5. wait until submitted write requests complete.
77584 +
77585 +   6. update journal header: change the pointer to the block number of just
77586 +   written tx head, submit an i/o for modified journal header block and wait
77587 +   for i/o completion.
77588 +
77589 +   NOTE: The special logging for bitmap blocks and some reiser4 super block
77590 +   fields makes processes of atom commit, flush and recovering a bit more
77591 +   complex (see comments in the source code for details).
77592 +
77593 +   The atom playing process is the following:
77594 +
77595 +   1. Write atom's overwrite set in-place.
77596 +
77597 +   2. Wait on i/o.
77598 +
77599 +   3. Update journal footer: change the pointer to block number of tx head
77600 +   block of the atom we currently flushing, submit an i/o, wait on i/o
77601 +   completion.
77602 +
77603 +   4. Free disk space which was used for wandered blocks and wander records.
77604 +
77605 +   After the freeing of wandered blocks and wander records we have that journal
77606 +   footer points to the on-disk structure which might be overwritten soon.
77607 +   Neither the log writer nor the journal recovery procedure use that pointer
77608 +   for accessing the data.  When the journal recovery procedure finds the oldest
77609 +   transaction it compares the journal footer pointer value with the "prev_tx"
77610 +   pointer value in tx head, if values are equal the oldest not flushed
77611 +   transaction is found.
77612 +
77613 +   NOTE on disk space leakage: the information about of what blocks and how many
77614 +   blocks are allocated for wandered blocks, wandered records is not written to
77615 +   the disk because of special logging for bitmaps and some super blocks
77616 +   counters.  After a system crash we the reiser4 does not remember those
77617 +   objects allocation, thus we have no such a kind of disk space leakage.
77618 +*/
77619 +
77620 +/* Special logging of reiser4 super block fields. */
77621 +
77622 +/* There are some reiser4 super block fields (free block count and OID allocator
77623 +   state (number of files and next free OID) which are logged separately from
77624 +   super block to avoid unnecessary atom fusion.
77625 +
77626 +   So, the reiser4 super block can be not captured by a transaction with
77627 +   allocates/deallocates disk blocks or create/delete file objects.  Moreover,
77628 +   the reiser4 on-disk super block is not touched when such a transaction is
77629 +   committed and flushed.  Those "counters logged specially" are logged in "tx
77630 +   head" blocks and in the journal footer block.
77631 +
77632 +   A step-by-step description of special logging:
77633 +
77634 +   0. The per-atom information about deleted or created files and allocated or
77635 +   freed blocks is collected during the transaction.  The atom's
77636 +   ->nr_objects_created and ->nr_objects_deleted are for object
77637 +   deletion/creation tracking, the numbers of allocated and freed blocks are
77638 +   calculated using atom's delete set and atom's capture list -- all new and
77639 +   relocated nodes should be on atom's clean list and should have JNODE_RELOC
77640 +   bit set.
77641 +
77642 +   1. The "logged specially" reiser4 super block fields have their "committed"
77643 +   versions in the reiser4 in-memory super block.  They get modified only at
77644 +   atom commit time.  The atom's commit thread has an exclusive access to those
77645 +   "committed" fields because the log writer implementation supports only one
77646 +   atom commit a time (there is a per-fs "commit" semaphore).  At
77647 +   that time "committed" counters are modified using per-atom information
77648 +   collected during the transaction. These counters are stored on disk as a
77649 +   part of tx head block when atom is committed.
77650 +
77651 +   2. When the atom is flushed the value of the free block counter and the OID
77652 +   allocator state get written to the journal footer block.  A special journal
77653 +   procedure (journal_recover_sb_data()) takes those values from the journal
77654 +   footer and updates the reiser4 in-memory super block.
77655 +
77656 +   NOTE: That means free block count and OID allocator state are logged
77657 +   separately from the reiser4 super block regardless of the fact that the
77658 +   reiser4 super block has fields to store both the free block counter and the
77659 +   OID allocator.
77660 +
77661 +   Writing the whole super block at commit time requires knowing true values of
77662 +   all its fields without changes made by not yet committed transactions. It is
77663 +   possible by having their "committed" version of the super block like the
77664 +   reiser4 bitmap blocks have "committed" and "working" versions.  However,
77665 +   another scheme was implemented which stores special logged values in the
77666 +   unused free space inside transaction head block.  In my opinion it has an
77667 +   advantage of not writing whole super block when only part of it was
77668 +   modified. */
77669 +
77670 +#include "debug.h"
77671 +#include "dformat.h"
77672 +#include "txnmgr.h"
77673 +#include "jnode.h"
77674 +#include "znode.h"
77675 +#include "block_alloc.h"
77676 +#include "page_cache.h"
77677 +#include "wander.h"
77678 +#include "reiser4.h"
77679 +#include "super.h"
77680 +#include "vfs_ops.h"
77681 +#include "writeout.h"
77682 +#include "inode.h"
77683 +#include "entd.h"
77684 +
77685 +#include <linux/types.h>
77686 +#include <linux/fs.h>          /* for struct super_block  */
77687 +#include <linux/mm.h>          /* for struct page */
77688 +#include <linux/pagemap.h>
77689 +#include <linux/bio.h>         /* for struct bio */
77690 +#include <linux/blkdev.h>
77691 +
77692 +static int write_jnodes_to_disk_extent(
77693 +       jnode *, int, const reiser4_block_nr *, flush_queue_t *, int);
77694 +
77695 +/* The commit_handle is a container for objects needed at atom commit time  */
77696 +struct commit_handle {
77697 +       /* A pointer to atom's list of OVRWR nodes */
77698 +       struct list_head *overwrite_set;
77699 +       /* atom's overwrite set size */
77700 +       int overwrite_set_size;
77701 +       /* jnodes for wander record blocks */
77702 +       struct list_head tx_list;
77703 +       /* number of wander records */
77704 +       __u32 tx_size;
77705 +       /* 'committed' sb counters are saved here until atom is completely
77706 +          flushed  */
77707 +       __u64 free_blocks;
77708 +       __u64 nr_files;
77709 +       __u64 next_oid;
77710 +       /* A pointer to the atom which is being committed */
77711 +       txn_atom *atom;
77712 +       /* A pointer to current super block */
77713 +       struct super_block *super;
77714 +       /* The counter of modified bitmaps */
77715 +       reiser4_block_nr nr_bitmap;
77716 +};
77717 +
77718 +static void init_commit_handle(struct commit_handle *ch, txn_atom *atom)
77719 +{
77720 +       memset(ch, 0, sizeof(struct commit_handle));
77721 +       INIT_LIST_HEAD(&ch->tx_list);
77722 +
77723 +       ch->atom = atom;
77724 +       ch->super = reiser4_get_current_sb();
77725 +}
77726 +
77727 +static void done_commit_handle(struct commit_handle *ch)
77728 +{
77729 +       assert("zam-690", list_empty(&ch->tx_list));
77730 +}
77731 +
77732 +static inline int reiser4_use_write_barrier(struct super_block * s)
77733 +{
77734 +       return !reiser4_is_set(s, REISER4_NO_WRITE_BARRIER);
77735 +}
77736 +
77737 +static void disable_write_barrier(struct super_block * s)
77738 +{
77739 +       notice("zam-1055", "%s does not support write barriers,"
77740 +              " using synchronous write instead.", s->s_id);
77741 +       set_bit((int)REISER4_NO_WRITE_BARRIER, &get_super_private(s)->fs_flags);
77742 +}
77743 +
77744 +
77745 +/* fill journal header block data  */
77746 +static void format_journal_header(struct commit_handle *ch)
77747 +{
77748 +       struct reiser4_super_info_data *sbinfo;
77749 +       struct journal_header *header;
77750 +       jnode *txhead;
77751 +
77752 +       sbinfo = get_super_private(ch->super);
77753 +       assert("zam-479", sbinfo != NULL);
77754 +       assert("zam-480", sbinfo->journal_header != NULL);
77755 +
77756 +       txhead = list_entry(ch->tx_list.next, jnode, capture_link);
77757 +
77758 +       jload(sbinfo->journal_header);
77759 +
77760 +       header = (struct journal_header *)jdata(sbinfo->journal_header);
77761 +       assert("zam-484", header != NULL);
77762 +
77763 +       put_unaligned(cpu_to_le64(*jnode_get_block(txhead)),
77764 +                     &header->last_committed_tx);
77765 +
77766 +       jrelse(sbinfo->journal_header);
77767 +}
77768 +
77769 +/* fill journal footer block data */
77770 +static void format_journal_footer(struct commit_handle *ch)
77771 +{
77772 +       struct reiser4_super_info_data *sbinfo;
77773 +       struct journal_footer *footer;
77774 +       jnode *tx_head;
77775 +
77776 +       sbinfo = get_super_private(ch->super);
77777 +
77778 +       tx_head = list_entry(ch->tx_list.next, jnode, capture_link);
77779 +
77780 +       assert("zam-493", sbinfo != NULL);
77781 +       assert("zam-494", sbinfo->journal_header != NULL);
77782 +
77783 +       check_me("zam-691", jload(sbinfo->journal_footer) == 0);
77784 +
77785 +       footer = (struct journal_footer *)jdata(sbinfo->journal_footer);
77786 +       assert("zam-495", footer != NULL);
77787 +
77788 +       put_unaligned(cpu_to_le64(*jnode_get_block(tx_head)),
77789 +                     &footer->last_flushed_tx);
77790 +       put_unaligned(cpu_to_le64(ch->free_blocks), &footer->free_blocks);
77791 +
77792 +       put_unaligned(cpu_to_le64(ch->nr_files), &footer->nr_files);
77793 +       put_unaligned(cpu_to_le64(ch->next_oid), &footer->next_oid);
77794 +
77795 +       jrelse(sbinfo->journal_footer);
77796 +}
77797 +
77798 +/* wander record capacity depends on current block size */
77799 +static int wander_record_capacity(const struct super_block *super)
77800 +{
77801 +       return (super->s_blocksize -
77802 +               sizeof(struct wander_record_header)) /
77803 +           sizeof(struct wander_entry);
77804 +}
77805 +
77806 +/* Fill first wander record (tx head) in accordance with supplied given data */
77807 +static void format_tx_head(struct commit_handle *ch)
77808 +{
77809 +       jnode *tx_head;
77810 +       jnode *next;
77811 +       struct tx_header *header;
77812 +
77813 +       tx_head = list_entry(ch->tx_list.next, jnode, capture_link);
77814 +       assert("zam-692", &ch->tx_list != &tx_head->capture_link);
77815 +
77816 +       next = list_entry(tx_head->capture_link.next, jnode, capture_link);
77817 +       if (&ch->tx_list == &next->capture_link)
77818 +               next = tx_head;
77819 +
77820 +       header = (struct tx_header *)jdata(tx_head);
77821 +
77822 +       assert("zam-460", header != NULL);
77823 +       assert("zam-462", ch->super->s_blocksize >= sizeof(struct tx_header));
77824 +
77825 +       memset(jdata(tx_head), 0, (size_t) ch->super->s_blocksize);
77826 +       memcpy(jdata(tx_head), TX_HEADER_MAGIC, TX_HEADER_MAGIC_SIZE);
77827 +
77828 +       put_unaligned(cpu_to_le32(ch->tx_size), &header->total);
77829 +       put_unaligned(cpu_to_le64(get_super_private(ch->super)->last_committed_tx),
77830 +                     &header->prev_tx);
77831 +       put_unaligned(cpu_to_le64(*jnode_get_block(next)), &header->next_block);
77832 +       put_unaligned(cpu_to_le64(ch->free_blocks), &header->free_blocks);
77833 +       put_unaligned(cpu_to_le64(ch->nr_files), &header->nr_files);
77834 +       put_unaligned(cpu_to_le64(ch->next_oid), &header->next_oid);
77835 +}
77836 +
77837 +/* prepare ordinary wander record block (fill all service fields) */
77838 +static void
77839 +format_wander_record(struct commit_handle *ch, jnode *node, __u32 serial)
77840 +{
77841 +       struct wander_record_header *LRH;
77842 +       jnode *next;
77843 +
77844 +       assert("zam-464", node != NULL);
77845 +
77846 +       LRH = (struct wander_record_header *)jdata(node);
77847 +       next = list_entry(node->capture_link.next, jnode, capture_link);
77848 +
77849 +       if (&ch->tx_list == &next->capture_link)
77850 +               next = list_entry(ch->tx_list.next, jnode, capture_link);
77851 +
77852 +       assert("zam-465", LRH != NULL);
77853 +       assert("zam-463",
77854 +              ch->super->s_blocksize > sizeof(struct wander_record_header));
77855 +
77856 +       memset(jdata(node), 0, (size_t) ch->super->s_blocksize);
77857 +       memcpy(jdata(node), WANDER_RECORD_MAGIC, WANDER_RECORD_MAGIC_SIZE);
77858 +
77859 +       put_unaligned(cpu_to_le32(ch->tx_size), &LRH->total);
77860 +       put_unaligned(cpu_to_le32(serial), &LRH->serial);
77861 +       put_unaligned(cpu_to_le64(*jnode_get_block(next)), &LRH->next_block);
77862 +}
77863 +
77864 +/* add one wandered map entry to formatted wander record */
77865 +static void
77866 +store_entry(jnode * node, int index, const reiser4_block_nr * a,
77867 +           const reiser4_block_nr * b)
77868 +{
77869 +       char *data;
77870 +       struct wander_entry *pairs;
77871 +
77872 +       data = jdata(node);
77873 +       assert("zam-451", data != NULL);
77874 +
77875 +       pairs =
77876 +           (struct wander_entry *)(data + sizeof(struct wander_record_header));
77877 +
77878 +       put_unaligned(cpu_to_le64(*a), &pairs[index].original);
77879 +       put_unaligned(cpu_to_le64(*b), &pairs[index].wandered);
77880 +}
77881 +
77882 +/* currently, wander records contains contain only wandered map, which depend on
77883 +   overwrite set size */
77884 +static void get_tx_size(struct commit_handle *ch)
77885 +{
77886 +       assert("zam-440", ch->overwrite_set_size != 0);
77887 +       assert("zam-695", ch->tx_size == 0);
77888 +
77889 +       /* count all ordinary wander records
77890 +          (<overwrite_set_size> - 1) / <wander_record_capacity> + 1 and add one
77891 +          for tx head block */
77892 +       ch->tx_size =
77893 +           (ch->overwrite_set_size - 1) / wander_record_capacity(ch->super) +
77894 +           2;
77895 +}
77896 +
77897 +/* A special structure for using in store_wmap_actor() for saving its state
77898 +   between calls */
77899 +struct store_wmap_params {
77900 +       jnode *cur;             /* jnode of current wander record to fill */
77901 +       int idx;                /* free element index in wander record  */
77902 +       int capacity;           /* capacity  */
77903 +
77904 +#if REISER4_DEBUG
77905 +       struct list_head *tx_list;
77906 +#endif
77907 +};
77908 +
77909 +/* an actor for use in blocknr_set_iterator routine which populates the list
77910 +   of pre-formatted wander records by wandered map info */
77911 +static int
77912 +store_wmap_actor(txn_atom * atom UNUSED_ARG, const reiser4_block_nr * a,
77913 +                const reiser4_block_nr * b, void *data)
77914 +{
77915 +       struct store_wmap_params *params = data;
77916 +
77917 +       if (params->idx >= params->capacity) {
77918 +               /* a new wander record should be taken from the tx_list */
77919 +               params->cur = list_entry(params->cur->capture_link.next, jnode, capture_link);
77920 +               assert("zam-454",
77921 +                      params->tx_list != &params->cur->capture_link);
77922 +
77923 +               params->idx = 0;
77924 +       }
77925 +
77926 +       store_entry(params->cur, params->idx, a, b);
77927 +       params->idx++;
77928 +
77929 +       return 0;
77930 +}
77931 +
77932 +/* This function is called after Relocate set gets written to disk, Overwrite
77933 +   set is written to wandered locations and all wander records are written
77934 +   also. Updated journal header blocks contains a pointer (block number) to
77935 +   first wander record of the just written transaction */
77936 +static int update_journal_header(struct commit_handle *ch, int use_barrier)
77937 +{
77938 +       struct reiser4_super_info_data *sbinfo = get_super_private(ch->super);
77939 +       jnode *jh = sbinfo->journal_header;
77940 +       jnode *head = list_entry(ch->tx_list.next, jnode, capture_link);
77941 +       int ret;
77942 +
77943 +       format_journal_header(ch);
77944 +
77945 +       ret = write_jnodes_to_disk_extent(jh, 1, jnode_get_block(jh), NULL,
77946 +                                         use_barrier ? WRITEOUT_BARRIER : 0);
77947 +       if (ret)
77948 +               return ret;
77949 +
77950 +       // blk_run_address_space(sbinfo->fake->i_mapping);
77951 +       /*blk_run_queues(); */
77952 +
77953 +       ret = jwait_io(jh, WRITE);
77954 +
77955 +       if (ret)
77956 +               return ret;
77957 +
77958 +       sbinfo->last_committed_tx = *jnode_get_block(head);
77959 +
77960 +       return 0;
77961 +}
77962 +
77963 +/* This function is called after write-back is finished. We update journal
77964 +   footer block and free blocks which were occupied by wandered blocks and
77965 +   transaction wander records */
77966 +static int update_journal_footer(struct commit_handle *ch, int use_barrier)
77967 +{
77968 +       reiser4_super_info_data *sbinfo = get_super_private(ch->super);
77969 +
77970 +       jnode *jf = sbinfo->journal_footer;
77971 +
77972 +       int ret;
77973 +
77974 +       format_journal_footer(ch);
77975 +
77976 +       ret = write_jnodes_to_disk_extent(jf, 1, jnode_get_block(jf), NULL,
77977 +                                         use_barrier ? WRITEOUT_BARRIER : 0);
77978 +       if (ret)
77979 +               return ret;
77980 +
77981 +       // blk_run_address_space(sbinfo->fake->i_mapping);
77982 +       /*blk_run_queue(); */
77983 +
77984 +       ret = jwait_io(jf, WRITE);
77985 +       if (ret)
77986 +               return ret;
77987 +
77988 +       return 0;
77989 +}
77990 +
77991 +/* free block numbers of wander records of already written in place transaction */
77992 +static void dealloc_tx_list(struct commit_handle *ch)
77993 +{
77994 +       while (!list_empty(&ch->tx_list)) {
77995 +               jnode *cur = list_entry(ch->tx_list.next, jnode, capture_link);
77996 +               list_del(&cur->capture_link);
77997 +               ON_DEBUG(INIT_LIST_HEAD(&cur->capture_link));
77998 +               reiser4_dealloc_block(jnode_get_block(cur), BLOCK_NOT_COUNTED,
77999 +                                     BA_FORMATTED);
78000 +
78001 +               unpin_jnode_data(cur);
78002 +               drop_io_head(cur);
78003 +       }
78004 +}
78005 +
78006 +/* An actor for use in block_nr_iterator() routine which frees wandered blocks
78007 +   from atom's overwrite set. */
78008 +static int
78009 +dealloc_wmap_actor(txn_atom * atom UNUSED_ARG,
78010 +                  const reiser4_block_nr * a UNUSED_ARG,
78011 +                  const reiser4_block_nr * b, void *data UNUSED_ARG)
78012 +{
78013 +
78014 +       assert("zam-499", b != NULL);
78015 +       assert("zam-500", *b != 0);
78016 +       assert("zam-501", !blocknr_is_fake(b));
78017 +
78018 +       reiser4_dealloc_block(b, BLOCK_NOT_COUNTED, BA_FORMATTED);
78019 +       return 0;
78020 +}
78021 +
78022 +/* free wandered block locations of already written in place transaction */
78023 +static void dealloc_wmap(struct commit_handle *ch)
78024 +{
78025 +       assert("zam-696", ch->atom != NULL);
78026 +
78027 +       blocknr_set_iterator(ch->atom, &ch->atom->wandered_map,
78028 +                            dealloc_wmap_actor, NULL, 1);
78029 +}
78030 +
78031 +/* helper function for alloc wandered blocks, which refill set of block
78032 +   numbers needed for wandered blocks  */
78033 +static int
78034 +get_more_wandered_blocks(int count, reiser4_block_nr * start, int *len)
78035 +{
78036 +       reiser4_blocknr_hint hint;
78037 +       int ret;
78038 +
78039 +       reiser4_block_nr wide_len = count;
78040 +
78041 +       /* FIXME-ZAM: A special policy needed for allocation of wandered blocks
78042 +          ZAM-FIXME-HANS: yes, what happened to our discussion of using a fixed
78043 +          reserved allocation area so as to get the best qualities of fixed
78044 +          journals? */
78045 +       blocknr_hint_init(&hint);
78046 +       hint.block_stage = BLOCK_GRABBED;
78047 +
78048 +       ret = reiser4_alloc_blocks(&hint, start, &wide_len,
78049 +                                  BA_FORMATTED | BA_USE_DEFAULT_SEARCH_START);
78050 +       *len = (int)wide_len;
78051 +
78052 +       return ret;
78053 +}
78054 +
78055 +/*
78056 + * roll back changes made before issuing BIO in the case of IO error.
78057 + */
78058 +static void undo_bio(struct bio *bio)
78059 +{
78060 +       int i;
78061 +
78062 +       for (i = 0; i < bio->bi_vcnt; ++i) {
78063 +               struct page *pg;
78064 +               jnode *node;
78065 +
78066 +               pg = bio->bi_io_vec[i].bv_page;
78067 +               ClearPageWriteback(pg);
78068 +               node = jprivate(pg);
78069 +               spin_lock_jnode(node);
78070 +               JF_CLR(node, JNODE_WRITEBACK);
78071 +               JF_SET(node, JNODE_DIRTY);
78072 +               spin_unlock_jnode(node);
78073 +       }
78074 +       bio_put(bio);
78075 +}
78076 +
78077 +/* put overwrite set back to atom's clean list */
78078 +static void put_overwrite_set(struct commit_handle *ch)
78079 +{
78080 +       jnode *cur;
78081 +
78082 +       list_for_each_entry(cur, ch->overwrite_set, capture_link)
78083 +               jrelse_tail(cur);
78084 +}
78085 +
78086 +/* Count overwrite set size, grab disk space for wandered blocks allocation.
78087 +   Since we have a separate list for atom's overwrite set we just scan the list,
78088 +   count bitmap and other not leaf nodes which wandered blocks allocation we
78089 +   have to grab space for. */
78090 +static int get_overwrite_set(struct commit_handle *ch)
78091 +{
78092 +       int ret;
78093 +       jnode *cur;
78094 +       __u64 nr_not_leaves = 0;
78095 +#if REISER4_DEBUG
78096 +       __u64 nr_formatted_leaves = 0;
78097 +       __u64 nr_unformatted_leaves = 0;
78098 +#endif
78099 +
78100 +       assert("zam-697", ch->overwrite_set_size == 0);
78101 +
78102 +       ch->overwrite_set = ATOM_OVRWR_LIST(ch->atom);
78103 +       cur = list_entry(ch->overwrite_set->next, jnode, capture_link);
78104 +
78105 +       while (ch->overwrite_set != &cur->capture_link) {
78106 +               jnode *next = list_entry(cur->capture_link.next, jnode, capture_link);
78107 +
78108 +               /* Count bitmap locks for getting correct statistics what number
78109 +                * of blocks were cleared by the transaction commit. */
78110 +               if (jnode_get_type(cur) == JNODE_BITMAP)
78111 +                       ch->nr_bitmap++;
78112 +
78113 +               assert("zam-939", JF_ISSET(cur, JNODE_OVRWR)
78114 +                      || jnode_get_type(cur) == JNODE_BITMAP);
78115 +
78116 +               if (jnode_is_znode(cur) && znode_above_root(JZNODE(cur))) {
78117 +                       /* we replace fake znode by another (real)
78118 +                          znode which is suggested by disk_layout
78119 +                          plugin */
78120 +
78121 +                       /* FIXME: it looks like fake znode should be
78122 +                          replaced by jnode supplied by
78123 +                          disk_layout. */
78124 +
78125 +                       struct super_block *s = reiser4_get_current_sb();
78126 +                       reiser4_super_info_data *sbinfo =
78127 +                           get_current_super_private();
78128 +
78129 +                       if (sbinfo->df_plug->log_super) {
78130 +                               jnode *sj = sbinfo->df_plug->log_super(s);
78131 +
78132 +                               assert("zam-593", sj != NULL);
78133 +
78134 +                               if (IS_ERR(sj))
78135 +                                       return PTR_ERR(sj);
78136 +
78137 +                               spin_lock_jnode(sj);
78138 +                               JF_SET(sj, JNODE_OVRWR);
78139 +                               insert_into_atom_ovrwr_list(ch->atom, sj);
78140 +                               spin_unlock_jnode(sj);
78141 +
78142 +                               /* jload it as the rest of overwrite set */
78143 +                               jload_gfp(sj, get_gfp_mask(), 0);
78144 +
78145 +                               ch->overwrite_set_size++;
78146 +                       }
78147 +                       spin_lock_jnode(cur);
78148 +                       uncapture_block(cur);
78149 +                       jput(cur);
78150 +
78151 +               } else {
78152 +                       int ret;
78153 +                       ch->overwrite_set_size++;
78154 +                       ret = jload_gfp(cur, get_gfp_mask(), 0);
78155 +                       if (ret)
78156 +                               reiser4_panic("zam-783",
78157 +                                             "cannot load e-flushed jnode back (ret = %d)\n",
78158 +                                             ret);
78159 +               }
78160 +
78161 +               /* Count not leaves here because we have to grab disk space
78162 +                * for wandered blocks. They were not counted as "flush
78163 +                * reserved". Counting should be done _after_ nodes are pinned
78164 +                * into memory by jload(). */
78165 +               if (!jnode_is_leaf(cur))
78166 +                       nr_not_leaves++;
78167 +               else {
78168 +#if REISER4_DEBUG
78169 +                       /* at this point @cur either has JNODE_FLUSH_RESERVED
78170 +                        * or is eflushed. Locking is not strong enough to
78171 +                        * write an assertion checking for this. */
78172 +                       if (jnode_is_znode(cur))
78173 +                               nr_formatted_leaves++;
78174 +                       else
78175 +                               nr_unformatted_leaves++;
78176 +#endif
78177 +                       JF_CLR(cur, JNODE_FLUSH_RESERVED);
78178 +               }
78179 +
78180 +               cur = next;
78181 +       }
78182 +
78183 +       /* Grab space for writing (wandered blocks) of not leaves found in
78184 +        * overwrite set. */
78185 +       ret = reiser4_grab_space_force(nr_not_leaves, BA_RESERVED);
78186 +       if (ret)
78187 +               return ret;
78188 +
78189 +       /* Disk space for allocation of wandered blocks of leaf nodes already
78190 +        * reserved as "flush reserved", move it to grabbed space counter. */
78191 +       spin_lock_atom(ch->atom);
78192 +       assert("zam-940",
78193 +              nr_formatted_leaves + nr_unformatted_leaves <=
78194 +              ch->atom->flush_reserved);
78195 +       flush_reserved2grabbed(ch->atom, ch->atom->flush_reserved);
78196 +       spin_unlock_atom(ch->atom);
78197 +
78198 +       return ch->overwrite_set_size;
78199 +}
78200 +
78201 +/**
78202 + * write_jnodes_to_disk_extent - submit write request
78203 + * @head:
78204 + * @first: first jnode of the list
78205 + * @nr: number of jnodes on the list
78206 + * @block_p:
78207 + * @fq:
78208 + * @flags: used to decide whether page is to get PG_reclaim flag
78209 + *
78210 + * Submits a write request for @nr jnodes beginning from the @first, other
78211 + * jnodes are after the @first on the double-linked "capture" list.  All jnodes
78212 + * will be written to the disk region of @nr blocks starting with @block_p block
78213 + * number.  If @fq is not NULL it means that waiting for i/o completion will be
78214 + * done more efficiently by using flush_queue_t objects.
78215 + * This function is the one which writes list of jnodes in batch mode. It does
78216 + * all low-level things as bio construction and page states manipulation.
78217 + *
78218 + * ZAM-FIXME-HANS: brief me on why this function exists, and why bios are
78219 + * aggregated in this function instead of being left to the layers below
78220 + *
78221 + * FIXME: ZAM->HANS: What layer are you talking about? Can you point me to that?
78222 + * Why that layer needed? Why BIOs cannot be constructed here?
78223 + */
78224 +static int write_jnodes_to_disk_extent(
78225 +       jnode *first, int nr, const reiser4_block_nr *block_p,
78226 +       flush_queue_t *fq, int flags)
78227 +{
78228 +       struct super_block *super = reiser4_get_current_sb();
78229 +       int write_op = ( flags & WRITEOUT_BARRIER ) ? WRITE_BARRIER : WRITE;
78230 +       int max_blocks;
78231 +       jnode *cur = first;
78232 +       reiser4_block_nr block;
78233 +
78234 +       assert("zam-571", first != NULL);
78235 +       assert("zam-572", block_p != NULL);
78236 +       assert("zam-570", nr > 0);
78237 +
78238 +       block = *block_p;
78239 +       max_blocks = min(bio_get_nr_vecs(super->s_bdev), BIO_MAX_PAGES);
78240 +
78241 +       while (nr > 0) {
78242 +               struct bio *bio;
78243 +               int nr_blocks = min(nr, max_blocks);
78244 +               int i;
78245 +               int nr_used;
78246 +
78247 +               bio = bio_alloc(GFP_NOIO, nr_blocks);
78248 +               if (!bio)
78249 +                       return RETERR(-ENOMEM);
78250 +
78251 +               bio->bi_bdev = super->s_bdev;
78252 +               bio->bi_sector = block * (super->s_blocksize >> 9);
78253 +               for (nr_used = 0, i = 0; i < nr_blocks; i++) {
78254 +                       struct page *pg;
78255 +
78256 +                       pg = jnode_page(cur);
78257 +                       assert("zam-573", pg != NULL);
78258 +
78259 +                       page_cache_get(pg);
78260 +
78261 +                       lock_and_wait_page_writeback(pg);
78262 +
78263 +                       if (!bio_add_page(bio, pg, super->s_blocksize, 0)) {
78264 +                               /*
78265 +                                * underlying device is satiated. Stop adding
78266 +                                * pages to the bio.
78267 +                                */
78268 +                               unlock_page(pg);
78269 +                               page_cache_release(pg);
78270 +                               break;
78271 +                       }
78272 +
78273 +                       spin_lock_jnode(cur);
78274 +                       assert("nikita-3166",
78275 +                              pg->mapping == jnode_get_mapping(cur));
78276 +                       assert("zam-912", !JF_ISSET(cur, JNODE_WRITEBACK));
78277 +#if REISER4_DEBUG
78278 +                       spin_lock(&cur->load);
78279 +                       assert("nikita-3165", !jnode_is_releasable(cur));
78280 +                       spin_unlock(&cur->load);
78281 +#endif
78282 +                       JF_SET(cur, JNODE_WRITEBACK);
78283 +                       JF_CLR(cur, JNODE_DIRTY);
78284 +                       ON_DEBUG(cur->written++);
78285 +                       spin_unlock_jnode(cur);
78286 +
78287 +                       ClearPageError(pg);
78288 +                       set_page_writeback(pg);
78289 +
78290 +                       if (get_current_context()->entd) {
78291 +                               /* this is ent thread */
78292 +                               entd_context *ent = get_entd_context(super);
78293 +                               struct wbq *rq, *next;
78294 +
78295 +                               spin_lock(&ent->guard);
78296 +
78297 +                               if (pg == ent->cur_request->page) {
78298 +                                       /*
78299 +                                        * entd is called for this page. This
78300 +                                        * request is not in th etodo list
78301 +                                        */
78302 +                                       ent->cur_request->written = 1;
78303 +                               } else {
78304 +                                       /*
78305 +                                        * if we have written a page for which writepage
78306 +                                        * is called for - move request to another list.
78307 +                                        */
78308 +                                       list_for_each_entry_safe(rq, next, &ent->todo_list, link) {
78309 +                                               assert("", rq->magic == WBQ_MAGIC);
78310 +                                               if (pg == rq->page) {
78311 +                                                       /*
78312 +                                                        * remove request from
78313 +                                                        * entd's queue, but do
78314 +                                                        * not wake up a thread
78315 +                                                        * which put this
78316 +                                                        * request
78317 +                                                        */
78318 +                                                       list_del_init(&rq->link);
78319 +                                                       ent->nr_todo_reqs --;
78320 +                                                       list_add_tail(&rq->link, &ent->done_list);
78321 +                                                       ent->nr_done_reqs ++;
78322 +                                                       rq->written = 1;
78323 +                                                       break;
78324 +                                               }
78325 +                                       }
78326 +                               }
78327 +                               spin_unlock(&ent->guard);
78328 +                       }
78329 +
78330 +                       clear_page_dirty_for_io(pg);
78331 +
78332 +                       unlock_page(pg);
78333 +
78334 +                       cur = list_entry(cur->capture_link.next, jnode, capture_link);
78335 +                       nr_used++;
78336 +               }
78337 +               if (nr_used > 0) {
78338 +                       assert("nikita-3453",
78339 +                              bio->bi_size == super->s_blocksize * nr_used);
78340 +                       assert("nikita-3454", bio->bi_vcnt == nr_used);
78341 +
78342 +                       /* Check if we are allowed to write at all */
78343 +                       if (super->s_flags & MS_RDONLY)
78344 +                               undo_bio(bio);
78345 +                       else {
78346 +                               int not_supported;
78347 +
78348 +                               add_fq_to_bio(fq, bio);
78349 +                               bio_get(bio);
78350 +                               reiser4_submit_bio(write_op, bio);
78351 +                               not_supported = bio_flagged(bio, BIO_EOPNOTSUPP);
78352 +                               bio_put(bio);
78353 +                               if (not_supported)
78354 +                                       return -EOPNOTSUPP;
78355 +                       }
78356 +
78357 +                       block += nr_used - 1;
78358 +                       update_blocknr_hint_default(super, &block);
78359 +                       block += 1;
78360 +               } else {
78361 +                       bio_put(bio);
78362 +               }
78363 +               nr -= nr_used;
78364 +       }
78365 +
78366 +       return 0;
78367 +}
78368 +
78369 +/* This is a procedure which recovers a contiguous sequences of disk block
78370 +   numbers in the given list of j-nodes and submits write requests on this
78371 +   per-sequence basis */
78372 +int
78373 +write_jnode_list(struct list_head *head, flush_queue_t *fq,
78374 +                long *nr_submitted, int flags)
78375 +{
78376 +       int ret;
78377 +       jnode *beg = list_entry(head->next, jnode, capture_link);
78378 +
78379 +       while (head != &beg->capture_link) {
78380 +               int nr = 1;
78381 +               jnode *cur = list_entry(beg->capture_link.next, jnode, capture_link);
78382 +
78383 +               while (head != &cur->capture_link) {
78384 +                       if (*jnode_get_block(cur) != *jnode_get_block(beg) + nr)
78385 +                               break;
78386 +                       ++nr;
78387 +                       cur = list_entry(cur->capture_link.next, jnode, capture_link);
78388 +               }
78389 +
78390 +               ret = write_jnodes_to_disk_extent(
78391 +                       beg, nr, jnode_get_block(beg), fq, flags);
78392 +               if (ret)
78393 +                       return ret;
78394 +
78395 +               if (nr_submitted)
78396 +                       *nr_submitted += nr;
78397 +
78398 +               beg = cur;
78399 +       }
78400 +
78401 +       return 0;
78402 +}
78403 +
78404 +/* add given wandered mapping to atom's wandered map */
78405 +static int
78406 +add_region_to_wmap(jnode * cur, int len, const reiser4_block_nr * block_p)
78407 +{
78408 +       int ret;
78409 +       blocknr_set_entry *new_bsep = NULL;
78410 +       reiser4_block_nr block;
78411 +
78412 +       txn_atom *atom;
78413 +
78414 +       assert("zam-568", block_p != NULL);
78415 +       block = *block_p;
78416 +       assert("zam-569", len > 0);
78417 +
78418 +       while ((len--) > 0) {
78419 +               do {
78420 +                       atom = get_current_atom_locked();
78421 +                       assert("zam-536",
78422 +                              !blocknr_is_fake(jnode_get_block(cur)));
78423 +                       ret =
78424 +                           blocknr_set_add_pair(atom, &atom->wandered_map,
78425 +                                                &new_bsep,
78426 +                                                jnode_get_block(cur), &block);
78427 +               } while (ret == -E_REPEAT);
78428 +
78429 +               if (ret) {
78430 +                       /* deallocate blocks which were not added to wandered
78431 +                          map */
78432 +                       reiser4_block_nr wide_len = len;
78433 +
78434 +                       reiser4_dealloc_blocks(&block, &wide_len,
78435 +                                              BLOCK_NOT_COUNTED,
78436 +                                              BA_FORMATTED
78437 +                                              /* formatted, without defer */ );
78438 +
78439 +                       return ret;
78440 +               }
78441 +
78442 +               spin_unlock_atom(atom);
78443 +
78444 +               cur = list_entry(cur->capture_link.next, jnode, capture_link);
78445 +               ++block;
78446 +       }
78447 +
78448 +       return 0;
78449 +}
78450 +
78451 +/* Allocate wandered blocks for current atom's OVERWRITE SET and immediately
78452 +   submit IO for allocated blocks.  We assume that current atom is in a stage
78453 +   when any atom fusion is impossible and atom is unlocked and it is safe. */
78454 +static int alloc_wandered_blocks(struct commit_handle *ch, flush_queue_t *fq)
78455 +{
78456 +       reiser4_block_nr block;
78457 +
78458 +       int rest;
78459 +       int len;
78460 +       int ret;
78461 +
78462 +       jnode *cur;
78463 +
78464 +       assert("zam-534", ch->overwrite_set_size > 0);
78465 +
78466 +       rest = ch->overwrite_set_size;
78467 +
78468 +       cur = list_entry(ch->overwrite_set->next, jnode, capture_link);
78469 +       while (ch->overwrite_set != &cur->capture_link) {
78470 +               assert("zam-567", JF_ISSET(cur, JNODE_OVRWR));
78471 +
78472 +               ret = get_more_wandered_blocks(rest, &block, &len);
78473 +               if (ret)
78474 +                       return ret;
78475 +
78476 +               rest -= len;
78477 +
78478 +               ret = add_region_to_wmap(cur, len, &block);
78479 +               if (ret)
78480 +                       return ret;
78481 +
78482 +               ret = write_jnodes_to_disk_extent(cur, len, &block, fq, 0);
78483 +               if (ret)
78484 +                       return ret;
78485 +
78486 +               while ((len--) > 0) {
78487 +                       assert("zam-604",
78488 +                              ch->overwrite_set != &cur->capture_link);
78489 +                       cur = list_entry(cur->capture_link.next, jnode, capture_link);
78490 +               }
78491 +       }
78492 +
78493 +       return 0;
78494 +}
78495 +
78496 +/* allocate given number of nodes over the journal area and link them into a
78497 +   list, return pointer to the first jnode in the list */
78498 +static int alloc_tx(struct commit_handle *ch, flush_queue_t * fq)
78499 +{
78500 +       reiser4_blocknr_hint hint;
78501 +       reiser4_block_nr allocated = 0;
78502 +       reiser4_block_nr first, len;
78503 +       jnode *cur;
78504 +       jnode *txhead;
78505 +       int ret;
78506 +       reiser4_context *ctx;
78507 +       reiser4_super_info_data *sbinfo;
78508 +
78509 +       assert("zam-698", ch->tx_size > 0);
78510 +       assert("zam-699", list_empty_careful(&ch->tx_list));
78511 +
78512 +       ctx = get_current_context();
78513 +       sbinfo = get_super_private(ctx->super);
78514 +
78515 +       while (allocated < (unsigned)ch->tx_size) {
78516 +               len = (ch->tx_size - allocated);
78517 +
78518 +               blocknr_hint_init(&hint);
78519 +
78520 +               hint.block_stage = BLOCK_GRABBED;
78521 +
78522 +               /* FIXME: there should be some block allocation policy for
78523 +                  nodes which contain wander records */
78524 +
78525 +               /* We assume that disk space for wandered record blocks can be
78526 +                * taken from reserved area. */
78527 +               ret = reiser4_alloc_blocks(&hint, &first, &len,
78528 +                                          BA_FORMATTED | BA_RESERVED |
78529 +                                          BA_USE_DEFAULT_SEARCH_START);
78530 +               blocknr_hint_done(&hint);
78531 +
78532 +               if (ret)
78533 +                       return ret;
78534 +
78535 +               allocated += len;
78536 +
78537 +               /* create jnodes for all wander records */
78538 +               while (len--) {
78539 +                       cur = alloc_io_head(&first);
78540 +
78541 +                       if (cur == NULL) {
78542 +                               ret = RETERR(-ENOMEM);
78543 +                               goto free_not_assigned;
78544 +                       }
78545 +
78546 +                       ret = jinit_new(cur, get_gfp_mask());
78547 +
78548 +                       if (ret != 0) {
78549 +                               jfree(cur);
78550 +                               goto free_not_assigned;
78551 +                       }
78552 +
78553 +                       pin_jnode_data(cur);
78554 +
78555 +                       list_add_tail(&cur->capture_link, &ch->tx_list);
78556 +
78557 +                       first++;
78558 +               }
78559 +       }
78560 +
78561 +       { /* format a on-disk linked list of wander records */
78562 +               int serial = 1;
78563 +
78564 +               txhead = list_entry(ch->tx_list.next, jnode, capture_link);
78565 +               format_tx_head(ch);
78566 +
78567 +               cur = list_entry(txhead->capture_link.next, jnode, capture_link);
78568 +               while (&ch->tx_list != &cur->capture_link) {
78569 +                       format_wander_record(ch, cur, serial++);
78570 +                       cur = list_entry(cur->capture_link.next, jnode, capture_link);
78571 +               }
78572 +       }
78573 +
78574 +       { /* Fill wander records with Wandered Set */
78575 +               struct store_wmap_params params;
78576 +               txn_atom *atom;
78577 +
78578 +               params.cur = list_entry(txhead->capture_link.next, jnode, capture_link);
78579 +
78580 +               params.idx = 0;
78581 +               params.capacity =
78582 +                   wander_record_capacity(reiser4_get_current_sb());
78583 +
78584 +               atom = get_current_atom_locked();
78585 +               blocknr_set_iterator(atom, &atom->wandered_map,
78586 +                                    &store_wmap_actor, &params, 0);
78587 +               spin_unlock_atom(atom);
78588 +       }
78589 +
78590 +       { /* relse all jnodes from tx_list */
78591 +               cur = list_entry(ch->tx_list.next, jnode, capture_link);
78592 +               while (&ch->tx_list != &cur->capture_link) {
78593 +                       jrelse(cur);
78594 +                       cur = list_entry(cur->capture_link.next, jnode, capture_link);
78595 +               }
78596 +       }
78597 +
78598 +       ret = write_jnode_list(&ch->tx_list, fq, NULL, 0);
78599 +
78600 +       return ret;
78601 +
78602 +      free_not_assigned:
78603 +       /* We deallocate blocks not yet assigned to jnodes on tx_list. The
78604 +          caller takes care about invalidating of tx list  */
78605 +       reiser4_dealloc_blocks(&first, &len, BLOCK_NOT_COUNTED, BA_FORMATTED);
78606 +
78607 +       return ret;
78608 +}
78609 +
78610 +static int commit_tx(struct commit_handle *ch)
78611 +{
78612 +       flush_queue_t *fq;
78613 +       int barrier;
78614 +       int ret;
78615 +
78616 +       /* Grab more space for wandered records. */
78617 +       ret = reiser4_grab_space_force((__u64) (ch->tx_size), BA_RESERVED);
78618 +       if (ret)
78619 +               return ret;
78620 +
78621 +       fq = get_fq_for_current_atom();
78622 +       if (IS_ERR(fq))
78623 +               return PTR_ERR(fq);
78624 +
78625 +       spin_unlock_atom(fq->atom);
78626 +       do {
78627 +               ret = alloc_wandered_blocks(ch, fq);
78628 +               if (ret)
78629 +                       break;
78630 +               ret = alloc_tx(ch, fq);
78631 +               if (ret)
78632 +                       break;
78633 +       } while (0);
78634 +
78635 +       fq_put(fq);
78636 +       if (ret)
78637 +               return ret;
78638 + repeat_wo_barrier:
78639 +       barrier = reiser4_use_write_barrier(ch->super);
78640 +       if (!barrier) {
78641 +               ret = current_atom_finish_all_fq();
78642 +               if (ret)
78643 +                       return ret;
78644 +       }
78645 +       ret = update_journal_header(ch, barrier);
78646 +       if (barrier) {
78647 +               if (ret) {
78648 +                       if (ret == -EOPNOTSUPP) {
78649 +                               disable_write_barrier(ch->super);
78650 +                               goto repeat_wo_barrier;
78651 +                       }
78652 +                       return ret;
78653 +               }
78654 +               ret = current_atom_finish_all_fq();
78655 +       }
78656 +       return ret;
78657 +}
78658 +
78659 +
78660 +static int write_tx_back(struct commit_handle * ch)
78661 +{
78662 +       flush_queue_t *fq;
78663 +       int ret;
78664 +       int barrier;
78665 +
78666 +       post_commit_hook();
78667 +       fq = get_fq_for_current_atom();
78668 +       if (IS_ERR(fq))
78669 +               return  PTR_ERR(fq);
78670 +       spin_unlock_atom(fq->atom);
78671 +       ret = write_jnode_list(
78672 +               ch->overwrite_set, fq, NULL, WRITEOUT_FOR_PAGE_RECLAIM);
78673 +       fq_put(fq);
78674 +       if (ret)
78675 +               return ret;
78676 + repeat_wo_barrier:
78677 +       barrier = reiser4_use_write_barrier(ch->super);
78678 +       if (!barrier) {
78679 +               ret = current_atom_finish_all_fq();
78680 +               if (ret)
78681 +                       return ret;
78682 +       }
78683 +       ret = update_journal_footer(ch, barrier);
78684 +       if (barrier) {
78685 +               if (ret) {
78686 +                       if (ret == -EOPNOTSUPP) {
78687 +                               disable_write_barrier(ch->super);
78688 +                               goto repeat_wo_barrier;
78689 +                       }
78690 +                       return ret;
78691 +               }
78692 +               ret = current_atom_finish_all_fq();
78693 +       }
78694 +       if (ret)
78695 +               return ret;
78696 +       post_write_back_hook();
78697 +       return 0;
78698 +}
78699 +
78700 +/* We assume that at this moment all captured blocks are marked as RELOC or
78701 +   WANDER (belong to Relocate o Overwrite set), all nodes from Relocate set
78702 +   are submitted to write.
78703 +*/
78704 +
78705 +int reiser4_write_logs(long *nr_submitted)
78706 +{
78707 +       txn_atom *atom;
78708 +       struct super_block *super = reiser4_get_current_sb();
78709 +       reiser4_super_info_data *sbinfo = get_super_private(super);
78710 +       struct commit_handle ch;
78711 +       int ret;
78712 +
78713 +       writeout_mode_enable();
78714 +
78715 +       /* block allocator may add j-nodes to the clean_list */
78716 +       ret = pre_commit_hook();
78717 +       if (ret)
78718 +               return ret;
78719 +
78720 +       /* No locks are required if we take atom which stage >=
78721 +        * ASTAGE_PRE_COMMIT */
78722 +       atom = get_current_context()->trans->atom;
78723 +       assert("zam-965", atom != NULL);
78724 +
78725 +       /* relocate set is on the atom->clean_nodes list after
78726 +        * current_atom_complete_writes() finishes. It can be safely
78727 +        * uncaptured after commit_semaphore is taken, because any atom that
78728 +        * captures these nodes is guaranteed to commit after current one.
78729 +        *
78730 +        * This can only be done after pre_commit_hook(), because it is where
78731 +        * early flushed jnodes with CREATED bit are transferred to the
78732 +        * overwrite list. */
78733 +       invalidate_list(ATOM_CLEAN_LIST(atom));
78734 +       spin_lock_atom(atom);
78735 +       /* There might be waiters for the relocate nodes which we have
78736 +        * released, wake them up. */
78737 +       atom_send_event(atom);
78738 +       spin_unlock_atom(atom);
78739 +
78740 +       if (REISER4_DEBUG) {
78741 +               int level;
78742 +
78743 +               for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; ++level)
78744 +                       assert("nikita-3352",
78745 +                              list_empty_careful(ATOM_DIRTY_LIST(atom, level)));
78746 +       }
78747 +
78748 +       sbinfo->nr_files_committed += (unsigned)atom->nr_objects_created;
78749 +       sbinfo->nr_files_committed -= (unsigned)atom->nr_objects_deleted;
78750 +
78751 +       init_commit_handle(&ch, atom);
78752 +
78753 +       ch.free_blocks = sbinfo->blocks_free_committed;
78754 +       ch.nr_files = sbinfo->nr_files_committed;
78755 +       /* ZAM-FIXME-HANS: email me what the contention level is for the super
78756 +        * lock. */
78757 +       ch.next_oid = oid_next(super);
78758 +
78759 +       /* count overwrite set and place it in a separate list */
78760 +       ret = get_overwrite_set(&ch);
78761 +
78762 +       if (ret <= 0) {
78763 +               /* It is possible that overwrite set is empty here, it means
78764 +                  all captured nodes are clean */
78765 +               goto up_and_ret;
78766 +       }
78767 +
78768 +       /* Inform the caller about what number of dirty pages will be
78769 +        * submitted to disk. */
78770 +       *nr_submitted += ch.overwrite_set_size - ch.nr_bitmap;
78771 +
78772 +       /* count all records needed for storing of the wandered set */
78773 +       get_tx_size(&ch);
78774 +
78775 +       ret = commit_tx(&ch);
78776 +       if (ret)
78777 +               goto up_and_ret;
78778 +
78779 +       spin_lock_atom(atom);
78780 +       atom_set_stage(atom, ASTAGE_POST_COMMIT);
78781 +       spin_unlock_atom(atom);
78782 +
78783 +       ret = write_tx_back(&ch);
78784 +       post_write_back_hook();
78785 +
78786 +      up_and_ret:
78787 +       if (ret) {
78788 +               /* there could be fq attached to current atom; the only way to
78789 +                  remove them is: */
78790 +               current_atom_finish_all_fq();
78791 +       }
78792 +
78793 +       /* free blocks of flushed transaction */
78794 +       dealloc_tx_list(&ch);
78795 +       dealloc_wmap(&ch);
78796 +
78797 +       put_overwrite_set(&ch);
78798 +
78799 +       done_commit_handle(&ch);
78800 +
78801 +       writeout_mode_disable();
78802 +
78803 +       return ret;
78804 +}
78805 +
78806 +/* consistency checks for journal data/control blocks: header, footer, log
78807 +   records, transactions head blocks. All functions return zero on success. */
78808 +
78809 +static int check_journal_header(const jnode * node UNUSED_ARG)
78810 +{
78811 +       /* FIXME: journal header has no magic field yet. */
78812 +       return 0;
78813 +}
78814 +
78815 +/* wait for write completion for all jnodes from given list */
78816 +static int wait_on_jnode_list(struct list_head *head)
78817 +{
78818 +       jnode *scan;
78819 +       int ret = 0;
78820 +
78821 +       list_for_each_entry(scan, head, capture_link) {
78822 +               struct page *pg = jnode_page(scan);
78823 +
78824 +               if (pg) {
78825 +                       if (PageWriteback(pg))
78826 +                               wait_on_page_writeback(pg);
78827 +
78828 +                       if (PageError(pg))
78829 +                               ret++;
78830 +               }
78831 +       }
78832 +
78833 +       return ret;
78834 +}
78835 +
78836 +static int check_journal_footer(const jnode * node UNUSED_ARG)
78837 +{
78838 +       /* FIXME: journal footer has no magic field yet. */
78839 +       return 0;
78840 +}
78841 +
78842 +static int check_tx_head(const jnode * node)
78843 +{
78844 +       struct tx_header *header = (struct tx_header *)jdata(node);
78845 +
78846 +       if (memcmp(&header->magic, TX_HEADER_MAGIC, TX_HEADER_MAGIC_SIZE) != 0) {
78847 +               warning("zam-627", "tx head at block %s corrupted\n",
78848 +                       sprint_address(jnode_get_block(node)));
78849 +               return RETERR(-EIO);
78850 +       }
78851 +
78852 +       return 0;
78853 +}
78854 +
78855 +static int check_wander_record(const jnode * node)
78856 +{
78857 +       struct wander_record_header *RH =
78858 +           (struct wander_record_header *)jdata(node);
78859 +
78860 +       if (memcmp(&RH->magic, WANDER_RECORD_MAGIC, WANDER_RECORD_MAGIC_SIZE) !=
78861 +           0) {
78862 +               warning("zam-628", "wander record at block %s corrupted\n",
78863 +                       sprint_address(jnode_get_block(node)));
78864 +               return RETERR(-EIO);
78865 +       }
78866 +
78867 +       return 0;
78868 +}
78869 +
78870 +/* fill commit_handler structure by everything what is needed for update_journal_footer */
78871 +static int restore_commit_handle(struct commit_handle *ch, jnode *tx_head)
78872 +{
78873 +       struct tx_header *TXH;
78874 +       int ret;
78875 +
78876 +       ret = jload(tx_head);
78877 +       if (ret)
78878 +               return ret;
78879 +
78880 +       TXH = (struct tx_header *)jdata(tx_head);
78881 +
78882 +       ch->free_blocks = le64_to_cpu(get_unaligned(&TXH->free_blocks));
78883 +       ch->nr_files = le64_to_cpu(get_unaligned(&TXH->nr_files));
78884 +       ch->next_oid = le64_to_cpu(get_unaligned(&TXH->next_oid));
78885 +
78886 +       jrelse(tx_head);
78887 +
78888 +       list_add(&tx_head->capture_link, &ch->tx_list);
78889 +
78890 +       return 0;
78891 +}
78892 +
78893 +/* replay one transaction: restore and write overwrite set in place */
78894 +static int replay_transaction(const struct super_block *s,
78895 +                             jnode * tx_head,
78896 +                             const reiser4_block_nr * log_rec_block_p,
78897 +                             const reiser4_block_nr * end_block,
78898 +                             unsigned int nr_wander_records)
78899 +{
78900 +       reiser4_block_nr log_rec_block = *log_rec_block_p;
78901 +       struct commit_handle ch;
78902 +       LIST_HEAD(overwrite_set);
78903 +       jnode *log;
78904 +       int ret;
78905 +
78906 +       init_commit_handle(&ch, NULL);
78907 +       ch.overwrite_set = &overwrite_set;
78908 +
78909 +       restore_commit_handle(&ch, tx_head);
78910 +
78911 +       while (log_rec_block != *end_block) {
78912 +               struct wander_record_header *header;
78913 +               struct wander_entry *entry;
78914 +
78915 +               int i;
78916 +
78917 +               if (nr_wander_records == 0) {
78918 +                       warning("zam-631",
78919 +                               "number of wander records in the linked list"
78920 +                               " greater than number stored in tx head.\n");
78921 +                       ret = RETERR(-EIO);
78922 +                       goto free_ow_set;
78923 +               }
78924 +
78925 +               log = alloc_io_head(&log_rec_block);
78926 +               if (log == NULL)
78927 +                       return RETERR(-ENOMEM);
78928 +
78929 +               ret = jload(log);
78930 +               if (ret < 0) {
78931 +                       drop_io_head(log);
78932 +                       return ret;
78933 +               }
78934 +
78935 +               ret = check_wander_record(log);
78936 +               if (ret) {
78937 +                       jrelse(log);
78938 +                       drop_io_head(log);
78939 +                       return ret;
78940 +               }
78941 +
78942 +               header = (struct wander_record_header *)jdata(log);
78943 +               log_rec_block = le64_to_cpu(get_unaligned(&header->next_block));
78944 +
78945 +               entry = (struct wander_entry *)(header + 1);
78946 +
78947 +               /* restore overwrite set from wander record content */
78948 +               for (i = 0; i < wander_record_capacity(s); i++) {
78949 +                       reiser4_block_nr block;
78950 +                       jnode *node;
78951 +
78952 +                       block = le64_to_cpu(get_unaligned(&entry->wandered));
78953 +                       if (block == 0)
78954 +                               break;
78955 +
78956 +                       node = alloc_io_head(&block);
78957 +                       if (node == NULL) {
78958 +                               ret = RETERR(-ENOMEM);
78959 +                               /*
78960 +                                * FIXME-VS:???
78961 +                                */
78962 +                               jrelse(log);
78963 +                               drop_io_head(log);
78964 +                               goto free_ow_set;
78965 +                       }
78966 +
78967 +                       ret = jload(node);
78968 +
78969 +                       if (ret < 0) {
78970 +                               drop_io_head(node);
78971 +                               /*
78972 +                                * FIXME-VS:???
78973 +                                */
78974 +                               jrelse(log);
78975 +                               drop_io_head(log);
78976 +                               goto free_ow_set;
78977 +                       }
78978 +
78979 +                       block = le64_to_cpu(get_unaligned(&entry->original));
78980 +
78981 +                       assert("zam-603", block != 0);
78982 +
78983 +                       jnode_set_block(node, &block);
78984 +
78985 +                       list_add_tail(&node->capture_link, ch.overwrite_set);
78986 +
78987 +                       ++entry;
78988 +               }
78989 +
78990 +               jrelse(log);
78991 +               drop_io_head(log);
78992 +
78993 +               --nr_wander_records;
78994 +       }
78995 +
78996 +       if (nr_wander_records != 0) {
78997 +               warning("zam-632", "number of wander records in the linked list"
78998 +                       " less than number stored in tx head.\n");
78999 +               ret = RETERR(-EIO);
79000 +               goto free_ow_set;
79001 +       }
79002 +
79003 +       {                       /* write wandered set in place */
79004 +               write_jnode_list(ch.overwrite_set, NULL, NULL, 0);
79005 +               ret = wait_on_jnode_list(ch.overwrite_set);
79006 +
79007 +               if (ret) {
79008 +                       ret = RETERR(-EIO);
79009 +                       goto free_ow_set;
79010 +               }
79011 +       }
79012 +
79013 +       ret = update_journal_footer(&ch, 0);
79014 +
79015 +      free_ow_set:
79016 +
79017 +       while (!list_empty(ch.overwrite_set)) {
79018 +               jnode *cur = list_entry(ch.overwrite_set->next, jnode, capture_link);
79019 +               list_del_init(&cur->capture_link);
79020 +               jrelse(cur);
79021 +               drop_io_head(cur);
79022 +       }
79023 +
79024 +       list_del_init(&tx_head->capture_link);
79025 +
79026 +       done_commit_handle(&ch);
79027 +
79028 +       return ret;
79029 +}
79030 +
79031 +/* find oldest committed and not played transaction and play it. The transaction
79032 + * was committed and journal header block was updated but the blocks from the
79033 + * process of writing the atom's overwrite set in-place and updating of journal
79034 + * footer block were not completed. This function completes the process by
79035 + * recovering the atom's overwrite set from their wandered locations and writes
79036 + * them in-place and updating the journal footer. */
79037 +static int replay_oldest_transaction(struct super_block *s)
79038 +{
79039 +       reiser4_super_info_data *sbinfo = get_super_private(s);
79040 +       jnode *jf = sbinfo->journal_footer;
79041 +       unsigned int total;
79042 +       struct journal_footer *F;
79043 +       struct tx_header *T;
79044 +
79045 +       reiser4_block_nr prev_tx;
79046 +       reiser4_block_nr last_flushed_tx;
79047 +       reiser4_block_nr log_rec_block = 0;
79048 +
79049 +       jnode *tx_head;
79050 +
79051 +       int ret;
79052 +
79053 +       if ((ret = jload(jf)) < 0)
79054 +               return ret;
79055 +
79056 +       F = (struct journal_footer *)jdata(jf);
79057 +
79058 +       last_flushed_tx = le64_to_cpu(get_unaligned(&F->last_flushed_tx));
79059 +
79060 +       jrelse(jf);
79061 +
79062 +       if (sbinfo->last_committed_tx == last_flushed_tx) {
79063 +               /* all transactions are replayed */
79064 +               return 0;
79065 +       }
79066 +
79067 +       prev_tx = sbinfo->last_committed_tx;
79068 +
79069 +       /* searching for oldest not flushed transaction */
79070 +       while (1) {
79071 +               tx_head = alloc_io_head(&prev_tx);
79072 +               if (!tx_head)
79073 +                       return RETERR(-ENOMEM);
79074 +
79075 +               ret = jload(tx_head);
79076 +               if (ret < 0) {
79077 +                       drop_io_head(tx_head);
79078 +                       return ret;
79079 +               }
79080 +
79081 +               ret = check_tx_head(tx_head);
79082 +               if (ret) {
79083 +                       jrelse(tx_head);
79084 +                       drop_io_head(tx_head);
79085 +                       return ret;
79086 +               }
79087 +
79088 +               T = (struct tx_header *)jdata(tx_head);
79089 +
79090 +               prev_tx = le64_to_cpu(get_unaligned(&T->prev_tx));
79091 +
79092 +               if (prev_tx == last_flushed_tx)
79093 +                       break;
79094 +
79095 +               jrelse(tx_head);
79096 +               drop_io_head(tx_head);
79097 +       }
79098 +
79099 +       total = le32_to_cpu(get_unaligned(&T->total));
79100 +       log_rec_block = le64_to_cpu(get_unaligned(&T->next_block));
79101 +
79102 +       pin_jnode_data(tx_head);
79103 +       jrelse(tx_head);
79104 +
79105 +       ret =
79106 +           replay_transaction(s, tx_head, &log_rec_block,
79107 +                              jnode_get_block(tx_head), total - 1);
79108 +
79109 +       unpin_jnode_data(tx_head);
79110 +       drop_io_head(tx_head);
79111 +
79112 +       if (ret)
79113 +               return ret;
79114 +       return -E_REPEAT;
79115 +}
79116 +
79117 +/* The reiser4 journal current implementation was optimized to not to capture
79118 +   super block if certain super blocks fields are modified. Currently, the set
79119 +   is (<free block count>, <OID allocator>). These fields are logged by
79120 +   special way which includes storing them in each transaction head block at
79121 +   atom commit time and writing that information to journal footer block at
79122 +   atom flush time.  For getting info from journal footer block to the
79123 +   in-memory super block there is a special function
79124 +   reiser4_journal_recover_sb_data() which should be called after disk format
79125 +   plugin re-reads super block after journal replaying.
79126 +*/
79127 +
79128 +/* get the information from journal footer in-memory super block */
79129 +int reiser4_journal_recover_sb_data(struct super_block *s)
79130 +{
79131 +       reiser4_super_info_data *sbinfo = get_super_private(s);
79132 +       struct journal_footer *jf;
79133 +       int ret;
79134 +
79135 +       assert("zam-673", sbinfo->journal_footer != NULL);
79136 +
79137 +       ret = jload(sbinfo->journal_footer);
79138 +       if (ret != 0)
79139 +               return ret;
79140 +
79141 +       ret = check_journal_footer(sbinfo->journal_footer);
79142 +       if (ret != 0)
79143 +               goto out;
79144 +
79145 +       jf = (struct journal_footer *)jdata(sbinfo->journal_footer);
79146 +
79147 +       /* was there at least one flushed transaction?  */
79148 +       if (jf->last_flushed_tx) {
79149 +
79150 +               /* restore free block counter logged in this transaction */
79151 +               reiser4_set_free_blocks(s, le64_to_cpu(get_unaligned(&jf->free_blocks)));
79152 +
79153 +               /* restore oid allocator state */
79154 +               oid_init_allocator(s,
79155 +                                  le64_to_cpu(get_unaligned(&jf->nr_files)),
79156 +                                  le64_to_cpu(get_unaligned(&jf->next_oid)));
79157 +       }
79158 +      out:
79159 +       jrelse(sbinfo->journal_footer);
79160 +       return ret;
79161 +}
79162 +
79163 +/* reiser4 replay journal procedure */
79164 +int reiser4_journal_replay(struct super_block *s)
79165 +{
79166 +       reiser4_super_info_data *sbinfo = get_super_private(s);
79167 +       jnode *jh, *jf;
79168 +       struct journal_header *header;
79169 +       int nr_tx_replayed = 0;
79170 +       int ret;
79171 +
79172 +       assert("zam-582", sbinfo != NULL);
79173 +
79174 +       jh = sbinfo->journal_header;
79175 +       jf = sbinfo->journal_footer;
79176 +
79177 +       if (!jh || !jf) {
79178 +               /* it is possible that disk layout does not support journal
79179 +                  structures, we just warn about this */
79180 +               warning("zam-583",
79181 +                       "journal control blocks were not loaded by disk layout plugin.  "
79182 +                       "journal replaying is not possible.\n");
79183 +               return 0;
79184 +       }
79185 +
79186 +       /* Take free block count from journal footer block. The free block
79187 +          counter value corresponds the last flushed transaction state */
79188 +       ret = jload(jf);
79189 +       if (ret < 0)
79190 +               return ret;
79191 +
79192 +       ret = check_journal_footer(jf);
79193 +       if (ret) {
79194 +               jrelse(jf);
79195 +               return ret;
79196 +       }
79197 +
79198 +       jrelse(jf);
79199 +
79200 +       /* store last committed transaction info in reiser4 in-memory super
79201 +          block */
79202 +       ret = jload(jh);
79203 +       if (ret < 0)
79204 +               return ret;
79205 +
79206 +       ret = check_journal_header(jh);
79207 +       if (ret) {
79208 +               jrelse(jh);
79209 +               return ret;
79210 +       }
79211 +
79212 +       header = (struct journal_header *)jdata(jh);
79213 +       sbinfo->last_committed_tx = le64_to_cpu(get_unaligned(&header->last_committed_tx));
79214 +
79215 +       jrelse(jh);
79216 +
79217 +       /* replay committed transactions */
79218 +       while ((ret = replay_oldest_transaction(s)) == -E_REPEAT)
79219 +               nr_tx_replayed++;
79220 +
79221 +       return ret;
79222 +}
79223 +
79224 +/* load journal control block (either journal header or journal footer block) */
79225 +static int
79226 +load_journal_control_block(jnode ** node, const reiser4_block_nr * block)
79227 +{
79228 +       int ret;
79229 +
79230 +       *node = alloc_io_head(block);
79231 +       if (!(*node))
79232 +               return RETERR(-ENOMEM);
79233 +
79234 +       ret = jload(*node);
79235 +
79236 +       if (ret) {
79237 +               drop_io_head(*node);
79238 +               *node = NULL;
79239 +               return ret;
79240 +       }
79241 +
79242 +       pin_jnode_data(*node);
79243 +       jrelse(*node);
79244 +
79245 +       return 0;
79246 +}
79247 +
79248 +/* unload journal header or footer and free jnode */
79249 +static void unload_journal_control_block(jnode ** node)
79250 +{
79251 +       if (*node) {
79252 +               unpin_jnode_data(*node);
79253 +               drop_io_head(*node);
79254 +               *node = NULL;
79255 +       }
79256 +}
79257 +
79258 +/* release journal control blocks */
79259 +void done_journal_info(struct super_block *s)
79260 +{
79261 +       reiser4_super_info_data *sbinfo = get_super_private(s);
79262 +
79263 +       assert("zam-476", sbinfo != NULL);
79264 +
79265 +       unload_journal_control_block(&sbinfo->journal_header);
79266 +       unload_journal_control_block(&sbinfo->journal_footer);
79267 +       rcu_barrier();
79268 +}
79269 +
79270 +/* load journal control blocks */
79271 +int init_journal_info(struct super_block *s)
79272 +{
79273 +       reiser4_super_info_data *sbinfo = get_super_private(s);
79274 +       journal_location *loc;
79275 +       int ret;
79276 +
79277 +       loc = &sbinfo->jloc;
79278 +
79279 +       assert("zam-651", loc != NULL);
79280 +       assert("zam-652", loc->header != 0);
79281 +       assert("zam-653", loc->footer != 0);
79282 +
79283 +       ret = load_journal_control_block(&sbinfo->journal_header, &loc->header);
79284 +
79285 +       if (ret)
79286 +               return ret;
79287 +
79288 +       ret = load_journal_control_block(&sbinfo->journal_footer, &loc->footer);
79289 +
79290 +       if (ret) {
79291 +               unload_journal_control_block(&sbinfo->journal_header);
79292 +       }
79293 +
79294 +       return ret;
79295 +}
79296 +
79297 +/* Make Linus happy.
79298 +   Local variables:
79299 +   c-indentation-style: "K&R"
79300 +   mode-name: "LC"
79301 +   c-basic-offset: 8
79302 +   tab-width: 8
79303 +   fill-column: 80
79304 +   End:
79305 +*/
79306 diff --git a/fs/reiser4/wander.h b/fs/reiser4/wander.h
79307 new file mode 100644
79308 index 0000000..0e3b334
79309 --- /dev/null
79310 +++ b/fs/reiser4/wander.h
79311 @@ -0,0 +1,135 @@
79312 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
79313 +
79314 +#if !defined (__FS_REISER4_WANDER_H__)
79315 +#define __FS_REISER4_WANDER_H__
79316 +
79317 +#include "dformat.h"
79318 +
79319 +#include <linux/fs.h>          /* for struct super_block  */
79320 +
79321 +/* REISER4 JOURNAL ON-DISK DATA STRUCTURES   */
79322 +
79323 +#define TX_HEADER_MAGIC  "TxMagic4"
79324 +#define WANDER_RECORD_MAGIC "LogMagc4"
79325 +
79326 +#define TX_HEADER_MAGIC_SIZE  (8)
79327 +#define WANDER_RECORD_MAGIC_SIZE (8)
79328 +
79329 +/* journal header block format */
79330 +struct journal_header {
79331 +       /* last written transaction head location */
79332 +       d64 last_committed_tx;
79333 +};
79334 +
79335 +typedef struct journal_location {
79336 +       reiser4_block_nr footer;
79337 +       reiser4_block_nr header;
79338 +} journal_location;
79339 +
79340 +/* The wander.c head comment describes usage and semantic of all these structures */
79341 +/* journal footer block format */
79342 +struct journal_footer {
79343 +       /* last flushed transaction location. */
79344 +       /* This block number is no more valid after the transaction it points
79345 +          to gets flushed, this number is used only at journal replaying time
79346 +          for detection of the end of on-disk list of committed transactions
79347 +          which were not flushed completely */
79348 +       d64 last_flushed_tx;
79349 +
79350 +       /* free block counter is written in journal footer at transaction
79351 +          flushing , not in super block because free blocks counter is logged
79352 +          by another way than super block fields (root pointer, for
79353 +          example). */
79354 +       d64 free_blocks;
79355 +
79356 +       /* number of used OIDs and maximal used OID are logged separately from
79357 +          super block */
79358 +       d64 nr_files;
79359 +       d64 next_oid;
79360 +};
79361 +
79362 +/* Each wander record (except the first one) has unified format with wander
79363 +   record header followed by an array of log entries */
79364 +struct wander_record_header {
79365 +       /* when there is no predefined location for wander records, this magic
79366 +          string should help reiser4fsck. */
79367 +       char magic[WANDER_RECORD_MAGIC_SIZE];
79368 +
79369 +       /* transaction id */
79370 +       d64 id;
79371 +
79372 +       /* total number of wander records in current transaction  */
79373 +       d32 total;
79374 +
79375 +       /* this block number in transaction */
79376 +       d32 serial;
79377 +
79378 +       /* number of previous block in commit */
79379 +       d64 next_block;
79380 +};
79381 +
79382 +/* The first wander record (transaction head) of written transaction has the
79383 +   special format */
79384 +struct tx_header {
79385 +       /* magic string makes first block in transaction different from other
79386 +          logged blocks, it should help fsck. */
79387 +       char magic[TX_HEADER_MAGIC_SIZE];
79388 +
79389 +       /* transaction id */
79390 +       d64 id;
79391 +
79392 +       /* total number of records (including this first tx head) in the
79393 +          transaction */
79394 +       d32 total;
79395 +
79396 +       /* align next field to 8-byte boundary; this field always is zero */
79397 +       d32 padding;
79398 +
79399 +       /* block number of previous transaction head */
79400 +       d64 prev_tx;
79401 +
79402 +       /* next wander record location */
79403 +       d64 next_block;
79404 +
79405 +       /* committed versions of free blocks counter */
79406 +       d64 free_blocks;
79407 +
79408 +       /* number of used OIDs (nr_files) and maximal used OID are logged
79409 +          separately from super block */
79410 +       d64 nr_files;
79411 +       d64 next_oid;
79412 +};
79413 +
79414 +/* A transaction gets written to disk as a set of wander records (each wander
79415 +   record size is fs block) */
79416 +
79417 +/* As it was told above a wander The rest of wander record is filled by these log entries, unused space filled
79418 +   by zeroes */
79419 +struct wander_entry {
79420 +       d64 original;           /* block original location */
79421 +       d64 wandered;           /* block wandered location */
79422 +};
79423 +
79424 +/* REISER4 JOURNAL WRITER FUNCTIONS   */
79425 +
79426 +extern int reiser4_write_logs(long *);
79427 +extern int reiser4_journal_replay(struct super_block *);
79428 +extern int reiser4_journal_recover_sb_data(struct super_block *);
79429 +
79430 +extern int init_journal_info(struct super_block *);
79431 +extern void done_journal_info(struct super_block *);
79432 +
79433 +extern int write_jnode_list(struct list_head *, flush_queue_t *, long *, int);
79434 +
79435 +#endif                         /* __FS_REISER4_WANDER_H__ */
79436 +
79437 +/* Make Linus happy.
79438 +   Local variables:
79439 +   c-indentation-style: "K&R"
79440 +   mode-name: "LC"
79441 +   c-basic-offset: 8
79442 +   tab-width: 8
79443 +   fill-column: 80
79444 +   scroll-step: 1
79445 +   End:
79446 +*/
79447 diff --git a/fs/reiser4/writeout.h b/fs/reiser4/writeout.h
79448 new file mode 100644
79449 index 0000000..29e9a8b
79450 --- /dev/null
79451 +++ b/fs/reiser4/writeout.h
79452 @@ -0,0 +1,21 @@
79453 +/* Copyright 2002, 2003, 2004 by Hans Reiser, licensing governed by reiser4/README  */
79454 +
79455 +#if !defined (__FS_REISER4_WRITEOUT_H__)
79456 +
79457 +#define WRITEOUT_SINGLE_STREAM (0x1)
79458 +#define WRITEOUT_FOR_PAGE_RECLAIM  (0x2)
79459 +#define WRITEOUT_BARRIER (0x4)
79460 +
79461 +extern int get_writeout_flags(void);
79462 +
79463 +#endif                         /* __FS_REISER4_WRITEOUT_H__ */
79464 +
79465 +/* Make Linus happy.
79466 +   Local variables:
79467 +   c-indentation-style: "K&R"
79468 +   mode-name: "LC"
79469 +   c-basic-offset: 8
79470 +   tab-width: 8
79471 +   fill-column: 80
79472 +   End:
79473 +*/
79474 diff --git a/fs/reiser4/znode.c b/fs/reiser4/znode.c
79475 new file mode 100644
79476 index 0000000..f844e89
79477 --- /dev/null
79478 +++ b/fs/reiser4/znode.c
79479 @@ -0,0 +1,1028 @@
79480 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
79481 + * reiser4/README */
79482 +/* Znode manipulation functions. */
79483 +/* Znode is the in-memory header for a tree node. It is stored
79484 +   separately from the node itself so that it does not get written to
79485 +   disk.  In this respect znode is like buffer head or page head. We
79486 +   also use znodes for additional reiser4 specific purposes:
79487 +
79488 +    . they are organized into tree structure which is a part of whole
79489 +      reiser4 tree.
79490 +    . they are used to implement node grained locking
79491 +    . they are used to keep additional state associated with a
79492 +      node
79493 +    . they contain links to lists used by the transaction manager
79494 +
79495 +   Znode is attached to some variable "block number" which is instance of
79496 +   fs/reiser4/tree.h:reiser4_block_nr type. Znode can exist without
79497 +   appropriate node being actually loaded in memory. Existence of znode itself
79498 +   is regulated by reference count (->x_count) in it. Each time thread
79499 +   acquires reference to znode through call to zget(), ->x_count is
79500 +   incremented and decremented on call to zput().  Data (content of node) are
79501 +   brought in memory through call to zload(), which also increments ->d_count
79502 +   reference counter.  zload can block waiting on IO.  Call to zrelse()
79503 +   decreases this counter. Also, ->c_count keeps track of number of child
79504 +   znodes and prevents parent znode from being recycled until all of its
79505 +   children are. ->c_count is decremented whenever child goes out of existence
79506 +   (being actually recycled in zdestroy()) which can be some time after last
79507 +   reference to this child dies if we support some form of LRU cache for
79508 +   znodes.
79509 +
79510 +*/
79511 +/* EVERY ZNODE'S STORY
79512 +
79513 +   1. His infancy.
79514 +
79515 +   Once upon a time, the znode was born deep inside of zget() by call to
79516 +   zalloc(). At the return from zget() znode had:
79517 +
79518 +    . reference counter (x_count) of 1
79519 +    . assigned block number, marked as used in bitmap
79520 +    . pointer to parent znode. Root znode parent pointer points
79521 +      to its father: "fake" znode. This, in turn, has NULL parent pointer.
79522 +    . hash table linkage
79523 +    . no data loaded from disk
79524 +    . no node plugin
79525 +    . no sibling linkage
79526 +
79527 +   2. His childhood
79528 +
79529 +   Each node is either brought into memory as a result of tree traversal, or
79530 +   created afresh, creation of the root being a special case of the latter. In
79531 +   either case it's inserted into sibling list. This will typically require
79532 +   some ancillary tree traversing, but ultimately both sibling pointers will
79533 +   exist and JNODE_LEFT_CONNECTED and JNODE_RIGHT_CONNECTED will be true in
79534 +   zjnode.state.
79535 +
79536 +   3. His youth.
79537 +
79538 +   If znode is bound to already existing node in a tree, its content is read
79539 +   from the disk by call to zload(). At that moment, JNODE_LOADED bit is set
79540 +   in zjnode.state and zdata() function starts to return non null for this
79541 +   znode. zload() further calls zparse() that determines which node layout
79542 +   this node is rendered in, and sets ->nplug on success.
79543 +
79544 +   If znode is for new node just created, memory for it is allocated and
79545 +   zinit_new() function is called to initialise data, according to selected
79546 +   node layout.
79547 +
79548 +   4. His maturity.
79549 +
79550 +   After this point, znode lingers in memory for some time. Threads can
79551 +   acquire references to znode either by blocknr through call to zget(), or by
79552 +   following a pointer to unallocated znode from internal item. Each time
79553 +   reference to znode is obtained, x_count is increased. Thread can read/write
79554 +   lock znode. Znode data can be loaded through calls to zload(), d_count will
79555 +   be increased appropriately. If all references to znode are released
79556 +   (x_count drops to 0), znode is not recycled immediately. Rather, it is
79557 +   still cached in the hash table in the hope that it will be accessed
79558 +   shortly.
79559 +
79560 +   There are two ways in which znode existence can be terminated:
79561 +
79562 +    . sudden death: node bound to this znode is removed from the tree
79563 +    . overpopulation: znode is purged out of memory due to memory pressure
79564 +
79565 +   5. His death.
79566 +
79567 +   Death is complex process.
79568 +
79569 +   When we irrevocably commit ourselves to decision to remove node from the
79570 +   tree, JNODE_HEARD_BANSHEE bit is set in zjnode.state of corresponding
79571 +   znode. This is done either in ->kill_hook() of internal item or in
79572 +   kill_root() function when tree root is removed.
79573 +
79574 +   At this moment znode still has:
79575 +
79576 +    . locks held on it, necessary write ones
79577 +    . references to it
79578 +    . disk block assigned to it
79579 +    . data loaded from the disk
79580 +    . pending requests for lock
79581 +
79582 +   But once JNODE_HEARD_BANSHEE bit set, last call to unlock_znode() does node
79583 +   deletion. Node deletion includes two phases. First all ways to get
79584 +   references to that znode (sibling and parent links and hash lookup using
79585 +   block number stored in parent node) should be deleted -- it is done through
79586 +   sibling_list_remove(), also we assume that nobody uses down link from
79587 +   parent node due to its nonexistence or proper parent node locking and
79588 +   nobody uses parent pointers from children due to absence of them. Second we
79589 +   invalidate all pending lock requests which still are on znode's lock
79590 +   request queue, this is done by invalidate_lock(). Another JNODE_IS_DYING
79591 +   znode status bit is used to invalidate pending lock requests. Once it set
79592 +   all requesters are forced to return -EINVAL from
79593 +   longterm_lock_znode(). Future locking attempts are not possible because all
79594 +   ways to get references to that znode are removed already. Last, node is
79595 +   uncaptured from transaction.
79596 +
79597 +   When last reference to the dying znode is just about to be released,
79598 +   block number for this lock is released and znode is removed from the
79599 +   hash table.
79600 +
79601 +   Now znode can be recycled.
79602 +
79603 +   [it's possible to free bitmap block and remove znode from the hash
79604 +   table when last lock is released. This will result in having
79605 +   referenced but completely orphaned znode]
79606 +
79607 +   6. Limbo
79608 +
79609 +   As have been mentioned above znodes with reference counter 0 are
79610 +   still cached in a hash table. Once memory pressure increases they are
79611 +   purged out of there [this requires something like LRU list for
79612 +   efficient implementation. LRU list would also greatly simplify
79613 +   implementation of coord cache that would in this case morph to just
79614 +   scanning some initial segment of LRU list]. Data loaded into
79615 +   unreferenced znode are flushed back to the durable storage if
79616 +   necessary and memory is freed. Znodes themselves can be recycled at
79617 +   this point too.
79618 +
79619 +*/
79620 +
79621 +#include "debug.h"
79622 +#include "dformat.h"
79623 +#include "key.h"
79624 +#include "coord.h"
79625 +#include "plugin/plugin_header.h"
79626 +#include "plugin/node/node.h"
79627 +#include "plugin/plugin.h"
79628 +#include "txnmgr.h"
79629 +#include "jnode.h"
79630 +#include "znode.h"
79631 +#include "block_alloc.h"
79632 +#include "tree.h"
79633 +#include "tree_walk.h"
79634 +#include "super.h"
79635 +#include "reiser4.h"
79636 +
79637 +#include <linux/pagemap.h>
79638 +#include <linux/spinlock.h>
79639 +#include <linux/slab.h>
79640 +#include <linux/err.h>
79641 +
79642 +static z_hash_table *get_htable(reiser4_tree *,
79643 +                               const reiser4_block_nr * const blocknr);
79644 +static z_hash_table *znode_get_htable(const znode *);
79645 +static void zdrop(znode *);
79646 +
79647 +/* hash table support */
79648 +
79649 +/* compare two block numbers for equality. Used by hash-table macros */
79650 +static inline int
79651 +blknreq(const reiser4_block_nr * b1, const reiser4_block_nr * b2)
79652 +{
79653 +       assert("nikita-534", b1 != NULL);
79654 +       assert("nikita-535", b2 != NULL);
79655 +
79656 +       return *b1 == *b2;
79657 +}
79658 +
79659 +/* Hash znode by block number. Used by hash-table macros */
79660 +/* Audited by: umka (2002.06.11) */
79661 +static inline __u32
79662 +blknrhashfn(z_hash_table * table, const reiser4_block_nr * b)
79663 +{
79664 +       assert("nikita-536", b != NULL);
79665 +
79666 +       return *b & (REISER4_ZNODE_HASH_TABLE_SIZE - 1);
79667 +}
79668 +
79669 +/* The hash table definition */
79670 +#define KMALLOC(size) kmalloc((size), GFP_KERNEL)
79671 +#define KFREE(ptr, size) kfree(ptr)
79672 +TYPE_SAFE_HASH_DEFINE(z, znode, reiser4_block_nr, zjnode.key.z, zjnode.link.z,
79673 +                     blknrhashfn, blknreq);
79674 +#undef KFREE
79675 +#undef KMALLOC
79676 +
79677 +/* slab for znodes */
79678 +static kmem_cache_t *znode_cache;
79679 +
79680 +int znode_shift_order;
79681 +
79682 +/**
79683 + * init_znodes - create znode cache
79684 + *
79685 + * Initializes slab cache of znodes. It is part of reiser4 module initialization.
79686 + */
79687 +int init_znodes(void)
79688 +{
79689 +       znode_cache = kmem_cache_create("znode", sizeof(znode), 0,
79690 +                                       SLAB_HWCACHE_ALIGN |
79691 +                                       SLAB_RECLAIM_ACCOUNT, NULL, NULL);
79692 +       if (znode_cache == NULL)
79693 +               return RETERR(-ENOMEM);
79694 +
79695 +       for (znode_shift_order = 0; (1 << znode_shift_order) < sizeof(znode);
79696 +            ++znode_shift_order);
79697 +       --znode_shift_order;
79698 +       return 0;
79699 +}
79700 +
79701 +/**
79702 + * done_znodes - delete znode cache
79703 + *
79704 + * This is called on reiser4 module unloading or system shutdown.
79705 + */
79706 +void done_znodes(void)
79707 +{
79708 +       destroy_reiser4_cache(&znode_cache);
79709 +}
79710 +
79711 +/* call this to initialise tree of znodes */
79712 +int znodes_tree_init(reiser4_tree * tree /* tree to initialise znodes for */ )
79713 +{
79714 +       int result;
79715 +       assert("umka-050", tree != NULL);
79716 +
79717 +       rwlock_init(&tree->dk_lock);
79718 +
79719 +       result = z_hash_init(&tree->zhash_table, REISER4_ZNODE_HASH_TABLE_SIZE);
79720 +       if (result != 0)
79721 +               return result;
79722 +       result = z_hash_init(&tree->zfake_table, REISER4_ZNODE_HASH_TABLE_SIZE);
79723 +       return result;
79724 +}
79725 +
79726 +/* free this znode */
79727 +void zfree(znode * node /* znode to free */ )
79728 +{
79729 +       assert("nikita-465", node != NULL);
79730 +       assert("nikita-2120", znode_page(node) == NULL);
79731 +       assert("nikita-2301", list_empty_careful(&node->lock.owners));
79732 +       assert("nikita-2302", list_empty_careful(&node->lock.requestors));
79733 +       assert("nikita-2663", (list_empty_careful(&ZJNODE(node)->capture_link) &&
79734 +                              NODE_LIST(ZJNODE(node)) == NOT_CAPTURED));
79735 +       assert("nikita-3220", list_empty(&ZJNODE(node)->jnodes));
79736 +       assert("nikita-3293", !znode_is_right_connected(node));
79737 +       assert("nikita-3294", !znode_is_left_connected(node));
79738 +       assert("nikita-3295", node->left == NULL);
79739 +       assert("nikita-3296", node->right == NULL);
79740 +
79741 +       /* not yet phash_jnode_destroy(ZJNODE(node)); */
79742 +
79743 +       kmem_cache_free(znode_cache, node);
79744 +}
79745 +
79746 +/* call this to free tree of znodes */
79747 +void znodes_tree_done(reiser4_tree * tree /* tree to finish with znodes of */ )
79748 +{
79749 +       znode *node;
79750 +       znode *next;
79751 +       z_hash_table *ztable;
79752 +
79753 +       /* scan znode hash-tables and kill all znodes, then free hash tables
79754 +        * themselves. */
79755 +
79756 +       assert("nikita-795", tree != NULL);
79757 +
79758 +       ztable = &tree->zhash_table;
79759 +
79760 +       if (ztable->_table != NULL) {
79761 +               for_all_in_htable(ztable, z, node, next) {
79762 +                       node->c_count = 0;
79763 +                       node->in_parent.node = NULL;
79764 +                       assert("nikita-2179", atomic_read(&ZJNODE(node)->x_count) == 0);
79765 +                       zdrop(node);
79766 +               }
79767 +
79768 +               z_hash_done(&tree->zhash_table);
79769 +       }
79770 +
79771 +       ztable = &tree->zfake_table;
79772 +
79773 +       if (ztable->_table != NULL) {
79774 +               for_all_in_htable(ztable, z, node, next) {
79775 +                       node->c_count = 0;
79776 +                       node->in_parent.node = NULL;
79777 +                       assert("nikita-2179", atomic_read(&ZJNODE(node)->x_count) == 0);
79778 +                       zdrop(node);
79779 +               }
79780 +
79781 +               z_hash_done(&tree->zfake_table);
79782 +       }
79783 +}
79784 +
79785 +/* ZNODE STRUCTURES */
79786 +
79787 +/* allocate fresh znode */
79788 +znode *zalloc(gfp_t gfp_flag /* allocation flag */ )
79789 +{
79790 +       znode *node;
79791 +
79792 +       node = kmem_cache_alloc(znode_cache, gfp_flag);
79793 +       return node;
79794 +}
79795 +
79796 +/* Initialize fields of znode
79797 +   @node:    znode to initialize;
79798 +   @parent:  parent znode;
79799 +   @tree:    tree we are in. */
79800 +void zinit(znode * node, const znode * parent, reiser4_tree * tree)
79801 +{
79802 +       assert("nikita-466", node != NULL);
79803 +       assert("umka-268", current_tree != NULL);
79804 +
79805 +       memset(node, 0, sizeof *node);
79806 +
79807 +       assert("umka-051", tree != NULL);
79808 +
79809 +       jnode_init(&node->zjnode, tree, JNODE_FORMATTED_BLOCK);
79810 +       reiser4_init_lock(&node->lock);
79811 +       init_parent_coord(&node->in_parent, parent);
79812 +}
79813 +
79814 +/*
79815 + * remove znode from indices. This is called jput() when last reference on
79816 + * znode is released.
79817 + */
79818 +void znode_remove(znode * node /* znode to remove */ , reiser4_tree * tree)
79819 +{
79820 +       assert("nikita-2108", node != NULL);
79821 +       assert("nikita-470", node->c_count == 0);
79822 +       assert_rw_write_locked(&(tree->tree_lock));
79823 +
79824 +       /* remove reference to this znode from cbk cache */
79825 +       cbk_cache_invalidate(node, tree);
79826 +
79827 +       /* update c_count of parent */
79828 +       if (znode_parent(node) != NULL) {
79829 +               assert("nikita-472", znode_parent(node)->c_count > 0);
79830 +               /* father, onto your hands I forward my spirit... */
79831 +               znode_parent(node)->c_count--;
79832 +               node->in_parent.node = NULL;
79833 +       } else {
79834 +               /* orphaned znode?! Root? */
79835 +       }
79836 +
79837 +       /* remove znode from hash-table */
79838 +       z_hash_remove_rcu(znode_get_htable(node), node);
79839 +}
79840 +
79841 +/* zdrop() -- Remove znode from the tree.
79842 +
79843 +   This is called when znode is removed from the memory. */
79844 +static void zdrop(znode * node /* znode to finish with */ )
79845 +{
79846 +       jdrop(ZJNODE(node));
79847 +}
79848 +
79849 +/*
79850 + * put znode into right place in the hash table. This is called by relocate
79851 + * code.
79852 + */
79853 +int znode_rehash(znode * node /* node to rehash */ ,
79854 +                const reiser4_block_nr * new_block_nr /* new block number */ )
79855 +{
79856 +       z_hash_table *oldtable;
79857 +       z_hash_table *newtable;
79858 +       reiser4_tree *tree;
79859 +
79860 +       assert("nikita-2018", node != NULL);
79861 +
79862 +       tree = znode_get_tree(node);
79863 +       oldtable = znode_get_htable(node);
79864 +       newtable = get_htable(tree, new_block_nr);
79865 +
79866 +       write_lock_tree(tree);
79867 +       /* remove znode from hash-table */
79868 +       z_hash_remove_rcu(oldtable, node);
79869 +
79870 +       /* assertion no longer valid due to RCU */
79871 +       /* assert("nikita-2019", z_hash_find(newtable, new_block_nr) == NULL); */
79872 +
79873 +       /* update blocknr */
79874 +       znode_set_block(node, new_block_nr);
79875 +       node->zjnode.key.z = *new_block_nr;
79876 +
79877 +       /* insert it into hash */
79878 +       z_hash_insert_rcu(newtable, node);
79879 +       write_unlock_tree(tree);
79880 +       return 0;
79881 +}
79882 +
79883 +/* ZNODE LOOKUP, GET, PUT */
79884 +
79885 +/* zlook() - get znode with given block_nr in a hash table or return NULL
79886 +
79887 +   If result is non-NULL then the znode's x_count is incremented.  Internal version
79888 +   accepts pre-computed hash index.  The hash table is accessed under caller's
79889 +   tree->hash_lock.
79890 +*/
79891 +znode *zlook(reiser4_tree * tree, const reiser4_block_nr * const blocknr)
79892 +{
79893 +       znode *result;
79894 +       __u32 hash;
79895 +       z_hash_table *htable;
79896 +
79897 +       assert("jmacd-506", tree != NULL);
79898 +       assert("jmacd-507", blocknr != NULL);
79899 +
79900 +       htable = get_htable(tree, blocknr);
79901 +       hash = blknrhashfn(htable, blocknr);
79902 +
79903 +       rcu_read_lock();
79904 +       result = z_hash_find_index(htable, hash, blocknr);
79905 +
79906 +       if (result != NULL) {
79907 +               add_x_ref(ZJNODE(result));
79908 +               result = znode_rip_check(tree, result);
79909 +       }
79910 +       rcu_read_unlock();
79911 +
79912 +       return result;
79913 +}
79914 +
79915 +/* return hash table where znode with block @blocknr is (or should be)
79916 + * stored */
79917 +static z_hash_table *get_htable(reiser4_tree * tree,
79918 +                               const reiser4_block_nr * const blocknr)
79919 +{
79920 +       z_hash_table *table;
79921 +       if (is_disk_addr_unallocated(blocknr))
79922 +               table = &tree->zfake_table;
79923 +       else
79924 +               table = &tree->zhash_table;
79925 +       return table;
79926 +}
79927 +
79928 +/* return hash table where znode @node is (or should be) stored */
79929 +static z_hash_table *znode_get_htable(const znode * node)
79930 +{
79931 +       return get_htable(znode_get_tree(node), znode_get_block(node));
79932 +}
79933 +
79934 +/* zget() - get znode from hash table, allocating it if necessary.
79935 +
79936 +   First a call to zlook, locating a x-referenced znode if one
79937 +   exists.  If znode is not found, allocate new one and return.  Result
79938 +   is returned with x_count reference increased.
79939 +
79940 +   LOCKS TAKEN:   TREE_LOCK, ZNODE_LOCK
79941 +   LOCK ORDERING: NONE
79942 +*/
79943 +znode *zget(reiser4_tree * tree,
79944 +           const reiser4_block_nr * const blocknr,
79945 +           znode * parent, tree_level level, gfp_t gfp_flag)
79946 +{
79947 +       znode *result;
79948 +       __u32 hashi;
79949 +
79950 +       z_hash_table *zth;
79951 +
79952 +       assert("jmacd-512", tree != NULL);
79953 +       assert("jmacd-513", blocknr != NULL);
79954 +       assert("jmacd-514", level < REISER4_MAX_ZTREE_HEIGHT);
79955 +
79956 +       zth = get_htable(tree, blocknr);
79957 +       hashi = blknrhashfn(zth, blocknr);
79958 +
79959 +       /* NOTE-NIKITA address-as-unallocated-blocknr still is not
79960 +          implemented. */
79961 +
79962 +       z_hash_prefetch_bucket(zth, hashi);
79963 +
79964 +       rcu_read_lock();
79965 +       /* Find a matching BLOCKNR in the hash table.  If the znode is found,
79966 +          we obtain an reference (x_count) but the znode remains unlocked.
79967 +          Have to worry about race conditions later. */
79968 +       result = z_hash_find_index(zth, hashi, blocknr);
79969 +       /* According to the current design, the hash table lock protects new
79970 +          znode references. */
79971 +       if (result != NULL) {
79972 +               add_x_ref(ZJNODE(result));
79973 +               /* NOTE-NIKITA it should be so, but special case during
79974 +                  creation of new root makes such assertion highly
79975 +                  complicated.  */
79976 +               assert("nikita-2131", 1 || znode_parent(result) == parent ||
79977 +                      (ZF_ISSET(result, JNODE_ORPHAN)
79978 +                       && (znode_parent(result) == NULL)));
79979 +               result = znode_rip_check(tree, result);
79980 +       }
79981 +
79982 +       rcu_read_unlock();
79983 +
79984 +       if (!result) {
79985 +               znode *shadow;
79986 +
79987 +               result = zalloc(gfp_flag);
79988 +               if (!result) {
79989 +                       return ERR_PTR(RETERR(-ENOMEM));
79990 +               }
79991 +
79992 +               zinit(result, parent, tree);
79993 +               ZJNODE(result)->blocknr = *blocknr;
79994 +               ZJNODE(result)->key.z = *blocknr;
79995 +               result->level = level;
79996 +
79997 +               write_lock_tree(tree);
79998 +
79999 +               shadow = z_hash_find_index(zth, hashi, blocknr);
80000 +               if (unlikely(shadow != NULL && !ZF_ISSET(shadow, JNODE_RIP))) {
80001 +                       jnode_list_remove(ZJNODE(result));
80002 +                       zfree(result);
80003 +                       result = shadow;
80004 +               } else {
80005 +                       result->version = znode_build_version(tree);
80006 +                       z_hash_insert_index_rcu(zth, hashi, result);
80007 +
80008 +                       if (parent != NULL)
80009 +                               ++parent->c_count;
80010 +               }
80011 +
80012 +               add_x_ref(ZJNODE(result));
80013 +
80014 +               write_unlock_tree(tree);
80015 +       }
80016 +#if REISER4_DEBUG
80017 +       if (!blocknr_is_fake(blocknr) && *blocknr != 0)
80018 +               reiser4_check_block(blocknr, 1);
80019 +#endif
80020 +       /* Check for invalid tree level, return -EIO */
80021 +       if (unlikely(znode_get_level(result) != level)) {
80022 +               warning("jmacd-504",
80023 +                       "Wrong level for cached block %llu: %i expecting %i",
80024 +                       (unsigned long long)(*blocknr), znode_get_level(result),
80025 +                       level);
80026 +               zput(result);
80027 +               return ERR_PTR(RETERR(-EIO));
80028 +       }
80029 +
80030 +       assert("nikita-1227", znode_invariant(result));
80031 +
80032 +       return result;
80033 +}
80034 +
80035 +/* ZNODE PLUGINS/DATA */
80036 +
80037 +/* "guess" plugin for node loaded from the disk. Plugin id of node plugin is
80038 +   stored at the fixed offset from the beginning of the node. */
80039 +static node_plugin *znode_guess_plugin(const znode * node      /* znode to guess
80040 +                                                                * plugin of */ )
80041 +{
80042 +       reiser4_tree *tree;
80043 +
80044 +       assert("nikita-1053", node != NULL);
80045 +       assert("nikita-1055", zdata(node) != NULL);
80046 +
80047 +       tree = znode_get_tree(node);
80048 +       assert("umka-053", tree != NULL);
80049 +
80050 +       if (reiser4_is_set(tree->super, REISER4_ONE_NODE_PLUGIN)) {
80051 +               return tree->nplug;
80052 +       } else {
80053 +               return node_plugin_by_disk_id
80054 +                   (tree, &((common_node_header *) zdata(node))->plugin_id);
80055 +#ifdef GUESS_EXISTS
80056 +               reiser4_plugin *plugin;
80057 +
80058 +               /* NOTE-NIKITA add locking here when dynamic plugins will be
80059 +                * implemented */
80060 +               for_all_plugins(REISER4_NODE_PLUGIN_TYPE, plugin) {
80061 +                       if ((plugin->u.node.guess != NULL)
80062 +                           && plugin->u.node.guess(node))
80063 +                               return plugin;
80064 +               }
80065 +               warning("nikita-1057", "Cannot guess node plugin");
80066 +               print_znode("node", node);
80067 +               return NULL;
80068 +#endif
80069 +       }
80070 +}
80071 +
80072 +/* parse node header and install ->node_plugin */
80073 +int zparse(znode * node /* znode to parse */ )
80074 +{
80075 +       int result;
80076 +
80077 +       assert("nikita-1233", node != NULL);
80078 +       assert("nikita-2370", zdata(node) != NULL);
80079 +
80080 +       if (node->nplug == NULL) {
80081 +               node_plugin *nplug;
80082 +
80083 +               nplug = znode_guess_plugin(node);
80084 +               if (likely(nplug != NULL)) {
80085 +                       result = nplug->parse(node);
80086 +                       if (likely(result == 0))
80087 +                               node->nplug = nplug;
80088 +               } else {
80089 +                       result = RETERR(-EIO);
80090 +               }
80091 +       } else
80092 +               result = 0;
80093 +       return result;
80094 +}
80095 +
80096 +/* zload with readahead */
80097 +int zload_ra(znode * node /* znode to load */ , ra_info_t * info)
80098 +{
80099 +       int result;
80100 +
80101 +       assert("nikita-484", node != NULL);
80102 +       assert("nikita-1377", znode_invariant(node));
80103 +       assert("jmacd-7771", !znode_above_root(node));
80104 +       assert("nikita-2125", atomic_read(&ZJNODE(node)->x_count) > 0);
80105 +       assert("nikita-3016", schedulable());
80106 +
80107 +       if (info)
80108 +               formatted_readahead(node, info);
80109 +
80110 +       result = jload(ZJNODE(node));
80111 +       assert("nikita-1378", znode_invariant(node));
80112 +       return result;
80113 +}
80114 +
80115 +/* load content of node into memory */
80116 +int zload(znode * node)
80117 +{
80118 +       return zload_ra(node, NULL);
80119 +}
80120 +
80121 +/* call node plugin to initialise newly allocated node. */
80122 +int zinit_new(znode * node /* znode to initialise */ , gfp_t gfp_flags)
80123 +{
80124 +       return jinit_new(ZJNODE(node), gfp_flags);
80125 +}
80126 +
80127 +/* drop reference to node data. When last reference is dropped, data are
80128 +   unloaded. */
80129 +void zrelse(znode * node /* znode to release references to */ )
80130 +{
80131 +       assert("nikita-1381", znode_invariant(node));
80132 +
80133 +       jrelse(ZJNODE(node));
80134 +}
80135 +
80136 +/* returns free space in node */
80137 +unsigned znode_free_space(znode * node /* znode to query */ )
80138 +{
80139 +       assert("nikita-852", node != NULL);
80140 +       return node_plugin_by_node(node)->free_space(node);
80141 +}
80142 +
80143 +/* left delimiting key of znode */
80144 +reiser4_key *znode_get_rd_key(znode * node /* znode to query */ )
80145 +{
80146 +       assert("nikita-958", node != NULL);
80147 +       assert_rw_locked(&(znode_get_tree(node)->dk_lock));
80148 +       assert("nikita-3067", LOCK_CNT_GTZ(rw_locked_dk));
80149 +       assert("nikita-30671", node->rd_key_version != 0);
80150 +       return &node->rd_key;
80151 +}
80152 +
80153 +/* right delimiting key of znode */
80154 +reiser4_key *znode_get_ld_key(znode * node /* znode to query */ )
80155 +{
80156 +       assert("nikita-974", node != NULL);
80157 +       assert_rw_locked(&(znode_get_tree(node)->dk_lock));
80158 +       assert("nikita-3068", LOCK_CNT_GTZ(rw_locked_dk));
80159 +       assert("nikita-30681", node->ld_key_version != 0);
80160 +       return &node->ld_key;
80161 +}
80162 +
80163 +ON_DEBUG(atomic_t delim_key_version = ATOMIC_INIT(0);
80164 +    )
80165 +
80166 +/* update right-delimiting key of @node */
80167 +reiser4_key *znode_set_rd_key(znode * node, const reiser4_key * key)
80168 +{
80169 +       assert("nikita-2937", node != NULL);
80170 +       assert("nikita-2939", key != NULL);
80171 +       assert_rw_write_locked(&(znode_get_tree(node)->dk_lock));
80172 +       assert("nikita-3069", LOCK_CNT_GTZ(write_locked_dk));
80173 +       assert("nikita-2944",
80174 +              znode_is_any_locked(node) ||
80175 +              znode_get_level(node) != LEAF_LEVEL ||
80176 +              keyge(key, &node->rd_key) ||
80177 +              keyeq(&node->rd_key, min_key()) ||
80178 +              ZF_ISSET(node, JNODE_HEARD_BANSHEE));
80179 +
80180 +       node->rd_key = *key;
80181 +       ON_DEBUG(node->rd_key_version = atomic_inc_return(&delim_key_version));
80182 +       return &node->rd_key;
80183 +}
80184 +
80185 +/* update left-delimiting key of @node */
80186 +reiser4_key *znode_set_ld_key(znode * node, const reiser4_key * key)
80187 +{
80188 +       assert("nikita-2940", node != NULL);
80189 +       assert("nikita-2941", key != NULL);
80190 +       assert_rw_write_locked(&(znode_get_tree(node)->dk_lock));
80191 +       assert("nikita-3070", LOCK_CNT_GTZ(write_locked_dk));
80192 +       assert("nikita-2943",
80193 +              znode_is_any_locked(node) || keyeq(&node->ld_key, min_key()));
80194 +
80195 +       node->ld_key = *key;
80196 +       ON_DEBUG(node->ld_key_version = atomic_inc_return(&delim_key_version));
80197 +       return &node->ld_key;
80198 +}
80199 +
80200 +/* true if @key is inside key range for @node */
80201 +int znode_contains_key(znode * node /* znode to look in */ ,
80202 +                      const reiser4_key * key /* key to look for */ )
80203 +{
80204 +       assert("nikita-1237", node != NULL);
80205 +       assert("nikita-1238", key != NULL);
80206 +
80207 +       /* left_delimiting_key <= key <= right_delimiting_key */
80208 +       return keyle(znode_get_ld_key(node), key)
80209 +           && keyle(key, znode_get_rd_key(node));
80210 +}
80211 +
80212 +/* same as znode_contains_key(), but lock dk lock */
80213 +int znode_contains_key_lock(znode * node /* znode to look in */ ,
80214 +                           const reiser4_key * key /* key to look for */ )
80215 +{
80216 +       int result;
80217 +
80218 +       assert("umka-056", node != NULL);
80219 +       assert("umka-057", key != NULL);
80220 +
80221 +       read_lock_dk(znode_get_tree(node));
80222 +       result = znode_contains_key(node, key);
80223 +       read_unlock_dk(znode_get_tree(node));
80224 +       return result;
80225 +}
80226 +
80227 +/* get parent pointer, assuming tree is not locked */
80228 +znode *znode_parent_nolock(const znode * node /* child znode */ )
80229 +{
80230 +       assert("nikita-1444", node != NULL);
80231 +       return node->in_parent.node;
80232 +}
80233 +
80234 +/* get parent pointer of znode */
80235 +znode *znode_parent(const znode * node /* child znode */ )
80236 +{
80237 +       assert("nikita-1226", node != NULL);
80238 +       assert("nikita-1406", LOCK_CNT_GTZ(rw_locked_tree));
80239 +       return znode_parent_nolock(node);
80240 +}
80241 +
80242 +/* detect uber znode used to protect in-superblock tree root pointer */
80243 +int znode_above_root(const znode * node /* znode to query */ )
80244 +{
80245 +       assert("umka-059", node != NULL);
80246 +
80247 +       return disk_addr_eq(&ZJNODE(node)->blocknr, &UBER_TREE_ADDR);
80248 +}
80249 +
80250 +/* check that @node is root---that its block number is recorder in the tree as
80251 +   that of root node */
80252 +#if REISER4_DEBUG
80253 +static int znode_is_true_root(const znode * node /* znode to query */ )
80254 +{
80255 +       assert("umka-060", node != NULL);
80256 +       assert("umka-061", current_tree != NULL);
80257 +
80258 +       return disk_addr_eq(znode_get_block(node),
80259 +                           &znode_get_tree(node)->root_block);
80260 +}
80261 +#endif
80262 +
80263 +/* check that @node is root */
80264 +int znode_is_root(const znode * node /* znode to query */ )
80265 +{
80266 +       assert("nikita-1206", node != NULL);
80267 +
80268 +       return znode_get_level(node) == znode_get_tree(node)->height;
80269 +}
80270 +
80271 +/* Returns true is @node was just created by zget() and wasn't ever loaded
80272 +   into memory. */
80273 +/* NIKITA-HANS: yes */
80274 +int znode_just_created(const znode * node)
80275 +{
80276 +       assert("nikita-2188", node != NULL);
80277 +       return (znode_page(node) == NULL);
80278 +}
80279 +
80280 +/* obtain updated ->znode_epoch. See seal.c for description. */
80281 +__u64 znode_build_version(reiser4_tree * tree)
80282 +{
80283 +       __u64 result;
80284 +
80285 +       spin_lock(&tree->epoch_lock);
80286 +       result = ++tree->znode_epoch;
80287 +       spin_unlock(&tree->epoch_lock);
80288 +       return result;
80289 +}
80290 +
80291 +void init_load_count(load_count * dh)
80292 +{
80293 +       assert("nikita-2105", dh != NULL);
80294 +       memset(dh, 0, sizeof *dh);
80295 +}
80296 +
80297 +void done_load_count(load_count * dh)
80298 +{
80299 +       assert("nikita-2106", dh != NULL);
80300 +       if (dh->node != NULL) {
80301 +               for (; dh->d_ref > 0; --dh->d_ref)
80302 +                       zrelse(dh->node);
80303 +               dh->node = NULL;
80304 +       }
80305 +}
80306 +
80307 +static int incr_load_count(load_count * dh)
80308 +{
80309 +       int result;
80310 +
80311 +       assert("nikita-2110", dh != NULL);
80312 +       assert("nikita-2111", dh->node != NULL);
80313 +
80314 +       result = zload(dh->node);
80315 +       if (result == 0)
80316 +               ++dh->d_ref;
80317 +       return result;
80318 +}
80319 +
80320 +int incr_load_count_znode(load_count * dh, znode * node)
80321 +{
80322 +       assert("nikita-2107", dh != NULL);
80323 +       assert("nikita-2158", node != NULL);
80324 +       assert("nikita-2109",
80325 +              ergo(dh->node != NULL, (dh->node == node) || (dh->d_ref == 0)));
80326 +
80327 +       dh->node = node;
80328 +       return incr_load_count(dh);
80329 +}
80330 +
80331 +int incr_load_count_jnode(load_count * dh, jnode * node)
80332 +{
80333 +       if (jnode_is_znode(node)) {
80334 +               return incr_load_count_znode(dh, JZNODE(node));
80335 +       }
80336 +       return 0;
80337 +}
80338 +
80339 +void copy_load_count(load_count * new, load_count * old)
80340 +{
80341 +       int ret = 0;
80342 +       done_load_count(new);
80343 +       new->node = old->node;
80344 +       new->d_ref = 0;
80345 +
80346 +       while ((new->d_ref < old->d_ref) && (ret = incr_load_count(new)) == 0) {
80347 +       }
80348 +
80349 +       assert("jmacd-87589", ret == 0);
80350 +}
80351 +
80352 +void move_load_count(load_count * new, load_count * old)
80353 +{
80354 +       done_load_count(new);
80355 +       new->node = old->node;
80356 +       new->d_ref = old->d_ref;
80357 +       old->node = NULL;
80358 +       old->d_ref = 0;
80359 +}
80360 +
80361 +/* convert parent pointer into coord */
80362 +void parent_coord_to_coord(const parent_coord_t * pcoord, coord_t * coord)
80363 +{
80364 +       assert("nikita-3204", pcoord != NULL);
80365 +       assert("nikita-3205", coord != NULL);
80366 +
80367 +       coord_init_first_unit_nocheck(coord, pcoord->node);
80368 +       coord_set_item_pos(coord, pcoord->item_pos);
80369 +       coord->between = AT_UNIT;
80370 +}
80371 +
80372 +/* pack coord into parent_coord_t */
80373 +void coord_to_parent_coord(const coord_t * coord, parent_coord_t * pcoord)
80374 +{
80375 +       assert("nikita-3206", pcoord != NULL);
80376 +       assert("nikita-3207", coord != NULL);
80377 +
80378 +       pcoord->node = coord->node;
80379 +       pcoord->item_pos = coord->item_pos;
80380 +}
80381 +
80382 +/* Initialize a parent hint pointer. (parent hint pointer is a field in znode,
80383 +   look for comments there) */
80384 +void init_parent_coord(parent_coord_t * pcoord, const znode * node)
80385 +{
80386 +       pcoord->node = (znode *) node;
80387 +       pcoord->item_pos = (unsigned short)~0;
80388 +}
80389 +
80390 +#if REISER4_DEBUG
80391 +
80392 +/* debugging aid: znode invariant */
80393 +static int znode_invariant_f(const znode * node /* znode to check */ ,
80394 +                            char const **msg   /* where to store error
80395 +                                                * message, if any */ )
80396 +{
80397 +#define _ergo(ant, con)                                                \
80398 +       ((*msg) = "{" #ant "} ergo {" #con "}", ergo((ant), (con)))
80399 +
80400 +#define _equi(e1, e2)                                          \
80401 +       ((*msg) = "{" #e1 "} <=> {" #e2 "}", equi((e1), (e2)))
80402 +
80403 +#define _check(exp) ((*msg) = #exp, (exp))
80404 +
80405 +       return jnode_invariant_f(ZJNODE(node), msg) &&
80406 +           /* [znode-fake] invariant */
80407 +           /* fake znode doesn't have a parent, and */
80408 +           _ergo(znode_get_level(node) == 0, znode_parent(node) == NULL) &&
80409 +           /* there is another way to express this very check, and */
80410 +           _ergo(znode_above_root(node), znode_parent(node) == NULL) &&
80411 +           /* it has special block number, and */
80412 +           _ergo(znode_get_level(node) == 0,
80413 +                 disk_addr_eq(znode_get_block(node), &UBER_TREE_ADDR)) &&
80414 +           /* it is the only znode with such block number, and */
80415 +           _ergo(!znode_above_root(node) && znode_is_loaded(node),
80416 +                 !disk_addr_eq(znode_get_block(node), &UBER_TREE_ADDR)) &&
80417 +           /* it is parent of the tree root node */
80418 +           _ergo(znode_is_true_root(node),
80419 +                 znode_above_root(znode_parent(node))) &&
80420 +           /* [znode-level] invariant */
80421 +           /* level of parent znode is one larger than that of child,
80422 +              except for the fake znode, and */
80423 +           _ergo(znode_parent(node) && !znode_above_root(znode_parent(node)),
80424 +                 znode_get_level(znode_parent(node)) ==
80425 +                 znode_get_level(node) + 1) &&
80426 +           /* left neighbor is at the same level, and */
80427 +           _ergo(znode_is_left_connected(node) && node->left != NULL,
80428 +                 znode_get_level(node) == znode_get_level(node->left)) &&
80429 +           /* right neighbor is at the same level */
80430 +           _ergo(znode_is_right_connected(node) && node->right != NULL,
80431 +                 znode_get_level(node) == znode_get_level(node->right)) &&
80432 +           /* [znode-connected] invariant */
80433 +           _ergo(node->left != NULL, znode_is_left_connected(node)) &&
80434 +           _ergo(node->right != NULL, znode_is_right_connected(node)) &&
80435 +           _ergo(!znode_is_root(node) && node->left != NULL,
80436 +                 znode_is_right_connected(node->left) &&
80437 +                 node->left->right == node) &&
80438 +           _ergo(!znode_is_root(node) && node->right != NULL,
80439 +                 znode_is_left_connected(node->right) &&
80440 +                 node->right->left == node) &&
80441 +           /* [znode-c_count] invariant */
80442 +           /* for any znode, c_count of its parent is greater than 0 */
80443 +           _ergo(znode_parent(node) != NULL &&
80444 +                 !znode_above_root(znode_parent(node)),
80445 +                 znode_parent(node)->c_count > 0) &&
80446 +           /* leaves don't have children */
80447 +           _ergo(znode_get_level(node) == LEAF_LEVEL,
80448 +                 node->c_count == 0) &&
80449 +           _check(node->zjnode.jnodes.prev != NULL) &&
80450 +           _check(node->zjnode.jnodes.next != NULL) &&
80451 +           /* orphan doesn't have a parent */
80452 +           _ergo(ZF_ISSET(node, JNODE_ORPHAN), znode_parent(node) == 0) &&
80453 +           /* [znode-modify] invariant */
80454 +           /* if znode is not write-locked, its checksum remains
80455 +            * invariant */
80456 +           /* unfortunately, zlock is unordered w.r.t. jnode_lock, so we
80457 +            * cannot check this. */
80458 +           /* [znode-refs] invariant */
80459 +           /* only referenced znode can be long-term locked */
80460 +           _ergo(znode_is_locked(node),
80461 +                 atomic_read(&ZJNODE(node)->x_count) != 0);
80462 +}
80463 +
80464 +/* debugging aid: check znode invariant and panic if it doesn't hold */
80465 +int znode_invariant(znode * node /* znode to check */ )
80466 +{
80467 +       char const *failed_msg;
80468 +       int result;
80469 +
80470 +       assert("umka-063", node != NULL);
80471 +       assert("umka-064", current_tree != NULL);
80472 +
80473 +       spin_lock_znode(node);
80474 +       read_lock_tree(znode_get_tree(node));
80475 +       result = znode_invariant_f(node, &failed_msg);
80476 +       if (!result) {
80477 +               /* print_znode("corrupted node", node); */
80478 +               warning("jmacd-555", "Condition %s failed", failed_msg);
80479 +       }
80480 +       read_unlock_tree(znode_get_tree(node));
80481 +       spin_unlock_znode(node);
80482 +       return result;
80483 +}
80484 +
80485 +/* return non-0 iff data are loaded into znode */
80486 +int znode_is_loaded(const znode * node /* znode to query */ )
80487 +{
80488 +       assert("nikita-497", node != NULL);
80489 +       return jnode_is_loaded(ZJNODE(node));
80490 +}
80491 +
80492 +unsigned long znode_times_locked(const znode * z)
80493 +{
80494 +       return z->times_locked;
80495 +}
80496 +
80497 +#endif                         /* REISER4_DEBUG */
80498 +
80499 +/* Make Linus happy.
80500 +   Local variables:
80501 +   c-indentation-style: "K&R"
80502 +   mode-name: "LC"
80503 +   c-basic-offset: 8
80504 +   tab-width: 8
80505 +   fill-column: 120
80506 +   End:
80507 +*/
80508 diff --git a/fs/reiser4/znode.h b/fs/reiser4/znode.h
80509 new file mode 100644
80510 index 0000000..a49d2ae
80511 --- /dev/null
80512 +++ b/fs/reiser4/znode.h
80513 @@ -0,0 +1,434 @@
80514 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
80515 + * reiser4/README */
80516 +
80517 +/* Declaration of znode (Zam's node). See znode.c for more details. */
80518 +
80519 +#ifndef __ZNODE_H__
80520 +#define __ZNODE_H__
80521 +
80522 +#include "forward.h"
80523 +#include "debug.h"
80524 +#include "dformat.h"
80525 +#include "key.h"
80526 +#include "coord.h"
80527 +#include "plugin/node/node.h"
80528 +#include "jnode.h"
80529 +#include "lock.h"
80530 +#include "readahead.h"
80531 +
80532 +#include <linux/types.h>
80533 +#include <linux/spinlock.h>
80534 +#include <linux/pagemap.h>     /* for PAGE_CACHE_SIZE */
80535 +#include <asm/atomic.h>
80536 +#include <asm/semaphore.h>
80537 +
80538 +/* znode tracks its position within parent (internal item in a parent node,
80539 + * that contains znode's block number). */
80540 +typedef struct parent_coord {
80541 +       znode *node;
80542 +       pos_in_node_t item_pos;
80543 +} parent_coord_t;
80544 +
80545 +/* &znode - node in a reiser4 tree.
80546 +
80547 +   NOTE-NIKITA fields in this struct have to be rearranged (later) to reduce
80548 +   cacheline pressure.
80549 +
80550 +   Locking:
80551 +
80552 +   Long term: data in a disk node attached to this znode are protected
80553 +   by long term, deadlock aware lock ->lock;
80554 +
80555 +   Spin lock: the following fields are protected by the spin lock:
80556 +
80557 +    ->lock
80558 +
80559 +   Following fields are protected by the global tree lock:
80560 +
80561 +    ->left
80562 +    ->right
80563 +    ->in_parent
80564 +    ->c_count
80565 +
80566 +   Following fields are protected by the global delimiting key lock (dk_lock):
80567 +
80568 +    ->ld_key (to update ->ld_key long-term lock on the node is also required)
80569 +    ->rd_key
80570 +
80571 +   Following fields are protected by the long term lock:
80572 +
80573 +    ->nr_items
80574 +
80575 +   ->node_plugin is never changed once set. This means that after code made
80576 +   itself sure that field is valid it can be accessed without any additional
80577 +   locking.
80578 +
80579 +   ->level is immutable.
80580 +
80581 +   Invariants involving this data-type:
80582 +
80583 +      [znode-fake]
80584 +      [znode-level]
80585 +      [znode-connected]
80586 +      [znode-c_count]
80587 +      [znode-refs]
80588 +      [jnode-refs]
80589 +      [jnode-queued]
80590 +      [znode-modify]
80591 +
80592 +    For this to be made into a clustering or NUMA filesystem, we would want to eliminate all of the global locks.
80593 +    Suggestions for how to do that are desired.*/
80594 +struct znode {
80595 +       /* Embedded jnode. */
80596 +       jnode zjnode;
80597 +
80598 +       /* contains three subfields, node, pos_in_node, and pos_in_unit.
80599 +
80600 +          pos_in_node and pos_in_unit are only hints that are cached to
80601 +          speed up lookups during balancing. They are not required to be up to
80602 +          date. Synched in find_child_ptr().
80603 +
80604 +          This value allows us to avoid expensive binary searches.
80605 +
80606 +          in_parent->node points to the parent of this node, and is NOT a
80607 +          hint.
80608 +        */
80609 +       parent_coord_t in_parent;
80610 +
80611 +       /*
80612 +        * sibling list pointers
80613 +        */
80614 +
80615 +       /* left-neighbor */
80616 +       znode *left;
80617 +       /* right-neighbor */
80618 +       znode *right;
80619 +
80620 +       /* long term lock on node content. This lock supports deadlock
80621 +          detection. See lock.c
80622 +        */
80623 +       zlock lock;
80624 +
80625 +       /* You cannot remove from memory a node that has children in
80626 +          memory. This is because we rely on the fact that parent of given
80627 +          node can always be reached without blocking for io. When reading a
80628 +          node into memory you must increase the c_count of its parent, when
80629 +          removing it from memory you must decrease the c_count.  This makes
80630 +          the code simpler, and the cases where it is suboptimal are truly
80631 +          obscure.
80632 +        */
80633 +       int c_count;
80634 +
80635 +       /* plugin of node attached to this znode. NULL if znode is not
80636 +          loaded. */
80637 +       node_plugin *nplug;
80638 +
80639 +       /* version of znode data. This is increased on each modification. This
80640 +        * is necessary to implement seals (see seal.[ch]) efficiently. */
80641 +       __u64 version;
80642 +
80643 +       /* left delimiting key. Necessary to efficiently perform
80644 +          balancing with node-level locking. Kept in memory only. */
80645 +       reiser4_key ld_key;
80646 +       /* right delimiting key. */
80647 +       reiser4_key rd_key;
80648 +
80649 +       /* znode's tree level */
80650 +       __u16 level;
80651 +       /* number of items in this node. This field is modified by node
80652 +        * plugin. */
80653 +       __u16 nr_items;
80654 +
80655 +#if REISER4_DEBUG
80656 +       void *creator;
80657 +       reiser4_key first_key;
80658 +       unsigned long times_locked;
80659 +       int left_version;       /* when node->left was updated */
80660 +       int right_version;      /* when node->right was updated */
80661 +       int ld_key_version;     /* when node->ld_key was updated */
80662 +       int rd_key_version;     /* when node->rd_key was updated */
80663 +#endif
80664 +
80665 +} __attribute__ ((aligned(16)));
80666 +
80667 +ON_DEBUG(extern atomic_t delim_key_version;
80668 +    )
80669 +
80670 +/* In general I think these macros should not be exposed. */
80671 +#define znode_is_locked(node)          (lock_is_locked(&node->lock))
80672 +#define znode_is_rlocked(node)         (lock_is_rlocked(&node->lock))
80673 +#define znode_is_wlocked(node)         (lock_is_wlocked(&node->lock))
80674 +#define znode_is_wlocked_once(node)    (lock_is_wlocked_once(&node->lock))
80675 +#define znode_can_be_rlocked(node)     (lock_can_be_rlocked(&node->lock))
80676 +#define is_lock_compatible(node, mode) (lock_mode_compatible(&node->lock, mode))
80677 +/* Macros for accessing the znode state. */
80678 +#define        ZF_CLR(p,f)             JF_CLR  (ZJNODE(p), (f))
80679 +#define        ZF_ISSET(p,f)           JF_ISSET(ZJNODE(p), (f))
80680 +#define        ZF_SET(p,f)             JF_SET  (ZJNODE(p), (f))
80681 +extern znode *zget(reiser4_tree * tree, const reiser4_block_nr * const block,
80682 +                  znode * parent, tree_level level, gfp_t gfp_flag);
80683 +extern znode *zlook(reiser4_tree * tree, const reiser4_block_nr * const block);
80684 +extern int zload(znode * node);
80685 +extern int zload_ra(znode * node, ra_info_t * info);
80686 +extern int zinit_new(znode * node, gfp_t gfp_flags);
80687 +extern void zrelse(znode * node);
80688 +extern void znode_change_parent(znode * new_parent, reiser4_block_nr * block);
80689 +
80690 +/* size of data in znode */
80691 +static inline unsigned
80692 +znode_size(const znode * node UNUSED_ARG /* znode to query */ )
80693 +{
80694 +       assert("nikita-1416", node != NULL);
80695 +       return PAGE_CACHE_SIZE;
80696 +}
80697 +
80698 +extern void parent_coord_to_coord(const parent_coord_t * pcoord,
80699 +                                 coord_t * coord);
80700 +extern void coord_to_parent_coord(const coord_t * coord,
80701 +                                 parent_coord_t * pcoord);
80702 +extern void init_parent_coord(parent_coord_t * pcoord, const znode * node);
80703 +
80704 +extern unsigned znode_free_space(znode * node);
80705 +
80706 +extern reiser4_key *znode_get_rd_key(znode * node);
80707 +extern reiser4_key *znode_get_ld_key(znode * node);
80708 +
80709 +extern reiser4_key *znode_set_rd_key(znode * node, const reiser4_key * key);
80710 +extern reiser4_key *znode_set_ld_key(znode * node, const reiser4_key * key);
80711 +
80712 +/* `connected' state checks */
80713 +static inline int znode_is_right_connected(const znode * node)
80714 +{
80715 +       return ZF_ISSET(node, JNODE_RIGHT_CONNECTED);
80716 +}
80717 +
80718 +static inline int znode_is_left_connected(const znode * node)
80719 +{
80720 +       return ZF_ISSET(node, JNODE_LEFT_CONNECTED);
80721 +}
80722 +
80723 +static inline int znode_is_connected(const znode * node)
80724 +{
80725 +       return znode_is_right_connected(node) && znode_is_left_connected(node);
80726 +}
80727 +
80728 +extern int znode_shift_order;
80729 +extern int znode_rehash(znode * node, const reiser4_block_nr * new_block_nr);
80730 +extern void znode_remove(znode *, reiser4_tree *);
80731 +extern znode *znode_parent(const znode * node);
80732 +extern znode *znode_parent_nolock(const znode * node);
80733 +extern int znode_above_root(const znode * node);
80734 +extern int init_znodes(void);
80735 +extern void done_znodes(void);
80736 +extern int znodes_tree_init(reiser4_tree * ztree);
80737 +extern void znodes_tree_done(reiser4_tree * ztree);
80738 +extern int znode_contains_key(znode * node, const reiser4_key * key);
80739 +extern int znode_contains_key_lock(znode * node, const reiser4_key * key);
80740 +extern unsigned znode_save_free_space(znode * node);
80741 +extern unsigned znode_recover_free_space(znode * node);
80742 +extern znode *zalloc(gfp_t gfp_flag);
80743 +extern void zinit(znode *, const znode * parent, reiser4_tree *);
80744 +extern int zparse(znode * node);
80745 +
80746 +
80747 +extern int znode_just_created(const znode * node);
80748 +
80749 +extern void zfree(znode * node);
80750 +
80751 +#if REISER4_DEBUG
80752 +extern void print_znode(const char *prefix, const znode * node);
80753 +#else
80754 +#define print_znode( p, n ) noop
80755 +#endif
80756 +
80757 +/* Make it look like various znode functions exist instead of treating znodes as
80758 +   jnodes in znode-specific code. */
80759 +#define znode_page(x)               jnode_page ( ZJNODE(x) )
80760 +#define zdata(x)                    jdata ( ZJNODE(x) )
80761 +#define znode_get_block(x)          jnode_get_block ( ZJNODE(x) )
80762 +#define znode_created(x)            jnode_created ( ZJNODE(x) )
80763 +#define znode_set_created(x)        jnode_set_created ( ZJNODE(x) )
80764 +#define znode_convertible(x)        jnode_convertible (ZJNODE(x))
80765 +#define znode_set_convertible(x)    jnode_set_convertible (ZJNODE(x))
80766 +
80767 +#define znode_is_dirty(x)           jnode_is_dirty    ( ZJNODE(x) )
80768 +#define znode_check_dirty(x)        jnode_check_dirty ( ZJNODE(x) )
80769 +#define znode_make_clean(x)         jnode_make_clean   ( ZJNODE(x) )
80770 +#define znode_set_block(x, b)       jnode_set_block ( ZJNODE(x), (b) )
80771 +
80772 +#define spin_lock_znode(x)          spin_lock_jnode ( ZJNODE(x) )
80773 +#define spin_unlock_znode(x)        spin_unlock_jnode ( ZJNODE(x) )
80774 +#define spin_trylock_znode(x)       spin_trylock_jnode ( ZJNODE(x) )
80775 +#define spin_znode_is_locked(x)     spin_jnode_is_locked ( ZJNODE(x) )
80776 +#define spin_znode_is_not_locked(x) spin_jnode_is_not_locked ( ZJNODE(x) )
80777 +
80778 +#if REISER4_DEBUG
80779 +extern int znode_x_count_is_protected(const znode * node);
80780 +extern int znode_invariant(znode * node);
80781 +#endif
80782 +
80783 +/* acquire reference to @node */
80784 +static inline znode *zref(znode * node)
80785 +{
80786 +       /* change of x_count from 0 to 1 is protected by tree spin-lock */
80787 +       return JZNODE(jref(ZJNODE(node)));
80788 +}
80789 +
80790 +/* release reference to @node */
80791 +static inline void zput(znode * node)
80792 +{
80793 +       assert("nikita-3564", znode_invariant(node));
80794 +       jput(ZJNODE(node));
80795 +}
80796 +
80797 +/* get the level field for a znode */
80798 +static inline tree_level znode_get_level(const znode * node)
80799 +{
80800 +       return node->level;
80801 +}
80802 +
80803 +/* get the level field for a jnode */
80804 +static inline tree_level jnode_get_level(const jnode * node)
80805 +{
80806 +       if (jnode_is_znode(node))
80807 +               return znode_get_level(JZNODE(node));
80808 +       else
80809 +               /* unformatted nodes are all at the LEAF_LEVEL and for
80810 +                  "semi-formatted" nodes like bitmaps, level doesn't matter. */
80811 +               return LEAF_LEVEL;
80812 +}
80813 +
80814 +/* true if jnode is on leaf level */
80815 +static inline int jnode_is_leaf(const jnode * node)
80816 +{
80817 +       if (jnode_is_znode(node))
80818 +               return (znode_get_level(JZNODE(node)) == LEAF_LEVEL);
80819 +       if (jnode_get_type(node) == JNODE_UNFORMATTED_BLOCK)
80820 +               return 1;
80821 +       return 0;
80822 +}
80823 +
80824 +/* return znode's tree */
80825 +static inline reiser4_tree *znode_get_tree(const znode * node)
80826 +{
80827 +       assert("nikita-2692", node != NULL);
80828 +       return jnode_get_tree(ZJNODE(node));
80829 +}
80830 +
80831 +/* resolve race with zput */
80832 +static inline znode *znode_rip_check(reiser4_tree * tree, znode * node)
80833 +{
80834 +       jnode *j;
80835 +
80836 +       j = jnode_rip_sync(tree, ZJNODE(node));
80837 +       if (likely(j != NULL))
80838 +               node = JZNODE(j);
80839 +       else
80840 +               node = NULL;
80841 +       return node;
80842 +}
80843 +
80844 +#if defined(REISER4_DEBUG)
80845 +int znode_is_loaded(const znode * node /* znode to query */ );
80846 +#endif
80847 +
80848 +extern __u64 znode_build_version(reiser4_tree * tree);
80849 +
80850 +/* Data-handles.  A data handle object manages pairing calls to zload() and zrelse().  We
80851 +   must load the data for a node in many places.  We could do this by simply calling
80852 +   zload() everywhere, the difficulty arises when we must release the loaded data by
80853 +   calling zrelse.  In a function with many possible error/return paths, it requires extra
80854 +   work to figure out which exit paths must call zrelse and those which do not.  The data
80855 +   handle automatically calls zrelse for every zload that it is responsible for.  In that
80856 +   sense, it acts much like a lock_handle.
80857 +*/
80858 +typedef struct load_count {
80859 +       znode *node;
80860 +       int d_ref;
80861 +} load_count;
80862 +
80863 +extern void init_load_count(load_count * lc);  /* Initialize a load_count set the current node to NULL. */
80864 +extern void done_load_count(load_count * dh);  /* Finalize a load_count: call zrelse() if necessary */
80865 +extern int incr_load_count_znode(load_count * dh, znode * node);       /* Set the argument znode to the current node, call zload(). */
80866 +extern int incr_load_count_jnode(load_count * dh, jnode * node);       /* If the argument jnode is formatted, do the same as
80867 +                                                                        * incr_load_count_znode, otherwise do nothing (unformatted nodes
80868 +                                                                        * don't require zload/zrelse treatment). */
80869 +extern void move_load_count(load_count * new, load_count * old);       /* Move the contents of a load_count.  Old handle is released. */
80870 +extern void copy_load_count(load_count * new, load_count * old);       /* Copy the contents of a load_count.  Old handle remains held. */
80871 +
80872 +/* Variable initializers for load_count. */
80873 +#define INIT_LOAD_COUNT ( load_count * ){ .node = NULL, .d_ref = 0 }
80874 +#define INIT_LOAD_COUNT_NODE( n ) ( load_count ){ .node = ( n ), .d_ref = 0 }
80875 +/* A convenience macro for use in assertions or debug-only code, where loaded
80876 +   data is only required to perform the debugging check.  This macro
80877 +   encapsulates an expression inside a pair of calls to zload()/zrelse(). */
80878 +#define WITH_DATA( node, exp )                         \
80879 +({                                                     \
80880 +       long __with_dh_result;                          \
80881 +       znode *__with_dh_node;                          \
80882 +                                                       \
80883 +       __with_dh_node = ( node );                      \
80884 +       __with_dh_result = zload( __with_dh_node );     \
80885 +       if( __with_dh_result == 0 ) {                   \
80886 +               __with_dh_result = ( long )( exp );     \
80887 +               zrelse( __with_dh_node );               \
80888 +       }                                               \
80889 +       __with_dh_result;                               \
80890 +})
80891 +
80892 +/* Same as above, but accepts a return value in case zload fails. */
80893 +#define WITH_DATA_RET( node, ret, exp )                        \
80894 +({                                                     \
80895 +       int __with_dh_result;                           \
80896 +       znode *__with_dh_node;                          \
80897 +                                                       \
80898 +       __with_dh_node = ( node );                      \
80899 +       __with_dh_result = zload( __with_dh_node );     \
80900 +       if( __with_dh_result == 0 ) {                   \
80901 +               __with_dh_result = ( int )( exp );      \
80902 +               zrelse( __with_dh_node );               \
80903 +       } else                                          \
80904 +               __with_dh_result = ( ret );             \
80905 +       __with_dh_result;                               \
80906 +})
80907 +
80908 +#define WITH_COORD(coord, exp)                 \
80909 +({                                             \
80910 +       coord_t *__coord;                       \
80911 +                                               \
80912 +       __coord = (coord);                      \
80913 +       coord_clear_iplug(__coord);             \
80914 +       WITH_DATA(__coord->node, exp);          \
80915 +})
80916 +
80917 +#if REISER4_DEBUG
80918 +#define STORE_COUNTERS                                         \
80919 +       lock_counters_info __entry_counters = *lock_counters()
80920 +#define CHECK_COUNTERS                                         \
80921 +ON_DEBUG_CONTEXT(                                              \
80922 +({                                                             \
80923 +       __entry_counters.x_refs = lock_counters() -> x_refs;    \
80924 +       __entry_counters.t_refs = lock_counters() -> t_refs;    \
80925 +       __entry_counters.d_refs = lock_counters() -> d_refs;    \
80926 +       assert("nikita-2159",                                   \
80927 +              !memcmp(&__entry_counters, lock_counters(),      \
80928 +                      sizeof __entry_counters));               \
80929 +}) )
80930 +
80931 +#else
80932 +#define STORE_COUNTERS
80933 +#define CHECK_COUNTERS noop
80934 +#endif
80935 +
80936 +/* __ZNODE_H__ */
80937 +#endif
80938 +
80939 +/* Make Linus happy.
80940 +   Local variables:
80941 +   c-indentation-style: "K&R"
80942 +   mode-name: "LC"
80943 +   c-basic-offset: 8
80944 +   tab-width: 8
80945 +   fill-column: 120
80946 +   End:
80947 +*/
80948 diff --git a/include/linux/fs.h b/include/linux/fs.h
80949 index 2fe6e3f..7071851 100644
80950 --- a/include/linux/fs.h
80951 +++ b/include/linux/fs.h
80952 @@ -1199,6 +1199,8 @@ struct super_operations {
80953         void (*clear_inode) (struct inode *);
80954         void (*umount_begin) (struct vfsmount *, int);
80955
80956 +       void (*sync_inodes) (struct super_block *sb,
80957 +                               struct writeback_control *wbc);
80958         int (*show_options)(struct seq_file *, struct vfsmount *);
80959         int (*show_stats)(struct seq_file *, struct vfsmount *);
80960  #ifdef CONFIG_QUOTA
80961 @@ -1617,6 +1619,7 @@ extern int invalidate_inode_pages2(struct address_space *mapping);
80962  extern int invalidate_inode_pages2_range(struct address_space *mapping,
80963                                          pgoff_t start, pgoff_t end);
80964  extern int write_inode_now(struct inode *, int);
80965 +extern void generic_sync_sb_inodes(struct super_block *, struct writeback_control *);
80966  extern int filemap_fdatawrite(struct address_space *);
80967  extern int filemap_flush(struct address_space *);
80968  extern int filemap_fdatawait(struct address_space *);
80969 diff --git a/lib/radix-tree.c b/lib/radix-tree.c
80970 index aa9bfd0..c989f9c 100644
80971 --- a/lib/radix-tree.c
80972 +++ b/lib/radix-tree.c
80973 @@ -139,6 +139,7 @@ int radix_tree_preload(gfp_t gfp_mask)
80974  out:
80975         return ret;
80976  }
80977 +EXPORT_SYMBOL(radix_tree_preload);
80978
80979  static inline void tag_set(struct radix_tree_node *node, unsigned int tag,
80980                 int offset)
80981 diff --git a/mm/filemap.c b/mm/filemap.c
80982 index 7b84dc8..c117b52 100644
80983 --- a/mm/filemap.c
80984 +++ b/mm/filemap.c
80985 @@ -121,6 +121,7 @@ void __remove_from_page_cache(struct page *page)
80986         mapping->nrpages--;
80987         __dec_zone_page_state(page, NR_FILE_PAGES);
80988  }
80989 +EXPORT_SYMBOL(__remove_from_page_cache);
80990
80991  void remove_from_page_cache(struct page *page)
80992  {
80993 @@ -132,6 +133,7 @@ void remove_from_page_cache(struct page *page)
80994         __remove_from_page_cache(page);
80995         write_unlock_irq(&mapping->tree_lock);
80996  }
80997 +EXPORT_SYMBOL(remove_from_page_cache);
80998
80999  static int sync_page(void *word)
81000  {
81001 @@ -465,6 +467,7 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
81002                 lru_cache_add(page);
81003         return ret;
81004  }
81005 +EXPORT_SYMBOL(add_to_page_cache_lru);
81006
81007  #ifdef CONFIG_NUMA
81008  struct page *__page_cache_alloc(gfp_t gfp)
81009 @@ -738,6 +741,7 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
81010         read_unlock_irq(&mapping->tree_lock);
81011         return ret;
81012  }
81013 +EXPORT_SYMBOL(find_get_pages);
81014
81015  /**
81016   * find_get_pages_contig - gang contiguous pagecache lookup
81017 @@ -798,6 +802,7 @@ unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
81018         read_unlock_irq(&mapping->tree_lock);
81019         return ret;
81020  }
81021 +EXPORT_SYMBOL(find_get_pages_tag);
81022
81023  /**
81024   * grab_cache_page_nowait - returns locked page at given index in given cache
81025 diff --git a/mm/readahead.c b/mm/readahead.c
81026 index 23cb61a..649bd43 100644
81027 --- a/mm/readahead.c
81028 +++ b/mm/readahead.c
81029 @@ -572,6 +572,7 @@ void handle_ra_miss(struct address_space *mapping,
81030         ra->flags &= ~RA_FLAG_INCACHE;
81031         ra->cache_hit = 0;
81032  }
81033 +EXPORT_SYMBOL_GPL(handle_ra_miss);
81034
81035  /*
81036   * Given a desired number of PAGE_CACHE_SIZE readahead pages, return a