kernel-reiser4.patch

   1 From: Artem Bityutskiy <dedekind@yandex.ru>
   2
   3 This patch adds new operation to struct super_operations - sync_inodes,
   4 generic implementaion and changes fs-writeback.c:sync_sb_inodes() to call
   5 filesystem's sync_inodes if it is defined or generic implementaion otherwise.
   6 This new operation allows filesystem to decide itself what to flush.
   7
   8 Reiser4 flushes dirty pages on basic of atoms, not of inodes.  sync_sb_inodes
   9 used to call address space flushing method (writepages) for every dirty inode.
  10  For reiser4 it caused having to commit atoms unnecessarily often.  This
  11 turned into substantial slowdown.  Having this method helped to fix that
  12 problem.
  13
  14 akpm: this patch needs to be chnaged to remove the `sb' arg.
  15
  16 Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
  17 Cc: Edward Shishkin <edward.shishkin@gmail.com>
  18 Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
  19 ---
  20
  21  fs/fs-writeback.c  |    5 ++++-
  22  include/linux/fs.h |    2 ++
  23  2 files changed, 6 insertions(+), 1 deletion(-)
  24
  25 diff -puN fs/fs-writeback.c~reiser4-vfs-add-super_operationssync_inodes-2 fs/fs-writeback.c
  26 --- a/fs/fs-writeback.c~reiser4-vfs-add-super_operationssync_inodes-2
  27 +++ a/fs/fs-writeback.c
  28 @@ -1221,7 +1221,10 @@ EXPORT_SYMBOL(writeback_inodes_sb);
  29   */
  30  void sync_inodes_sb(struct super_block *sb)
  31  {
  32 -       bdi_sync_writeback(sb->s_bdi, sb);
  33 +       if (sb->s_op->sync_inodes)
  34 +               sb->s_op->sync_inodes(sb, NULL);
  35 +       else
  36 +               bdi_sync_writeback(sb->s_bdi, sb);
  37         wait_sb_inodes(sb);
  38  }
  39  EXPORT_SYMBOL(sync_inodes_sb);
  40 diff -puN include/linux/fs.h~reiser4-vfs-add-super_operationssync_inodes-2 include/linux/fs.h
  41 --- a/include/linux/fs.h~reiser4-vfs-add-super_operationssync_inodes-2
  42 +++ a/include/linux/fs.h
  43 @@ -1568,6 +1568,8 @@ struct super_operations {
  44         void (*clear_inode) (struct inode *);
  45         void (*umount_begin) (struct super_block *);
  46
  47 +       void (*sync_inodes)(struct super_block *sb,
  48 +                               struct writeback_control *wbc);
  49         int (*show_options)(struct seq_file *, struct vfsmount *);
  50         int (*show_stats)(struct seq_file *, struct vfsmount *);
  51  #ifdef CONFIG_QUOTA
  52 _
  53 From: Hans Reiser <reiser@namesys.com>
  54
  55 Reiser4 is trying to add/remove pages to/from address space, so it needs
  56 add_to_page_cache_lru to be EXPORT_SYMBOL-ed.
  57
  58 [bunk@stusta.de: unexport {,__}remove_from_page_cache]
  59 Signed-off-by: Adrian Bunk <bunk@stusta.de>
  60 Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
  61 ---
  62
  63  mm/filemap.c |    1 +
  64  1 file changed, 1 insertion(+)
  65
  66 diff -puN mm/filemap.c~reiser4-export-remove_from_page_cache mm/filemap.c
  67 --- a/mm/filemap.c~reiser4-export-remove_from_page_cache
  68 +++ a/mm/filemap.c
  69 @@ -779,6 +779,7 @@ repeat:
  70         rcu_read_unlock();
  71         return ret;
  72  }
  73 +EXPORT_SYMBOL(add_to_page_cache_lru);
  74
  75  /**
  76   * find_get_pages_contig - gang contiguous pagecache lookup
  77 _
  78 From: Andrew Morton <akpm@linux-foundation.org>
  79
  80 merge glitch
  81
  82 Reported-by: Randy Dunlap <randy.dunlap@oracle.com>
  83 Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
  84 ---
  85
  86  mm/filemap.c |    1 -
  87  1 file changed, 1 deletion(-)
  88
  89 diff -puN mm/filemap.c~reiser4-export-remove_from_page_cache-fix mm/filemap.c
  90 --- a/mm/filemap.c~reiser4-export-remove_from_page_cache-fix
  91 +++ a/mm/filemap.c
  92 @@ -779,7 +779,6 @@ repeat:
  93         rcu_read_unlock();
  94         return ret;
  95  }
  96 -EXPORT_SYMBOL(add_to_page_cache_lru);
  97
  98  /**
  99   * find_get_pages_contig - gang contiguous pagecache lookup
 100 _
 101 Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
 102 ---
 103
 104  mm/filemap.c |    1 +
 105  1 file changed, 1 insertion(+)
 106
 107 diff -puN mm/filemap.c~reiser4-export-find_get_pages mm/filemap.c
 108 --- a/mm/filemap.c~reiser4-export-find_get_pages
 109 +++ a/mm/filemap.c
 110 @@ -779,6 +779,7 @@ repeat:
 111         rcu_read_unlock();
 112         return ret;
 113  }
 114 +EXPORT_SYMBOL(find_get_pages);
 115
 116  /**
 117   * find_get_pages_contig - gang contiguous pagecache lookup
 118 _
 119 From: Vladimir Saveliev <vs@namesys.com>
 120
 121 This is the main reiserfs4 filesystem.
 122
 123 Q&A wrt this patch:
 124
 125 - A really short guide to how to get up and running with this filesystem.
 126
 127         Reiser4 is a file system based on dancing tree algorithms, and
 128         is described at http://www.namesys.com.  One should be able to get it
 129         up and running just like any of the other filesystems supported by
 130         Linux.  Configure it to be compiled either builtin or as a module.
 131         Create reiser4 filesystem with mkfs.reiser4, mount and use it.  More
 132         detailed info can be found at
 133         http://thebsh.namesys.com/snapshots/LATEST/READ.ME.
 134
 135 - The direct URL which people use to obtain the mkfs tool for this
 136   filesystem.  Also fsck and anything else.
 137
 138         Reiser4 userland tools can be obtained at
 139         ftp://ftp.namesys.com/pub/reiser4progs.
 140         ftp://ftp.namesys.com/pub/reiser4progs/README contains detailed
 141         instructions on how to compile and install these tools.  Also all
 142         reiser4 progs have man pages.
 143
 144 - Any known shortcomings, caveats, etc.
 145
 146         Reiser4 has been tested on i386 yet only.  Quota support is
 147         not ready yet.  Should be ready soon.  Reiser4 was tested extensively,
 148         and we got to where the mailing list was not able to hit any bugs, but
 149         then we told people that, got an order of magnitude increase in users,
 150         and they are able to hit bugs that we are working on now.
 151
 152         Reiser's Law of Software Engineering: Each order of magnitude
 153         increase in users finds more bugs, in a quantity equal to the previous
 154         order of magnitude increase in users.  Success for software developers
 155         is measured by how long the frustration lasts.
 156
 157         Only the very core functionality is working.  Exotic plugins,
 158         an API for multiple operation transactions and accessing multiple
 159         small files in one syscall, compression, inheritance, all have been
 160         postponed until after the core functionality is shipped.  The
 161         compression plugin needs a code review before anyone should use it.
 162
 163 - A statement on compatibility with reiserfs3 filesytems.
 164
 165         To upgrade from reiserfs V3 to V4, use tar, or sponsor us to
 166         write a convertfs.
 167
 168 - Bear in mind that people will immediately benchmark this filesytem, and
 169   first impressions count.  Now is your chance to communicate any tuning
 170   guidelines, mount options or whatever which you'd like people to understand
 171   BEFORE they start publishing benchmark info.
 172
 173         Reiser4 is not tuned for fsync/sync/O_SYNC performance yet.
 174
 175         If you see results that are much different from those at
 176         www.namesys.com/benchmarks.html, let us know.  If you see performance
 177         characteristics that don't quite make sense, email
 178         reiserfs-list@namesys.com, such things are always of interest.
 179
 180         reiser4 is not tuned for mmaping and dirtying more than
 181         physical ram like IOzone does.  This is quite different in its code
 182         path from writing and dirtying more than physical ram.  There are
 183         those who think that what IOZone does is rarely done by real programs,
 184         and therefor we should not bother to optimize what it does.  All I
 185         know is, this month we are not optimized for it.
 186
 187         Please consider its space savings when you benchmark it also.
 188
 189 [michal.k.k.piotrowski@gmail.com: kill #include "linux/config.h"]
 190 [akpm@linux-foundation.org: reiser4_drop_page: don't call remove_from_page_cache]
 191 [bunk@stusta.de: fs/reiser4/: possible cleanups]
 192 Signed-off-by: Vladimir Saveliev <vs@namesys.com>
 193 Signed-off-by: Hans Reiser <reiser@namesys.com>
 194 Signed-off-by: Edward Shishkin <edward@namesys.com>
 195 DESC
 196 reiser4: fix for drop-unused-semaphores.patch
 197 EDESC
 198 From: Edward Shishkin <edward@namesys.com>
 199
 200 Wait for tail conversion completion when acquiring exclusive access by
 201 . mmap_unix_file()
 202 . setattr_unix_file()
 203 . release_unix_file()
 204 Update comments.
 205
 206 Signed-off-by: Edward Shishkin <edward@namesys.com>
 207 Cc Jonathan Briggs <jbriggs@esoft.com>
 208 DESC
 209 reiser4-slab-allocators-remove-slab_debug_initial-flag
 210 EDESC
 211 From: Andrew Morton <akpm@linux-foundation.org>
 212
 213 Cc: Christoph Lameter <cl@linux-foundation.org>.com>
 214 DESC
 215 reiser4: use simple_prepare_write to zero page data
 216 EDESC
 217 From: Nate Diller <nate.diller@gmail.com>
 218
 219 It's common for file systems to need to zero data on either side of a
 220 write, if a page is not Uptodate during prepare_write.  It just so happens
 221 that simple_prepare_write() in libfs.c does exactly that, so we can avoid
 222 duplication and just call that function to zero page data.
 223
 224 Signed-off-by: Nate Diller <nate.diller@gmail.com>
 225 Cc: Vladimir Saveliev <vs@namesys.com>
 226 Cc: Edward Shishkin <edward@namesys.com>
 227 DESC
 228 reiser4-fix
 229 EDESC
 230 From: Andrew Morton <akpm@linux-foundation.org>
 231
 232
 233 DESC
 234 reiser4: use zero_user_page
 235 EDESC
 236 From: Nate Diller <nate.diller@gmail.com>
 237
 238 Use zero_user_page() instead of open-coding it.
 239
 240 Signed-off-by: Nate Diller <nate.diller@gmail.com>
 241 Cc: Vladimir Saveliev <vs@namesys.com>
 242 Cc: Edward Shishkin <edward@namesys.com>
 243 DESC
 244 reiser4: remove typedefs
 245 EDESC
 246 From: Edward Shishkin <edward@namesys.com>
 247
 248 . Reduce number of typedefs from 289 to 248
 249 . Remove unused file plugin/file/invert.c
 250 . Update comments
 251
 252 DESC
 253 reiser4: fix write_extent
 254 EDESC
 255 From: Edward Shishkin <edward@namesys.com>
 256
 257 Prepared-by Ignatich <ignatich@gmail.com>
 258
 259 Fix reiser4_write_extent():
 260    1) handling incomplete writes missed in reiser4-temp-fix.patch
 261    2) bugs in the case of returned errors
 262
 263 Signed-off-by: Edward Shishkin <edward@namesys.com>
 264 DESC
 265 reiser4 make sync_inodes non-void
 266 EDESC
 267 From: Edward Shishkin <edward@namesys.com>
 268
 269 Make reiser4_sync_inodes non-void
 270
 271 Signed-off-by: Edward Shishkin <edward@namesys.com>
 272 DESC
 273 Reiser4: Drop 'size' argument from bio_endio and bi_end_io
 274 EDESC
 275 From: Laurent Riffard <laurent.riffard@free.fr>
 276
 277 Reiser4: Drop 'size' argument from bio_endio and bi_end_io
 278
 279 This patch pushes into Reiser4 the changes introduced by
 280 commit 6712ecf8f648118c3363c142196418f89a510b90:
 281
 282         As bi_end_io is only called once when the request is complete,
 283         the 'size' argument is now redundant.  Remove it.
 284
 285         Now there is no need for bio_endio to subtract the size completed
 286         from bi_size.  So don't do that either.
 287
 288         While we are at it, change bi_end_io to return void.
 289
 290 Signed-off-by: Laurent Riffard <laurent.riffard@free.fr>
 291 Acked-by: Jens Axboe <jens.axboe@oracle.com>
 292 Acked-by: Edward Shishkin <edward@namesys.com>
 293 DESC
 294 mm: clean up and kernelify shrinker registration
 295 EDESC
 296 From: Rusty Russell <rusty@rustcorp.com.au>
 297
 298 I can never remember what the function to register to receive VM pressure
 299 is called.  I have to trace down from __alloc_pages() to find it.
 300
 301 It's called "set_shrinker()", and it needs Your Help.
 302
 303 1) Don't hide struct shrinker.  It contains no magic.
 304 2) Don't allocate "struct shrinker".  It's not helpful.
 305 3) Call them "register_shrinker" and "unregister_shrinker".
 306 4) Call the function "shrink" not "shrinker".
 307 5) Reduce the 17 lines of waffly comments to 13, but document it properly.
 308
 309 The comment in reiser4 makes me a little queasy.
 310
 311 Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
 312 Cc: Vladimir Saveliev <vs@namesys.com>
 313 Acked-by: Edward Shishkin <edward@namesys.com>
 314 DESC
 315 reiser4: fix NULL dereference in __mnt_is_readonly in ftruncate()
 316 EDESC
 317 From: Dave Hansen <haveblue@us.ibm.com>
 318
 319 Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
 320 Cc: Edward Shishkin <edward@namesys.com>
 321 Cc: "Vladimir V. Saveliev" <vs@namesys.com>
 322 DESC
 323 reiser4: fix extent2tail
 324 EDESC
 325 From: Edward Shishkin <edward@namesys.com>
 326
 327 Fixed bug in extent2tail conversion.
 328
 329 Bug description:
 330 when converting partially converted file
 331 (with flag REISER4_PART_MIXED installed)
 332 reiser4_cut_tree() starts to cut old metatada
 333 from wrong offset. Result is data corruption.
 334
 335 Signed-off-by: Edward Shishkin <edward@namesys.com>
 336 DESC
 337 reiser4: fix read_tail
 338 EDESC
 339 From: Edward Shishkin <edward@namesys.com>
 340
 341 Update hint when reading tails
 342
 343 Signed-off-by: Edward Shishkin <edward@namesys.com>
 344 DESC
 345 reiser4: fix unix-file readpages filler
 346 EDESC
 347 From: Edward Shishkin <edward@namesys.com>
 348
 349 Protect page (via incrementing page count) from being reclaimed when looking
 350 for extent pointer in unix-file specific readpages filler.
 351
 352 Signed-off-by: Edward Shishkin <edward@namesys.com>
 353 DESC
 354 reiser4: fix readpage_unix_file
 355 EDESC
 356 From: Edward Shishkin <edward@namesys.com>
 357
 358 . If nominated (by VFS) page is out of file size, then fill it
 359   by zeros instead of returning -EINVAL (this prevents returning
 360   an unexpected error (-EINVAL) by some apps that don't check
 361   file size).
 362
 363 . Check if the page became uptodate while it was being unlocked.
 364
 365 Signed-off-by: Edward Shishkin <edward@namesys.com>
 366 Cc: Zan Lynx <zlynx@acm.org>
 367 Cc: "Vladimir V. Saveliev" <vs@namesys.com>
 368 DESC
 369 reiser4: fix for new aops patches
 370 EDESC
 371 From: Nick Piggin <npiggin@suse.de>
 372
 373 Cc: Vladimir Saveliev <vs@namesys.com>
 374 Cc: Edward Shishkin <edward@namesys.com>
 375 DESC
 376 reiser4: do not allocate struct file on stack
 377 EDESC
 378 From: Edward Shishkin <edward@namesys.com>
 379
 380 Do not allocate struct file on stack, pass the persistent one instead.
 381
 382 Signed-off-by: Edward Shishkin <edward@namesys.com>
 383 Tested-by: Zan Lynx <zlynx@acm.org>
 384 Cc: "Vladimir V. Saveliev" <vs@namesys.com>
 385 DESC
 386 git-block-vs-reiser4
 387 EDESC
 388 From: Andrew Morton <akpm@linux-foundation.org>
 389
 390 Hope this is right.
 391
 392 Hope you know what you're doing ;)
 393
 394 Cc: Vladimir Saveliev <vs@namesys.com>
 395 Cc: Edward Shishkin <edward@namesys.com>
 396 Cc: Jens Axboe <jens.axboe@oracle.com>
 397 DESC
 398 reiser4: cryptcompress misc fixups
 399 EDESC
 400 From: Edward Shishkin <edward@namesys.com>
 401
 402 .  Fix a race (reproducible by fsx + sync (1)) between
 403   checkin_page_cluster operations: serialize them via special per-inode
 404   checkin_mutex (usual i_mutex is not suitable for this purpose, as
 405   ->writepages() also calls checkin_page_cluster();
 406
 407 .  Add comments for checkin/checkout technique for synchronization of
 408   primary and secondary caches with proof of correctness;
 409
 410 .  Fix missed right neighbor when updating disk clusters by
 411   handle_pos_on_leaf() during squalloc (should use upper levels to get
 412   expected non-connected neighbor);
 413
 414 .  Resolve a race between read and truncate (when read finds partially
 415   truncated and, hence, unrecoverable disk cluster) via keeping a track of
 416   leftmost truncated disk clusters in cryptcompress-specific part of inode;
 417
 418 . Introduce size translators and size modulators for
 419   common needs;
 420
 421 . Update comments;
 422
 423 . Rename badly sounding function names;
 424
 425 . Fix coding style;
 426
 427 . Add my part of credits.
 428
 429 Signed-off-by: Edward Shishkin <edward@namesys.com>
 430 Cc: "Vladimir V. Saveliev" <vs@namesys.com>
 431 DESC
 432 reiser4: cryptcompress misc fixups-2
 433 EDESC
 434 From: Edward Shishkin <edward@namesys.com>
 435
 436 Check a file plugin id before manipulating with plugin-specific counter.
 437
 438 Signed-off-by: Edward Shishkin <edward@namesys.com>
 439 Cc: "Vladimir V. Saveliev" <vs@namesys.com>
 440 DESC
 441 fs/reiser4/plugin/: make 3 functions static
 442 EDESC
 443 From: Adrian Bunk <bunk@kernel.org>
 444
 445 This patch makes the following needlessly global functions static:
 446 - file/cryptcompress.c: __put_page_cluster()
 447 - file/cryptcompress.c: put_hint_cluster()
 448 - item/ctail.c: ctail_read_disk_cluster()
 449
 450 Signed-off-by: Adrian Bunk <bunk@kernel.org>
 451 Cc: Edward Shishkin <edward@namesys.com>
 452 Cc: "Vladimir V. Saveliev" <vs@namesys.com>
 453 DESC
 454 reiser4: change error code base
 455 EDESC
 456 From: Edward Shishkin <edward@namesys.com>
 457
 458 Change REISER4_ERROR_CODE_BASE to 10000 to not overlap real errnos
 459
 460 Signed-off-by: Edward Shishkin <edward@namesys.com>
 461 Cc: "Vladimir V. Saveliev" <vs@namesys.com>
 462 DESC
 463 reiser4: use lzo library functions
 464 EDESC
 465 From: Edward Shishkin <edward@namesys.com>
 466
 467 . Convert Reiser4 to use lzo implementation in lib/lzo/ instead of
 468   including its own copy of minilzo;
 469 . Do not set zeros to workmem region.
 470
 471 Signed-off-by: Edward Shishkin <edward@namesys.com>
 472 Cc: "Vladimir V. Saveliev" <vs@namesys.com>
 473 DESC
 474 fs/reiser4/plugin/file/cryptcompress.c: kmalloc + memset conversion to kzalloc
 475 EDESC
 476 From: Mariusz Kozlowski <m.kozlowski@tuxland.pl>
 477
 478  fs/reiser4/plugin/file/cryptcompress.c | 101386 -> 101352 (-34 bytes)
 479  fs/reiser4/plugin/file/cryptcompress.o | 456784 -> 456644 (-140 bytes)
 480
 481 Signed-off-by: Mariusz Kozlowski <m.kozlowski@tuxland.pl>
 482 Cc: Edward Shishkin <edward@namesys.com>
 483 Cc: "Vladimir V. Saveliev" <vs@namesys.com>
 484 DESC
 485 reiser4: kmalloc + memset conversion to kzalloc
 486 EDESC
 487 From: Mariusz Kozlowski <m.kozlowski@tuxland.pl>
 488
 489 Signed-off-by: Mariusz Kozlowski <m.kozlowski@tuxland.pl>
 490 Cc: Edward Shishkin <edward@namesys.com>
 491 Cc: "Vladimir V. Saveliev" <vs@namesys.com>
 492 DESC
 493 fs/reiser4/init_super.c: kmalloc + memset conversion to kzalloc
 494 EDESC
 495 From: Mariusz Kozlowski <m.kozlowski@tuxland.pl>
 496
 497  fs/reiser4/init_super.c | 19283 -> 19246 (-37 bytes)
 498  fs/reiser4/init_super.o | 155348 -> 155152 (-196 bytes)
 499
 500 Signed-off-by: Mariusz Kozlowski <m.kozlowski@tuxland.pl>
 501 Cc: Edward Shishkin <edward@namesys.com>
 502 Cc: "Vladimir V. Saveliev" <vs@namesys.com>
 503 DESC
 504 fs/reiser4/plugin/inode_ops_rename.c: kmalloc + memset conversion to kzalloc
 505 EDESC
 506 From: Mariusz Kozlowski <m.kozlowski@tuxland.pl>
 507
 508  fs/reiser4/plugin/inode_ops_rename.c | 28474 -> 28344 (-130 bytes)
 509  fs/reiser4/plugin/inode_ops_rename.o | 142600 -> 142476 (-124 bytes)
 510
 511 Signed-off-by: Mariusz Kozlowski <m.kozlowski@tuxland.pl>
 512 Cc: Edward Shishkin <edward@namesys.com>
 513 Cc: "Vladimir V. Saveliev" <vs@namesys.com>
 514 DESC
 515 fs/reiser4/ktxnmgrd.c: kmalloc + memset conversion to kzalloc
 516 EDESC
 517 From: Mariusz Kozlowski <m.kozlowski@tuxland.pl>
 518
 519  fs/reiser4/ktxnmgrd.c | 5314 -> 5277 (-37 bytes)
 520  fs/reiser4/ktxnmgrd.o | 131624 -> 131496 (-128 bytes)
 521
 522 Signed-off-by: Mariusz Kozlowski <m.kozlowski@tuxland.pl>
 523 Cc: Edward Shishkin <edward@namesys.com>
 524 Cc: "Vladimir V. Saveliev" <vs@namesys.com>
 525 DESC
 526 Use helpers to obtain task pid in printks
 527 EDESC
 528 From: Pavel Emelyanov <xemul@openvz.org>
 529
 530 The task_struct->pid member is going to be deprecated, so start
 531 using the helpers (task_pid_nr/task_pid_vnr/task_pid_nr_ns) in
 532 the kernel.
 533
 534 The first thing to start with is the pid, printed to dmesg - in
 535 this case we may safely use task_pid_nr(). Besides, printks produce
 536 more (much more) than a half of all the explicit pid usage.
 537
 538 Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
 539 DESC
 540 Subject: [PATCH 1/2] remove asm/bitops.h includes
 541 EDESC
 542 From: Jiri Slaby <jirislaby@gmail.com>
 543
 544 remove asm/bitops.h includes
 545
 546 including asm/bitops directly may cause compile errors. don't include it
 547 and include linux/bitops instead. next patch will deny including asm header
 548 directly.
 549
 550 Cc: Adrian Bunk <bunk@kernel.org>
 551 Signed-off-by: Jiri Slaby <jirislaby@gmail.com>
 552
 553 ---
 554 commit 3c05eef3d0a98065323d7d6d9a78e0985eba4b10
 555 tree cb9691832992f570b0363dd568f6fa3d2c81e3f5
 556 parent 132bb039c741d00f066e7501e3613d2d20bf0595
 557 author Jiri Slaby <jirislaby@gmail.com> Tue, 04 Sep 2007 21:01:35 +0200
 558 committer Jiri Slaby <jirislaby@gmail.com> Tue, 04 Sep 2007 21:01:35 +0200
 559
 560 index 7ad84ea..32afaa3 100644
 561 DESC
 562 git-nfsd-broke-reiser4
 563 EDESC
 564 From: Andrew Morton <akpm@linux-foundation.org>
 565
 566 fs/reiser4/export_ops.c: In function 'reiser4_decode_fh':
 567 fs/reiser4/export_ops.c:96: error: 'const struct export_operations' has no member named 'find_exported_dentry'
 568 fs/reiser4/export_ops.c:96: warning: type defaults to 'int' in declaration of 'fn'
 569 fs/reiser4/export_ops.c:98: error: 'const struct export_operations' has no member named 'find_exported_dentry'
 570 fs/reiser4/export_ops.c:99: warning: comparison between pointer and integer
 571 fs/reiser4/export_ops.c:101: error: called object 'fn' is not a function
 572 fs/reiser4/export_ops.c: At top level:
 573 fs/reiser4/export_ops.c:282: error: unknown field 'decode_fh' specified in initializer
 574 fs/reiser4/export_ops.c:282: warning: initialization from incompatible pointer type
 575 fs/reiser4/export_ops.c:284: error: unknown field 'get_dentry' specified in initializer
 576 fs/reiser4/export_ops.c:285: warning: excess elements in struct initializer
 577 fs/reiser4/export_ops.c:285: warning: (near initialization for 'reiser4_export_operations')
 578
 579 help!
 580
 581 Cc: J. Bruce Fields <bfields@citi.umich.edu>
 582 Cc: Edward Shishkin <edward@namesys.com>
 583 Cc: "Vladimir V. Saveliev" <vs@namesys.com>
 584 DESC
 585 slab-api-remove-useless-ctor-parameter-and-reorder-parameters-vs-reiser4
 586 EDESC
 587 From: Andrew Morton <akpm@linux-foundation.org>
 588
 589 Cc: Christoph Lameter <cl@linux-foundation.org>
 590 DESC
 591 reiser4 kgdb fix
 592 EDESC
 593 From: Kamalesh Babulal <kamalesh@linux.vnet.ibm.com>
 594
 595 fs/built-in.o: In function `reiser4_debugtrap':
 596 /root/kernels/linux-2.6.25-rc5/fs/reiser4/debug.c:295: undefined reference to `breakpoint'
 597 make: *** [.tmp_vmlinux1] Error 1
 598
 599 This build failure has been introduced by reiser4.patch, i think the
 600 breakpoint() have been used instead of kgdb_breakpoint().
 601
 602 Cc: Edward Shishkin <edward.shishkin@gmail.com>
 603 DESC
 604 Reiser4 portion of zero_user cleanup patch
 605 EDESC
 606 From: Christoph Lameter <cl@linux-foundation.org>
 607
 608 Reiser4 only exists in mm. So split this off.
 609
 610 Signed-off-by: Christoph Lameter <cl@linux-foundation.org>
 611 DESC
 612 reiser4: replace uid==0 check with capability
 613 EDESC
 614 From: "Serge E. Hallyn" <serue@us.ibm.com>
 615
 616 Reiser4 gives root some reserved blocks.  Replace the uid==0 check, which
 617 is not safe in the face of user namespaces, with a CAP_SYS_RESOURCE check,
 618 which seems appropriate.
 619
 620 The per-uid and per-guid reservations appear unimplemented so I'm ignoring
 621 them.
 622
 623 Signed-off-by: Serge Hallyn <serue@us.ibm.com>
 624 Acked-by: Edward Shishkin <edward.shishkin@gmail.com>
 625 Cc: "Vladimir V. Saveliev" <vs@namesys.com>
 626 DESC
 627 jens-fixed-reiser4
 628 EDESC
 629 From: Jens Axboe <jens.axboe@oracle.com>
 630
 631 On Tue, Nov 06 2007, akpm@linux-foundation.org wrote:
 632 >
 633 > The patch titled
 634 >      jens-broke-reiser4
 635 > has been added to the -mm tree.  Its filename is
 636 >      jens-broke-reiser4.patch
 637 >
 638 > *** Remember to use Documentation/SubmitChecklist when testing your code ***
 639 >
 640 > See http://www.zip.com.au/~akpm/linux/patches/stuff/added-to-mm.txt to find
 641 > out what to do about this
 642 >
 643 > ------------------------------------------------------
 644 > Subject: jens-broke-reiser4
 645 > From: Andrew Morton <akpm@linux-foundation.org>
 646 >
 647 > fs/reiser4/plugin/file/cryptcompress.c: In function 'reiser4_deflate_cluster':
 648 > fs/reiser4/plugin/file/cryptcompress.c:1120: error: 'struct scatterlist' has no member named 'page'
 649 > fs/reiser4/plugin/file/cryptcompress.c:1124: error: 'struct scatterlist' has no member named 'page'
 650 > fs/reiser4/plugin/file/cryptcompress.c: In function 'reiser4_inflate_cluster':
 651 > fs/reiser4/plugin/file/cryptcompress.c:1184: error: 'struct scatterlist' has no member named 'page'
 652 > fs/reiser4/plugin/file/cryptcompress.c:1188: error: 'struct scatterlist' has no member named 'page'
 653 >
 654 > Please send a fix against next -mm.
 655
 656 Here's one for 2.6.23-mm1, should apply fine for you now.
 657
 658 Cc: Edward Shishkin <edward.shishkin@gmail.com>
 659 DESC
 660 reiser4: make methods supplied to vfs invariant
 661 EDESC
 662 From: Edward Shishkin <edward.shishkin@gmail.com>
 663
 664 Problem:
 665 hangs when writing to reiser4 partition exported via nfs
 666 (found by Roc Vallès Domènech <rvalles@es.gnu.org>).
 667
 668 Bug:
 669 vfs (which is not aware of reiser4 plugin conversion) uses
 670 obsolete copy of @file->f_op, whereas old methods are not
 671 aware about new structures. It leads to memory corruption.
 672
 673 Fixup:
 674 Prevent collisions with vfs:
 675 Make inode_ops, file_ops and a_ops supplied to vfs
 676 invariant with respect to plugin conversion.
 677
 678 Signed-off-by: Edward Shishkin <edward.shishkin@gmail.com>
 679 DESC
 680 reiser4: new export ops
 681 EDESC
 682 From: Edward Shishkin <edward.shishkin@gmail.com>
 683
 684 Adjust reiser4 for the new export ops.
 685
 686 Signed-off-by: Edward Shishkin <edward.shishkin@gmail.com>
 687 DESC
 688 reiser4: new export ops (update)
 689 EDESC
 690 From: Edward Shishkin <edward.shishkin@gmail.com>
 691
 692 Signed-off-by: Edward Shishkin <edward.shishkin@gmail.com>
 693 DESC
 694 reiser4: new export_ops fixup
 695 EDESC
 696 From: Edward Shishkin <edward.shishkin@gmail.com>
 697
 698 Added missed reiser4_{init, exit}_context()
 699
 700 Cc: Sven Muller <musv@gmx.de>
 701 DESC
 702 reiser4: specify splice file operations
 703 EDESC
 704 From: Edward Shishkin <edward.shishkin@gmail.com>
 705
 706 Specify splice_read, splice_write file operations for loopback
 707 functionality.
 708
 709 Signed-off-by: Edward Shishkin <edward.shishkin@gmail.com>
 710 Cc: Jens Axboe <jens.axboe@oracle.com>
 711 DESC
 712 reiser4: fix dummy ioctl_cryptcompress
 713 EDESC
 714 From: Edward Shishkin <edward.shishkin@gmail.com>
 715
 716 . Problem: unexpected resolving to prompt when merging/updating
 717   stuff with python-based Gentoo manager (reported by
 718   rvalles <rvalles@es.gnu.org> and
 719   Dushan Tcholich <dusanc@gmail.com>).
 720
 721   Bug: User application made wrong decision about file's nature
 722   based on returned value of ->ioctl() method for cryptcompress
 723   file plugin.
 724
 725   Fix: make dummy ->ioctl() method for cryptcompress file plugin
 726   to return -EINVAL instead of zero.
 727
 728 . Drop some redundant ifs.
 729
 730 . Update comments. Add precise definition of FCS (file conversion set)
 731   that should be protected during file plugin conversion.
 732
 733 Signed-off-by: Edward Shishkin <edward.shishkin@gmail.com>
 734 DESC
 735 reiser4: granulate rw-serialization when accessing file conversion set
 736 EDESC
 737 From: Edward Shishkin <edward.shishkin@gmail.com>
 738
 739 Split common file plugin conversion procedure into
 740 . plugin scheduling part
 741 . plugin conversion part
 742 Move the last one to the plugin-independent file operation
 743 (reiser4_write_careful) with active protection of file
 744 conversion set (conv_sem held).
 745
 746 Signed-off-by: Edward Shishkin <edward.shishkin@gmail.com>
 747 DESC
 748 reiser4: fix disk cluster synchronization
 749 EDESC
 750 From: Edward Shishkin <edward.shishkin@gmail.com>
 751
 752 Problem:
 753 (2.6.24-rc3-mm2) BUG: unable to handle kernel NULL
 754 pointer dereference at virtual address 00000024
 755 EIP is at convert_ctail+0x14e/0x166
 756
 757 Bug: When updating disk clusters convert_ctail()
 758 looks at inode which is already evicted from memory
 759 or reused for other needs.
 760
 761 Fixup: Keep all needed file-specific info in
 762 convert_item_info before disk cluster update
 763 (when inode is pinned), then forget about inode.
 764
 765 Cleanups in plugin/file/file_conversion.c
 766
 767 Signed-off-by: Edward Shishkin <edward.shishkin@gmail.com>
 768 DESC
 769 reiser4: use balance_dirty_pages_ratelimited_nr
 770 EDESC
 771 From: Edward Shishkin <edward.shishkin@gmail.com>
 772
 773 Use balance_dirty_pages_ratelimited_nr() instead of
 774 balance_dirty_pages_ratelimited() for all reiser4 page cluster operations.
 775
 776 Signed-off-by: Edward Shishkin <edward.shishkin@gmail.com>
 777 DESC
 778 reiser4: correct references to filemap_nopage()
 779 EDESC
 780 From: Nick Piggin <npiggin@suse.de>
 781
 782 Correct old reiser4 references to filemap_nopage. No code change.
 783
 784 Signed-off-by: Nick Piggin <npiggin@suse.de>
 785 Cc: Edward Shishkin <edward.shishkin@gmail.com>
 786 DESC
 787 reiser4: fix null pointer dereference in reiser4_write_extent
 788 EDESC
 789 From: Edward Shishkin <edward.shishkin@gmail.com>
 790
 791 Problem:
 792 Oops when starting kde4:
 793 BUG: unable to handle kernel NULL pointer dereference
 794 at virtual address 0000000c
 795 printing eip: c025eba5 *pde = 00000000
 796 Oops: 0000 [#1] SMP
 797 last sysfs file: /devices/pci0000:00/0000:00:1e.0/0000:04:04.0/resource
 798 Modules linked in: thermal processor fan button
 799
 800 Pid: 3705, comm: kwrite Not tainted (2.6.23-mm1 #8)
 801 EIP: 0060:[<c025eba5>] EFLAGS: 00010246 CPU: 0
 802 EIP is at update_extents+0x44/0x2e7
 803
 804 Bug:
 805 Trying to look at not persistent struct file in the
 806 case of expanded truncate via sys_truncate64(path, length).
 807
 808 The fixup:
 809 . Don't look at struct file at truncate_file_body();
 810 . Add an inode *inode argument to the following
 811   functions to handle the case of not persistent
 812   struct file.
 813   . reiser4_write_extent();
 814   . reiser4_write_tail();
 815   . reiser4_update_extents();
 816 Other changes:
 817 . Add missesd identifier in some asserts.
 818 . Comments cleanups.
 819
 820 Signed-off-by: Edward Shishkin <edward.shishkin@gmail.com>
 821 DESC
 822 reiser4: code cleanups
 823 EDESC
 824 From: Edward Shishkin <edward.shishkin@gmail.com>
 825
 826 Coding style fixups.
 827
 828 Signed-off-by: Dushan Tcholich <dusanc@gmail.com>
 829 Signed-off-by: Bartosz Szreder <cfiend@talent.edu.pl>
 830 Acked-by: Edward Shishkin <edward.shishkin@gmail.com>
 831 DESC
 832 reiser4-tree_lock-fixes
 833 EDESC
 834 From: Andrew Morton <akpm@linux-foundation.org>
 835
 836 Cc: Nick Piggin <nickpiggin@yahoo.com.au>
 837 DESC
 838 reiser4-tree_lock-fixes-fix
 839 EDESC
 840 From: Andrew Morton <akpm@linux-foundation.org>
 841
 842 Cc: Nick Piggin <nickpiggin@yahoo.com.au>
 843 DESC
 844 reiser4: fix handling ENOSPC cryptcompress
 845 EDESC
 846 From: Edward Shishkin <edward.shishkin@gmail.com>
 847
 848 Problem:
 849 Processes fall into infinite loop
 850 when running in no-space-left-on-device situation.
 851
 852 Fixup:
 853 Fixed leak of checkin_mutex in do_write_cryptcompress();
 854
 855 DESC
 856 reiser4: fix handling ENOSPC unix_file
 857 EDESC
 858 From: Edward Shishkin <edward.shishkin@gmail.com>
 859
 860 Problem:
 861 Processes fall into infinite loop
 862 when running in no-space-left-on-device situation.
 863
 864 Fixups:
 865 1. fixed leak of exclusive access in write_unix_file();
 866 2. fixed leak of inode's flag REISER4_PART_IN_CONV in tail2extent();
 867 DESC
 868 reiser4: fix kill_hook_internal
 869 EDESC
 870 From: Edward Shishkin <edward.shishkin@gmail.com>
 871
 872 Problem:
 873 Failed assertion (nikita-2754): child znode is not loaded
 874 when looking at its number of items in kill_hook_internal().
 875 Nobody cares about its loading.
 876
 877 Fixup:
 878 Added missed zload/zrelse of the child.
 879 DESC
 880 reiser4-semaphore-fix
 881 EDESC
 882 From: Andrew Morton <akpm@linux-foundation.org>
 883
 884 In file included from fs/reiser4/znode.h:23,
 885                  from fs/reiser4/tree.h:14,
 886                  from fs/reiser4/super.h:11,
 887                  from fs/reiser4/debug.c:28:
 888 include/asm/semaphore.h:1:2: warning: #warning Use linux/semaphore.h, not asm/semaphore.h
 889
 890 Cc: Edward Shishkin <edward.shishkin@gmail.com>
 891 DESC
 892 slb-drop-kmem-cache-argument-from-constructor-reiser4
 893 EDESC
 894 From: Andrew Morton <akpm@linux-foundation.org>
 895 DESC
 896 reiser4-suid
 897 EDESC
 898 From: Andrew Morton <akpm@linux-foundation.org>
 899 DESC
 900 reiser4: compile warning cleanups
 901 EDESC
 902 From: "Ryan Hope" <rmh3093@gmail.com>
 903
 904 Clean up some reiser4 compile warnings:
 905
 906 Cc: Edward Shishkin <edward.shishkin@gmail.com>
 907 DESC
 908 reiser4: use wake_up_process() instead of wake_up() when possible
 909 EDESC
 910 From: Ryan Hope <rmh3093@gmail.com>
 911
 912 This was item #6 on the todo list for reiser4 inclusion in mainline:
 913
 914 Cc: Edward Shishkin <edward.shishkin@gmail.com>
 915 DESC
 916 reiser4: track upstream changes
 917 EDESC
 918 From: Andrew Morton <akpm@linux-foundation.org>
 919
 920 Hope it still works..
 921
 922 Cc: Edward Shishkin <edward.shishkin@gmail.com>
 923 Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
 924 ---
 925
 926  Documentation/Changes                         |   12
 927  Documentation/filesystems/reiser4.txt         |   75
 928  fs/Kconfig                                    |    1
 929  fs/Makefile                                   |    1
 930  fs/reiser4/Kconfig                            |   34
 931  fs/reiser4/Makefile                           |   98
 932  fs/reiser4/README                             |  128
 933  fs/reiser4/as_ops.c                           |  376 +
 934  fs/reiser4/block_alloc.c                      | 1142 ++++
 935  fs/reiser4/block_alloc.h                      |  177
 936  fs/reiser4/blocknrset.c                       |  371 +
 937  fs/reiser4/carry.c                            | 1398 +++++
 938  fs/reiser4/carry.h                            |  445 +
 939  fs/reiser4/carry_ops.c                        | 2132 +++++++++
 940  fs/reiser4/carry_ops.h                        |   43
 941  fs/reiser4/context.c                          |  289 +
 942  fs/reiser4/context.h                          |  228
 943  fs/reiser4/coord.c                            |  928 +++
 944  fs/reiser4/coord.h                            |  399 +
 945  fs/reiser4/debug.c                            |  308 +
 946  fs/reiser4/debug.h                            |  351 +
 947  fs/reiser4/dformat.h                          |   71
 948  fs/reiser4/dscale.c                           |  192
 949  fs/reiser4/dscale.h                           |   28
 950  fs/reiser4/entd.c                             |  335 +
 951  fs/reiser4/entd.h                             |   90
 952  fs/reiser4/eottl.c                            |  510 ++
 953  fs/reiser4/estimate.c                         |  129
 954  fs/reiser4/export_ops.c                       |  328 +
 955  fs/reiser4/flush.c                            | 3703 +++++++++++++++
 956  fs/reiser4/flush.h                            |  300 +
 957  fs/reiser4/flush_queue.c                      |  678 ++
 958  fs/reiser4/forward.h                          |  256 +
 959  fs/reiser4/fsdata.c                           |  804 +++
 960  fs/reiser4/fsdata.h                           |  205
 961  fs/reiser4/init_super.c                       |  751 +++
 962  fs/reiser4/inode.c                            |  711 +++
 963  fs/reiser4/inode.h                            |  453 +
 964  fs/reiser4/ioctl.h                            |   41
 965  fs/reiser4/jnode.c                            | 1923 ++++++++
 966  fs/reiser4/jnode.h                            |  704 ++
 967  fs/reiser4/kassign.c                          |  677 ++
 968  fs/reiser4/kassign.h                          |  111
 969  fs/reiser4/key.c                              |  138
 970  fs/reiser4/key.h                              |  392 +
 971  fs/reiser4/ktxnmgrd.c                         |  215
 972  fs/reiser4/ktxnmgrd.h                         |   52
 973  fs/reiser4/lock.c                             | 1237 +++++
 974  fs/reiser4/lock.h                             |  250 +
 975  fs/reiser4/oid.c                              |  141
 976  fs/reiser4/page_cache.c                       |  714 +++
 977  fs/reiser4/page_cache.h                       |   68
 978  fs/reiser4/plugin/Makefile                    |   26
 979  fs/reiser4/plugin/cluster.c                   |   72
 980  fs/reiser4/plugin/cluster.h                   |  410 +
 981  fs/reiser4/plugin/compress/Makefile           |    5
 982  fs/reiser4/plugin/compress/compress.c         |  367 +
 983  fs/reiser4/plugin/compress/compress.h         |   43
 984  fs/reiser4/plugin/compress/compress_mode.c    |  162
 985  fs/reiser4/plugin/crypto/cipher.c             |   37
 986  fs/reiser4/plugin/crypto/cipher.h             |   55
 987  fs/reiser4/plugin/crypto/digest.c             |   58
 988  fs/reiser4/plugin/dir/Makefile                |    5
 989  fs/reiser4/plugin/dir/dir.h                   |   36
 990  fs/reiser4/plugin/dir/hashed_dir.c            |   81
 991  fs/reiser4/plugin/dir/seekable_dir.c          |   46
 992  fs/reiser4/plugin/dir_plugin_common.c         |  873 +++
 993  fs/reiser4/plugin/disk_format/Makefile        |    5
 994  fs/reiser4/plugin/disk_format/disk_format.c   |   38
 995  fs/reiser4/plugin/disk_format/disk_format.h   |   27
 996  fs/reiser4/plugin/disk_format/disk_format40.c |  655 ++
 997  fs/reiser4/plugin/disk_format/disk_format40.h |  109
 998  fs/reiser4/plugin/fibration.c                 |  175
 999  fs/reiser4/plugin/fibration.h                 |   37
1000  fs/reiser4/plugin/file/Makefile               |    7
1001  fs/reiser4/plugin/file/cryptcompress.c        | 3775 ++++++++++++++++
1002  fs/reiser4/plugin/file/cryptcompress.h        |  616 ++
1003  fs/reiser4/plugin/file/file.c                 | 2728 +++++++++++
1004  fs/reiser4/plugin/file/file.h                 |  331 +
1005  fs/reiser4/plugin/file/file_conversion.c      |  689 ++
1006  fs/reiser4/plugin/file/symfile.c              |   87
1007  fs/reiser4/plugin/file/symlink.c              |   95
1008  fs/reiser4/plugin/file/tail_conversion.c      |  737 +++
1009  fs/reiser4/plugin/file_ops.c                  |  205
1010  fs/reiser4/plugin/file_ops_readdir.c          |  658 ++
1011  fs/reiser4/plugin/file_plugin_common.c        | 1011 ++++
1012  fs/reiser4/plugin/hash.c                      |  352 +
1013  fs/reiser4/plugin/inode_ops.c                 |  906 +++
1014  fs/reiser4/plugin/inode_ops_rename.c          |  925 +++
1015  fs/reiser4/plugin/item/Makefile               |   18
1016  fs/reiser4/plugin/item/acl.h                  |   66
1017  fs/reiser4/plugin/item/blackbox.c             |  142
1018  fs/reiser4/plugin/item/blackbox.h             |   33
1019  fs/reiser4/plugin/item/cde.c                  | 1008 ++++
1020  fs/reiser4/plugin/item/cde.h                  |   87
1021  fs/reiser4/plugin/item/ctail.c                | 1613 ++++++
1022  fs/reiser4/plugin/item/ctail.h                |  102
1023  fs/reiser4/plugin/item/extent.c               |  197
1024  fs/reiser4/plugin/item/extent.h               |  231
1025  fs/reiser4/plugin/item/extent_file_ops.c      | 1450 ++++++
1026  fs/reiser4/plugin/item/extent_flush_ops.c     | 1028 ++++
1027  fs/reiser4/plugin/item/extent_item_ops.c      |  889 +++
1028  fs/reiser4/plugin/item/internal.c             |  404 +
1029  fs/reiser4/plugin/item/internal.h             |   57
1030  fs/reiser4/plugin/item/item.c                 |  719 +++
1031  fs/reiser4/plugin/item/item.h                 |  398 +
1032  fs/reiser4/plugin/item/sde.c                  |  190
1033  fs/reiser4/plugin/item/sde.h                  |   66
1034  fs/reiser4/plugin/item/static_stat.c          | 1107 ++++
1035  fs/reiser4/plugin/item/static_stat.h          |  224
1036  fs/reiser4/plugin/item/tail.c                 |  807 +++
1037  fs/reiser4/plugin/item/tail.h                 |   58
1038  fs/reiser4/plugin/node/Makefile               |    5
1039  fs/reiser4/plugin/node/node.c                 |  131
1040  fs/reiser4/plugin/node/node.h                 |  272 +
1041  fs/reiser4/plugin/node/node40.c               | 2924 ++++++++++++
1042  fs/reiser4/plugin/node/node40.h               |  125
1043  fs/reiser4/plugin/object.c                    |  531 ++
1044  fs/reiser4/plugin/object.h                    |  120
1045  fs/reiser4/plugin/plugin.c                    |  560 ++
1046  fs/reiser4/plugin/plugin.h                    |  942 +++
1047  fs/reiser4/plugin/plugin_header.h             |  157
1048  fs/reiser4/plugin/plugin_set.c                |  380 +
1049  fs/reiser4/plugin/plugin_set.h                |   78
1050  fs/reiser4/plugin/security/Makefile           |    4
1051  fs/reiser4/plugin/security/perm.c             |   33
1052  fs/reiser4/plugin/security/perm.h             |   38
1053  fs/reiser4/plugin/space/Makefile              |    4
1054  fs/reiser4/plugin/space/bitmap.c              | 1585 ++++++
1055  fs/reiser4/plugin/space/bitmap.h              |   47
1056  fs/reiser4/plugin/space/space_allocator.h     |   80
1057  fs/reiser4/plugin/tail_policy.c               |  113
1058  fs/reiser4/pool.c                             |  231
1059  fs/reiser4/pool.h                             |   57
1060  fs/reiser4/readahead.c                        |  140
1061  fs/reiser4/readahead.h                        |   52
1062  fs/reiser4/reiser4.h                          |  270 +
1063  fs/reiser4/safe_link.c                        |  354 +
1064  fs/reiser4/safe_link.h                        |   29
1065  fs/reiser4/seal.c                             |  218
1066  fs/reiser4/seal.h                             |   49
1067  fs/reiser4/search.c                           | 1612 ++++++
1068  fs/reiser4/status_flags.c                     |  174
1069  fs/reiser4/status_flags.h                     |   47
1070  fs/reiser4/super.c                            |  306 +
1071  fs/reiser4/super.h                            |  466 +
1072  fs/reiser4/super_ops.c                        |  725 +++
1073  fs/reiser4/tap.c                              |  376 +
1074  fs/reiser4/tap.h                              |   70
1075  fs/reiser4/tree.c                             | 1878 +++++++
1076  fs/reiser4/tree.h                             |  577 ++
1077  fs/reiser4/tree_mod.c                         |  386 +
1078  fs/reiser4/tree_mod.h                         |   29
1079  fs/reiser4/tree_walk.c                        |  927 +++
1080  fs/reiser4/tree_walk.h                        |  125
1081  fs/reiser4/txnmgr.c                           | 3164 +++++++++++++
1082  fs/reiser4/txnmgr.h                           |  701 ++
1083  fs/reiser4/type_safe_hash.h                   |  320 +
1084  fs/reiser4/vfs_ops.c                          |  259 +
1085  fs/reiser4/vfs_ops.h                          |   53
1086  fs/reiser4/wander.c                           | 1797 +++++++
1087  fs/reiser4/wander.h                           |  135
1088  fs/reiser4/writeout.h                         |   21
1089  fs/reiser4/znode.c                            | 1029 ++++
1090  fs/reiser4/znode.h                            |  434 +
1091  165 files changed, 77621 insertions(+)
1092
1093 diff -puN Documentation/Changes~reiser4 Documentation/Changes
1094 --- a/Documentation/Changes~reiser4
1095 +++ a/Documentation/Changes
1096 @@ -36,6 +36,7 @@ o  module-init-tools      0.9.10
1097  o  e2fsprogs              1.41.4                  # e2fsck -V
1098  o  jfsutils               1.1.3                   # fsck.jfs -V
1099  o  reiserfsprogs          3.6.3                   # reiserfsck -V 2>&1|grep reiserfsprogs
1100 +o  reiser4progs           1.0.0                   # fsck.reiser4 -V
1101  o  xfsprogs               2.6.0                   # xfs_db -V
1102  o  squashfs-tools         4.0                     # mksquashfs -version
1103  o  btrfs-progs            0.18                    # btrfsck
1104 @@ -155,6 +156,13 @@ The reiserfsprogs package should be used
1105  versions of mkreiserfs, resize_reiserfs, debugreiserfs and
1106  reiserfsck. These utils work on both i386 and alpha platforms.
1107
1108 +Reiser4progs
1109 +------------
1110 +
1111 +The reiser4progs package contains utilities for the reiser4 file system.
1112 +Detailed instructions are provided in the README file located at:
1113 +<ftp://ftp.namesys.com/pub/reiser4progs/README>.
1114 +
1115  Xfsprogs
1116  --------
1117
1118 @@ -343,6 +351,10 @@ Reiserfsprogs
1119  -------------
1120  o  <http://www.namesys.com/pub/reiserfsprogs/reiserfsprogs-3.6.3.tar.gz>
1121
1122 +Reiser4progs
1123 +------------
1124 +o  <ftp://ftp.namesys.com/pub/reiser4progs/>
1125 +
1126  Xfsprogs
1127  --------
1128  o  <ftp://oss.sgi.com/projects/xfs/download/>
1129 diff -puN /dev/null Documentation/filesystems/reiser4.txt
1130 --- /dev/null
1131 +++ a/Documentation/filesystems/reiser4.txt
1132 @@ -0,0 +1,75 @@
1133 +Reiser4 filesystem
1134 +==================
1135 +Reiser4 is a file system based on dancing tree algorithms, and is
1136 +described at http://www.namesys.com
1137 +
1138 +
1139 +References
1140 +==========
1141 +web page               http://namesys.com/v4/v4.html
1142 +source code            ftp://ftp.namesys.com/pub/reiser4-for-2.6/
1143 +userland tools         ftp://ftp.namesys.com/pub/reiser4progs/
1144 +install page           http://www.namesys.com/install_v4.html
1145 +
1146 +Compile options
1147 +===============
1148 +Enable reiser4 debug mode
1149 +       This checks everything imaginable while reiser4
1150 +       runs
1151 +
1152 +Mount options
1153 +=============
1154 +tmgr.atom_max_size=N
1155 +       Atoms containing more than N blocks will be forced to commit.
1156 +       N is decimal.
1157 +       Default is nr_free_pagecache_pages() / 2 at mount time.
1158 +
1159 +tmgr.atom_max_age=N
1160 +       Atoms older than N seconds will be forced to commit. N is decimal.
1161 +       Default is 600.
1162 +
1163 +tmgr.atom_max_flushers=N
1164 +       Limit of concurrent flushers for one atom. 0 means no limit.
1165 +       Default is 0.
1166 +
1167 +tree.cbk_cache.nr_slots=N
1168 +       Number of slots in the cbk cache.
1169 +
1170 +flush.relocate_threshold=N
1171 +       If flush finds more than N adjacent dirty leaf-level blocks it
1172 +       will force them to be relocated.
1173 +       Default is 64.
1174 +
1175 +flush.relocate_distance=N
1176 +       If flush finds can find a block allocation closer than at most
1177 +       N from the preceder it will relocate to that position.
1178 +       Default is 64.
1179 +
1180 +flush.scan_maxnodes=N
1181 +       The maximum number of nodes to scan left on a level during
1182 +       flush.
1183 +       Default is 10000.
1184 +
1185 +optimal_io_size=N
1186 +       Preferred IO size. This value is used to set st_blksize of
1187 +       struct stat.
1188 +       Default is 65536.
1189 +
1190 +bsdgroups
1191 +       Turn on BSD-style gid assignment.
1192 +
1193 +32bittimes
1194 +       By default file in reiser4 have 64 bit timestamps. Files
1195 +       created when filesystem is mounted with 32bittimes mount
1196 +       option will get 32 bit timestamps.
1197 +
1198 +mtflush
1199 +       Turn off concurrent flushing.
1200 +
1201 +nopseudo
1202 +       Disable pseudo files support. See
1203 +       http://namesys.com/v4/pseudo.html for more about pseudo files.
1204 +
1205 +dont_load_bitmap
1206 +       Don't load all bitmap blocks at mount time, it is useful for
1207 +       machines with tiny RAM and large disks.
1208 diff -puN fs/Kconfig~reiser4 fs/Kconfig
1209 --- a/fs/Kconfig~reiser4
1210 +++ a/fs/Kconfig
1211 @@ -28,6 +28,7 @@ config FS_MBCACHE
1212         default m if EXT2_FS_XATTR || EXT3_FS_XATTR || EXT4_FS_XATTR
1213
1214  source "fs/reiserfs/Kconfig"
1215 +source "fs/reiser4/Kconfig"
1216  source "fs/jfs/Kconfig"
1217
1218  config FS_POSIX_ACL
1219 diff -puN fs/Makefile~reiser4 fs/Makefile
1220 --- a/fs/Makefile~reiser4
1221 +++ a/fs/Makefile
1222 @@ -65,6 +65,7 @@ obj-$(CONFIG_DLM)             += dlm/
1223  # Do not add any filesystems before this line
1224  obj-$(CONFIG_FSCACHE)          += fscache/
1225  obj-$(CONFIG_REISERFS_FS)      += reiserfs/
1226 +obj-$(CONFIG_REISER4_FS)       += reiser4/
1227  obj-$(CONFIG_EXT3_FS)          += ext3/ # Before ext2 so root fs can be ext3
1228  obj-$(CONFIG_EXT2_FS)          += ext2/
1229  # We place ext4 after ext2 so plain ext2 root fs's are mounted using ext2
1230 diff -puN /dev/null fs/reiser4/Kconfig
1231 --- /dev/null
1232 +++ a/fs/reiser4/Kconfig
1233 @@ -0,0 +1,34 @@
1234 +config REISER4_FS
1235 +       tristate "Reiser4 (EXPERIMENTAL)"
1236 +       depends on EXPERIMENTAL
1237 +       select ZLIB_INFLATE
1238 +       select ZLIB_DEFLATE
1239 +       select LZO_COMPRESS
1240 +       select LZO_DECOMPRESS
1241 +       select CRYPTO
1242 +       help
1243 +         Reiser4 is a filesystem that performs all filesystem operations
1244 +         as atomic transactions, which means that it either performs a
1245 +         write, or it does not, and in the event of a crash it does not
1246 +         partially perform it or corrupt it.
1247 +
1248 +         It stores files in dancing trees, which are like balanced trees but
1249 +         faster.  It packs small files together so that they share blocks
1250 +         without wasting space.  This means you can use it to store really
1251 +         small files.  It also means that it saves you disk space.  It avoids
1252 +         hassling you with anachronisms like having a maximum number of
1253 +         inodes, and wasting space if you use less than that number.
1254 +
1255 +         Reiser4 is a distinct filesystem type from reiserfs (V3).
1256 +         It's therefore not possible to use reiserfs file systems
1257 +         with reiser4.
1258 +
1259 +         To learn more about reiser4, go to http://www.namesys.com
1260 +
1261 +config REISER4_DEBUG
1262 +       bool "Enable reiser4 debug mode"
1263 +       depends on REISER4_FS
1264 +       help
1265 +         Don't use this unless you are debugging reiser4.
1266 +
1267 +         If unsure, say N.
1268 diff -puN /dev/null fs/reiser4/Makefile
1269 --- /dev/null
1270 +++ a/fs/reiser4/Makefile
1271 @@ -0,0 +1,98 @@
1272 +#
1273 +# reiser4/Makefile
1274 +#
1275 +
1276 +obj-$(CONFIG_REISER4_FS) += reiser4.o
1277 +
1278 +reiser4-y := \
1279 +                  debug.o \
1280 +                  jnode.o \
1281 +                  znode.o \
1282 +                  key.o \
1283 +                  pool.o \
1284 +                  tree_mod.o \
1285 +                  estimate.o \
1286 +                  carry.o \
1287 +                  carry_ops.o \
1288 +                  lock.o \
1289 +                  tree.o \
1290 +                  context.o \
1291 +                  tap.o \
1292 +                  coord.o \
1293 +                  block_alloc.o \
1294 +                  txnmgr.o \
1295 +                  kassign.o \
1296 +                  flush.o \
1297 +                  wander.o \
1298 +                  eottl.o \
1299 +                  search.o \
1300 +                  page_cache.o \
1301 +                  seal.o \
1302 +                  dscale.o \
1303 +                  flush_queue.o \
1304 +                  ktxnmgrd.o \
1305 +                  blocknrset.o \
1306 +                  super.o \
1307 +                  super_ops.o \
1308 +                  fsdata.o \
1309 +                  export_ops.o \
1310 +                  oid.o \
1311 +                  tree_walk.o \
1312 +                  inode.o \
1313 +                  vfs_ops.o \
1314 +                  as_ops.o \
1315 +                  entd.o\
1316 +                  readahead.o \
1317 +                  status_flags.o \
1318 +                  init_super.o \
1319 +                  safe_link.o \
1320 +           \
1321 +                  plugin/plugin.o \
1322 +                  plugin/plugin_set.o \
1323 +                  plugin/node/node.o \
1324 +                  plugin/object.o \
1325 +                  plugin/cluster.o \
1326 +                  plugin/inode_ops.o \
1327 +                  plugin/inode_ops_rename.o \
1328 +                  plugin/file_ops.o \
1329 +                  plugin/file_ops_readdir.o \
1330 +                  plugin/file_plugin_common.o \
1331 +                  plugin/file/file.o \
1332 +                  plugin/file/tail_conversion.o \
1333 +                  plugin/file/file_conversion.o \
1334 +                  plugin/file/symlink.o \
1335 +                  plugin/file/cryptcompress.o \
1336 +                  plugin/dir_plugin_common.o \
1337 +                  plugin/dir/hashed_dir.o \
1338 +                  plugin/dir/seekable_dir.o \
1339 +                  plugin/node/node40.o \
1340 +           \
1341 +                  plugin/crypto/cipher.o \
1342 +                  plugin/crypto/digest.o \
1343 +           \
1344 +                  plugin/compress/compress.o \
1345 +                  plugin/compress/compress_mode.o \
1346 +           \
1347 +                  plugin/item/static_stat.o \
1348 +                  plugin/item/sde.o \
1349 +                  plugin/item/cde.o \
1350 +                  plugin/item/blackbox.o \
1351 +                  plugin/item/internal.o \
1352 +                  plugin/item/tail.o \
1353 +                  plugin/item/ctail.o \
1354 +                  plugin/item/extent.o \
1355 +                  plugin/item/extent_item_ops.o \
1356 +                  plugin/item/extent_file_ops.o \
1357 +                  plugin/item/extent_flush_ops.o \
1358 +           \
1359 +                  plugin/hash.o \
1360 +                  plugin/fibration.o \
1361 +                  plugin/tail_policy.o \
1362 +                  plugin/item/item.o \
1363 +           \
1364 +                  plugin/security/perm.o \
1365 +                  plugin/space/bitmap.o \
1366 +           \
1367 +                  plugin/disk_format/disk_format40.o \
1368 +                  plugin/disk_format/disk_format.o
1369 +
1370 diff -puN /dev/null fs/reiser4/README
1371 --- /dev/null
1372 +++ a/fs/reiser4/README
1373 @@ -0,0 +1,128 @@
1374 +[LICENSING]
1375 +
1376 +Reiser4 is hereby licensed under the GNU General
1377 +Public License version 2.
1378 +
1379 +Source code files that contain the phrase "licensing governed by
1380 +reiser4/README" are "governed files" throughout this file.  Governed
1381 +files are licensed under the GPL.  The portions of them owned by Hans
1382 +Reiser, or authorized to be licensed by him, have been in the past,
1383 +and likely will be in the future, licensed to other parties under
1384 +other licenses.  If you add your code to governed files, and don't
1385 +want it to be owned by Hans Reiser, put your copyright label on that
1386 +code so the poor blight and his customers can keep things straight.
1387 +All portions of governed files not labeled otherwise are owned by Hans
1388 +Reiser, and by adding your code to it, widely distributing it to
1389 +others or sending us a patch, and leaving the sentence in stating that
1390 +licensing is governed by the statement in this file, you accept this.
1391 +It will be a kindness if you identify whether Hans Reiser is allowed
1392 +to license code labeled as owned by you on your behalf other than
1393 +under the GPL, because he wants to know if it is okay to do so and put
1394 +a check in the mail to you (for non-trivial improvements) when he
1395 +makes his next sale.  He makes no guarantees as to the amount if any,
1396 +though he feels motivated to motivate contributors, and you can surely
1397 +discuss this with him before or after contributing.  You have the
1398 +right to decline to allow him to license your code contribution other
1399 +than under the GPL.
1400 +
1401 +Further licensing options are available for commercial and/or other
1402 +interests directly from Hans Reiser: reiser@namesys.com.  If you interpret
1403 +the GPL as not allowing those additional licensing options, you read
1404 +it wrongly, and Richard Stallman agrees with me, when carefully read
1405 +you can see that those restrictions on additional terms do not apply
1406 +to the owner of the copyright, and my interpretation of this shall
1407 +govern for this license.
1408 +
1409 +[END LICENSING]
1410 +
1411 +Reiser4 is a file system based on dancing tree algorithms, and is
1412 +described at http://www.namesys.com
1413 +
1414 +mkfs.reiser4 and other utilities are on our webpage or wherever your
1415 +Linux provider put them.  You really want to be running the latest
1416 +version off the website if you use fsck.
1417 +
1418 +Yes, if you update your reiser4 kernel module you do have to
1419 +recompile your kernel, most of the time.  The errors you get will be
1420 +quite cryptic if your forget to do so.
1421 +
1422 +Hideous Commercial Pitch: Spread your development costs across other OS
1423 +vendors.  Select from the best in the world, not the best in your
1424 +building, by buying from third party OS component suppliers.  Leverage
1425 +the software component development power of the internet.  Be the most
1426 +aggressive in taking advantage of the commercial possibilities of
1427 +decentralized internet development, and add value through your branded
1428 +integration that you sell as an operating system.  Let your competitors
1429 +be the ones to compete against the entire internet by themselves.  Be
1430 +hip, get with the new economic trend, before your competitors do.  Send
1431 +email to reiser@namesys.com
1432 +
1433 +Hans Reiser was the primary architect of Reiser4, but a whole team
1434 +chipped their ideas in.  He invested everything he had into Namesys
1435 +for 5.5 dark years of no money before Reiser3 finally started to work well
1436 +enough to bring in money.  He owns the copyright.
1437 +
1438 +DARPA was the primary sponsor of Reiser4.  DARPA does not endorse
1439 +Reiser4, it merely sponsors it.  DARPA is, in solely Hans's personal
1440 +opinion, unique in its willingness to invest into things more
1441 +theoretical than the VC community can readily understand, and more
1442 +longterm than allows them to be sure that they will be the ones to
1443 +extract the economic benefits from.  DARPA also integrated us into a
1444 +security community that transformed our security worldview.
1445 +
1446 +Vladimir Saveliev is our lead programmer, with us from the beginning,
1447 +and he worked long hours writing the cleanest code.  This is why he is
1448 +now the lead programmer after years of commitment to our work.  He
1449 +always made the effort to be the best he could be, and to make his
1450 +code the best that it could be.  What resulted was quite remarkable. I
1451 +don't think that money can ever motivate someone to work the way he
1452 +did, he is one of the most selfless men I know.
1453 +
1454 +Alexander Lyamin was our sysadmin, and helped to educate us in
1455 +security issues.  Moscow State University and IMT were very generous
1456 +in the internet access they provided us, and in lots of other little
1457 +ways that a generous institution can be.
1458 +
1459 +Alexander Zarochentcev (sometimes known as zam, or sasha), wrote the
1460 +locking code, the block allocator, and finished the flushing code.
1461 +His code is always crystal clean and well structured.
1462 +
1463 +Nikita Danilov wrote the core of the balancing code, the core of the
1464 +plugins code, and the directory code.  He worked a steady pace of long
1465 +hours that produced a whole lot of well abstracted code.  He is our
1466 +senior computer scientist.
1467 +
1468 +Vladimir Demidov wrote the parser.  Writing an in kernel parser is
1469 +something very few persons have the skills for, and it is thanks to
1470 +him that we can say that the parser is really not so big compared to
1471 +various bits of our other code, and making a parser work in the kernel
1472 +was not so complicated as everyone would imagine mainly because it was
1473 +him doing it...
1474 +
1475 +Joshua McDonald wrote the transaction manager, and the flush code.
1476 +The flush code unexpectedly turned out be extremely hairy for reasons
1477 +you can read about on our web page, and he did a great job on an
1478 +extremely difficult task.
1479 +
1480 +Nina Reiser handled our accounting, government relations, and much
1481 +more.
1482 +
1483 +Ramon Reiser developed our website.
1484 +
1485 +Beverly Palmer drew our graphics.
1486 +
1487 +Vitaly Fertman developed librepair, userspace plugins repair code, fsck
1488 +and worked with Umka on developing libreiser4 and userspace plugins.
1489 +
1490 +Yury Umanets (aka Umka) developed libreiser4, userspace plugins and
1491 +userspace tools (reiser4progs).
1492 +
1493 +Oleg Drokin (aka Green) is the release manager who fixes everything.
1494 +It is so nice to have someone like that on the team.  He (plus Chris
1495 +and Jeff) make it possible for the entire rest of the Namesys team to
1496 +focus on Reiser4, and he fixed a whole lot of Reiser4 bugs also.  It
1497 +is just amazing to watch his talent for spotting bugs in action.
1498 +
1499 +Edward Shishkin wrote cryptcompress file plugin (which manages files
1500 +built of encrypted and(or) compressed bodies) and other plugins related
1501 +to transparent encryption and compression support.
1502 diff -puN /dev/null fs/reiser4/as_ops.c
1503 --- /dev/null
1504 +++ a/fs/reiser4/as_ops.c
1505 @@ -0,0 +1,376 @@
1506 +/* Copyright 2003 by Hans Reiser, licensing governed by reiser4/README */
1507 +
1508 +/* Interface to VFS. Reiser4 address_space_operations are defined here. */
1509 +
1510 +#include "forward.h"
1511 +#include "debug.h"
1512 +#include "dformat.h"
1513 +#include "coord.h"
1514 +#include "plugin/item/item.h"
1515 +#include "plugin/file/file.h"
1516 +#include "plugin/security/perm.h"
1517 +#include "plugin/disk_format/disk_format.h"
1518 +#include "plugin/plugin.h"
1519 +#include "plugin/plugin_set.h"
1520 +#include "plugin/object.h"
1521 +#include "txnmgr.h"
1522 +#include "jnode.h"
1523 +#include "znode.h"
1524 +#include "block_alloc.h"
1525 +#include "tree.h"
1526 +#include "vfs_ops.h"
1527 +#include "inode.h"
1528 +#include "page_cache.h"
1529 +#include "ktxnmgrd.h"
1530 +#include "super.h"
1531 +#include "reiser4.h"
1532 +#include "entd.h"
1533 +
1534 +#include <linux/profile.h>
1535 +#include <linux/types.h>
1536 +#include <linux/mount.h>
1537 +#include <linux/vfs.h>
1538 +#include <linux/mm.h>
1539 +#include <linux/buffer_head.h>
1540 +#include <linux/dcache.h>
1541 +#include <linux/list.h>
1542 +#include <linux/pagemap.h>
1543 +#include <linux/slab.h>
1544 +#include <linux/seq_file.h>
1545 +#include <linux/init.h>
1546 +#include <linux/module.h>
1547 +#include <linux/writeback.h>
1548 +#include <linux/backing-dev.h>
1549 +#include <linux/quotaops.h>
1550 +#include <linux/security.h>
1551 +
1552 +/* address space operations */
1553 +
1554 +/**
1555 + * reiser4_set_page_dirty - set dirty bit, tag in page tree, dirty accounting
1556 + * @page: page to be dirtied
1557 + *
1558 + * Operation of struct address_space_operations. This implementation is used by
1559 + * unix and cryptcompress file plugins.
1560 + *
1561 + * This is called when reiser4 page gets dirtied outside of reiser4, for
1562 + * example, when dirty bit is moved from pte to physical page.
1563 + *
1564 + * Tags page in the mapping's page tree with special tag so that it is possible
1565 + * to do all the reiser4 specific work wrt dirty pages (jnode creation,
1566 + * capturing by an atom) later because it can not be done in the contexts where
1567 + * set_page_dirty is called.
1568 + */
1569 +int reiser4_set_page_dirty(struct page *page)
1570 +{
1571 +       /* this page can be unformatted only */
1572 +       assert("vs-1734", (page->mapping &&
1573 +                          page->mapping->host &&
1574 +                          reiser4_get_super_fake(page->mapping->host->i_sb) !=
1575 +                          page->mapping->host
1576 +                          && reiser4_get_cc_fake(page->mapping->host->i_sb) !=
1577 +                          page->mapping->host
1578 +                          && reiser4_get_bitmap_fake(page->mapping->host->i_sb) !=
1579 +                          page->mapping->host));
1580 +
1581 +       if (!TestSetPageDirty(page)) {
1582 +               struct address_space *mapping = page->mapping;
1583 +
1584 +               if (mapping) {
1585 +                       spin_lock_irq(&mapping->tree_lock);
1586 +
1587 +                       /* check for race with truncate */
1588 +                       if (page->mapping) {
1589 +                               assert("vs-1652", page->mapping == mapping);
1590 +                               if (mapping_cap_account_dirty(mapping))
1591 +                                       inc_zone_page_state(page,
1592 +                                                       NR_FILE_DIRTY);
1593 +                               radix_tree_tag_set(&mapping->page_tree,
1594 +                                                  page->index,
1595 +                                                  PAGECACHE_TAG_REISER4_MOVED);
1596 +                       }
1597 +                       spin_unlock_irq(&mapping->tree_lock);
1598 +                       __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
1599 +               }
1600 +       }
1601 +       return 0;
1602 +}
1603 +
1604 +/* ->invalidatepage method for reiser4 */
1605 +
1606 +/*
1607 + * this is called for each truncated page from
1608 + * truncate_inode_pages()->truncate_{complete,partial}_page().
1609 + *
1610 + * At the moment of call, page is under lock, and outstanding io (if any) has
1611 + * completed.
1612 + */
1613 +
1614 +/**
1615 + * reiser4_invalidatepage
1616 + * @page: page to invalidate
1617 + * @offset: starting offset for partial invalidation
1618 + *
1619 + */
1620 +void reiser4_invalidatepage(struct page *page, unsigned long offset)
1621 +{
1622 +       int ret = 0;
1623 +       reiser4_context *ctx;
1624 +       struct inode *inode;
1625 +       jnode *node;
1626 +
1627 +       /*
1628 +        * This is called to truncate file's page.
1629 +        *
1630 +        * Originally, reiser4 implemented truncate in a standard way
1631 +        * (vmtruncate() calls ->invalidatepage() on all truncated pages
1632 +        * first, then file system ->truncate() call-back is invoked).
1633 +        *
1634 +        * This lead to the problem when ->invalidatepage() was called on a
1635 +        * page with jnode that was captured into atom in ASTAGE_PRE_COMMIT
1636 +        * process. That is, truncate was bypassing transactions. To avoid
1637 +        * this, try_capture_page_to_invalidate() call was added here.
1638 +        *
1639 +        * After many troubles with vmtruncate() based truncate (including
1640 +        * races with flush, tail conversion, etc.) it was re-written in the
1641 +        * top-to-bottom style: items are killed in reiser4_cut_tree_object()
1642 +        * and pages belonging to extent are invalidated in kill_hook_extent().
1643 +        * So probably now additional call to capture is not needed here.
1644 +        */
1645 +
1646 +       assert("nikita-3137", PageLocked(page));
1647 +       assert("nikita-3138", !PageWriteback(page));
1648 +       inode = page->mapping->host;
1649 +
1650 +       /*
1651 +        * ->invalidatepage() should only be called for the unformatted
1652 +        * jnodes. Destruction of all other types of jnodes is performed
1653 +        * separately. But, during some corner cases (like handling errors
1654 +        * during mount) it is simpler to let ->invalidatepage to be called on
1655 +        * them. Check for this, and do nothing.
1656 +        */
1657 +       if (reiser4_get_super_fake(inode->i_sb) == inode)
1658 +               return;
1659 +       if (reiser4_get_cc_fake(inode->i_sb) == inode)
1660 +               return;
1661 +       if (reiser4_get_bitmap_fake(inode->i_sb) == inode)
1662 +               return;
1663 +       assert("vs-1426", PagePrivate(page));
1664 +       assert("vs-1427",
1665 +              page->mapping == jnode_get_mapping(jnode_by_page(page)));
1666 +       assert("", jprivate(page) != NULL);
1667 +       assert("", ergo(inode_file_plugin(inode) !=
1668 +                       file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID),
1669 +                       offset == 0));
1670 +
1671 +       ctx = reiser4_init_context(inode->i_sb);
1672 +       if (IS_ERR(ctx))
1673 +               return;
1674 +
1675 +       node = jprivate(page);
1676 +       spin_lock_jnode(node);
1677 +       if (!(node->state & ((1 << JNODE_DIRTY) | (1 << JNODE_FLUSH_QUEUED) |
1678 +                         (1 << JNODE_WRITEBACK) | (1 << JNODE_OVRWR)))) {
1679 +               /* there is not need to capture */
1680 +               jref(node);
1681 +               JF_SET(node, JNODE_HEARD_BANSHEE);
1682 +               page_clear_jnode(page, node);
1683 +               reiser4_uncapture_jnode(node);
1684 +               unhash_unformatted_jnode(node);
1685 +               jput(node);
1686 +               reiser4_exit_context(ctx);
1687 +               return;
1688 +       }
1689 +       spin_unlock_jnode(node);
1690 +
1691 +       /* capture page being truncated. */
1692 +       ret = try_capture_page_to_invalidate(page);
1693 +       if (ret != 0)
1694 +               warning("nikita-3141", "Cannot capture: %i", ret);
1695 +
1696 +       if (offset == 0) {
1697 +               /* remove jnode from transaction and detach it from page. */
1698 +               jref(node);
1699 +               JF_SET(node, JNODE_HEARD_BANSHEE);
1700 +               /* page cannot be detached from jnode concurrently, because it
1701 +                * is locked */
1702 +               reiser4_uncapture_page(page);
1703 +
1704 +               /* this detaches page from jnode, so that jdelete will not try
1705 +                * to lock page which is already locked */
1706 +               spin_lock_jnode(node);
1707 +               page_clear_jnode(page, node);
1708 +               spin_unlock_jnode(node);
1709 +               unhash_unformatted_jnode(node);
1710 +
1711 +               jput(node);
1712 +       }
1713 +
1714 +       reiser4_exit_context(ctx);
1715 +}
1716 +
1717 +/* help function called from reiser4_releasepage(). It returns true if jnode
1718 + * can be detached from its page and page released. */
1719 +int jnode_is_releasable(jnode * node/* node to check */)
1720 +{
1721 +       assert("nikita-2781", node != NULL);
1722 +       assert_spin_locked(&(node->guard));
1723 +       assert_spin_locked(&(node->load));
1724 +
1725 +       /* is some thread is currently using jnode page, later cannot be
1726 +        * detached */
1727 +       if (atomic_read(&node->d_count) != 0)
1728 +               return 0;
1729 +
1730 +       assert("vs-1214", !jnode_is_loaded(node));
1731 +
1732 +       /*
1733 +        * can only release page if real block number is assigned to it. Simple
1734 +        * check for ->atom wouldn't do, because it is possible for node to be
1735 +        * clean, not it atom yet, and still having fake block number. For
1736 +        * example, node just created in jinit_new().
1737 +        */
1738 +       if (reiser4_blocknr_is_fake(jnode_get_block(node)))
1739 +               return 0;
1740 +
1741 +       /*
1742 +        * pages prepared for write can not be released anyway, so avoid
1743 +        * detaching jnode from the page
1744 +        */
1745 +       if (JF_ISSET(node, JNODE_WRITE_PREPARED))
1746 +               return 0;
1747 +
1748 +       /*
1749 +        * dirty jnode cannot be released. It can however be submitted to disk
1750 +        * as part of early flushing, but only after getting flush-prepped.
1751 +        */
1752 +       if (JF_ISSET(node, JNODE_DIRTY))
1753 +               return 0;
1754 +
1755 +       /* overwrite set is only written by log writer. */
1756 +       if (JF_ISSET(node, JNODE_OVRWR))
1757 +               return 0;
1758 +
1759 +       /* jnode is already under writeback */
1760 +       if (JF_ISSET(node, JNODE_WRITEBACK))
1761 +               return 0;
1762 +
1763 +       /* don't flush bitmaps or journal records */
1764 +       if (!jnode_is_znode(node) && !jnode_is_unformatted(node))
1765 +               return 0;
1766 +
1767 +       return 1;
1768 +}
1769 +
1770 +/*
1771 + * ->releasepage method for reiser4
1772 + *
1773 + * This is called by VM scanner when it comes across clean page.  What we have
1774 + * to do here is to check whether page can really be released (freed that is)
1775 + * and if so, detach jnode from it and remove page from the page cache.
1776 + *
1777 + * Check for releasability is done by releasable() function.
1778 + */
1779 +int reiser4_releasepage(struct page *page, gfp_t gfp UNUSED_ARG)
1780 +{
1781 +       jnode *node;
1782 +
1783 +       assert("nikita-2257", PagePrivate(page));
1784 +       assert("nikita-2259", PageLocked(page));
1785 +       assert("nikita-2892", !PageWriteback(page));
1786 +       assert("nikita-3019", reiser4_schedulable());
1787 +
1788 +       /* NOTE-NIKITA: this can be called in the context of reiser4 call. It
1789 +          is not clear what to do in this case. A lot of deadlocks seems be
1790 +          possible. */
1791 +
1792 +       node = jnode_by_page(page);
1793 +       assert("nikita-2258", node != NULL);
1794 +       assert("reiser4-4", page->mapping != NULL);
1795 +       assert("reiser4-5", page->mapping->host != NULL);
1796 +
1797 +       if (PageDirty(page))
1798 +               return 0;
1799 +
1800 +       /* extra page reference is used by reiser4 to protect
1801 +        * jnode<->page link from this ->releasepage(). */
1802 +       if (page_count(page) > 3)
1803 +               return 0;
1804 +
1805 +       /* releasable() needs jnode lock, because it looks at the jnode fields
1806 +        * and we need jload_lock here to avoid races with jload(). */
1807 +       spin_lock_jnode(node);
1808 +       spin_lock(&(node->load));
1809 +       if (jnode_is_releasable(node)) {
1810 +               struct address_space *mapping;
1811 +
1812 +               mapping = page->mapping;
1813 +               jref(node);
1814 +               /* there is no need to synchronize against
1815 +                * jnode_extent_write() here, because pages seen by
1816 +                * jnode_extent_write() are !releasable(). */
1817 +               page_clear_jnode(page, node);
1818 +               spin_unlock(&(node->load));
1819 +               spin_unlock_jnode(node);
1820 +
1821 +               /* we are under memory pressure so release jnode also. */
1822 +               jput(node);
1823 +
1824 +               return 1;
1825 +       } else {
1826 +               spin_unlock(&(node->load));
1827 +               spin_unlock_jnode(node);
1828 +               assert("nikita-3020", reiser4_schedulable());
1829 +               return 0;
1830 +       }
1831 +}
1832 +
1833 +int reiser4_readpage(struct file *file, struct page *page)
1834 +{
1835 +       assert("edward-1533", PageLocked(page));
1836 +       assert("edward-1534", !PageUptodate(page));
1837 +       assert("edward-1535", page->mapping && page->mapping->host);
1838 +
1839 +       return inode_file_plugin(page->mapping->host)->readpage(file, page);
1840 +}
1841 +
1842 +int reiser4_readpages(struct file *file, struct address_space *mapping,
1843 +                     struct list_head *pages, unsigned nr_pages)
1844 +{
1845 +       return inode_file_plugin(mapping->host)->readpages(file, mapping,
1846 +                                                          pages, nr_pages);
1847 +}
1848 +
1849 +int reiser4_writepages(struct address_space *mapping,
1850 +                      struct writeback_control *wbc)
1851 +{
1852 +       return inode_file_plugin(mapping->host)->writepages(mapping, wbc);
1853 +}
1854 +
1855 +int reiser4_prepare_write(struct file *file, struct page *page,
1856 +                         unsigned from, unsigned to)
1857 +{
1858 +       return inode_file_plugin(file->f_dentry->d_inode)->prepare_write(file,
1859 +                                                                        page,
1860 +                                                                        from,
1861 +                                                                        to);
1862 +}
1863 +
1864 +int reiser4_commit_write(struct file *file, struct page *page,
1865 +                        unsigned from, unsigned to)
1866 +{
1867 +       return inode_file_plugin(file->f_dentry->d_inode)->commit_write(file,
1868 +                                                                       page,
1869 +                                                                       from,
1870 +                                                                       to);
1871 +}
1872 +
1873 +/* Make Linus happy.
1874 +   Local variables:
1875 +   c-indentation-style: "K&R"
1876 +   mode-name: "LC"
1877 +   c-basic-offset: 8
1878 +   tab-width: 8
1879 +   fill-column: 120
1880 +   End:
1881 +*/
1882 diff -puN /dev/null fs/reiser4/block_alloc.c
1883 --- /dev/null
1884 +++ a/fs/reiser4/block_alloc.c
1885 @@ -0,0 +1,1142 @@
1886 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
1887 +reiser4/README */
1888 +
1889 +#include "debug.h"
1890 +#include "dformat.h"
1891 +#include "plugin/plugin.h"
1892 +#include "txnmgr.h"
1893 +#include "znode.h"
1894 +#include "block_alloc.h"
1895 +#include "tree.h"
1896 +#include "super.h"
1897 +
1898 +#include <linux/types.h>       /* for __u??  */
1899 +#include <linux/fs.h>          /* for struct super_block  */
1900 +#include <linux/spinlock.h>
1901 +
1902 +/* THE REISER4 DISK SPACE RESERVATION SCHEME. */
1903 +
1904 +/* We need to be able to reserve enough disk space to ensure that an atomic
1905 +   operation will have enough disk space to flush (see flush.c and
1906 +   http://namesys.com/v4/v4.html) and commit it once it is started.
1907 +
1908 +   In our design a call for reserving disk space may fail but not an actual
1909 +   block allocation.
1910 +
1911 +   All free blocks, already allocated blocks, and all kinds of reserved blocks
1912 +   are counted in different per-fs block counters.
1913 +
1914 +   A reiser4 super block's set of block counters currently is:
1915 +
1916 +   free -- free blocks,
1917 +   used -- already allocated blocks,
1918 +
1919 +   grabbed -- initially reserved for performing an fs operation, those blocks
1920 +        are taken from free blocks, then grabbed disk space leaks from grabbed
1921 +        blocks counter to other counters like "fake allocated", "flush
1922 +        reserved", "used", the rest of not used grabbed space is returned to
1923 +        free space at the end of fs operation;
1924 +
1925 +   fake allocated -- counts all nodes without real disk block numbers assigned,
1926 +                   we have separate accounting for formatted and unformatted
1927 +                   nodes (for easier debugging);
1928 +
1929 +   flush reserved -- disk space needed for flushing and committing an atom.
1930 +                   Each dirty already allocated block could be written as a
1931 +                   part of atom's overwrite set or as a part of atom's
1932 +                   relocate set.  In both case one additional block is needed,
1933 +                   it is used as a wandered block if we do overwrite or as a
1934 +                   new location for a relocated block.
1935 +
1936 +   In addition, blocks in some states are counted on per-thread and per-atom
1937 +   basis.  A reiser4 context has a counter of blocks grabbed by this transaction
1938 +   and the sb's grabbed blocks counter is a sum of grabbed blocks counter values
1939 +   of each reiser4 context.  Each reiser4 atom has a counter of "flush reserved"
1940 +   blocks, which are reserved for flush processing and atom commit. */
1941 +
1942 +/* AN EXAMPLE: suppose we insert new item to the reiser4 tree.  We estimate
1943 +   number of blocks to grab for most expensive case of balancing when the leaf
1944 +   node we insert new item to gets split and new leaf node is allocated.
1945 +
1946 +   So, we need to grab blocks for
1947 +
1948 +   1) one block for possible dirtying the node we insert an item to. That block
1949 +      would be used for node relocation at flush time or for allocating of a
1950 +      wandered one, it depends what will be a result (what set, relocate or
1951 +      overwrite the node gets assigned to) of the node processing by the flush
1952 +      algorithm.
1953 +
1954 +   2) one block for either allocating a new node, or dirtying of right or left
1955 +      clean neighbor, only one case may happen.
1956 +
1957 +   VS-FIXME-HANS: why can only one case happen? I would expect to see dirtying
1958 +   of left neighbor, right neighbor, current node, and creation of new node.
1959 +   Have I forgotten something?  email me.
1960 +
1961 +   These grabbed blocks are counted in both reiser4 context "grabbed blocks"
1962 +   counter and in the fs-wide one (both ctx->grabbed_blocks and
1963 +   sbinfo->blocks_grabbed get incremented by 2), sb's free blocks counter is
1964 +   decremented by 2.
1965 +
1966 +   Suppose both two blocks were spent for dirtying of an already allocated clean
1967 +   node (one block went from "grabbed" to "flush reserved") and for new block
1968 +   allocating (one block went from "grabbed" to "fake allocated formatted").
1969 +
1970 +   Inserting of a child pointer to the parent node caused parent node to be
1971 +   split, the balancing code takes care about this grabbing necessary space
1972 +   immediately by calling reiser4_grab with BA_RESERVED flag set which means
1973 +   "can use the 5% reserved disk space".
1974 +
1975 +   At this moment insertion completes and grabbed blocks (if they were not used)
1976 +   should be returned to the free space counter.
1977 +
1978 +   However the atom life-cycle is not completed.  The atom had one "flush
1979 +   reserved" block added by our insertion and the new fake allocated node is
1980 +   counted as a "fake allocated formatted" one.  The atom has to be fully
1981 +   processed by flush before commit.  Suppose that the flush moved the first,
1982 +   already allocated node to the atom's overwrite list, the new fake allocated
1983 +   node, obviously, went into the atom relocate set.  The reiser4 flush
1984 +   allocates the new node using one unit from "fake allocated formatted"
1985 +   counter, the log writer uses one from "flush reserved" for wandered block
1986 +   allocation.
1987 +
1988 +   And, it is not the end.  When the wandered block is deallocated after the
1989 +   atom gets fully played (see wander.c for term description), the disk space
1990 +   occupied for it is returned to free blocks. */
1991 +
1992 +/* BLOCK NUMBERS */
1993 +
1994 +/* Any reiser4 node has a block number assigned to it.  We use these numbers for
1995 +   indexing in hash tables, so if a block has not yet been assigned a location
1996 +   on disk we need to give it a temporary fake block number.
1997 +
1998 +   Current implementation of reiser4 uses 64-bit integers for block numbers. We
1999 +   use highest bit in 64-bit block number to distinguish fake and real block
2000 +   numbers. So, only 63 bits may be used to addressing of real device
2001 +   blocks. That "fake" block numbers space is divided into subspaces of fake
2002 +   block numbers for data blocks and for shadow (working) bitmap blocks.
2003 +
2004 +   Fake block numbers for data blocks are generated by a cyclic counter, which
2005 +   gets incremented after each real block allocation. We assume that it is
2006 +   impossible to overload this counter during one transaction life. */
2007 +
2008 +/* Initialize a blocknr hint. */
2009 +void reiser4_blocknr_hint_init(reiser4_blocknr_hint * hint)
2010 +{
2011 +       memset(hint, 0, sizeof(reiser4_blocknr_hint));
2012 +}
2013 +
2014 +/* Release any resources of a blocknr hint. */
2015 +void reiser4_blocknr_hint_done(reiser4_blocknr_hint * hint UNUSED_ARG)
2016 +{
2017 +/* No resources should be freed in current blocknr_hint implementation. */
2018 +}
2019 +
2020 +/* see above for explanation of fake block number.  */
2021 +/* Audited by: green(2002.06.11) */
2022 +int reiser4_blocknr_is_fake(const reiser4_block_nr * da)
2023 +{
2024 +       /* The reason for not simply returning result of '&' operation is that
2025 +          while return value is (possibly 32bit) int,  the reiser4_block_nr is
2026 +          at least 64 bits long, and high bit (which is the only possible
2027 +          non zero bit after the masking) would be stripped off */
2028 +       return (*da & REISER4_FAKE_BLOCKNR_BIT_MASK) ? 1 : 0;
2029 +}
2030 +
2031 +/* Static functions for <reiser4 super block>/<reiser4 context> block counters
2032 +   arithmetic. Mostly, they are isolated to not to code same assertions in
2033 +   several places. */
2034 +static void sub_from_ctx_grabbed(reiser4_context * ctx, __u64 count)
2035 +{
2036 +       BUG_ON(ctx->grabbed_blocks < count);
2037 +       assert("zam-527", ctx->grabbed_blocks >= count);
2038 +       ctx->grabbed_blocks -= count;
2039 +}
2040 +
2041 +static void add_to_ctx_grabbed(reiser4_context * ctx, __u64 count)
2042 +{
2043 +       ctx->grabbed_blocks += count;
2044 +}
2045 +
2046 +static void sub_from_sb_grabbed(reiser4_super_info_data * sbinfo, __u64 count)
2047 +{
2048 +       assert("zam-525", sbinfo->blocks_grabbed >= count);
2049 +       sbinfo->blocks_grabbed -= count;
2050 +}
2051 +
2052 +/* Decrease the counter of block reserved for flush in super block. */
2053 +static void
2054 +sub_from_sb_flush_reserved(reiser4_super_info_data * sbinfo, __u64 count)
2055 +{
2056 +       assert("vpf-291", sbinfo->blocks_flush_reserved >= count);
2057 +       sbinfo->blocks_flush_reserved -= count;
2058 +}
2059 +
2060 +static void
2061 +sub_from_sb_fake_allocated(reiser4_super_info_data * sbinfo, __u64 count,
2062 +                          reiser4_ba_flags_t flags)
2063 +{
2064 +       if (flags & BA_FORMATTED) {
2065 +               assert("zam-806", sbinfo->blocks_fake_allocated >= count);
2066 +               sbinfo->blocks_fake_allocated -= count;
2067 +       } else {
2068 +               assert("zam-528",
2069 +                      sbinfo->blocks_fake_allocated_unformatted >= count);
2070 +               sbinfo->blocks_fake_allocated_unformatted -= count;
2071 +       }
2072 +}
2073 +
2074 +static void sub_from_sb_used(reiser4_super_info_data * sbinfo, __u64 count)
2075 +{
2076 +       assert("zam-530",
2077 +              sbinfo->blocks_used >= count + sbinfo->min_blocks_used);
2078 +       sbinfo->blocks_used -= count;
2079 +}
2080 +
2081 +static void
2082 +sub_from_cluster_reserved(reiser4_super_info_data * sbinfo, __u64 count)
2083 +{
2084 +       assert("edward-501", sbinfo->blocks_clustered >= count);
2085 +       sbinfo->blocks_clustered -= count;
2086 +}
2087 +
2088 +/* Increase the counter of block reserved for flush in atom. */
2089 +static void add_to_atom_flush_reserved_nolock(txn_atom * atom, __u32 count)
2090 +{
2091 +       assert("zam-772", atom != NULL);
2092 +       assert_spin_locked(&(atom->alock));
2093 +       atom->flush_reserved += count;
2094 +}
2095 +
2096 +/* Decrease the counter of block reserved for flush in atom. */
2097 +static void sub_from_atom_flush_reserved_nolock(txn_atom * atom, __u32 count)
2098 +{
2099 +       assert("zam-774", atom != NULL);
2100 +       assert_spin_locked(&(atom->alock));
2101 +       assert("nikita-2790", atom->flush_reserved >= count);
2102 +       atom->flush_reserved -= count;
2103 +}
2104 +
2105 +/* super block has 6 counters: free, used, grabbed, fake allocated
2106 +   (formatted and unformatted) and flush reserved. Their sum must be
2107 +   number of blocks on a device. This function checks this */
2108 +int reiser4_check_block_counters(const struct super_block *super)
2109 +{
2110 +       __u64 sum;
2111 +
2112 +       sum = reiser4_grabbed_blocks(super) + reiser4_free_blocks(super) +
2113 +           reiser4_data_blocks(super) + reiser4_fake_allocated(super) +
2114 +           reiser4_fake_allocated_unformatted(super) + reiser4_flush_reserved(super) +
2115 +           reiser4_clustered_blocks(super);
2116 +       if (reiser4_block_count(super) != sum) {
2117 +               printk("super block counters: "
2118 +                      "used %llu, free %llu, "
2119 +                      "grabbed %llu, fake allocated (formatetd %llu, unformatted %llu), "
2120 +                      "reserved %llu, clustered %llu, sum %llu, must be (block count) %llu\n",
2121 +                      (unsigned long long)reiser4_data_blocks(super),
2122 +                      (unsigned long long)reiser4_free_blocks(super),
2123 +                      (unsigned long long)reiser4_grabbed_blocks(super),
2124 +                      (unsigned long long)reiser4_fake_allocated(super),
2125 +                      (unsigned long long)
2126 +                      reiser4_fake_allocated_unformatted(super),
2127 +                      (unsigned long long)reiser4_flush_reserved(super),
2128 +                      (unsigned long long)reiser4_clustered_blocks(super),
2129 +                      (unsigned long long)sum,
2130 +                      (unsigned long long)reiser4_block_count(super));
2131 +               return 0;
2132 +       }
2133 +       return 1;
2134 +}
2135 +
2136 +/* Adjust "working" free blocks counter for number of blocks we are going to
2137 +   allocate.  Record number of grabbed blocks in fs-wide and per-thread
2138 +   counters.  This function should be called before bitmap scanning or
2139 +   allocating fake block numbers
2140 +
2141 +   @super           -- pointer to reiser4 super block;
2142 +   @count           -- number of blocks we reserve;
2143 +
2144 +   @return          -- 0 if success,  -ENOSPC, if all
2145 +                       free blocks are preserved or already allocated.
2146 +*/
2147 +
2148 +static int
2149 +reiser4_grab(reiser4_context * ctx, __u64 count, reiser4_ba_flags_t flags)
2150 +{
2151 +       __u64 free_blocks;
2152 +       int ret = 0, use_reserved = flags & BA_RESERVED;
2153 +       reiser4_super_info_data *sbinfo;
2154 +
2155 +       assert("vs-1276", ctx == get_current_context());
2156 +
2157 +       /* Do not grab anything on ro-mounted fs. */
2158 +       if (rofs_super(ctx->super)) {
2159 +               ctx->grab_enabled = 0;
2160 +               return 0;
2161 +       }
2162 +
2163 +       sbinfo = get_super_private(ctx->super);
2164 +
2165 +       spin_lock_reiser4_super(sbinfo);
2166 +
2167 +       free_blocks = sbinfo->blocks_free;
2168 +
2169 +       if ((use_reserved && free_blocks < count) ||
2170 +           (!use_reserved && free_blocks < count + sbinfo->blocks_reserved)) {
2171 +               ret = RETERR(-ENOSPC);
2172 +               goto unlock_and_ret;
2173 +       }
2174 +
2175 +       add_to_ctx_grabbed(ctx, count);
2176 +
2177 +       sbinfo->blocks_grabbed += count;
2178 +       sbinfo->blocks_free -= count;
2179 +
2180 +#if REISER4_DEBUG
2181 +       if (ctx->grabbed_initially == 0)
2182 +               ctx->grabbed_initially = count;
2183 +#endif
2184 +
2185 +       assert("nikita-2986", reiser4_check_block_counters(ctx->super));
2186 +
2187 +       /* disable grab space in current context */
2188 +       ctx->grab_enabled = 0;
2189 +
2190 +unlock_and_ret:
2191 +       spin_unlock_reiser4_super(sbinfo);
2192 +
2193 +       return ret;
2194 +}
2195 +
2196 +int reiser4_grab_space(__u64 count, reiser4_ba_flags_t flags)
2197 +{
2198 +       int ret;
2199 +       reiser4_context *ctx;
2200 +
2201 +       assert("nikita-2964", ergo(flags & BA_CAN_COMMIT,
2202 +                                  lock_stack_isclean(get_current_lock_stack
2203 +                                                     ())));
2204 +       ctx = get_current_context();
2205 +       if (!(flags & BA_FORCE) && !is_grab_enabled(ctx))
2206 +               return 0;
2207 +
2208 +       ret = reiser4_grab(ctx, count, flags);
2209 +       if (ret == -ENOSPC) {
2210 +
2211 +               /* Trying to commit the all transactions if BA_CAN_COMMIT flag
2212 +                  present */
2213 +               if (flags & BA_CAN_COMMIT) {
2214 +                       txnmgr_force_commit_all(ctx->super, 0);
2215 +                       ctx->grab_enabled = 1;
2216 +                       ret = reiser4_grab(ctx, count, flags);
2217 +               }
2218 +       }
2219 +       /*
2220 +        * allocation from reserved pool cannot fail. This is severe error.
2221 +        */
2222 +       assert("nikita-3005", ergo(flags & BA_RESERVED, ret == 0));
2223 +       return ret;
2224 +}
2225 +
2226 +/*
2227 + * SPACE RESERVED FOR UNLINK/TRUNCATE
2228 + *
2229 + * Unlink and truncate require space in transaction (to update stat data, at
2230 + * least). But we don't want rm(1) to fail with "No space on device" error.
2231 + *
2232 + * Solution is to reserve 5% of disk space for truncates and
2233 + * unlinks. Specifically, normal space grabbing requests don't grab space from
2234 + * reserved area. Only requests with BA_RESERVED bit in flags are allowed to
2235 + * drain it. Per super block delete mutex is used to allow only one
2236 + * thread at a time to grab from reserved area.
2237 + *
2238 + * Grabbing from reserved area should always be performed with BA_CAN_COMMIT
2239 + * flag.
2240 + *
2241 + */
2242 +
2243 +int reiser4_grab_reserved(struct super_block *super,
2244 +                         __u64 count, reiser4_ba_flags_t flags)
2245 +{
2246 +       reiser4_super_info_data *sbinfo = get_super_private(super);
2247 +
2248 +       assert("nikita-3175", flags & BA_CAN_COMMIT);
2249 +
2250 +       /* Check the delete mutex already taken by us, we assume that
2251 +        * reading of machine word is atomic. */
2252 +       if (sbinfo->delete_mutex_owner == current) {
2253 +               if (reiser4_grab_space
2254 +                   (count, (flags | BA_RESERVED) & ~BA_CAN_COMMIT)) {
2255 +                       warning("zam-1003",
2256 +                               "nested call of grab_reserved fails count=(%llu)",
2257 +                               (unsigned long long)count);
2258 +                       reiser4_release_reserved(super);
2259 +                       return RETERR(-ENOSPC);
2260 +               }
2261 +               return 0;
2262 +       }
2263 +
2264 +       if (reiser4_grab_space(count, flags)) {
2265 +               mutex_lock(&sbinfo->delete_mutex);
2266 +               assert("nikita-2929", sbinfo->delete_mutex_owner == NULL);
2267 +               sbinfo->delete_mutex_owner = current;
2268 +
2269 +               if (reiser4_grab_space(count, flags | BA_RESERVED)) {
2270 +                       warning("zam-833",
2271 +                               "reserved space is not enough (%llu)",
2272 +                               (unsigned long long)count);
2273 +                       reiser4_release_reserved(super);
2274 +                       return RETERR(-ENOSPC);
2275 +               }
2276 +       }
2277 +       return 0;
2278 +}
2279 +
2280 +void reiser4_release_reserved(struct super_block *super)
2281 +{
2282 +       reiser4_super_info_data *info;
2283 +
2284 +       info = get_super_private(super);
2285 +       if (info->delete_mutex_owner == current) {
2286 +               info->delete_mutex_owner = NULL;
2287 +               mutex_unlock(&info->delete_mutex);
2288 +       }
2289 +}
2290 +
2291 +static reiser4_super_info_data *grabbed2fake_allocated_head(int count)
2292 +{
2293 +       reiser4_context *ctx;
2294 +       reiser4_super_info_data *sbinfo;
2295 +
2296 +       ctx = get_current_context();
2297 +       sub_from_ctx_grabbed(ctx, count);
2298 +
2299 +       sbinfo = get_super_private(ctx->super);
2300 +       spin_lock_reiser4_super(sbinfo);
2301 +
2302 +       sub_from_sb_grabbed(sbinfo, count);
2303 +       /* return sbinfo locked */
2304 +       return sbinfo;
2305 +}
2306 +
2307 +/* is called after @count fake block numbers are allocated and pointer to
2308 +   those blocks are inserted into tree. */
2309 +static void grabbed2fake_allocated_formatted(void)
2310 +{
2311 +       reiser4_super_info_data *sbinfo;
2312 +
2313 +       sbinfo = grabbed2fake_allocated_head(1);
2314 +       sbinfo->blocks_fake_allocated++;
2315 +
2316 +       assert("vs-922", reiser4_check_block_counters(reiser4_get_current_sb()));
2317 +
2318 +       spin_unlock_reiser4_super(sbinfo);
2319 +}
2320 +
2321 +/**
2322 + * grabbed2fake_allocated_unformatted
2323 + * @count:
2324 + *
2325 + */
2326 +static void grabbed2fake_allocated_unformatted(int count)
2327 +{
2328 +       reiser4_super_info_data *sbinfo;
2329 +
2330 +       sbinfo = grabbed2fake_allocated_head(count);
2331 +       sbinfo->blocks_fake_allocated_unformatted += count;
2332 +
2333 +       assert("vs-9221", reiser4_check_block_counters(reiser4_get_current_sb()));
2334 +
2335 +       spin_unlock_reiser4_super(sbinfo);
2336 +}
2337 +
2338 +void grabbed2cluster_reserved(int count)
2339 +{
2340 +       reiser4_context *ctx;
2341 +       reiser4_super_info_data *sbinfo;
2342 +
2343 +       ctx = get_current_context();
2344 +       sub_from_ctx_grabbed(ctx, count);
2345 +
2346 +       sbinfo = get_super_private(ctx->super);
2347 +       spin_lock_reiser4_super(sbinfo);
2348 +
2349 +       sub_from_sb_grabbed(sbinfo, count);
2350 +       sbinfo->blocks_clustered += count;
2351 +
2352 +       assert("edward-504", reiser4_check_block_counters(ctx->super));
2353 +
2354 +       spin_unlock_reiser4_super(sbinfo);
2355 +}
2356 +
2357 +void cluster_reserved2grabbed(int count)
2358 +{
2359 +       reiser4_context *ctx;
2360 +       reiser4_super_info_data *sbinfo;
2361 +
2362 +       ctx = get_current_context();
2363 +
2364 +       sbinfo = get_super_private(ctx->super);
2365 +       spin_lock_reiser4_super(sbinfo);
2366 +
2367 +       sub_from_cluster_reserved(sbinfo, count);
2368 +       sbinfo->blocks_grabbed += count;
2369 +
2370 +       assert("edward-505", reiser4_check_block_counters(ctx->super));
2371 +
2372 +       spin_unlock_reiser4_super(sbinfo);
2373 +       add_to_ctx_grabbed(ctx, count);
2374 +}
2375 +
2376 +void cluster_reserved2free(int count)
2377 +{
2378 +       reiser4_context *ctx;
2379 +       reiser4_super_info_data *sbinfo;
2380 +
2381 +       ctx = get_current_context();
2382 +       sbinfo = get_super_private(ctx->super);
2383 +
2384 +       cluster_reserved2grabbed(count);
2385 +       grabbed2free(ctx, sbinfo, count);
2386 +}
2387 +
2388 +static DEFINE_SPINLOCK(fake_lock);
2389 +static reiser4_block_nr fake_gen = 0;
2390 +
2391 +/**
2392 + * assign_fake_blocknr
2393 + * @blocknr:
2394 + * @count:
2395 + *
2396 + * Obtain a fake block number for new node which will be used to refer to
2397 + * this newly allocated node until real allocation is done.
2398 + */
2399 +static void assign_fake_blocknr(reiser4_block_nr *blocknr, int count)
2400 +{
2401 +       spin_lock(&fake_lock);
2402 +       *blocknr = fake_gen;
2403 +       fake_gen += count;
2404 +       spin_unlock(&fake_lock);
2405 +
2406 +       BUG_ON(*blocknr & REISER4_BLOCKNR_STATUS_BIT_MASK);
2407 +       /**blocknr &= ~REISER4_BLOCKNR_STATUS_BIT_MASK;*/
2408 +       *blocknr |= REISER4_UNALLOCATED_STATUS_VALUE;
2409 +       assert("zam-394", zlook(current_tree, blocknr) == NULL);
2410 +}
2411 +
2412 +int assign_fake_blocknr_formatted(reiser4_block_nr * blocknr)
2413 +{
2414 +       assign_fake_blocknr(blocknr, 1);
2415 +       grabbed2fake_allocated_formatted();
2416 +       return 0;
2417 +}
2418 +
2419 +/**
2420 + * fake_blocknrs_unformatted
2421 + * @count: number of fake numbers to get
2422 + *
2423 + * Allocates @count fake block numbers which will be assigned to jnodes
2424 + */
2425 +reiser4_block_nr fake_blocknr_unformatted(int count)
2426 +{
2427 +       reiser4_block_nr blocknr;
2428 +
2429 +       assign_fake_blocknr(&blocknr, count);
2430 +       grabbed2fake_allocated_unformatted(count);
2431 +
2432 +       return blocknr;
2433 +}
2434 +
2435 +/* adjust sb block counters, if real (on-disk) block allocation immediately
2436 +   follows grabbing of free disk space. */
2437 +static void grabbed2used(reiser4_context *ctx, reiser4_super_info_data *sbinfo,
2438 +                        __u64 count)
2439 +{
2440 +       sub_from_ctx_grabbed(ctx, count);
2441 +
2442 +       spin_lock_reiser4_super(sbinfo);
2443 +
2444 +       sub_from_sb_grabbed(sbinfo, count);
2445 +       sbinfo->blocks_used += count;
2446 +
2447 +       assert("nikita-2679", reiser4_check_block_counters(ctx->super));
2448 +
2449 +       spin_unlock_reiser4_super(sbinfo);
2450 +}
2451 +
2452 +/* adjust sb block counters when @count unallocated blocks get mapped to disk */
2453 +static void fake_allocated2used(reiser4_super_info_data *sbinfo, __u64 count,
2454 +                               reiser4_ba_flags_t flags)
2455 +{
2456 +       spin_lock_reiser4_super(sbinfo);
2457 +
2458 +       sub_from_sb_fake_allocated(sbinfo, count, flags);
2459 +       sbinfo->blocks_used += count;
2460 +
2461 +       assert("nikita-2680",
2462 +              reiser4_check_block_counters(reiser4_get_current_sb()));
2463 +
2464 +       spin_unlock_reiser4_super(sbinfo);
2465 +}
2466 +
2467 +static void flush_reserved2used(txn_atom * atom, __u64 count)
2468 +{
2469 +       reiser4_super_info_data *sbinfo;
2470 +
2471 +       assert("zam-787", atom != NULL);
2472 +       assert_spin_locked(&(atom->alock));
2473 +
2474 +       sub_from_atom_flush_reserved_nolock(atom, (__u32) count);
2475 +
2476 +       sbinfo = get_current_super_private();
2477 +       spin_lock_reiser4_super(sbinfo);
2478 +
2479 +       sub_from_sb_flush_reserved(sbinfo, count);
2480 +       sbinfo->blocks_used += count;
2481 +
2482 +       assert("zam-789",
2483 +              reiser4_check_block_counters(reiser4_get_current_sb()));
2484 +
2485 +       spin_unlock_reiser4_super(sbinfo);
2486 +}
2487 +
2488 +/* update the per fs  blocknr hint default value. */
2489 +void
2490 +update_blocknr_hint_default(const struct super_block *s,
2491 +                           const reiser4_block_nr * block)
2492 +{
2493 +       reiser4_super_info_data *sbinfo = get_super_private(s);
2494 +
2495 +       assert("nikita-3342", !reiser4_blocknr_is_fake(block));
2496 +
2497 +       spin_lock_reiser4_super(sbinfo);
2498 +       if (*block < sbinfo->block_count) {
2499 +               sbinfo->blocknr_hint_default = *block;
2500 +       } else {
2501 +               warning("zam-676",
2502 +                       "block number %llu is too large to be used in a blocknr hint\n",
2503 +                       (unsigned long long)*block);
2504 +               dump_stack();
2505 +               DEBUGON(1);
2506 +       }
2507 +       spin_unlock_reiser4_super(sbinfo);
2508 +}
2509 +
2510 +/* get current value of the default blocknr hint. */
2511 +void get_blocknr_hint_default(reiser4_block_nr * result)
2512 +{
2513 +       reiser4_super_info_data *sbinfo = get_current_super_private();
2514 +
2515 +       spin_lock_reiser4_super(sbinfo);
2516 +       *result = sbinfo->blocknr_hint_default;
2517 +       assert("zam-677", *result < sbinfo->block_count);
2518 +       spin_unlock_reiser4_super(sbinfo);
2519 +}
2520 +
2521 +/* Allocate "real" disk blocks by calling a proper space allocation plugin
2522 + * method. Blocks are allocated in one contiguous disk region. The plugin
2523 + * independent part accounts blocks by subtracting allocated amount from grabbed
2524 + * or fake block counter and add the same amount to the counter of allocated
2525 + * blocks.
2526 + *
2527 + * @hint -- a reiser4 blocknr hint object which contains further block
2528 + *          allocation hints and parameters (search start, a stage of block
2529 + *          which will be mapped to disk, etc.),
2530 + * @blk  -- an out parameter for the beginning of the allocated region,
2531 + * @len  -- in/out parameter, it should contain the maximum number of allocated
2532 + *          blocks, after block allocation completes, it contains the length of
2533 + *          allocated disk region.
2534 + * @flags -- see reiser4_ba_flags_t description.
2535 + *
2536 + * @return -- 0 if success, error code otherwise.
2537 + */
2538 +int
2539 +reiser4_alloc_blocks(reiser4_blocknr_hint * hint, reiser4_block_nr * blk,
2540 +                    reiser4_block_nr * len, reiser4_ba_flags_t flags)
2541 +{
2542 +       __u64 needed = *len;
2543 +       reiser4_context *ctx;
2544 +       reiser4_super_info_data *sbinfo;
2545 +       int ret;
2546 +
2547 +       assert("zam-986", hint != NULL);
2548 +
2549 +       ctx = get_current_context();
2550 +       sbinfo = get_super_private(ctx->super);
2551 +
2552 +       /* For write-optimized data we use default search start value, which is
2553 +        * close to last write location. */
2554 +       if (flags & BA_USE_DEFAULT_SEARCH_START)
2555 +               get_blocknr_hint_default(&hint->blk);
2556 +
2557 +       /* VITALY: allocator should grab this for internal/tx-lists/similar
2558 +          only. */
2559 +/* VS-FIXME-HANS: why is this comment above addressed to vitaly (from vitaly)?*/
2560 +       if (hint->block_stage == BLOCK_NOT_COUNTED) {
2561 +               ret = reiser4_grab_space_force(*len, flags);
2562 +               if (ret != 0)
2563 +                       return ret;
2564 +       }
2565 +
2566 +       ret =
2567 +           sa_alloc_blocks(reiser4_get_space_allocator(ctx->super),
2568 +                           hint, (int)needed, blk, len);
2569 +
2570 +       if (!ret) {
2571 +               assert("zam-680", *blk < reiser4_block_count(ctx->super));
2572 +               assert("zam-681",
2573 +                      *blk + *len <= reiser4_block_count(ctx->super));
2574 +
2575 +               if (flags & BA_PERMANENT) {
2576 +                       /* we assume that current atom exists at this moment */
2577 +                       txn_atom *atom = get_current_atom_locked();
2578 +                       atom->nr_blocks_allocated += *len;
2579 +                       spin_unlock_atom(atom);
2580 +               }
2581 +
2582 +               switch (hint->block_stage) {
2583 +               case BLOCK_NOT_COUNTED:
2584 +               case BLOCK_GRABBED:
2585 +                       grabbed2used(ctx, sbinfo, *len);
2586 +                       break;
2587 +               case BLOCK_UNALLOCATED:
2588 +                       fake_allocated2used(sbinfo, *len, flags);
2589 +                       break;
2590 +               case BLOCK_FLUSH_RESERVED:
2591 +                       {
2592 +                               txn_atom *atom = get_current_atom_locked();
2593 +                               flush_reserved2used(atom, *len);
2594 +                               spin_unlock_atom(atom);
2595 +                       }
2596 +                       break;
2597 +               default:
2598 +                       impossible("zam-531", "wrong block stage");
2599 +               }
2600 +       } else {
2601 +               assert("zam-821",
2602 +                      ergo(hint->max_dist == 0
2603 +                           && !hint->backward, ret != -ENOSPC));
2604 +               if (hint->block_stage == BLOCK_NOT_COUNTED)
2605 +                       grabbed2free(ctx, sbinfo, needed);
2606 +       }
2607 +
2608 +       return ret;
2609 +}
2610 +
2611 +/* used -> fake_allocated -> grabbed -> free */
2612 +
2613 +/* adjust sb block counters when @count unallocated blocks get unmapped from
2614 +   disk */
2615 +static void
2616 +used2fake_allocated(reiser4_super_info_data * sbinfo, __u64 count,
2617 +                   int formatted)
2618 +{
2619 +       spin_lock_reiser4_super(sbinfo);
2620 +
2621 +       if (formatted)
2622 +               sbinfo->blocks_fake_allocated += count;
2623 +       else
2624 +               sbinfo->blocks_fake_allocated_unformatted += count;
2625 +
2626 +       sub_from_sb_used(sbinfo, count);
2627 +
2628 +       assert("nikita-2681",
2629 +              reiser4_check_block_counters(reiser4_get_current_sb()));
2630 +
2631 +       spin_unlock_reiser4_super(sbinfo);
2632 +}
2633 +
2634 +static void
2635 +used2flush_reserved(reiser4_super_info_data * sbinfo, txn_atom * atom,
2636 +                   __u64 count, reiser4_ba_flags_t flags UNUSED_ARG)
2637 +{
2638 +       assert("nikita-2791", atom != NULL);
2639 +       assert_spin_locked(&(atom->alock));
2640 +
2641 +       add_to_atom_flush_reserved_nolock(atom, (__u32) count);
2642 +
2643 +       spin_lock_reiser4_super(sbinfo);
2644 +
2645 +       sbinfo->blocks_flush_reserved += count;
2646 +       /*add_to_sb_flush_reserved(sbinfo, count); */
2647 +       sub_from_sb_used(sbinfo, count);
2648 +
2649 +       assert("nikita-2681",
2650 +              reiser4_check_block_counters(reiser4_get_current_sb()));
2651 +
2652 +       spin_unlock_reiser4_super(sbinfo);
2653 +}
2654 +
2655 +/* disk space, virtually used by fake block numbers is counted as "grabbed"
2656 +   again. */
2657 +static void
2658 +fake_allocated2grabbed(reiser4_context * ctx, reiser4_super_info_data * sbinfo,
2659 +                      __u64 count, reiser4_ba_flags_t flags)
2660 +{
2661 +       add_to_ctx_grabbed(ctx, count);
2662 +
2663 +       spin_lock_reiser4_super(sbinfo);
2664 +
2665 +       assert("nikita-2682", reiser4_check_block_counters(ctx->super));
2666 +
2667 +       sbinfo->blocks_grabbed += count;
2668 +       sub_from_sb_fake_allocated(sbinfo, count, flags & BA_FORMATTED);
2669 +
2670 +       assert("nikita-2683", reiser4_check_block_counters(ctx->super));
2671 +
2672 +       spin_unlock_reiser4_super(sbinfo);
2673 +}
2674 +
2675 +void fake_allocated2free(__u64 count, reiser4_ba_flags_t flags)
2676 +{
2677 +       reiser4_context *ctx;
2678 +       reiser4_super_info_data *sbinfo;
2679 +
2680 +       ctx = get_current_context();
2681 +       sbinfo = get_super_private(ctx->super);
2682 +
2683 +       fake_allocated2grabbed(ctx, sbinfo, count, flags);
2684 +       grabbed2free(ctx, sbinfo, count);
2685 +}
2686 +
2687 +void grabbed2free_mark(__u64 mark)
2688 +{
2689 +       reiser4_context *ctx;
2690 +       reiser4_super_info_data *sbinfo;
2691 +
2692 +       ctx = get_current_context();
2693 +       sbinfo = get_super_private(ctx->super);
2694 +
2695 +       assert("nikita-3007", (__s64) mark >= 0);
2696 +       assert("nikita-3006", ctx->grabbed_blocks >= mark);
2697 +       grabbed2free(ctx, sbinfo, ctx->grabbed_blocks - mark);
2698 +}
2699 +
2700 +/**
2701 + * grabbed2free - adjust grabbed and free block counters
2702 + * @ctx: context to update grabbed block counter of
2703 + * @sbinfo: super block to update grabbed and free block counters of
2704 + * @count: number of blocks to adjust counters by
2705 + *
2706 + * Decreases context's and per filesystem's counters of grabbed
2707 + * blocks. Increases per filesystem's counter of free blocks.
2708 + */
2709 +void grabbed2free(reiser4_context *ctx, reiser4_super_info_data *sbinfo,
2710 +                 __u64 count)
2711 +{
2712 +       sub_from_ctx_grabbed(ctx, count);
2713 +
2714 +       spin_lock_reiser4_super(sbinfo);
2715 +
2716 +       sub_from_sb_grabbed(sbinfo, count);
2717 +       sbinfo->blocks_free += count;
2718 +       assert("nikita-2684", reiser4_check_block_counters(ctx->super));
2719 +
2720 +       spin_unlock_reiser4_super(sbinfo);
2721 +}
2722 +
2723 +void grabbed2flush_reserved_nolock(txn_atom * atom, __u64 count)
2724 +{
2725 +       reiser4_context *ctx;
2726 +       reiser4_super_info_data *sbinfo;
2727 +
2728 +       assert("vs-1095", atom);
2729 +
2730 +       ctx = get_current_context();
2731 +       sbinfo = get_super_private(ctx->super);
2732 +
2733 +       sub_from_ctx_grabbed(ctx, count);
2734 +
2735 +       add_to_atom_flush_reserved_nolock(atom, count);
2736 +
2737 +       spin_lock_reiser4_super(sbinfo);
2738 +
2739 +       sbinfo->blocks_flush_reserved += count;
2740 +       sub_from_sb_grabbed(sbinfo, count);
2741 +
2742 +       assert("vpf-292", reiser4_check_block_counters(ctx->super));
2743 +
2744 +       spin_unlock_reiser4_super(sbinfo);
2745 +}
2746 +
2747 +void grabbed2flush_reserved(__u64 count)
2748 +{
2749 +       txn_atom *atom = get_current_atom_locked();
2750 +
2751 +       grabbed2flush_reserved_nolock(atom, count);
2752 +
2753 +       spin_unlock_atom(atom);
2754 +}
2755 +
2756 +void flush_reserved2grabbed(txn_atom * atom, __u64 count)
2757 +{
2758 +       reiser4_context *ctx;
2759 +       reiser4_super_info_data *sbinfo;
2760 +
2761 +       assert("nikita-2788", atom != NULL);
2762 +       assert_spin_locked(&(atom->alock));
2763 +
2764 +       ctx = get_current_context();
2765 +       sbinfo = get_super_private(ctx->super);
2766 +
2767 +       add_to_ctx_grabbed(ctx, count);
2768 +
2769 +       sub_from_atom_flush_reserved_nolock(atom, (__u32) count);
2770 +
2771 +       spin_lock_reiser4_super(sbinfo);
2772 +
2773 +       sbinfo->blocks_grabbed += count;
2774 +       sub_from_sb_flush_reserved(sbinfo, count);
2775 +
2776 +       assert("vpf-292", reiser4_check_block_counters(ctx->super));
2777 +
2778 +       spin_unlock_reiser4_super(sbinfo);
2779 +}
2780 +
2781 +/**
2782 + * all_grabbed2free - releases all blocks grabbed in context
2783 + *
2784 + * Decreases context's and super block's grabbed block counters by number of
2785 + * blocks grabbed by current context and increases super block's free block
2786 + * counter correspondingly.
2787 + */
2788 +void all_grabbed2free(void)
2789 +{
2790 +       reiser4_context *ctx = get_current_context();
2791 +
2792 +       grabbed2free(ctx, get_super_private(ctx->super), ctx->grabbed_blocks);
2793 +}
2794 +
2795 +/* adjust sb block counters if real (on-disk) blocks do not become unallocated
2796 +   after freeing, @count blocks become "grabbed". */
2797 +static void
2798 +used2grabbed(reiser4_context * ctx, reiser4_super_info_data * sbinfo,
2799 +            __u64 count)
2800 +{
2801 +       add_to_ctx_grabbed(ctx, count);
2802 +
2803 +       spin_lock_reiser4_super(sbinfo);
2804 +
2805 +       sbinfo->blocks_grabbed += count;
2806 +       sub_from_sb_used(sbinfo, count);
2807 +
2808 +       assert("nikita-2685", reiser4_check_block_counters(ctx->super));
2809 +
2810 +       spin_unlock_reiser4_super(sbinfo);
2811 +}
2812 +
2813 +/* this used to be done through used2grabbed and grabbed2free*/
2814 +static void used2free(reiser4_super_info_data * sbinfo, __u64 count)
2815 +{
2816 +       spin_lock_reiser4_super(sbinfo);
2817 +
2818 +       sbinfo->blocks_free += count;
2819 +       sub_from_sb_used(sbinfo, count);
2820 +
2821 +       assert("nikita-2685",
2822 +              reiser4_check_block_counters(reiser4_get_current_sb()));
2823 +
2824 +       spin_unlock_reiser4_super(sbinfo);
2825 +}
2826 +
2827 +#if REISER4_DEBUG
2828 +
2829 +/* check "allocated" state of given block range */
2830 +static void
2831 +reiser4_check_blocks(const reiser4_block_nr * start,
2832 +                    const reiser4_block_nr * len, int desired)
2833 +{
2834 +       sa_check_blocks(start, len, desired);
2835 +}
2836 +
2837 +/* check "allocated" state of given block */
2838 +void reiser4_check_block(const reiser4_block_nr * block, int desired)
2839 +{
2840 +       const reiser4_block_nr one = 1;
2841 +
2842 +       reiser4_check_blocks(block, &one, desired);
2843 +}
2844 +
2845 +#endif
2846 +
2847 +/* Blocks deallocation function may do an actual deallocation through space
2848 +   plugin allocation or store deleted block numbers in atom's delete_set data
2849 +   structure depend on @defer parameter. */
2850 +
2851 +/* if BA_DEFER bit is not turned on, @target_stage means the stage of blocks
2852 +   which will be deleted from WORKING bitmap. They might be just unmapped from
2853 +   disk, or freed but disk space is still grabbed by current thread, or these
2854 +   blocks must not be counted in any reiser4 sb block counters,
2855 +   see block_stage_t comment */
2856 +
2857 +/* BA_FORMATTED bit is only used when BA_DEFER in not present: it is used to
2858 +   distinguish blocks allocated for unformatted and formatted nodes */
2859 +
2860 +int
2861 +reiser4_dealloc_blocks(const reiser4_block_nr * start,
2862 +                      const reiser4_block_nr * len,
2863 +                      block_stage_t target_stage, reiser4_ba_flags_t flags)
2864 +{
2865 +       txn_atom *atom = NULL;
2866 +       int ret;
2867 +       reiser4_context *ctx;
2868 +       reiser4_super_info_data *sbinfo;
2869 +
2870 +       ctx = get_current_context();
2871 +       sbinfo = get_super_private(ctx->super);
2872 +
2873 +       if (REISER4_DEBUG) {
2874 +               assert("zam-431", *len != 0);
2875 +               assert("zam-432", *start != 0);
2876 +               assert("zam-558", !reiser4_blocknr_is_fake(start));
2877 +
2878 +               spin_lock_reiser4_super(sbinfo);
2879 +               assert("zam-562", *start < sbinfo->block_count);
2880 +               spin_unlock_reiser4_super(sbinfo);
2881 +       }
2882 +
2883 +       if (flags & BA_DEFER) {
2884 +               blocknr_set_entry *bsep = NULL;
2885 +
2886 +               /* storing deleted block numbers in a blocknr set
2887 +                  datastructure for further actual deletion */
2888 +               do {
2889 +                       atom = get_current_atom_locked();
2890 +                       assert("zam-430", atom != NULL);
2891 +
2892 +                       ret =
2893 +                           blocknr_set_add_extent(atom, &atom->delete_set,
2894 +                                                  &bsep, start, len);
2895 +
2896 +                       if (ret == -ENOMEM)
2897 +                               return ret;
2898 +
2899 +                       /* This loop might spin at most two times */
2900 +               } while (ret == -E_REPEAT);
2901 +
2902 +               assert("zam-477", ret == 0);
2903 +               assert("zam-433", atom != NULL);
2904 +
2905 +               spin_unlock_atom(atom);
2906 +
2907 +       } else {
2908 +               assert("zam-425", get_current_super_private() != NULL);
2909 +               sa_dealloc_blocks(reiser4_get_space_allocator(ctx->super),
2910 +                                 *start, *len);
2911 +
2912 +               if (flags & BA_PERMANENT) {
2913 +                       /* These blocks were counted as allocated, we have to
2914 +                        * revert it back if allocation is discarded. */
2915 +                       txn_atom *atom = get_current_atom_locked();
2916 +                       atom->nr_blocks_allocated -= *len;
2917 +                       spin_unlock_atom(atom);
2918 +               }
2919 +
2920 +               switch (target_stage) {
2921 +               case BLOCK_NOT_COUNTED:
2922 +                       assert("vs-960", flags & BA_FORMATTED);
2923 +                       /* VITALY: This is what was grabbed for
2924 +                          internal/tx-lists/similar only */
2925 +                       used2free(sbinfo, *len);
2926 +                       break;
2927 +
2928 +               case BLOCK_GRABBED:
2929 +                       used2grabbed(ctx, sbinfo, *len);
2930 +                       break;
2931 +
2932 +               case BLOCK_UNALLOCATED:
2933 +                       used2fake_allocated(sbinfo, *len, flags & BA_FORMATTED);
2934 +                       break;
2935 +
2936 +               case BLOCK_FLUSH_RESERVED:{
2937 +                               txn_atom *atom;
2938 +
2939 +                               atom = get_current_atom_locked();
2940 +                               used2flush_reserved(sbinfo, atom, *len,
2941 +                                                   flags & BA_FORMATTED);
2942 +                               spin_unlock_atom(atom);
2943 +                               break;
2944 +                       }
2945 +               default:
2946 +                       impossible("zam-532", "wrong block stage");
2947 +               }
2948 +       }
2949 +
2950 +       return 0;
2951 +}
2952 +
2953 +/* wrappers for block allocator plugin methods */
2954 +int reiser4_pre_commit_hook(void)
2955 +{
2956 +       assert("zam-502", get_current_super_private() != NULL);
2957 +       sa_pre_commit_hook();
2958 +       return 0;
2959 +}
2960 +
2961 +/* an actor which applies delete set to block allocator data */
2962 +static int
2963 +apply_dset(txn_atom * atom UNUSED_ARG, const reiser4_block_nr * a,
2964 +          const reiser4_block_nr * b, void *data UNUSED_ARG)
2965 +{
2966 +       reiser4_context *ctx;
2967 +       reiser4_super_info_data *sbinfo;
2968 +
2969 +       __u64 len = 1;
2970 +
2971 +       ctx = get_current_context();
2972 +       sbinfo = get_super_private(ctx->super);
2973 +
2974 +       assert("zam-877", atom->stage >= ASTAGE_PRE_COMMIT);
2975 +       assert("zam-552", sbinfo != NULL);
2976 +
2977 +       if (b != NULL)
2978 +               len = *b;
2979 +
2980 +       if (REISER4_DEBUG) {
2981 +               spin_lock_reiser4_super(sbinfo);
2982 +
2983 +               assert("zam-554", *a < reiser4_block_count(ctx->super));
2984 +               assert("zam-555", *a + len <= reiser4_block_count(ctx->super));
2985 +
2986 +               spin_unlock_reiser4_super(sbinfo);
2987 +       }
2988 +
2989 +       sa_dealloc_blocks(&sbinfo->space_allocator, *a, len);
2990 +       /* adjust sb block counters */
2991 +       used2free(sbinfo, len);
2992 +       return 0;
2993 +}
2994 +
2995 +void reiser4_post_commit_hook(void)
2996 +{
2997 +       txn_atom *atom;
2998 +
2999 +       atom = get_current_atom_locked();
3000 +       assert("zam-452", atom->stage == ASTAGE_POST_COMMIT);
3001 +       spin_unlock_atom(atom);
3002 +
3003 +       /* do the block deallocation which was deferred
3004 +          until commit is done */
3005 +       blocknr_set_iterator(atom, &atom->delete_set, apply_dset, NULL, 1);
3006 +
3007 +       assert("zam-504", get_current_super_private() != NULL);
3008 +       sa_post_commit_hook();
3009 +}
3010 +
3011 +void reiser4_post_write_back_hook(void)
3012 +{
3013 +       assert("zam-504", get_current_super_private() != NULL);
3014 +
3015 +       sa_post_commit_hook();
3016 +}
3017 +
3018 +/*
3019 +   Local variables:
3020 +   c-indentation-style: "K&R"
3021 +   mode-name: "LC"
3022 +   c-basic-offset: 8
3023 +   tab-width: 8
3024 +   fill-column: 120
3025 +   scroll-step: 1
3026 +   End:
3027 +*/
3028 diff -puN /dev/null fs/reiser4/block_alloc.h
3029 --- /dev/null
3030 +++ a/fs/reiser4/block_alloc.h
3031 @@ -0,0 +1,177 @@
3032 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
3033 +
3034 +#if !defined(__FS_REISER4_BLOCK_ALLOC_H__)
3035 +#define __FS_REISER4_BLOCK_ALLOC_H__
3036 +
3037 +#include "dformat.h"
3038 +#include "forward.h"
3039 +
3040 +#include <linux/types.h>       /* for __u??  */
3041 +#include <linux/fs.h>
3042 +
3043 +/* Mask when is applied to given block number shows is that block number is a
3044 +   fake one */
3045 +#define REISER4_FAKE_BLOCKNR_BIT_MASK   0x8000000000000000ULL
3046 +/* Mask which isolates a type of object this fake block number was assigned
3047 +   to */
3048 +#define REISER4_BLOCKNR_STATUS_BIT_MASK 0xC000000000000000ULL
3049 +
3050 +/*result after applying the REISER4_BLOCKNR_STATUS_BIT_MASK should be compared
3051 +   against these two values to understand is the object unallocated or bitmap
3052 +   shadow object (WORKING BITMAP block, look at the plugin/space/bitmap.c) */
3053 +#define REISER4_UNALLOCATED_STATUS_VALUE    0xC000000000000000ULL
3054 +#define REISER4_BITMAP_BLOCKS_STATUS_VALUE  0x8000000000000000ULL
3055 +
3056 +/* specification how block allocation was counted in sb block counters */
3057 +typedef enum {
3058 +       BLOCK_NOT_COUNTED = 0,  /* reiser4 has no info about this block yet */
3059 +       BLOCK_GRABBED = 1,      /* free space grabbed for further allocation
3060 +                                  of this block */
3061 +       BLOCK_FLUSH_RESERVED = 2,       /* block is reserved for flush needs. */
3062 +       BLOCK_UNALLOCATED = 3,  /* block is used for existing in-memory object
3063 +                                  ( unallocated formatted or unformatted
3064 +                                  node) */
3065 +       BLOCK_ALLOCATED = 4     /* block is mapped to disk, real on-disk block
3066 +                                  number assigned */
3067 +} block_stage_t;
3068 +
3069 +/* a hint for block allocator */
3070 +struct reiser4_blocknr_hint {
3071 +       /* FIXME: I think we want to add a longterm lock on the bitmap block
3072 +          here. This is to prevent jnode_flush() calls from interleaving
3073 +          allocations on the same bitmap, once a hint is established. */
3074 +
3075 +       /* search start hint */
3076 +       reiser4_block_nr blk;
3077 +       /* if not zero, it is a region size we search for free blocks in */
3078 +       reiser4_block_nr max_dist;
3079 +       /* level for allocation, may be useful have branch-level and higher
3080 +          write-optimized. */
3081 +       tree_level level;
3082 +       /* block allocator assumes that blocks, which will be mapped to disk,
3083 +          are in this specified block_stage */
3084 +       block_stage_t block_stage;
3085 +       /* If direction = 1 allocate blocks in backward direction from the end
3086 +        * of disk to the beginning of disk.  */
3087 +       unsigned int backward:1;
3088 +
3089 +};
3090 +
3091 +/* These flags control block allocation/deallocation behavior */
3092 +enum reiser4_ba_flags {
3093 +       /* do allocatations from reserved (5%) area */
3094 +       BA_RESERVED = (1 << 0),
3095 +
3096 +       /* block allocator can do commit trying to recover free space */
3097 +       BA_CAN_COMMIT = (1 << 1),
3098 +
3099 +       /* if operation will be applied to formatted block */
3100 +       BA_FORMATTED = (1 << 2),
3101 +
3102 +       /* defer actual block freeing until transaction commit */
3103 +       BA_DEFER = (1 << 3),
3104 +
3105 +       /* allocate blocks for permanent fs objects (formatted or unformatted),
3106 +           not wandered of log blocks */
3107 +       BA_PERMANENT = (1 << 4),
3108 +
3109 +       /* grab space even it was disabled */
3110 +       BA_FORCE = (1 << 5),
3111 +
3112 +       /* use default start value for free blocks search. */
3113 +       BA_USE_DEFAULT_SEARCH_START = (1 << 6)
3114 +};
3115 +
3116 +typedef enum reiser4_ba_flags reiser4_ba_flags_t;
3117 +
3118 +extern void reiser4_blocknr_hint_init(reiser4_blocknr_hint * hint);
3119 +extern void reiser4_blocknr_hint_done(reiser4_blocknr_hint * hint);
3120 +extern void update_blocknr_hint_default(const struct super_block *,
3121 +                                       const reiser4_block_nr *);
3122 +extern void get_blocknr_hint_default(reiser4_block_nr *);
3123 +
3124 +extern reiser4_block_nr reiser4_fs_reserved_space(struct super_block *super);
3125 +
3126 +int assign_fake_blocknr_formatted(reiser4_block_nr *);
3127 +reiser4_block_nr fake_blocknr_unformatted(int);
3128 +
3129 +/* free -> grabbed -> fake_allocated -> used */
3130 +
3131 +int reiser4_grab_space(__u64 count, reiser4_ba_flags_t flags);
3132 +void all_grabbed2free(void);
3133 +void grabbed2free(reiser4_context * , reiser4_super_info_data * , __u64 count);
3134 +void fake_allocated2free(__u64 count, reiser4_ba_flags_t flags);
3135 +void grabbed2flush_reserved_nolock(txn_atom * atom, __u64 count);
3136 +void grabbed2flush_reserved(__u64 count);
3137 +int reiser4_alloc_blocks(reiser4_blocknr_hint * hint,
3138 +                        reiser4_block_nr * start,
3139 +                        reiser4_block_nr * len, reiser4_ba_flags_t flags);
3140 +int reiser4_dealloc_blocks(const reiser4_block_nr *,
3141 +                          const reiser4_block_nr *,
3142 +                          block_stage_t, reiser4_ba_flags_t flags);
3143 +
3144 +static inline int reiser4_alloc_block(reiser4_blocknr_hint * hint,
3145 +                                     reiser4_block_nr * start,
3146 +                                     reiser4_ba_flags_t flags)
3147 +{
3148 +       reiser4_block_nr one = 1;
3149 +       return reiser4_alloc_blocks(hint, start, &one, flags);
3150 +}
3151 +
3152 +static inline int reiser4_dealloc_block(const reiser4_block_nr * block,
3153 +                                       block_stage_t stage,
3154 +                                       reiser4_ba_flags_t flags)
3155 +{
3156 +       const reiser4_block_nr one = 1;
3157 +       return reiser4_dealloc_blocks(block, &one, stage, flags);
3158 +}
3159 +
3160 +#define reiser4_grab_space_force(count, flags)         \
3161 +       reiser4_grab_space(count, flags | BA_FORCE)
3162 +
3163 +extern void grabbed2free_mark(__u64 mark);
3164 +extern int reiser4_grab_reserved(struct super_block *,
3165 +                                __u64, reiser4_ba_flags_t);
3166 +extern void reiser4_release_reserved(struct super_block *super);
3167 +
3168 +/* grabbed -> fake_allocated */
3169 +
3170 +/* fake_allocated -> used */
3171 +
3172 +/* used -> fake_allocated -> grabbed -> free */
3173 +
3174 +extern void flush_reserved2grabbed(txn_atom * atom, __u64 count);
3175 +
3176 +extern int reiser4_blocknr_is_fake(const reiser4_block_nr * da);
3177 +
3178 +extern void grabbed2cluster_reserved(int count);
3179 +extern void cluster_reserved2grabbed(int count);
3180 +extern void cluster_reserved2free(int count);
3181 +
3182 +extern int reiser4_check_block_counters(const struct super_block *);
3183 +
3184 +#if REISER4_DEBUG
3185 +
3186 +extern void reiser4_check_block(const reiser4_block_nr *, int);
3187 +
3188 +#else
3189 +
3190 +#  define reiser4_check_block(beg, val)        noop
3191 +
3192 +#endif
3193 +
3194 +extern int reiser4_pre_commit_hook(void);
3195 +extern void reiser4_post_commit_hook(void);
3196 +extern void reiser4_post_write_back_hook(void);
3197 +
3198 +#endif                         /* __FS_REISER4_BLOCK_ALLOC_H__ */
3199 +
3200 +/* Make Linus happy.
3201 +   Local variables:
3202 +   c-indentation-style: "K&R"
3203 +   mode-name: "LC"
3204 +   c-basic-offset: 8
3205 +   tab-width: 8
3206 +   fill-column: 120
3207 +   End:
3208 +*/
3209 diff -puN /dev/null fs/reiser4/blocknrset.c
3210 --- /dev/null
3211 +++ a/fs/reiser4/blocknrset.c
3212 @@ -0,0 +1,371 @@
3213 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
3214 +reiser4/README */
3215 +
3216 +/* This file contains code for various block number sets used by the atom to
3217 +   track the deleted set and wandered block mappings. */
3218 +
3219 +#include "debug.h"
3220 +#include "dformat.h"
3221 +#include "txnmgr.h"
3222 +#include "context.h"
3223 +
3224 +#include <linux/slab.h>
3225 +
3226 +/* The proposed data structure for storing unordered block number sets is a
3227 +   list of elements, each of which contains an array of block number or/and
3228 +   array of block number pairs. That element called blocknr_set_entry is used
3229 +   to store block numbers from the beginning and for extents from the end of
3230 +   the data field (char data[...]). The ->nr_blocks and ->nr_pairs fields
3231 +   count numbers of blocks and extents.
3232 +
3233 +   +------------------- blocknr_set_entry->data ------------------+
3234 +   |block1|block2| ... <free space> ... |pair3|pair2|pair1|
3235 +   +------------------------------------------------------------+
3236 +
3237 +   When current blocknr_set_entry is full, allocate a new one. */
3238 +
3239 +/* Usage examples: blocknr sets are used in reiser4 for storing atom's delete
3240 + * set (single blocks and block extents), in that case blocknr pair represent an
3241 + * extent; atom's wandered map is also stored as a blocknr set, blocknr pairs
3242 + * there represent a (real block) -> (wandered block) mapping. */
3243 +
3244 +/* Protection: blocknr sets belong to reiser4 atom, and
3245 + * their modifications are performed with the atom lock held */
3246 +
3247 +/* The total size of a blocknr_set_entry. */
3248 +#define BLOCKNR_SET_ENTRY_SIZE 128
3249 +
3250 +/* The number of blocks that can fit the blocknr data area. */
3251 +#define BLOCKNR_SET_ENTRIES_NUMBER             \
3252 +       ((BLOCKNR_SET_ENTRY_SIZE -              \
3253 +       2 * sizeof(unsigned) -                  \
3254 +       sizeof(struct list_head)) /             \
3255 +       sizeof(reiser4_block_nr))
3256 +
3257 +/* An entry of the blocknr_set */
3258 +struct blocknr_set_entry {
3259 +       unsigned nr_singles;
3260 +       unsigned nr_pairs;
3261 +       struct list_head link;
3262 +       reiser4_block_nr entries[BLOCKNR_SET_ENTRIES_NUMBER];
3263 +};
3264 +
3265 +/* A pair of blocks as recorded in the blocknr_set_entry data. */
3266 +struct blocknr_pair {
3267 +       reiser4_block_nr a;
3268 +       reiser4_block_nr b;
3269 +};
3270 +
3271 +/* Return the number of blocknr slots available in a blocknr_set_entry. */
3272 +/* Audited by: green(2002.06.11) */
3273 +static unsigned bse_avail(blocknr_set_entry * bse)
3274 +{
3275 +       unsigned used = bse->nr_singles + 2 * bse->nr_pairs;
3276 +
3277 +       assert("jmacd-5088", BLOCKNR_SET_ENTRIES_NUMBER >= used);
3278 +       cassert(sizeof(blocknr_set_entry) == BLOCKNR_SET_ENTRY_SIZE);
3279 +
3280 +       return BLOCKNR_SET_ENTRIES_NUMBER - used;
3281 +}
3282 +
3283 +/* Initialize a blocknr_set_entry. */
3284 +static void bse_init(blocknr_set_entry *bse)
3285 +{
3286 +       bse->nr_singles = 0;
3287 +       bse->nr_pairs = 0;
3288 +       INIT_LIST_HEAD(&bse->link);
3289 +}
3290 +
3291 +/* Allocate and initialize a blocknr_set_entry. */
3292 +/* Audited by: green(2002.06.11) */
3293 +static blocknr_set_entry *bse_alloc(void)
3294 +{
3295 +       blocknr_set_entry *e;
3296 +
3297 +       if ((e = (blocknr_set_entry *) kmalloc(sizeof(blocknr_set_entry),
3298 +                                          reiser4_ctx_gfp_mask_get())) == NULL)
3299 +               return NULL;
3300 +
3301 +       bse_init(e);
3302 +
3303 +       return e;
3304 +}
3305 +
3306 +/* Free a blocknr_set_entry. */
3307 +/* Audited by: green(2002.06.11) */
3308 +static void bse_free(blocknr_set_entry * bse)
3309 +{
3310 +       kfree(bse);
3311 +}
3312 +
3313 +/* Add a block number to a blocknr_set_entry */
3314 +/* Audited by: green(2002.06.11) */
3315 +static void
3316 +bse_put_single(blocknr_set_entry * bse, const reiser4_block_nr * block)
3317 +{
3318 +       assert("jmacd-5099", bse_avail(bse) >= 1);
3319 +
3320 +       bse->entries[bse->nr_singles++] = *block;
3321 +}
3322 +
3323 +/* Get a pair of block numbers */
3324 +/* Audited by: green(2002.06.11) */
3325 +static inline struct blocknr_pair *bse_get_pair(blocknr_set_entry * bse,
3326 +                                               unsigned pno)
3327 +{
3328 +       assert("green-1", BLOCKNR_SET_ENTRIES_NUMBER >= 2 * (pno + 1));
3329 +
3330 +       return (struct blocknr_pair *) (bse->entries +
3331 +                                       BLOCKNR_SET_ENTRIES_NUMBER -
3332 +                                       2 * (pno + 1));
3333 +}
3334 +
3335 +/* Add a pair of block numbers to a blocknr_set_entry */
3336 +/* Audited by: green(2002.06.11) */
3337 +static void
3338 +bse_put_pair(blocknr_set_entry * bse, const reiser4_block_nr * a,
3339 +            const reiser4_block_nr * b)
3340 +{
3341 +       struct blocknr_pair *pair;
3342 +
3343 +       assert("jmacd-5100", bse_avail(bse) >= 2 && a != NULL && b != NULL);
3344 +
3345 +       pair = bse_get_pair(bse, bse->nr_pairs++);
3346 +
3347 +       pair->a = *a;
3348 +       pair->b = *b;
3349 +}
3350 +
3351 +/* Add either a block or pair of blocks to the block number set.  The first
3352 +   blocknr (@a) must be non-NULL.  If @b is NULL a single blocknr is added, if
3353 +   @b is non-NULL a pair is added.  The block number set belongs to atom, and
3354 +   the call is made with the atom lock held.  There may not be enough space in
3355 +   the current blocknr_set_entry.  If new_bsep points to a non-NULL
3356 +   blocknr_set_entry then it will be added to the blocknr_set and new_bsep
3357 +   will be set to NULL.  If new_bsep contains NULL then the atom lock will be
3358 +   released and a new bse will be allocated in new_bsep.  E_REPEAT will be
3359 +   returned with the atom unlocked for the operation to be tried again.  If
3360 +   the operation succeeds, 0 is returned.  If new_bsep is non-NULL and not
3361 +   used during the call, it will be freed automatically. */
3362 +static int blocknr_set_add(txn_atom *atom, struct list_head *bset,
3363 +                          blocknr_set_entry **new_bsep, const reiser4_block_nr *a,
3364 +                          const reiser4_block_nr *b)
3365 +{
3366 +       blocknr_set_entry *bse;
3367 +       unsigned entries_needed;
3368 +
3369 +       assert("jmacd-5101", a != NULL);
3370 +
3371 +       entries_needed = (b == NULL) ? 1 : 2;
3372 +       if (list_empty(bset) ||
3373 +           bse_avail(list_entry(bset->next, blocknr_set_entry, link)) < entries_needed) {
3374 +               /* See if a bse was previously allocated. */
3375 +               if (*new_bsep == NULL) {
3376 +                       spin_unlock_atom(atom);
3377 +                       *new_bsep = bse_alloc();
3378 +                       return (*new_bsep != NULL) ? -E_REPEAT :
3379 +                               RETERR(-ENOMEM);
3380 +               }
3381 +
3382 +               /* Put it on the head of the list. */
3383 +               list_add(&((*new_bsep)->link), bset);
3384 +
3385 +               *new_bsep = NULL;
3386 +       }
3387 +
3388 +       /* Add the single or pair. */
3389 +       bse = list_entry(bset->next, blocknr_set_entry, link);
3390 +       if (b == NULL) {
3391 +               bse_put_single(bse, a);
3392 +       } else {
3393 +               bse_put_pair(bse, a, b);
3394 +       }
3395 +
3396 +       /* If new_bsep is non-NULL then there was an allocation race, free this
3397 +          copy. */
3398 +       if (*new_bsep != NULL) {
3399 +               bse_free(*new_bsep);
3400 +               *new_bsep = NULL;
3401 +       }
3402 +
3403 +       return 0;
3404 +}
3405 +
3406 +/* Add an extent to the block set.  If the length is 1, it is treated as a
3407 +   single block (e.g., reiser4_set_add_block). */
3408 +/* Audited by: green(2002.06.11) */
3409 +/* Auditor note: Entire call chain cannot hold any spinlocks, because
3410 +   kmalloc might schedule. The only exception is atom spinlock, which is
3411 +   properly freed. */
3412 +int
3413 +blocknr_set_add_extent(txn_atom * atom,
3414 +                      struct list_head *bset,
3415 +                      blocknr_set_entry ** new_bsep,
3416 +                      const reiser4_block_nr * start,
3417 +                      const reiser4_block_nr * len)
3418 +{
3419 +       assert("jmacd-5102", start != NULL && len != NULL && *len > 0);
3420 +       return blocknr_set_add(atom, bset, new_bsep, start,
3421 +                              *len == 1 ? NULL : len);
3422 +}
3423 +
3424 +/* Add a block pair to the block set. It adds exactly a pair, which is checked
3425 + * by an assertion that both arguments are not null.*/
3426 +/* Audited by: green(2002.06.11) */
3427 +/* Auditor note: Entire call chain cannot hold any spinlocks, because
3428 +   kmalloc might schedule. The only exception is atom spinlock, which is
3429 +   properly freed. */
3430 +int
3431 +blocknr_set_add_pair(txn_atom * atom,
3432 +                    struct list_head *bset,
3433 +                    blocknr_set_entry ** new_bsep, const reiser4_block_nr * a,
3434 +                    const reiser4_block_nr * b)
3435 +{
3436 +       assert("jmacd-5103", a != NULL && b != NULL);
3437 +       return blocknr_set_add(atom, bset, new_bsep, a, b);
3438 +}
3439 +
3440 +/* Initialize a blocknr_set. */
3441 +void blocknr_set_init(struct list_head *bset)
3442 +{
3443 +       INIT_LIST_HEAD(bset);
3444 +}
3445 +
3446 +/* Release the entries of a blocknr_set. */
3447 +void blocknr_set_destroy(struct list_head *bset)
3448 +{
3449 +       blocknr_set_entry *bse;
3450 +
3451 +       while (!list_empty(bset)) {
3452 +               bse = list_entry(bset->next, blocknr_set_entry, link);
3453 +               list_del_init(&bse->link);
3454 +               bse_free(bse);
3455 +       }
3456 +}
3457 +
3458 +/* Merge blocknr_set entries out of @from into @into. */
3459 +/* Audited by: green(2002.06.11) */
3460 +/* Auditor comments: This merge does not know if merged sets contain
3461 +   blocks pairs (As for wandered sets) or extents, so it cannot really merge
3462 +   overlapping ranges if there is some. So I believe it may lead to
3463 +   some blocks being presented several times in one blocknr_set. To help
3464 +   debugging such problems it might help to check for duplicate entries on
3465 +   actual processing of this set. Testing this kind of stuff right here is
3466 +   also complicated by the fact that these sets are not sorted and going
3467 +   through whole set on each element addition is going to be CPU-heavy task */
3468 +void blocknr_set_merge(struct list_head *from, struct list_head *into)
3469 +{
3470 +       blocknr_set_entry *bse_into = NULL;
3471 +
3472 +       /* If @from is empty, no work to perform. */
3473 +       if (list_empty(from))
3474 +               return;
3475 +       /* If @into is not empty, try merging partial-entries. */
3476 +       if (!list_empty(into)) {
3477 +
3478 +               /* Neither set is empty, pop the front to members and try to
3479 +                  combine them. */
3480 +               blocknr_set_entry *bse_from;
3481 +               unsigned into_avail;
3482 +
3483 +               bse_into = list_entry(into->next, blocknr_set_entry, link);
3484 +               list_del_init(&bse_into->link);
3485 +               bse_from = list_entry(from->next, blocknr_set_entry, link);
3486 +               list_del_init(&bse_from->link);
3487 +
3488 +               /* Combine singles. */
3489 +               for (into_avail = bse_avail(bse_into);
3490 +                    into_avail != 0 && bse_from->nr_singles != 0;
3491 +                    into_avail -= 1) {
3492 +                       bse_put_single(bse_into,
3493 +                                      &bse_from->entries[--bse_from->
3494 +                                                         nr_singles]);
3495 +               }
3496 +
3497 +               /* Combine pairs. */
3498 +               for (; into_avail > 1 && bse_from->nr_pairs != 0;
3499 +                    into_avail -= 2) {
3500 +                       struct blocknr_pair *pair =
3501 +                               bse_get_pair(bse_from, --bse_from->nr_pairs);
3502 +                       bse_put_pair(bse_into, &pair->a, &pair->b);
3503 +               }
3504 +
3505 +               /* If bse_from is empty, delete it now. */
3506 +               if (bse_avail(bse_from) == BLOCKNR_SET_ENTRIES_NUMBER) {
3507 +                       bse_free(bse_from);
3508 +               } else {
3509 +                       /* Otherwise, bse_into is full or nearly full (e.g.,
3510 +                          it could have one slot avail and bse_from has one
3511 +                          pair left).  Push it back onto the list.  bse_from
3512 +                          becomes bse_into, which will be the new partial. */
3513 +                       list_add(&bse_into->link, into);
3514 +                       bse_into = bse_from;
3515 +               }
3516 +       }
3517 +
3518 +       /* Splice lists together. */
3519 +       list_splice_init(from, into->prev);
3520 +
3521 +       /* Add the partial entry back to the head of the list. */
3522 +       if (bse_into != NULL)
3523 +               list_add(&bse_into->link, into);
3524 +}
3525 +
3526 +/* Iterate over all blocknr set elements. */
3527 +int blocknr_set_iterator(txn_atom *atom, struct list_head *bset,
3528 +                        blocknr_set_actor_f actor, void *data, int delete)
3529 +{
3530 +
3531 +       blocknr_set_entry *entry;
3532 +
3533 +       assert("zam-429", atom != NULL);
3534 +       assert("zam-430", atom_is_protected(atom));
3535 +       assert("zam-431", bset != 0);
3536 +       assert("zam-432", actor != NULL);
3537 +
3538 +       entry = list_entry(bset->next, blocknr_set_entry, link);
3539 +       while (bset != &entry->link) {
3540 +               blocknr_set_entry *tmp = list_entry(entry->link.next, blocknr_set_entry, link);
3541 +               unsigned int i;
3542 +               int ret;
3543 +
3544 +               for (i = 0; i < entry->nr_singles; i++) {
3545 +                       ret = actor(atom, &entry->entries[i], NULL, data);
3546 +
3547 +                       /* We can't break a loop if delete flag is set. */
3548 +                       if (ret != 0 && !delete)
3549 +                               return ret;
3550 +               }
3551 +
3552 +               for (i = 0; i < entry->nr_pairs; i++) {
3553 +                       struct blocknr_pair *ab;
3554 +
3555 +                       ab = bse_get_pair(entry, i);
3556 +
3557 +                       ret = actor(atom, &ab->a, &ab->b, data);
3558 +
3559 +                       if (ret != 0 && !delete)
3560 +                               return ret;
3561 +               }
3562 +
3563 +               if (delete) {
3564 +                       list_del(&entry->link);
3565 +                       bse_free(entry);
3566 +               }
3567 +
3568 +               entry = tmp;
3569 +       }
3570 +
3571 +       return 0;
3572 +}
3573 +
3574 +/*
3575 + * Local variables:
3576 + * c-indentation-style: "K&R"
3577 + * mode-name: "LC"
3578 + * c-basic-offset: 8
3579 + * tab-width: 8
3580 + * fill-column: 79
3581 + * scroll-step: 1
3582 + * End:
3583 + */
3584 diff -puN /dev/null fs/reiser4/carry.c
3585 --- /dev/null
3586 +++ a/fs/reiser4/carry.c
3587 @@ -0,0 +1,1398 @@
3588 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
3589 +   reiser4/README */
3590 +/* Functions to "carry" tree modification(s) upward. */
3591 +/* Tree is modified one level at a time. As we modify a level we accumulate a
3592 +   set of changes that need to be propagated to the next level.  We manage
3593 +   node locking such that any searches that collide with carrying are
3594 +   restarted, from the root if necessary.
3595 +
3596 +   Insertion of a new item may result in items being moved among nodes and
3597 +   this requires the delimiting key to be updated at the least common parent
3598 +   of the nodes modified to preserve search tree invariants. Also, insertion
3599 +   may require allocation of a new node. A pointer to the new node has to be
3600 +   inserted into some node on the parent level, etc.
3601 +
3602 +   Tree carrying is meant to be analogous to arithmetic carrying.
3603 +
3604 +   A carry operation is always associated with some node (&carry_node).
3605 +
3606 +   Carry process starts with some initial set of operations to be performed
3607 +   and an initial set of already locked nodes.  Operations are performed one
3608 +   by one. Performing each single operation has following possible effects:
3609 +
3610 +    - content of carry node associated with operation is modified
3611 +    - new carry nodes are locked and involved into carry process on this level
3612 +    - new carry operations are posted to the next level
3613 +
3614 +   After all carry operations on this level are done, process is repeated for
3615 +   the accumulated sequence on carry operations for the next level. This
3616 +   starts by trying to lock (in left to right order) all carry nodes
3617 +   associated with carry operations on the parent level. After this, we decide
3618 +   whether more nodes are required on the left of already locked set. If so,
3619 +   all locks taken on the parent level are released, new carry nodes are
3620 +   added, and locking process repeats.
3621 +
3622 +   It may happen that balancing process fails owing to unrecoverable error on
3623 +   some of upper levels of a tree (possible causes are io error, failure to
3624 +   allocate new node, etc.). In this case we should unmount the filesystem,
3625 +   rebooting if it is the root, and possibly advise the use of fsck.
3626 +
3627 +   USAGE:
3628 +
3629 +    int some_tree_operation( znode *node, ... )
3630 +    {
3631 +       // Allocate on a stack pool of carry objects: operations and nodes.
3632 +       // Most carry processes will only take objects from here, without
3633 +       // dynamic allocation.
3634 +
3635 +I feel uneasy about this pool.  It adds to code complexity, I understand why it
3636 +exists, but.... -Hans
3637 +
3638 +       carry_pool  pool;
3639 +       carry_level lowest_level;
3640 +       carry_op   *op;
3641 +
3642 +       init_carry_pool( &pool );
3643 +       init_carry_level( &lowest_level, &pool );
3644 +
3645 +       // operation may be one of:
3646 +       //   COP_INSERT    --- insert new item into node
3647 +       //   COP_CUT       --- remove part of or whole node
3648 +       //   COP_PASTE     --- increase size of item
3649 +       //   COP_DELETE    --- delete pointer from parent node
3650 +       //   COP_UPDATE    --- update delimiting key in least
3651 +       //                     common ancestor of two
3652 +
3653 +       op = reiser4_post_carry( &lowest_level, operation, node, 0 );
3654 +       if( IS_ERR( op ) || ( op == NULL ) ) {
3655 +               handle error
3656 +       } else {
3657 +       // fill in remaining fields in @op, according to carry.h:carry_op
3658 +               result = carry(&lowest_level, NULL);
3659 +       }
3660 +       done_carry_pool(&pool);
3661 +    }
3662 +
3663 +   When you are implementing node plugin method that participates in carry
3664 +   (shifting, insertion, deletion, etc.), do the following:
3665 +
3666 +   int foo_node_method(znode * node, ..., carry_level * todo)
3667 +   {
3668 +       carry_op   *op;
3669 +
3670 +       ....
3671 +
3672 +       // note, that last argument to reiser4_post_carry() is non-null
3673 +       // here, because @op is to be applied to the parent of @node, rather
3674 +       // than to the @node itself as in the previous case.
3675 +
3676 +       op = node_post_carry(todo, operation, node, 1);
3677 +       // fill in remaining fields in @op, according to carry.h:carry_op
3678 +
3679 +       ....
3680 +
3681 +   }
3682 +
3683 +   BATCHING:
3684 +
3685 +   One of the main advantages of level-by-level balancing implemented here is
3686 +   ability to batch updates on a parent level and to peform them more
3687 +   efficiently as a result.
3688 +
3689 +   Description To Be Done (TBD).
3690 +
3691 +   DIFFICULTIES AND SUBTLE POINTS:
3692 +
3693 +   1. complex plumbing is required, because:
3694 +
3695 +       a. effective allocation through pools is needed
3696 +
3697 +       b. target of operation is not exactly known when operation is
3698 +       posted. This is worked around through bitfields in &carry_node and
3699 +       logic in lock_carry_node()
3700 +
3701 +       c. of interaction with locking code: node should be added into sibling
3702 +       list when pointer to it is inserted into its parent, which is some time
3703 +       after node was created. Between these moments, node is somewhat in
3704 +       suspended state and is only registered in the carry lists
3705 +
3706 +    2. whole balancing logic is implemented here, in particular, insertion
3707 +    logic is coded in make_space().
3708 +
3709 +    3. special cases like insertion (reiser4_add_tree_root()) or deletion
3710 +    (reiser4_kill_tree_root()) of tree root and morphing of paste into insert
3711 +    (insert_paste()) have to be handled.
3712 +
3713 +    4. there is non-trivial interdependency between allocation of new nodes
3714 +    and almost everything else. This is mainly due to the (1.c) above. I shall
3715 +    write about this later.
3716 +
3717 +*/
3718 +
3719 +#include "forward.h"
3720 +#include "debug.h"
3721 +#include "key.h"
3722 +#include "coord.h"
3723 +#include "plugin/item/item.h"
3724 +#include "plugin/item/extent.h"
3725 +#include "plugin/node/node.h"
3726 +#include "jnode.h"
3727 +#include "znode.h"
3728 +#include "tree_mod.h"
3729 +#include "tree_walk.h"
3730 +#include "block_alloc.h"
3731 +#include "pool.h"
3732 +#include "tree.h"
3733 +#include "carry.h"
3734 +#include "carry_ops.h"
3735 +#include "super.h"
3736 +#include "reiser4.h"
3737 +
3738 +#include <linux/types.h>
3739 +
3740 +/* level locking/unlocking */
3741 +static int lock_carry_level(carry_level * level);
3742 +static void unlock_carry_level(carry_level * level, int failure);
3743 +static void done_carry_level(carry_level * level);
3744 +static void unlock_carry_node(carry_level * level, carry_node * node, int fail);
3745 +
3746 +int lock_carry_node(carry_level * level, carry_node * node);
3747 +int lock_carry_node_tail(carry_node * node);
3748 +
3749 +/* carry processing proper */
3750 +static int carry_on_level(carry_level * doing, carry_level * todo);
3751 +
3752 +static carry_op *add_op(carry_level * level, pool_ordering order,
3753 +                       carry_op * reference);
3754 +
3755 +/* handlers for carry operations. */
3756 +
3757 +static void fatal_carry_error(carry_level * doing, int ecode);
3758 +static int add_new_root(carry_level * level, carry_node * node, znode * fake);
3759 +
3760 +static void print_level(const char *prefix, carry_level * level);
3761 +
3762 +#if REISER4_DEBUG
3763 +typedef enum {
3764 +       CARRY_TODO,
3765 +       CARRY_DOING
3766 +} carry_queue_state;
3767 +static int carry_level_invariant(carry_level * level, carry_queue_state state);
3768 +#endif
3769 +
3770 +/* main entry point for tree balancing.
3771 +
3772 +   Tree carry performs operations from @doing and while doing so accumulates
3773 +   information about operations to be performed on the next level ("carried"
3774 +   to the parent level). Carried operations are performed, causing possibly
3775 +   more operations to be carried upward etc. carry() takes care about
3776 +   locking and pinning znodes while operating on them.
3777 +
3778 +   For usage, see comment at the top of fs/reiser4/carry.c
3779 +
3780 +*/
3781 +int reiser4_carry(carry_level * doing /* set of carry operations to be
3782 +                                      * performed */ ,
3783 +                 carry_level * done  /* set of nodes, already performed
3784 +                                      *  at the previous level.
3785 +                                      * NULL in most cases */)
3786 +{
3787 +       int result = 0;
3788 +       /* queue of new requests */
3789 +       carry_level *todo;
3790 +       ON_DEBUG(STORE_COUNTERS);
3791 +
3792 +       assert("nikita-888", doing != NULL);
3793 +       BUG_ON(done != NULL);
3794 +
3795 +       todo = doing + 1;
3796 +       init_carry_level(todo, doing->pool);
3797 +
3798 +       /* queue of requests preformed on the previous level */
3799 +       done = todo + 1;
3800 +       init_carry_level(done, doing->pool);
3801 +
3802 +       /* iterate until there is nothing more to do */
3803 +       while (result == 0 && doing->ops_num > 0) {
3804 +               carry_level *tmp;
3805 +
3806 +               /* at this point @done is locked. */
3807 +               /* repeat lock/do/unlock while
3808 +
3809 +                  (1) lock_carry_level() fails due to deadlock avoidance, or
3810 +
3811 +                  (2) carry_on_level() decides that more nodes have to
3812 +                  be involved.
3813 +
3814 +                  (3) some unexpected error occurred while balancing on the
3815 +                  upper levels. In this case all changes are rolled back.
3816 +
3817 +                */
3818 +               while (1) {
3819 +                       result = lock_carry_level(doing);
3820 +                       if (result == 0) {
3821 +                               /* perform operations from @doing and
3822 +                                  accumulate new requests in @todo */
3823 +                               result = carry_on_level(doing, todo);
3824 +                               if (result == 0)
3825 +                                       break;
3826 +                               else if (result != -E_REPEAT ||
3827 +                                        !doing->restartable) {
3828 +                                       warning("nikita-1043",
3829 +                                               "Fatal error during carry: %i",
3830 +                                               result);
3831 +                                       print_level("done", done);
3832 +                                       print_level("doing", doing);
3833 +                                       print_level("todo", todo);
3834 +                                       /* do some rough stuff like aborting
3835 +                                          all pending transcrashes and thus
3836 +                                          pushing tree back to the consistent
3837 +                                          state. Alternatvely, just panic.
3838 +                                        */
3839 +                                       fatal_carry_error(doing, result);
3840 +                                       return result;
3841 +                               }
3842 +                       } else if (result != -E_REPEAT) {
3843 +                               fatal_carry_error(doing, result);
3844 +                               return result;
3845 +                       }
3846 +                       unlock_carry_level(doing, 1);
3847 +               }
3848 +               /* at this point @done can be safely unlocked */
3849 +               done_carry_level(done);
3850 +
3851 +               /* cyclically shift queues */
3852 +               tmp = done;
3853 +               done = doing;
3854 +               doing = todo;
3855 +               todo = tmp;
3856 +               init_carry_level(todo, doing->pool);
3857 +
3858 +               /* give other threads chance to run */
3859 +               reiser4_preempt_point();
3860 +       }
3861 +       done_carry_level(done);
3862 +
3863 +       /* all counters, but x_refs should remain the same. x_refs can change
3864 +          owing to transaction manager */
3865 +       ON_DEBUG(CHECK_COUNTERS);
3866 +       return result;
3867 +}
3868 +
3869 +/* perform carry operations on given level.
3870 +
3871 +   Optimizations proposed by pooh:
3872 +
3873 +   (1) don't lock all nodes from queue at the same time. Lock nodes lazily as
3874 +   required;
3875 +
3876 +   (2) unlock node if there are no more operations to be performed upon it and
3877 +   node didn't add any operation to @todo. This can be implemented by
3878 +   attaching to each node two counters: counter of operaions working on this
3879 +   node and counter and operations carried upward from this node.
3880 +
3881 +*/
3882 +static int carry_on_level(carry_level * doing  /* queue of carry operations to
3883 +                                                * do on this level */ ,
3884 +                         carry_level * todo    /* queue where new carry
3885 +                                                * operations to be performed on
3886 +                                                * the * parent level are
3887 +                                                * accumulated during @doing
3888 +                                                * processing. */ )
3889 +{
3890 +       int result;
3891 +       int (*f) (carry_op *, carry_level *, carry_level *);
3892 +       carry_op *op;
3893 +       carry_op *tmp_op;
3894 +
3895 +       assert("nikita-1034", doing != NULL);
3896 +       assert("nikita-1035", todo != NULL);
3897 +
3898 +       /* @doing->nodes are locked. */
3899 +
3900 +       /* This function can be split into two phases: analysis and modification
3901 +
3902 +          Analysis calculates precisely what items should be moved between
3903 +          nodes. This information is gathered in some structures attached to
3904 +          each carry_node in a @doing queue. Analysis also determines whether
3905 +          new nodes are to be allocated etc.
3906 +
3907 +          After analysis is completed, actual modification is performed. Here
3908 +          we can take advantage of "batch modification": if there are several
3909 +          operations acting on the same node, modifications can be performed
3910 +          more efficiently when batched together.
3911 +
3912 +          Above is an optimization left for the future.
3913 +        */
3914 +       /* Important, but delayed optimization: it's possible to batch
3915 +          operations together and perform them more efficiently as a
3916 +          result. For example, deletion of several neighboring items from a
3917 +          node can be converted to a single ->cut() operation.
3918 +
3919 +          Before processing queue, it should be scanned and "mergeable"
3920 +          operations merged.
3921 +        */
3922 +       result = 0;
3923 +       for_all_ops(doing, op, tmp_op) {
3924 +               carry_opcode opcode;
3925 +
3926 +               assert("nikita-1041", op != NULL);
3927 +               opcode = op->op;
3928 +               assert("nikita-1042", op->op < COP_LAST_OP);
3929 +               f = op_dispatch_table[op->op].handler;
3930 +               result = f(op, doing, todo);
3931 +               /* locking can fail with -E_REPEAT. Any different error is fatal
3932 +                  and will be handled by fatal_carry_error() sledgehammer.
3933 +                */
3934 +               if (result != 0)
3935 +                       break;
3936 +       }
3937 +       if (result == 0) {
3938 +               carry_plugin_info info;
3939 +               carry_node *scan;
3940 +               carry_node *tmp_scan;
3941 +
3942 +               info.doing = doing;
3943 +               info.todo = todo;
3944 +
3945 +               assert("nikita-3002",
3946 +                      carry_level_invariant(doing, CARRY_DOING));
3947 +               for_all_nodes(doing, scan, tmp_scan) {
3948 +                       znode *node;
3949 +
3950 +                       node = reiser4_carry_real(scan);
3951 +                       assert("nikita-2547", node != NULL);
3952 +                       if (node_is_empty(node)) {
3953 +                               result =
3954 +                                   node_plugin_by_node(node)->
3955 +                                   prepare_removal(node, &info);
3956 +                               if (result != 0)
3957 +                                       break;
3958 +                       }
3959 +               }
3960 +       }
3961 +       return result;
3962 +}
3963 +
3964 +/* post carry operation
3965 +
3966 +   This is main function used by external carry clients: node layout plugins
3967 +   and tree operations to create new carry operation to be performed on some
3968 +   level.
3969 +
3970 +   New operation will be included in the @level queue. To actually perform it,
3971 +   call carry( level, ... ). This function takes write lock on @node. Carry
3972 +   manages all its locks by itself, don't worry about this.
3973 +
3974 +   This function adds operation and node at the end of the queue. It is up to
3975 +   caller to guarantee proper ordering of node queue.
3976 +
3977 +*/
3978 +carry_op * reiser4_post_carry(carry_level * level /* queue where new operation
3979 +                                                  * is to be posted at */ ,
3980 +                             carry_opcode op /* opcode of operation */ ,
3981 +                             znode * node      /* node on which this operation
3982 +                                                * will operate */ ,
3983 +                             int apply_to_parent_p /* whether operation will
3984 +                                                    * operate directly on @node
3985 +                                                    * or on it parent. */)
3986 +{
3987 +       carry_op *result;
3988 +       carry_node *child;
3989 +
3990 +       assert("nikita-1046", level != NULL);
3991 +       assert("nikita-1788", znode_is_write_locked(node));
3992 +
3993 +       result = add_op(level, POOLO_LAST, NULL);
3994 +       if (IS_ERR(result))
3995 +               return result;
3996 +       child = reiser4_add_carry(level, POOLO_LAST, NULL);
3997 +       if (IS_ERR(child)) {
3998 +               reiser4_pool_free(&level->pool->op_pool, &result->header);
3999 +               return (carry_op *) child;
4000 +       }
4001 +       result->node = child;
4002 +       result->op = op;
4003 +       child->parent = apply_to_parent_p;
4004 +       if (ZF_ISSET(node, JNODE_ORPHAN))
4005 +               child->left_before = 1;
4006 +       child->node = node;
4007 +       return result;
4008 +}
4009 +
4010 +/* initialize carry queue */
4011 +void init_carry_level(carry_level * level /* level to initialize */ ,
4012 +                     carry_pool * pool /* pool @level will allocate objects
4013 +                                        * from */ )
4014 +{
4015 +       assert("nikita-1045", level != NULL);
4016 +       assert("nikita-967", pool != NULL);
4017 +
4018 +       memset(level, 0, sizeof *level);
4019 +       level->pool = pool;
4020 +
4021 +       INIT_LIST_HEAD(&level->nodes);
4022 +       INIT_LIST_HEAD(&level->ops);
4023 +}
4024 +
4025 +/* allocate carry pool and initialize pools within queue */
4026 +carry_pool *init_carry_pool(int size)
4027 +{
4028 +       carry_pool *pool;
4029 +
4030 +       assert("", size >= sizeof(carry_pool) + 3 * sizeof(carry_level));
4031 +       pool = kmalloc(size, reiser4_ctx_gfp_mask_get());
4032 +       if (pool == NULL)
4033 +               return ERR_PTR(RETERR(-ENOMEM));
4034 +
4035 +       reiser4_init_pool(&pool->op_pool, sizeof(carry_op), CARRIES_POOL_SIZE,
4036 +                         (char *)pool->op);
4037 +       reiser4_init_pool(&pool->node_pool, sizeof(carry_node),
4038 +                         NODES_LOCKED_POOL_SIZE, (char *)pool->node);
4039 +       return pool;
4040 +}
4041 +
4042 +/* finish with queue pools */
4043 +void done_carry_pool(carry_pool * pool/* pool to destroy */)
4044 +{
4045 +       reiser4_done_pool(&pool->op_pool);
4046 +       reiser4_done_pool(&pool->node_pool);
4047 +       kfree(pool);
4048 +}
4049 +
4050 +/* add new carry node to the @level.
4051 +
4052 +   Returns pointer to the new carry node allocated from pool.  It's up to
4053 +   callers to maintain proper order in the @level. Assumption is that if carry
4054 +   nodes on one level are already sorted and modifications are peroformed from
4055 +   left to right, carry nodes added on the parent level will be ordered
4056 +   automatically. To control ordering use @order and @reference parameters.
4057 +
4058 +*/
4059 +carry_node *reiser4_add_carry_skip(carry_level * level /* &carry_level to add
4060 +                                                        * node to */ ,
4061 +                                  pool_ordering order  /* where to insert:
4062 +                                                        * at the beginning of
4063 +                                                        * @level,
4064 +                                                        * before @reference,
4065 +                                                        * after @reference,
4066 +                                                        * at the end of @level
4067 +                                                        */ ,
4068 +                                  carry_node * reference/* reference node for
4069 +                                                         * insertion */)
4070 +{
4071 +       ON_DEBUG(carry_node * orig_ref = reference);
4072 +
4073 +       if (order == POOLO_BEFORE) {
4074 +               reference = find_left_carry(reference, level);
4075 +               if (reference == NULL)
4076 +                       reference = list_entry(level->nodes.next, carry_node,
4077 +                                              header.level_linkage);
4078 +               else
4079 +                       reference = list_entry(reference->header.level_linkage.next,
4080 +                                              carry_node, header.level_linkage);
4081 +       } else if (order == POOLO_AFTER) {
4082 +               reference = find_right_carry(reference, level);
4083 +               if (reference == NULL)
4084 +                       reference = list_entry(level->nodes.prev, carry_node,
4085 +                                              header.level_linkage);
4086 +               else
4087 +                       reference = list_entry(reference->header.level_linkage.prev,
4088 +                                              carry_node, header.level_linkage);
4089 +       }
4090 +       assert("nikita-2209",
4091 +              ergo(orig_ref != NULL,
4092 +                   reiser4_carry_real(reference) ==
4093 +                   reiser4_carry_real(orig_ref)));
4094 +       return reiser4_add_carry(level, order, reference);
4095 +}
4096 +
4097 +carry_node *reiser4_add_carry(carry_level * level,   /* carry_level to add
4098 +                                                       node to */
4099 +                             pool_ordering order,   /* where to insert:
4100 +                                                     * at the beginning of
4101 +                                                     * @level;
4102 +                                                     * before @reference;
4103 +                                                     * after @reference;
4104 +                                                     * at the end of @level
4105 +                                                     */
4106 +                             carry_node * reference /* reference node for
4107 +                                                     * insertion */)
4108 +{
4109 +       carry_node *result;
4110 +
4111 +       result =
4112 +           (carry_node *) reiser4_add_obj(&level->pool->node_pool,
4113 +                                          &level->nodes,
4114 +                                          order, &reference->header);
4115 +       if (!IS_ERR(result) && (result != NULL))
4116 +               ++level->nodes_num;
4117 +       return result;
4118 +}
4119 +
4120 +/**
4121 + * add new carry operation to the @level.
4122 + *
4123 + * Returns pointer to the new carry operations allocated from pool. It's up to
4124 + * callers to maintain proper order in the @level. To control ordering use
4125 + * @order and @reference parameters.
4126 + */
4127 +static carry_op *add_op(carry_level * level, /* &carry_level to add node to */
4128 +                       pool_ordering order, /* where to insert:
4129 +                                             * at the beginning of @level;
4130 +                                             * before @reference;
4131 +                                             * after @reference;
4132 +                                             * at the end of @level */
4133 +                       carry_op * reference /* reference node for insertion */)
4134 +{
4135 +       carry_op *result;
4136 +
4137 +       result =
4138 +           (carry_op *) reiser4_add_obj(&level->pool->op_pool, &level->ops,
4139 +                                        order, &reference->header);
4140 +       if (!IS_ERR(result) && (result != NULL))
4141 +               ++level->ops_num;
4142 +       return result;
4143 +}
4144 +
4145 +/**
4146 + * Return node on the right of which @node was created.
4147 + *
4148 + * Each node is created on the right of some existing node (or it is new root,
4149 + * which is special case not handled here).
4150 + *
4151 + * @node is new node created on some level, but not yet inserted into its
4152 + * parent, it has corresponding bit (JNODE_ORPHAN) set in zstate.
4153 + */
4154 +static carry_node *find_begetting_brother(carry_node * node,/* node to start
4155 +                                                               search from */
4156 +                                         carry_level * kin UNUSED_ARG
4157 +                                                           /* level to scan */)
4158 +{
4159 +       carry_node *scan;
4160 +
4161 +       assert("nikita-1614", node != NULL);
4162 +       assert("nikita-1615", kin != NULL);
4163 +       assert("nikita-1616", LOCK_CNT_GTZ(rw_locked_tree));
4164 +       assert("nikita-1619", ergo(reiser4_carry_real(node) != NULL,
4165 +                                  ZF_ISSET(reiser4_carry_real(node),
4166 +                                           JNODE_ORPHAN)));
4167 +       for (scan = node;;
4168 +            scan = list_entry(scan->header.level_linkage.prev, carry_node,
4169 +                              header.level_linkage)) {
4170 +               assert("nikita-1617", &kin->nodes != &scan->header.level_linkage);
4171 +               if ((scan->node != node->node) &&
4172 +                   !ZF_ISSET(scan->node, JNODE_ORPHAN)) {
4173 +                       assert("nikita-1618", reiser4_carry_real(scan) != NULL);
4174 +                       break;
4175 +               }
4176 +       }
4177 +       return scan;
4178 +}
4179 +
4180 +static cmp_t
4181 +carry_node_cmp(carry_level * level, carry_node * n1, carry_node * n2)
4182 +{
4183 +       assert("nikita-2199", n1 != NULL);
4184 +       assert("nikita-2200", n2 != NULL);
4185 +
4186 +       if (n1 == n2)
4187 +               return EQUAL_TO;
4188 +       while (1) {
4189 +               n1 = carry_node_next(n1);
4190 +               if (carry_node_end(level, n1))
4191 +                       return GREATER_THAN;
4192 +               if (n1 == n2)
4193 +                       return LESS_THAN;
4194 +       }
4195 +       impossible("nikita-2201", "End of level reached");
4196 +}
4197 +
4198 +carry_node *find_carry_node(carry_level * level, const znode * node)
4199 +{
4200 +       carry_node *scan;
4201 +       carry_node *tmp_scan;
4202 +
4203 +       assert("nikita-2202", level != NULL);
4204 +       assert("nikita-2203", node != NULL);
4205 +
4206 +       for_all_nodes(level, scan, tmp_scan) {
4207 +               if (reiser4_carry_real(scan) == node)
4208 +                       return scan;
4209 +       }
4210 +       return NULL;
4211 +}
4212 +
4213 +znode *reiser4_carry_real(const carry_node * node)
4214 +{
4215 +       assert("nikita-3061", node != NULL);
4216 +
4217 +       return node->lock_handle.node;
4218 +}
4219 +
4220 +carry_node *insert_carry_node(carry_level * doing, carry_level * todo,
4221 +                             const znode * node)
4222 +{
4223 +       carry_node *base;
4224 +       carry_node *scan;
4225 +       carry_node *tmp_scan;
4226 +       carry_node *proj;
4227 +
4228 +       base = find_carry_node(doing, node);
4229 +       assert("nikita-2204", base != NULL);
4230 +
4231 +       for_all_nodes(todo, scan, tmp_scan) {
4232 +               proj = find_carry_node(doing, scan->node);
4233 +               assert("nikita-2205", proj != NULL);
4234 +               if (carry_node_cmp(doing, proj, base) != LESS_THAN)
4235 +                       break;
4236 +       }
4237 +       return scan;
4238 +}
4239 +
4240 +static carry_node *add_carry_atplace(carry_level * doing, carry_level * todo,
4241 +                                    znode * node)
4242 +{
4243 +       carry_node *reference;
4244 +
4245 +       assert("nikita-2994", doing != NULL);
4246 +       assert("nikita-2995", todo != NULL);
4247 +       assert("nikita-2996", node != NULL);
4248 +
4249 +       reference = insert_carry_node(doing, todo, node);
4250 +       assert("nikita-2997", reference != NULL);
4251 +
4252 +       return reiser4_add_carry(todo, POOLO_BEFORE, reference);
4253 +}
4254 +
4255 +/* like reiser4_post_carry(), but designed to be called from node plugin
4256 +   methods. This function is different from reiser4_post_carry() in that it
4257 +   finds proper place to insert node in the queue. */
4258 +carry_op *node_post_carry(carry_plugin_info * info     /* carry parameters
4259 +                                                        * passed down to node
4260 +                                                        * plugin */ ,
4261 +                         carry_opcode op /* opcode of operation */ ,
4262 +                         znode * node  /* node on which this
4263 +                                        * operation will operate */ ,
4264 +                         int apply_to_parent_p /* whether operation will
4265 +                                                * operate directly on @node
4266 +                                                * or on it parent. */ )
4267 +{
4268 +       carry_op *result;
4269 +       carry_node *child;
4270 +
4271 +       assert("nikita-2207", info != NULL);
4272 +       assert("nikita-2208", info->todo != NULL);
4273 +
4274 +       if (info->doing == NULL)
4275 +               return reiser4_post_carry(info->todo, op, node,
4276 +                                         apply_to_parent_p);
4277 +
4278 +       result = add_op(info->todo, POOLO_LAST, NULL);
4279 +       if (IS_ERR(result))
4280 +               return result;
4281 +       child = add_carry_atplace(info->doing, info->todo, node);
4282 +       if (IS_ERR(child)) {
4283 +               reiser4_pool_free(&info->todo->pool->op_pool, &result->header);
4284 +               return (carry_op *) child;
4285 +       }
4286 +       result->node = child;
4287 +       result->op = op;
4288 +       child->parent = apply_to_parent_p;
4289 +       if (ZF_ISSET(node, JNODE_ORPHAN))
4290 +               child->left_before = 1;
4291 +       child->node = node;
4292 +       return result;
4293 +}
4294 +
4295 +/* lock all carry nodes in @level */
4296 +static int lock_carry_level(carry_level * level/* level to lock */)
4297 +{
4298 +       int result;
4299 +       carry_node *node;
4300 +       carry_node *tmp_node;
4301 +
4302 +       assert("nikita-881", level != NULL);
4303 +       assert("nikita-2229", carry_level_invariant(level, CARRY_TODO));
4304 +
4305 +       /* lock nodes from left to right */
4306 +       result = 0;
4307 +       for_all_nodes(level, node, tmp_node) {
4308 +               result = lock_carry_node(level, node);
4309 +               if (result != 0)
4310 +                       break;
4311 +       }
4312 +       return result;
4313 +}
4314 +
4315 +/* Synchronize delimiting keys between @node and its left neighbor.
4316 +
4317 +   To reduce contention on dk key and simplify carry code, we synchronize
4318 +   delimiting keys only when carry ultimately leaves tree level (carrying
4319 +   changes upward) and unlocks nodes at this level.
4320 +
4321 +   This function first finds left neighbor of @node and then updates left
4322 +   neighbor's right delimiting key to conincide with least key in @node.
4323 +
4324 +*/
4325 +
4326 +ON_DEBUG(extern atomic_t delim_key_version;
4327 +    )
4328 +
4329 +static void sync_dkeys(znode * spot/* node to update */)
4330 +{
4331 +       reiser4_key pivot;
4332 +       reiser4_tree *tree;
4333 +
4334 +       assert("nikita-1610", spot != NULL);
4335 +       assert("nikita-1612", LOCK_CNT_NIL(rw_locked_dk));
4336 +
4337 +       tree = znode_get_tree(spot);
4338 +       read_lock_tree(tree);
4339 +       write_lock_dk(tree);
4340 +
4341 +       assert("nikita-2192", znode_is_loaded(spot));
4342 +
4343 +       /* sync left delimiting key of @spot with key in its leftmost item */
4344 +       if (node_is_empty(spot))
4345 +               pivot = *znode_get_rd_key(spot);
4346 +       else
4347 +               leftmost_key_in_node(spot, &pivot);
4348 +
4349 +       znode_set_ld_key(spot, &pivot);
4350 +
4351 +       /* there can be sequence of empty nodes pending removal on the left of
4352 +          @spot. Scan them and update their left and right delimiting keys to
4353 +          match left delimiting key of @spot. Also, update right delimiting
4354 +          key of first non-empty left neighbor.
4355 +        */
4356 +       while (1) {
4357 +               if (!ZF_ISSET(spot, JNODE_LEFT_CONNECTED))
4358 +                       break;
4359 +
4360 +               spot = spot->left;
4361 +               if (spot == NULL)
4362 +                       break;
4363 +
4364 +               znode_set_rd_key(spot, &pivot);
4365 +               /* don't sink into the domain of another balancing */
4366 +               if (!znode_is_write_locked(spot))
4367 +                       break;
4368 +               if (ZF_ISSET(spot, JNODE_HEARD_BANSHEE))
4369 +                       znode_set_ld_key(spot, &pivot);
4370 +               else
4371 +                       break;
4372 +       }
4373 +
4374 +       write_unlock_dk(tree);
4375 +       read_unlock_tree(tree);
4376 +}
4377 +
4378 +/* unlock all carry nodes in @level */
4379 +static void unlock_carry_level(carry_level * level /* level to unlock */ ,
4380 +                              int failure      /* true if unlocking owing to
4381 +                                                * failure */ )
4382 +{
4383 +       carry_node *node;
4384 +       carry_node *tmp_node;
4385 +
4386 +       assert("nikita-889", level != NULL);
4387 +
4388 +       if (!failure) {
4389 +               znode *spot;
4390 +
4391 +               spot = NULL;
4392 +               /* update delimiting keys */
4393 +               for_all_nodes(level, node, tmp_node) {
4394 +                       if (reiser4_carry_real(node) != spot) {
4395 +                               spot = reiser4_carry_real(node);
4396 +                               sync_dkeys(spot);
4397 +                       }
4398 +               }
4399 +       }
4400 +
4401 +       /* nodes can be unlocked in arbitrary order.  In preemptible
4402 +          environment it's better to unlock in reverse order of locking,
4403 +          though.
4404 +        */
4405 +       for_all_nodes_back(level, node, tmp_node) {
4406 +               /* all allocated nodes should be already linked to their
4407 +                  parents at this moment. */
4408 +               assert("nikita-1631",
4409 +                      ergo(!failure, !ZF_ISSET(reiser4_carry_real(node),
4410 +                                               JNODE_ORPHAN)));
4411 +               ON_DEBUG(check_dkeys(reiser4_carry_real(node)));
4412 +               unlock_carry_node(level, node, failure);
4413 +       }
4414 +       level->new_root = NULL;
4415 +}
4416 +
4417 +/* finish with @level
4418 +
4419 +   Unlock nodes and release all allocated resources */
4420 +static void done_carry_level(carry_level * level/* level to finish */)
4421 +{
4422 +       carry_node *node;
4423 +       carry_node *tmp_node;
4424 +       carry_op *op;
4425 +       carry_op *tmp_op;
4426 +
4427 +       assert("nikita-1076", level != NULL);
4428 +
4429 +       unlock_carry_level(level, 0);
4430 +       for_all_nodes(level, node, tmp_node) {
4431 +               assert("nikita-2113", list_empty_careful(&node->lock_handle.locks_link));
4432 +               assert("nikita-2114", list_empty_careful(&node->lock_handle.owners_link));
4433 +               reiser4_pool_free(&level->pool->node_pool, &node->header);
4434 +       }
4435 +       for_all_ops(level, op, tmp_op)
4436 +           reiser4_pool_free(&level->pool->op_pool, &op->header);
4437 +}
4438 +
4439 +/* helper function to complete locking of carry node
4440 +
4441 +   Finish locking of carry node. There are several ways in which new carry
4442 +   node can be added into carry level and locked. Normal is through
4443 +   lock_carry_node(), but also from find_{left|right}_neighbor(). This
4444 +   function factors out common final part of all locking scenarios. It
4445 +   supposes that @node -> lock_handle is lock handle for lock just taken and
4446 +   fills ->real_node from this lock handle.
4447 +
4448 +*/
4449 +int lock_carry_node_tail(carry_node * node/* node to complete locking of */)
4450 +{
4451 +       assert("nikita-1052", node != NULL);
4452 +       assert("nikita-1187", reiser4_carry_real(node) != NULL);
4453 +       assert("nikita-1188", !node->unlock);
4454 +
4455 +       node->unlock = 1;
4456 +       /* Load node content into memory and install node plugin by
4457 +          looking at the node header.
4458 +
4459 +          Most of the time this call is cheap because the node is
4460 +          already in memory.
4461 +
4462 +          Corresponding zrelse() is in unlock_carry_node()
4463 +        */
4464 +       return zload(reiser4_carry_real(node));
4465 +}
4466 +
4467 +/* lock carry node
4468 +
4469 +   "Resolve" node to real znode, lock it and mark as locked.
4470 +   This requires recursive locking of znodes.
4471 +
4472 +   When operation is posted to the parent level, node it will be applied to is
4473 +   not yet known. For example, when shifting data between two nodes,
4474 +   delimiting has to be updated in parent or parents of nodes involved. But
4475 +   their parents is not yet locked and, moreover said nodes can be reparented
4476 +   by concurrent balancing.
4477 +
4478 +   To work around this, carry operation is applied to special "carry node"
4479 +   rather than to the znode itself. Carry node consists of some "base" or
4480 +   "reference" znode and flags indicating how to get to the target of carry
4481 +   operation (->real_node field of carry_node) from base.
4482 +
4483 +*/
4484 +int lock_carry_node(carry_level * level /* level @node is in */ ,
4485 +                   carry_node * node/* node to lock */)
4486 +{
4487 +       int result;
4488 +       znode *reference_point;
4489 +       lock_handle lh;
4490 +       lock_handle tmp_lh;
4491 +       reiser4_tree *tree;
4492 +
4493 +       assert("nikita-887", level != NULL);
4494 +       assert("nikita-882", node != NULL);
4495 +
4496 +       result = 0;
4497 +       reference_point = node->node;
4498 +       init_lh(&lh);
4499 +       init_lh(&tmp_lh);
4500 +       if (node->left_before) {
4501 +               /* handling of new nodes, allocated on the previous level:
4502 +
4503 +                  some carry ops were propably posted from the new node, but
4504 +                  this node neither has parent pointer set, nor is
4505 +                  connected. This will be done in ->create_hook() for
4506 +                  internal item.
4507 +
4508 +                  No then less, parent of new node has to be locked. To do
4509 +                  this, first go to the "left" in the carry order. This
4510 +                  depends on the decision to always allocate new node on the
4511 +                  right of existing one.
4512 +
4513 +                  Loop handles case when multiple nodes, all orphans, were
4514 +                  inserted.
4515 +
4516 +                  Strictly speaking, taking tree lock is not necessary here,
4517 +                  because all nodes scanned by loop in
4518 +                  find_begetting_brother() are write-locked by this thread,
4519 +                  and thus, their sibling linkage cannot change.
4520 +
4521 +                */
4522 +               tree = znode_get_tree(reference_point);
4523 +               read_lock_tree(tree);
4524 +               reference_point = find_begetting_brother(node, level)->node;
4525 +               read_unlock_tree(tree);
4526 +               assert("nikita-1186", reference_point != NULL);
4527 +       }
4528 +       if (node->parent && (result == 0)) {
4529 +               result =
4530 +                   reiser4_get_parent(&tmp_lh, reference_point,
4531 +                                      ZNODE_WRITE_LOCK);
4532 +               if (result != 0) {
4533 +                       ;       /* nothing */
4534 +               } else if (znode_get_level(tmp_lh.node) == 0) {
4535 +                       assert("nikita-1347", znode_above_root(tmp_lh.node));
4536 +                       result = add_new_root(level, node, tmp_lh.node);
4537 +                       if (result == 0) {
4538 +                               reference_point = level->new_root;
4539 +                               move_lh(&lh, &node->lock_handle);
4540 +                       }
4541 +               } else if ((level->new_root != NULL)
4542 +                          && (level->new_root !=
4543 +                              znode_parent_nolock(reference_point))) {
4544 +                       /* parent of node exists, but this level aready
4545 +                          created different new root, so */
4546 +                       warning("nikita-1109",
4547 +                               /* it should be "radicis", but tradition is
4548 +                                  tradition.  do banshees read latin? */
4549 +                               "hodie natus est radici frater");
4550 +                       result = -EIO;
4551 +               } else {
4552 +                       move_lh(&lh, &tmp_lh);
4553 +                       reference_point = lh.node;
4554 +               }
4555 +       }
4556 +       if (node->left && (result == 0)) {
4557 +               assert("nikita-1183", node->parent);
4558 +               assert("nikita-883", reference_point != NULL);
4559 +               result =
4560 +                   reiser4_get_left_neighbor(&tmp_lh, reference_point,
4561 +                                             ZNODE_WRITE_LOCK,
4562 +                                             GN_CAN_USE_UPPER_LEVELS);
4563 +               if (result == 0) {
4564 +                       done_lh(&lh);
4565 +                       move_lh(&lh, &tmp_lh);
4566 +                       reference_point = lh.node;
4567 +               }
4568 +       }
4569 +       if (!node->parent && !node->left && !node->left_before) {
4570 +               result =
4571 +                   longterm_lock_znode(&lh, reference_point, ZNODE_WRITE_LOCK,
4572 +                                       ZNODE_LOCK_HIPRI);
4573 +       }
4574 +       if (result == 0) {
4575 +               move_lh(&node->lock_handle, &lh);
4576 +               result = lock_carry_node_tail(node);
4577 +       }
4578 +       done_lh(&tmp_lh);
4579 +       done_lh(&lh);
4580 +       return result;
4581 +}
4582 +
4583 +/* release a lock on &carry_node.
4584 +
4585 +   Release if necessary lock on @node. This opearion is pair of
4586 +   lock_carry_node() and is idempotent: you can call it more than once on the
4587 +   same node.
4588 +
4589 +*/
4590 +static void
4591 +unlock_carry_node(carry_level * level,
4592 +                 carry_node * node /* node to be released */ ,
4593 +                 int failure   /* 0 if node is unlocked due
4594 +                                * to some error */ )
4595 +{
4596 +       znode *real_node;
4597 +
4598 +       assert("nikita-884", node != NULL);
4599 +
4600 +       real_node = reiser4_carry_real(node);
4601 +       /* pair to zload() in lock_carry_node_tail() */
4602 +       zrelse(real_node);
4603 +       if (node->unlock && (real_node != NULL)) {
4604 +               assert("nikita-899", real_node == node->lock_handle.node);
4605 +               longterm_unlock_znode(&node->lock_handle);
4606 +       }
4607 +       if (failure) {
4608 +               if (node->deallocate && (real_node != NULL)) {
4609 +                       /* free node in bitmap
4610 +
4611 +                          Prepare node for removal. Last zput() will finish
4612 +                          with it.
4613 +                        */
4614 +                       ZF_SET(real_node, JNODE_HEARD_BANSHEE);
4615 +               }
4616 +               if (node->free) {
4617 +                       assert("nikita-2177",
4618 +                              list_empty_careful(&node->lock_handle.locks_link));
4619 +                       assert("nikita-2112",
4620 +                              list_empty_careful(&node->lock_handle.owners_link));
4621 +                       reiser4_pool_free(&level->pool->node_pool,
4622 +                                         &node->header);
4623 +               }
4624 +       }
4625 +}
4626 +
4627 +/* fatal_carry_error() - all-catching error handling function
4628 +
4629 +   It is possible that carry faces unrecoverable error, like unability to
4630 +   insert pointer at the internal level. Our simple solution is just panic in
4631 +   this situation. More sophisticated things like attempt to remount
4632 +   file-system as read-only can be implemented without much difficlties.
4633 +
4634 +   It is believed, that:
4635 +
4636 +   1. in stead of panicking, all current transactions can be aborted rolling
4637 +   system back to the consistent state.
4638 +
4639 +Umm, if you simply panic without doing anything more at all, then all current
4640 +transactions are aborted and the system is rolled back to a consistent state,
4641 +by virtue of the design of the transactional mechanism. Well, wait, let's be
4642 +precise.  If an internal node is corrupted on disk due to hardware failure,
4643 +then there may be no consistent state that can be rolled back to, so instead
4644 +we should say that it will rollback the transactions, which barring other
4645 +factors means rolling back to a consistent state.
4646 +
4647 +# Nikita: there is a subtle difference between panic and aborting
4648 +# transactions: machine doesn't reboot. Processes aren't killed. Processes
4649 +# don't using reiser4 (not that we care about such processes), or using other
4650 +# reiser4 mounts (about them we do care) will simply continue to run. With
4651 +# some luck, even application using aborted file system can survive: it will
4652 +# get some error, like EBADF, from each file descriptor on failed file system,
4653 +# but applications that do care about tolerance will cope with this (squid
4654 +# will).
4655 +
4656 +It would be a nice feature though to support rollback without rebooting
4657 +followed by remount, but this can wait for later versions.
4658 +
4659 +   2. once isolated transactions will be implemented it will be possible to
4660 +   roll back offending transaction.
4661 +
4662 +2. is additional code complexity of inconsistent value (it implies that a
4663 +broken tree should be kept in operation), so we must think about it more
4664 +before deciding if it should be done.  -Hans
4665 +
4666 +*/
4667 +static void fatal_carry_error(carry_level * doing UNUSED_ARG   /* carry level
4668 +                                                                * where
4669 +                                                                * unrecoverable
4670 +                                                                * error
4671 +                                                                * occurred */ ,
4672 +                             int ecode/* error code */)
4673 +{
4674 +       assert("nikita-1230", doing != NULL);
4675 +       assert("nikita-1231", ecode < 0);
4676 +
4677 +       reiser4_panic("nikita-1232", "Carry failed: %i", ecode);
4678 +}
4679 +
4680 +/**
4681 + * Add new root to the tree
4682 + *
4683 + * This function itself only manages changes in carry structures and delegates
4684 + * all hard work (allocation of znode for new root, changes of parent and
4685 + * sibling pointers to the reiser4_add_tree_root().
4686 + *
4687 + * Locking: old tree root is locked by carry at this point. Fake znode is also
4688 + * locked.
4689 + */
4690 +static int add_new_root(carry_level * level,/* carry level in context of which
4691 +                                            * operation is performed */
4692 +                       carry_node * node,  /* carry node for existing root */
4693 +                       znode * fake        /* "fake" znode already locked by
4694 +                                            * us */)
4695 +{
4696 +       int result;
4697 +
4698 +       assert("nikita-1104", level != NULL);
4699 +       assert("nikita-1105", node != NULL);
4700 +
4701 +       assert("nikita-1403", znode_is_write_locked(node->node));
4702 +       assert("nikita-1404", znode_is_write_locked(fake));
4703 +
4704 +       /* trying to create new root. */
4705 +       /* @node is root and it's already locked by us. This
4706 +          means that nobody else can be trying to add/remove
4707 +          tree root right now.
4708 +        */
4709 +       if (level->new_root == NULL)
4710 +               level->new_root = reiser4_add_tree_root(node->node, fake);
4711 +       if (!IS_ERR(level->new_root)) {
4712 +               assert("nikita-1210", znode_is_root(level->new_root));
4713 +               node->deallocate = 1;
4714 +               result =
4715 +                   longterm_lock_znode(&node->lock_handle, level->new_root,
4716 +                                       ZNODE_WRITE_LOCK, ZNODE_LOCK_LOPRI);
4717 +               if (result == 0)
4718 +                       zput(level->new_root);
4719 +       } else {
4720 +               result = PTR_ERR(level->new_root);
4721 +               level->new_root = NULL;
4722 +       }
4723 +       return result;
4724 +}
4725 +
4726 +/* allocate new znode and add the operation that inserts the
4727 +   pointer to it into the parent node into the todo level
4728 +
4729 +   Allocate new znode, add it into carry queue and post into @todo queue
4730 +   request to add pointer to new node into its parent.
4731 +
4732 +   This is carry related routing that calls reiser4_new_node() to allocate new
4733 +   node.
4734 +*/
4735 +carry_node *add_new_znode(znode * brother      /* existing left neighbor of new
4736 +                                                * node */ ,
4737 +                         carry_node * ref      /* carry node after which new
4738 +                                                * carry node is to be inserted
4739 +                                                * into queue. This affects
4740 +                                                * locking. */ ,
4741 +                         carry_level * doing   /* carry queue where new node is
4742 +                                                * to be added */ ,
4743 +                         carry_level * todo    /* carry queue where COP_INSERT
4744 +                                                * operation to add pointer to
4745 +                                                * new node will ne added */ )
4746 +{
4747 +       carry_node *fresh;
4748 +       znode *new_znode;
4749 +       carry_op *add_pointer;
4750 +       carry_plugin_info info;
4751 +
4752 +       assert("nikita-1048", brother != NULL);
4753 +       assert("nikita-1049", todo != NULL);
4754 +
4755 +       /* There is a lot of possible variations here: to what parent
4756 +          new node will be attached and where. For simplicity, always
4757 +          do the following:
4758 +
4759 +          (1) new node and @brother will have the same parent.
4760 +
4761 +          (2) new node is added on the right of @brother
4762 +
4763 +        */
4764 +
4765 +       fresh = reiser4_add_carry_skip(doing,
4766 +                                      ref ? POOLO_AFTER : POOLO_LAST, ref);
4767 +       if (IS_ERR(fresh))
4768 +               return fresh;
4769 +
4770 +       fresh->deallocate = 1;
4771 +       fresh->free = 1;
4772 +
4773 +       new_znode = reiser4_new_node(brother, znode_get_level(brother));
4774 +       if (IS_ERR(new_znode))
4775 +               /* @fresh will be deallocated automatically by error
4776 +                  handling code in the caller. */
4777 +               return (carry_node *) new_znode;
4778 +
4779 +       /* new_znode returned znode with x_count 1. Caller has to decrease
4780 +          it. make_space() does. */
4781 +
4782 +       ZF_SET(new_znode, JNODE_ORPHAN);
4783 +       fresh->node = new_znode;
4784 +
4785 +       while (ZF_ISSET(reiser4_carry_real(ref), JNODE_ORPHAN)) {
4786 +               ref = carry_node_prev(ref);
4787 +               assert("nikita-1606", !carry_node_end(doing, ref));
4788 +       }
4789 +
4790 +       info.todo = todo;
4791 +       info.doing = doing;
4792 +       add_pointer = node_post_carry(&info, COP_INSERT,
4793 +                                     reiser4_carry_real(ref), 1);
4794 +       if (IS_ERR(add_pointer)) {
4795 +               /* no need to deallocate @new_znode here: it will be
4796 +                  deallocated during carry error handling. */
4797 +               return (carry_node *) add_pointer;
4798 +       }
4799 +
4800 +       add_pointer->u.insert.type = COPT_CHILD;
4801 +       add_pointer->u.insert.child = fresh;
4802 +       add_pointer->u.insert.brother = brother;
4803 +       /* initially new node spawns empty key range */
4804 +       write_lock_dk(znode_get_tree(brother));
4805 +       znode_set_ld_key(new_znode,
4806 +                        znode_set_rd_key(new_znode,
4807 +                                         znode_get_rd_key(brother)));
4808 +       write_unlock_dk(znode_get_tree(brother));
4809 +       return fresh;
4810 +}
4811 +
4812 +/* DEBUGGING FUNCTIONS.
4813 +
4814 +   Probably we also should leave them on even when
4815 +   debugging is turned off to print dumps at errors.
4816 +*/
4817 +#if REISER4_DEBUG
4818 +static int carry_level_invariant(carry_level * level, carry_queue_state state)
4819 +{
4820 +       carry_node *node;
4821 +       carry_node *tmp_node;
4822 +
4823 +       if (level == NULL)
4824 +               return 0;
4825 +
4826 +       if (level->track_type != 0 &&
4827 +           level->track_type != CARRY_TRACK_NODE &&
4828 +           level->track_type != CARRY_TRACK_CHANGE)
4829 +               return 0;
4830 +
4831 +       /* check that nodes are in ascending order */
4832 +       for_all_nodes(level, node, tmp_node) {
4833 +               znode *left;
4834 +               znode *right;
4835 +
4836 +               reiser4_key lkey;
4837 +               reiser4_key rkey;
4838 +
4839 +               if (node != carry_node_front(level)) {
4840 +                       if (state == CARRY_TODO) {
4841 +                               right = node->node;
4842 +                               left = carry_node_prev(node)->node;
4843 +                       } else {
4844 +                               right = reiser4_carry_real(node);
4845 +                               left = reiser4_carry_real(carry_node_prev(node));
4846 +                       }
4847 +                       if (right == NULL || left == NULL)
4848 +                               continue;
4849 +                       if (node_is_empty(right) || node_is_empty(left))
4850 +                               continue;
4851 +                       if (!keyle(leftmost_key_in_node(left, &lkey),
4852 +                                  leftmost_key_in_node(right, &rkey))) {
4853 +                               warning("", "wrong key order");
4854 +                               return 0;
4855 +                       }
4856 +               }
4857 +       }
4858 +       return 1;
4859 +}
4860 +#endif
4861 +
4862 +/* get symbolic name for boolean */
4863 +static const char *tf(int boolean/* truth value */)
4864 +{
4865 +       return boolean ? "t" : "f";
4866 +}
4867 +
4868 +/* symbolic name for carry operation */
4869 +static const char *carry_op_name(carry_opcode op/* carry opcode */)
4870 +{
4871 +       switch (op) {
4872 +       case COP_INSERT:
4873 +               return "COP_INSERT";
4874 +       case COP_DELETE:
4875 +               return "COP_DELETE";
4876 +       case COP_CUT:
4877 +               return "COP_CUT";
4878 +       case COP_PASTE:
4879 +               return "COP_PASTE";
4880 +       case COP_UPDATE:
4881 +               return "COP_UPDATE";
4882 +       case COP_EXTENT:
4883 +               return "COP_EXTENT";
4884 +       case COP_INSERT_FLOW:
4885 +               return "COP_INSERT_FLOW";
4886 +       default:{
4887 +                       /* not mt safe, but who cares? */
4888 +                       static char buf[20];
4889 +
4890 +                       sprintf(buf, "unknown op: %x", op);
4891 +                       return buf;
4892 +               }
4893 +       }
4894 +}
4895 +
4896 +/* dump information about carry node */
4897 +static void print_carry(const char *prefix /* prefix to print */ ,
4898 +                       carry_node * node/* node to print */)
4899 +{
4900 +       if (node == NULL) {
4901 +               printk("%s: null\n", prefix);
4902 +               return;
4903 +       }
4904 +       printk
4905 +           ("%s: %p parent: %s, left: %s, unlock: %s, free: %s, dealloc: %s\n",
4906 +            prefix, node, tf(node->parent), tf(node->left), tf(node->unlock),
4907 +            tf(node->free), tf(node->deallocate));
4908 +}
4909 +
4910 +/* dump information about carry operation */
4911 +static void print_op(const char *prefix /* prefix to print */ ,
4912 +                    carry_op * op/* operation to print */)
4913 +{
4914 +       if (op == NULL) {
4915 +               printk("%s: null\n", prefix);
4916 +               return;
4917 +       }
4918 +       printk("%s: %p carry_opcode: %s\n", prefix, op, carry_op_name(op->op));
4919 +       print_carry("\tnode", op->node);
4920 +       switch (op->op) {
4921 +       case COP_INSERT:
4922 +       case COP_PASTE:
4923 +               print_coord("\tcoord",
4924 +                           op->u.insert.d ? op->u.insert.d->coord : NULL, 0);
4925 +               reiser4_print_key("\tkey",
4926 +                                 op->u.insert.d ? op->u.insert.d->key : NULL);
4927 +               print_carry("\tchild", op->u.insert.child);
4928 +               break;
4929 +       case COP_DELETE:
4930 +               print_carry("\tchild", op->u.delete.child);
4931 +               break;
4932 +       case COP_CUT:
4933 +               if (op->u.cut_or_kill.is_cut) {
4934 +                       print_coord("\tfrom",
4935 +                                   op->u.cut_or_kill.u.kill->params.from, 0);
4936 +                       print_coord("\tto", op->u.cut_or_kill.u.kill->params.to,
4937 +                                   0);
4938 +               } else {
4939 +                       print_coord("\tfrom",
4940 +                                   op->u.cut_or_kill.u.cut->params.from, 0);
4941 +                       print_coord("\tto", op->u.cut_or_kill.u.cut->params.to,
4942 +                                   0);
4943 +               }
4944 +               break;
4945 +       case COP_UPDATE:
4946 +               print_carry("\tleft", op->u.update.left);
4947 +               break;
4948 +       default:
4949 +               /* do nothing */
4950 +               break;
4951 +       }
4952 +}
4953 +
4954 +/* dump information about all nodes and operations in a @level */
4955 +static void print_level(const char *prefix /* prefix to print */ ,
4956 +                       carry_level * level/* level to print */)
4957 +{
4958 +       carry_node *node;
4959 +       carry_node *tmp_node;
4960 +       carry_op *op;
4961 +       carry_op *tmp_op;
4962 +
4963 +       if (level == NULL) {
4964 +               printk("%s: null\n", prefix);
4965 +               return;
4966 +       }
4967 +       printk("%s: %p, restartable: %s\n",
4968 +              prefix, level, tf(level->restartable));
4969 +
4970 +       for_all_nodes(level, node, tmp_node)
4971 +           print_carry("\tcarry node", node);
4972 +       for_all_ops(level, op, tmp_op)
4973 +           print_op("\tcarry op", op);
4974 +}
4975 +
4976 +/* Make Linus happy.
4977 +   Local variables:
4978 +   c-indentation-style: "K&R"
4979 +   mode-name: "LC"
4980 +   c-basic-offset: 8
4981 +   tab-width: 8
4982 +   fill-column: 120
4983 +   scroll-step: 1
4984 +   End:
4985 +*/
4986 diff -puN /dev/null fs/reiser4/carry.h
4987 --- /dev/null
4988 +++ a/fs/reiser4/carry.h
4989 @@ -0,0 +1,445 @@
4990 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
4991 +   reiser4/README */
4992 +
4993 +/* Functions and data types to "carry" tree modification(s) upward.
4994 +   See fs/reiser4/carry.c for details. */
4995 +
4996 +#if !defined(__FS_REISER4_CARRY_H__)
4997 +#define __FS_REISER4_CARRY_H__
4998 +
4999 +#include "forward.h"
5000 +#include "debug.h"
5001 +#include "pool.h"
5002 +#include "znode.h"
5003 +
5004 +#include <linux/types.h>
5005 +
5006 +/* &carry_node - "location" of carry node.
5007 +
5008 +   "location" of node that is involved or going to be involved into
5009 +   carry process. Node where operation will be carried to on the
5010 +   parent level cannot be recorded explicitly. Operation will be carried
5011 +   usually to the parent of some node (where changes are performed at
5012 +   the current level) or, to the left neighbor of its parent. But while
5013 +   modifications are performed at the current level, parent may
5014 +   change. So, we have to allow some indirection (or, positevly,
5015 +   flexibility) in locating carry nodes.
5016 +
5017 +*/
5018 +typedef struct carry_node {
5019 +       /* pool linkage */
5020 +       struct reiser4_pool_header header;
5021 +
5022 +       /* base node from which real_node is calculated. See
5023 +          fs/reiser4/carry.c:lock_carry_node(). */
5024 +       znode *node;
5025 +
5026 +       /* how to get ->real_node */
5027 +       /* to get ->real_node obtain parent of ->node */
5028 +       __u32 parent:1;
5029 +       /* to get ->real_node obtain left neighbor of parent of
5030 +          ->node */
5031 +       __u32 left:1;
5032 +       __u32 left_before:1;
5033 +
5034 +       /* locking */
5035 +
5036 +       /* this node was locked by carry process and should be
5037 +          unlocked when carry leaves a level */
5038 +       __u32 unlock:1;
5039 +
5040 +       /* disk block for this node was allocated by carry process and
5041 +          should be deallocated when carry leaves a level */
5042 +       __u32 deallocate:1;
5043 +       /* this carry node was allocated by carry process and should be
5044 +          freed when carry leaves a level */
5045 +       __u32 free:1;
5046 +
5047 +       /* type of lock we want to take on this node */
5048 +       lock_handle lock_handle;
5049 +} carry_node;
5050 +
5051 +/* &carry_opcode - elementary operations that can be carried upward
5052 +
5053 +   Operations that carry() can handle. This list is supposed to be
5054 +   expanded.
5055 +
5056 +   Each carry operation (cop) is handled by appropriate function defined
5057 +   in fs/reiser4/carry.c. For example COP_INSERT is handled by
5058 +   fs/reiser4/carry.c:carry_insert() etc. These functions in turn
5059 +   call plugins of nodes affected by operation to modify nodes' content
5060 +   and to gather operations to be performed on the next level.
5061 +
5062 +*/
5063 +typedef enum {
5064 +       /* insert new item into node. */
5065 +       COP_INSERT,
5066 +       /* delete pointer from parent node */
5067 +       COP_DELETE,
5068 +       /* remove part of or whole node. */
5069 +       COP_CUT,
5070 +       /* increase size of item. */
5071 +       COP_PASTE,
5072 +       /* insert extent (that is sequence of unformatted nodes). */
5073 +       COP_EXTENT,
5074 +       /* update delimiting key in least common ancestor of two
5075 +          nodes. This is performed when items are moved between two
5076 +          nodes.
5077 +        */
5078 +       COP_UPDATE,
5079 +       /* insert flow */
5080 +       COP_INSERT_FLOW,
5081 +       COP_LAST_OP,
5082 +} carry_opcode;
5083 +
5084 +#define CARRY_FLOW_NEW_NODES_LIMIT 20
5085 +
5086 +/* mode (or subtype) of COP_{INSERT|PASTE} operation. Specifies how target
5087 +   item is determined. */
5088 +typedef enum {
5089 +       /* target item is one containing pointer to the ->child node */
5090 +       COPT_CHILD,
5091 +       /* target item is given explicitly by @coord */
5092 +       COPT_ITEM_DATA,
5093 +       /* target item is given by key */
5094 +       COPT_KEY,
5095 +       /* see insert_paste_common() for more comments on this. */
5096 +       COPT_PASTE_RESTARTED,
5097 +} cop_insert_pos_type;
5098 +
5099 +/* flags to cut and delete */
5100 +typedef enum {
5101 +       /* don't kill node even if it became completely empty as results of
5102 +        * cut. This is needed for eottl handling. See carry_extent() for
5103 +        * details. */
5104 +       DELETE_RETAIN_EMPTY = (1 << 0)
5105 +} cop_delete_flag;
5106 +
5107 +/*
5108 + * carry() implements "lock handle tracking" feature.
5109 + *
5110 + * Callers supply carry with node where to perform initial operation and lock
5111 + * handle on this node. Trying to optimize node utilization carry may actually
5112 + * move insertion point to different node. Callers expect that lock handle
5113 + * will rebe transferred to the new node also.
5114 + *
5115 + */
5116 +typedef enum {
5117 +       /* transfer lock handle along with insertion point */
5118 +       CARRY_TRACK_CHANGE = 1,
5119 +       /* acquire new lock handle to the node where insertion point is. This
5120 +        * is used when carry() client doesn't initially possess lock handle
5121 +        * on the insertion point node, for example, by extent insertion
5122 +        * code. See carry_extent(). */
5123 +       CARRY_TRACK_NODE = 2
5124 +} carry_track_type;
5125 +
5126 +/* data supplied to COP_{INSERT|PASTE} by callers */
5127 +typedef struct carry_insert_data {
5128 +       /* position where new item is to be inserted */
5129 +       coord_t *coord;
5130 +       /* new item description */
5131 +       reiser4_item_data * data;
5132 +       /* key of new item */
5133 +       const reiser4_key * key;
5134 +} carry_insert_data;
5135 +
5136 +/* cut and kill are similar, so carry_cut_data and carry_kill_data share the
5137 +   below structure of parameters */
5138 +struct cut_kill_params {
5139 +       /* coord where cut starts (inclusive) */
5140 +       coord_t *from;
5141 +       /* coord where cut stops (inclusive, this item/unit will also be
5142 +        * cut) */
5143 +       coord_t *to;
5144 +       /* starting key. This is necessary when item and unit pos don't
5145 +        * uniquely identify what portion or tree to remove. For example, this
5146 +        * indicates what portion of extent unit will be affected. */
5147 +       const reiser4_key * from_key;
5148 +       /* exclusive stop key */
5149 +       const reiser4_key * to_key;
5150 +       /* if this is not NULL, smallest actually removed key is stored
5151 +        * here. */
5152 +       reiser4_key *smallest_removed;
5153 +       /* kill_node_content()  is called for file truncate */
5154 +       int truncate;
5155 +};
5156 +
5157 +struct carry_cut_data {
5158 +       struct cut_kill_params params;
5159 +};
5160 +
5161 +struct carry_kill_data {
5162 +       struct cut_kill_params params;
5163 +       /* parameter to be passed to the ->kill_hook() method of item
5164 +        * plugin */
5165 +       /*void *iplug_params; *//* FIXME: unused currently */
5166 +       /* if not NULL---inode whose items are being removed. This is needed
5167 +        * for ->kill_hook() of extent item to update VM structures when
5168 +        * removing pages. */
5169 +       struct inode *inode;
5170 +       /* sibling list maintenance is complicated by existence of eottl. When
5171 +        * eottl whose left and right neighbors are formatted leaves is
5172 +        * removed, one has to connect said leaves in the sibling list. This
5173 +        * cannot be done when extent removal is just started as locking rules
5174 +        * require sibling list update to happen atomically with removal of
5175 +        * extent item. Therefore: 1. pointers to left and right neighbors
5176 +        * have to be passed down to the ->kill_hook() of extent item, and
5177 +        * 2. said neighbors have to be locked. */
5178 +       lock_handle *left;
5179 +       lock_handle *right;
5180 +       /* flags modifying behavior of kill. Currently, it may have
5181 +          DELETE_RETAIN_EMPTY set. */
5182 +       unsigned flags;
5183 +       char *buf;
5184 +};
5185 +
5186 +/* &carry_tree_op - operation to "carry" upward.
5187 +
5188 +   Description of an operation we want to "carry" to the upper level of
5189 +   a tree: e.g, when we insert something and there is not enough space
5190 +   we allocate a new node and "carry" the operation of inserting a
5191 +   pointer to the new node to the upper level, on removal of empty node,
5192 +   we carry up operation of removing appropriate entry from parent.
5193 +
5194 +   There are two types of carry ops: when adding or deleting node we
5195 +   node at the parent level where appropriate modification has to be
5196 +   performed is known in advance. When shifting items between nodes
5197 +   (split, merge), delimiting key should be changed in the least common
5198 +   parent of the nodes involved that is not known in advance.
5199 +
5200 +   For the operations of the first type we store in &carry_op pointer to
5201 +   the &carry_node at the parent level. For the operation of the second
5202 +   type we store &carry_node or parents of the left and right nodes
5203 +   modified and keep track of them upward until they coincide.
5204 +
5205 +*/
5206 +typedef struct carry_op {
5207 +       /* pool linkage */
5208 +       struct reiser4_pool_header header;
5209 +       carry_opcode op;
5210 +       /* node on which operation is to be performed:
5211 +
5212 +          for insert, paste: node where new item is to be inserted
5213 +
5214 +          for delete: node where pointer is to be deleted
5215 +
5216 +          for cut: node to cut from
5217 +
5218 +          for update: node where delimiting key is to be modified
5219 +
5220 +          for modify: parent of modified node
5221 +
5222 +        */
5223 +       carry_node *node;
5224 +       union {
5225 +               struct {
5226 +                       /* (sub-)type of insertion/paste. Taken from
5227 +                          cop_insert_pos_type. */
5228 +                       __u8 type;
5229 +                       /* various operation flags. Taken from
5230 +                          cop_insert_flag. */
5231 +                       __u8 flags;
5232 +                       carry_insert_data *d;
5233 +                       carry_node *child;
5234 +                       znode *brother;
5235 +               } insert, paste, extent;
5236 +
5237 +               struct {
5238 +                       int is_cut;
5239 +                       union {
5240 +                               carry_kill_data *kill;
5241 +                               carry_cut_data *cut;
5242 +                       } u;
5243 +               } cut_or_kill;
5244 +
5245 +               struct {
5246 +                       carry_node *left;
5247 +               } update;
5248 +               struct {
5249 +                       /* changed child */
5250 +                       carry_node *child;
5251 +                       /* bitmask of changes. See &cop_modify_flag */
5252 +                       __u32 flag;
5253 +               } modify;
5254 +               struct {
5255 +                       /* flags to deletion operation. Are taken from
5256 +                          cop_delete_flag */
5257 +                       __u32 flags;
5258 +                       /* child to delete from parent. If this is
5259 +                          NULL, delete op->node.  */
5260 +                       carry_node *child;
5261 +               } delete;
5262 +               struct {
5263 +                       /* various operation flags. Taken from
5264 +                          cop_insert_flag. */
5265 +                       __u32 flags;
5266 +                       flow_t *flow;
5267 +                       coord_t *insert_point;
5268 +                       reiser4_item_data *data;
5269 +                       /* flow insertion is limited by number of new blocks
5270 +                          added in that operation which do not get any data
5271 +                          but part of flow. This limit is set by macro
5272 +                          CARRY_FLOW_NEW_NODES_LIMIT. This field stores number
5273 +                          of nodes added already during one carry_flow */
5274 +                       int new_nodes;
5275 +               } insert_flow;
5276 +       } u;
5277 +} carry_op;
5278 +
5279 +/* &carry_op_pool - preallocated pool of carry operations, and nodes */
5280 +typedef struct carry_pool {
5281 +       carry_op op[CARRIES_POOL_SIZE];
5282 +       struct reiser4_pool op_pool;
5283 +       carry_node node[NODES_LOCKED_POOL_SIZE];
5284 +       struct reiser4_pool node_pool;
5285 +} carry_pool;
5286 +
5287 +/* &carry_tree_level - carry process on given level
5288 +
5289 +   Description of balancing process on the given level.
5290 +
5291 +   No need for locking here, as carry_tree_level is essentially per
5292 +   thread thing (for now).
5293 +
5294 +*/
5295 +struct carry_level {
5296 +       /* this level may be restarted */
5297 +       __u32 restartable:1;
5298 +       /* list of carry nodes on this level, ordered by key order */
5299 +       struct list_head nodes;
5300 +       struct list_head ops;
5301 +       /* pool where new objects are allocated from */
5302 +       carry_pool *pool;
5303 +       int ops_num;
5304 +       int nodes_num;
5305 +       /* new root created on this level, if any */
5306 +       znode *new_root;
5307 +       /* This is set by caller (insert_by_key(), rreiser4_esize_item(), etc.)
5308 +          when they want ->tracked to automagically wander to the node where
5309 +          insertion point moved after insert or paste.
5310 +        */
5311 +       carry_track_type track_type;
5312 +       /* lock handle supplied by user that we are tracking. See
5313 +          above. */
5314 +       lock_handle *tracked;
5315 +};
5316 +
5317 +/* information carry passes to plugin methods that may add new operations to
5318 +   the @todo queue  */
5319 +struct carry_plugin_info {
5320 +       carry_level *doing;
5321 +       carry_level *todo;
5322 +};
5323 +
5324 +int reiser4_carry(carry_level * doing, carry_level * done);
5325 +
5326 +carry_node *reiser4_add_carry(carry_level * level, pool_ordering order,
5327 +                             carry_node * reference);
5328 +carry_node *reiser4_add_carry_skip(carry_level * level, pool_ordering order,
5329 +                                  carry_node * reference);
5330 +
5331 +extern carry_node *insert_carry_node(carry_level * doing,
5332 +                                    carry_level * todo, const znode * node);
5333 +
5334 +extern carry_pool *init_carry_pool(int);
5335 +extern void done_carry_pool(carry_pool * pool);
5336 +
5337 +extern void init_carry_level(carry_level * level, carry_pool * pool);
5338 +
5339 +extern carry_op *reiser4_post_carry(carry_level * level, carry_opcode op,
5340 +                                   znode * node, int apply_to_parent);
5341 +extern carry_op *node_post_carry(carry_plugin_info * info, carry_opcode op,
5342 +                                znode * node, int apply_to_parent_p);
5343 +
5344 +carry_node *add_new_znode(znode * brother, carry_node * reference,
5345 +                         carry_level * doing, carry_level * todo);
5346 +
5347 +carry_node *find_carry_node(carry_level * level, const znode * node);
5348 +
5349 +extern znode *reiser4_carry_real(const carry_node * node);
5350 +
5351 +/* helper macros to iterate over carry queues */
5352 +
5353 +#define carry_node_next(node)                                          \
5354 +       list_entry((node)->header.level_linkage.next, carry_node,       \
5355 +                  header.level_linkage)
5356 +
5357 +#define carry_node_prev(node)                                          \
5358 +       list_entry((node)->header.level_linkage.prev, carry_node,       \
5359 +                  header.level_linkage)
5360 +
5361 +#define carry_node_front(level)                                                \
5362 +       list_entry((level)->nodes.next, carry_node, header.level_linkage)
5363 +
5364 +#define carry_node_back(level)                                         \
5365 +       list_entry((level)->nodes.prev, carry_node, header.level_linkage)
5366 +
5367 +#define carry_node_end(level, node)                            \
5368 +       (&(level)->nodes == &(node)->header.level_linkage)
5369 +
5370 +/* macro to iterate over all operations in a @level */
5371 +#define for_all_ops(level /* carry level (of type carry_level *) */,          \
5372 +                   op    /* pointer to carry operation, modified by loop (of  \
5373 +                          * type carry_op *) */,                              \
5374 +                   tmp   /* pointer to carry operation (of type carry_op *),  \
5375 +                          * used to make iterator stable in the face of       \
5376 +                          * deletions from the level */ )                     \
5377 +for (op = list_entry(level->ops.next, carry_op, header.level_linkage),        \
5378 +     tmp = list_entry(op->header.level_linkage.next, carry_op, header.level_linkage);  \
5379 +     &op->header.level_linkage != &level->ops;                                \
5380 +     op = tmp,                                                                \
5381 +     tmp = list_entry(op->header.level_linkage.next, carry_op, header.level_linkage))
5382 +
5383 +#if 0
5384 +for (op = (carry_op *) pool_level_list_front(&level->ops),            \
5385 +     tmp = (carry_op *) pool_level_list_next(&op->header) ;           \
5386 +     !pool_level_list_end(&level->ops, &op->header) ;                 \
5387 +     op = tmp, tmp = (carry_op *) pool_level_list_next(&op->header))
5388 +#endif
5389 +
5390 +/* macro to iterate over all nodes in a @level */                             \
5391 +#define for_all_nodes(level /* carry level (of type carry_level *) */,        \
5392 +                     node  /* pointer to carry node, modified by loop (of     \
5393 +                             * type carry_node *) */,                         \
5394 +                     tmp   /* pointer to carry node (of type carry_node *),   \
5395 +                             * used to make iterator stable in the face of *  \
5396 +                             * deletions from the level */ )                  \
5397 +for (node = list_entry(level->nodes.next, carry_node, header.level_linkage),   \
5398 +     tmp = list_entry(node->header.level_linkage.next, carry_node, header.level_linkage); \
5399 +     &node->header.level_linkage != &level->nodes;                            \
5400 +     node = tmp,                                                              \
5401 +     tmp = list_entry(node->header.level_linkage.next, carry_node, header.level_linkage))
5402 +
5403 +#if 0
5404 +for (node = carry_node_front(level),                                   \
5405 +     tmp = carry_node_next(node) ; !carry_node_end(level, node) ;      \
5406 +     node = tmp, tmp = carry_node_next(node))
5407 +#endif
5408 +
5409 +/* macro to iterate over all nodes in a @level in reverse order
5410 +
5411 +   This is used, because nodes are unlocked in reversed order of locking */
5412 +#define for_all_nodes_back(level /* carry level (of type carry_level *) */,    \
5413 +                          node  /* pointer to carry node, modified by loop    \
5414 +                                  * (of type carry_node *) */,                \
5415 +                          tmp   /* pointer to carry node (of type carry_node  \
5416 +                                  * *), used to make iterator stable in the   \
5417 +                                  * face of deletions from the level */ )     \
5418 +for (node = carry_node_back(level),            \
5419 +     tmp = carry_node_prev(node) ; !carry_node_end(level, node) ;      \
5420 +     node = tmp, tmp = carry_node_prev(node))
5421 +
5422 +/* __FS_REISER4_CARRY_H__ */
5423 +#endif
5424 +
5425 +/* Make Linus happy.
5426 +   Local variables:
5427 +   c-indentation-style: "K&R"
5428 +   mode-name: "LC"
5429 +   c-basic-offset: 8
5430 +   tab-width: 8
5431 +   fill-column: 120
5432 +   scroll-step: 1
5433 +   End:
5434 +*/
5435 diff -puN /dev/null fs/reiser4/carry_ops.c
5436 --- /dev/null
5437 +++ a/fs/reiser4/carry_ops.c
5438 @@ -0,0 +1,2132 @@
5439 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
5440 +   reiser4/README */
5441 +
5442 +/* implementation of carry operations */
5443 +
5444 +#include "forward.h"
5445 +#include "debug.h"
5446 +#include "key.h"
5447 +#include "coord.h"
5448 +#include "plugin/item/item.h"
5449 +#include "plugin/node/node.h"
5450 +#include "jnode.h"
5451 +#include "znode.h"
5452 +#include "block_alloc.h"
5453 +#include "tree_walk.h"
5454 +#include "pool.h"
5455 +#include "tree_mod.h"
5456 +#include "carry.h"
5457 +#include "carry_ops.h"
5458 +#include "tree.h"
5459 +#include "super.h"
5460 +#include "reiser4.h"
5461 +
5462 +#include <linux/types.h>
5463 +#include <linux/err.h>
5464 +
5465 +static int carry_shift_data(sideof side, coord_t *insert_coord, znode * node,
5466 +                           carry_level * doing, carry_level * todo,
5467 +                           unsigned int including_insert_coord_p);
5468 +
5469 +extern int lock_carry_node(carry_level * level, carry_node * node);
5470 +extern int lock_carry_node_tail(carry_node * node);
5471 +
5472 +/* find left neighbor of a carry node
5473 +
5474 +   Look for left neighbor of @node and add it to the @doing queue. See
5475 +   comments in the body.
5476 +
5477 +*/
5478 +static carry_node *find_left_neighbor(carry_op * op    /* node to find left
5479 +                                                        * neighbor of */ ,
5480 +                                     carry_level * doing/* level to scan */)
5481 +{
5482 +       int result;
5483 +       carry_node *node;
5484 +       carry_node *left;
5485 +       int flags;
5486 +       reiser4_tree *tree;
5487 +
5488 +       node = op->node;
5489 +
5490 +       tree = current_tree;
5491 +       read_lock_tree(tree);
5492 +       /* first, check whether left neighbor is already in a @doing queue */
5493 +       if (reiser4_carry_real(node)->left != NULL) {
5494 +               /* NOTE: there is locking subtlety here. Look into
5495 +                * find_right_neighbor() for more info */
5496 +               if (find_carry_node(doing,
5497 +                                   reiser4_carry_real(node)->left) != NULL) {
5498 +                       read_unlock_tree(tree);
5499 +                       left = node;
5500 +                       do {
5501 +                               left = list_entry(left->header.level_linkage.prev,
5502 +                                                 carry_node, header.level_linkage);
5503 +                               assert("nikita-3408", !carry_node_end(doing,
5504 +                                                                     left));
5505 +                       } while (reiser4_carry_real(left) ==
5506 +                                reiser4_carry_real(node));
5507 +                       return left;
5508 +               }
5509 +       }
5510 +       read_unlock_tree(tree);
5511 +
5512 +       left = reiser4_add_carry_skip(doing, POOLO_BEFORE, node);
5513 +       if (IS_ERR(left))
5514 +               return left;
5515 +
5516 +       left->node = node->node;
5517 +       left->free = 1;
5518 +
5519 +       flags = GN_TRY_LOCK;
5520 +       if (!op->u.insert.flags & COPI_LOAD_LEFT)
5521 +               flags |= GN_NO_ALLOC;
5522 +
5523 +       /* then, feeling lucky, peek left neighbor in the cache. */
5524 +       result = reiser4_get_left_neighbor(&left->lock_handle,
5525 +                                          reiser4_carry_real(node),
5526 +                                          ZNODE_WRITE_LOCK, flags);
5527 +       if (result == 0) {
5528 +               /* ok, node found and locked. */
5529 +               result = lock_carry_node_tail(left);
5530 +               if (result != 0)
5531 +                       left = ERR_PTR(result);
5532 +       } else if (result == -E_NO_NEIGHBOR || result == -ENOENT) {
5533 +               /* node is leftmost node in a tree, or neighbor wasn't in
5534 +                  cache, or there is an extent on the left. */
5535 +               reiser4_pool_free(&doing->pool->node_pool, &left->header);
5536 +               left = NULL;
5537 +       } else if (doing->restartable) {
5538 +               /* if left neighbor is locked, and level is restartable, add
5539 +                  new node to @doing and restart. */
5540 +               assert("nikita-913", node->parent != 0);
5541 +               assert("nikita-914", node->node != NULL);
5542 +               left->left = 1;
5543 +               left->free = 0;
5544 +               left = ERR_PTR(-E_REPEAT);
5545 +       } else {
5546 +               /* left neighbor is locked, level cannot be restarted. Just
5547 +                  ignore left neighbor. */
5548 +               reiser4_pool_free(&doing->pool->node_pool, &left->header);
5549 +               left = NULL;
5550 +       }
5551 +       return left;
5552 +}
5553 +
5554 +/* find right neighbor of a carry node
5555 +
5556 +   Look for right neighbor of @node and add it to the @doing queue. See
5557 +   comments in the body.
5558 +
5559 +*/
5560 +static carry_node *find_right_neighbor(carry_op * op   /* node to find right
5561 +                                                        * neighbor of */ ,
5562 +                                      carry_level * doing/* level to scan */)
5563 +{
5564 +       int result;
5565 +       carry_node *node;
5566 +       carry_node *right;
5567 +       lock_handle lh;
5568 +       int flags;
5569 +       reiser4_tree *tree;
5570 +
5571 +       init_lh(&lh);
5572 +
5573 +       node = op->node;
5574 +
5575 +       tree = current_tree;
5576 +       read_lock_tree(tree);
5577 +       /* first, check whether right neighbor is already in a @doing queue */
5578 +       if (reiser4_carry_real(node)->right != NULL) {
5579 +               /*
5580 +                * Tree lock is taken here anyway, because, even if _outcome_
5581 +                * of (find_carry_node() != NULL) doesn't depends on
5582 +                * concurrent updates to ->right, find_carry_node() cannot
5583 +                * work with second argument NULL. Hence, following comment is
5584 +                * of historic importance only.
5585 +                *
5586 +                * Subtle:
5587 +                *
5588 +                * Q: why don't we need tree lock here, looking for the right
5589 +                * neighbor?
5590 +                *
5591 +                * A: even if value of node->real_node->right were changed
5592 +                * during find_carry_node() execution, outcome of execution
5593 +                * wouldn't change, because (in short) other thread cannot add
5594 +                * elements to the @doing, and if node->real_node->right
5595 +                * already was in @doing, value of node->real_node->right
5596 +                * couldn't change, because node cannot be inserted between
5597 +                * locked neighbors.
5598 +                */
5599 +               if (find_carry_node(doing,
5600 +                                   reiser4_carry_real(node)->right) != NULL) {
5601 +                       read_unlock_tree(tree);
5602 +                       /*
5603 +                        * What we are doing here (this is also applicable to
5604 +                        * the find_left_neighbor()).
5605 +                        *
5606 +                        * tree_walk.c code requires that insertion of a
5607 +                        * pointer to a child, modification of parent pointer
5608 +                        * in the child, and insertion of the child into
5609 +                        * sibling list are atomic (see
5610 +                        * plugin/item/internal.c:create_hook_internal()).
5611 +                        *
5612 +                        * carry allocates new node long before pointer to it
5613 +                        * is inserted into parent and, actually, long before
5614 +                        * parent is even known. Such allocated-but-orphaned
5615 +                        * nodes are only trackable through carry level lists.
5616 +                        *
5617 +                        * Situation that is handled here is following: @node
5618 +                        * has valid ->right pointer, but there is
5619 +                        * allocated-but-orphaned node in the carry queue that
5620 +                        * is logically between @node and @node->right. Here
5621 +                        * we are searching for it. Critical point is that
5622 +                        * this is only possible if @node->right is also in
5623 +                        * the carry queue (this is checked above), because
5624 +                        * this is the only way new orphaned node could be
5625 +                        * inserted between them (before inserting new node,
5626 +                        * make_space() first tries to shift to the right, so,
5627 +                        * right neighbor will be locked and queued).
5628 +                        *
5629 +                        */
5630 +                       right = node;
5631 +                       do {
5632 +                               right = list_entry(right->header.level_linkage.next,
5633 +                                                  carry_node, header.level_linkage);
5634 +                               assert("nikita-3408", !carry_node_end(doing,
5635 +                                                                     right));
5636 +                       } while (reiser4_carry_real(right) ==
5637 +                                reiser4_carry_real(node));
5638 +                       return right;
5639 +               }
5640 +       }
5641 +       read_unlock_tree(tree);
5642 +
5643 +       flags = GN_CAN_USE_UPPER_LEVELS;
5644 +       if (!op->u.insert.flags & COPI_LOAD_RIGHT)
5645 +               flags = GN_NO_ALLOC;
5646 +
5647 +       /* then, try to lock right neighbor */
5648 +       init_lh(&lh);
5649 +       result = reiser4_get_right_neighbor(&lh,
5650 +                                           reiser4_carry_real(node),
5651 +                                           ZNODE_WRITE_LOCK, flags);
5652 +       if (result == 0) {
5653 +               /* ok, node found and locked. */
5654 +               right = reiser4_add_carry_skip(doing, POOLO_AFTER, node);
5655 +               if (!IS_ERR(right)) {
5656 +                       right->node = lh.node;
5657 +                       move_lh(&right->lock_handle, &lh);
5658 +                       right->free = 1;
5659 +                       result = lock_carry_node_tail(right);
5660 +                       if (result != 0)
5661 +                               right = ERR_PTR(result);
5662 +               }
5663 +       } else if ((result == -E_NO_NEIGHBOR) || (result == -ENOENT)) {
5664 +               /* node is rightmost node in a tree, or neighbor wasn't in
5665 +                  cache, or there is an extent on the right. */
5666 +               right = NULL;
5667 +       } else
5668 +               right = ERR_PTR(result);
5669 +       done_lh(&lh);
5670 +       return right;
5671 +}
5672 +
5673 +/* how much free space in a @node is needed for @op
5674 +
5675 +   How much space in @node is required for completion of @op, where @op is
5676 +   insert or paste operation.
5677 +*/
5678 +static unsigned int space_needed_for_op(znode * node   /* znode data are
5679 +                                                        * inserted or
5680 +                                                        * pasted in */ ,
5681 +                                       carry_op * op   /* carry
5682 +                                                          operation */ )
5683 +{
5684 +       assert("nikita-919", op != NULL);
5685 +
5686 +       switch (op->op) {
5687 +       default:
5688 +               impossible("nikita-1701", "Wrong opcode");
5689 +       case COP_INSERT:
5690 +               return space_needed(node, NULL, op->u.insert.d->data, 1);
5691 +       case COP_PASTE:
5692 +               return space_needed(node, op->u.insert.d->coord,
5693 +                                   op->u.insert.d->data, 0);
5694 +       }
5695 +}
5696 +
5697 +/* how much space in @node is required to insert or paste @data at
5698 +   @coord. */
5699 +unsigned int space_needed(const znode * node   /* node data are inserted or
5700 +                                                * pasted in */ ,
5701 +                         const coord_t *coord  /* coord where data are
5702 +                                                * inserted or pasted
5703 +                                                * at */ ,
5704 +                         const reiser4_item_data * data /* data to insert or
5705 +                                                         * paste */ ,
5706 +                         int insertion/* non-0 is inserting, 0---paste */)
5707 +{
5708 +       int result;
5709 +       item_plugin *iplug;
5710 +
5711 +       assert("nikita-917", node != NULL);
5712 +       assert("nikita-918", node_plugin_by_node(node) != NULL);
5713 +       assert("vs-230", !insertion || (coord == NULL));
5714 +
5715 +       result = 0;
5716 +       iplug = data->iplug;
5717 +       if (iplug->b.estimate != NULL) {
5718 +               /* ask item plugin how much space is needed to insert this
5719 +                  item */
5720 +               result += iplug->b.estimate(insertion ? NULL : coord, data);
5721 +       } else {
5722 +               /* reasonable default */
5723 +               result += data->length;
5724 +       }
5725 +       if (insertion) {
5726 +               node_plugin *nplug;
5727 +
5728 +               nplug = node->nplug;
5729 +               /* and add node overhead */
5730 +               if (nplug->item_overhead != NULL)
5731 +                       result += nplug->item_overhead(node, NULL);
5732 +       }
5733 +       return result;
5734 +}
5735 +
5736 +/* find &coord in parent where pointer to new child is to be stored. */
5737 +static int find_new_child_coord(carry_op * op  /* COP_INSERT carry operation to
5738 +                                                * insert pointer to new
5739 +                                                * child */ )
5740 +{
5741 +       int result;
5742 +       znode *node;
5743 +       znode *child;
5744 +
5745 +       assert("nikita-941", op != NULL);
5746 +       assert("nikita-942", op->op == COP_INSERT);
5747 +
5748 +       node = reiser4_carry_real(op->node);
5749 +       assert("nikita-943", node != NULL);
5750 +       assert("nikita-944", node_plugin_by_node(node) != NULL);
5751 +
5752 +       child = reiser4_carry_real(op->u.insert.child);
5753 +       result =
5754 +           find_new_child_ptr(node, child, op->u.insert.brother,
5755 +                              op->u.insert.d->coord);
5756 +
5757 +       build_child_ptr_data(child, op->u.insert.d->data);
5758 +       return result;
5759 +}
5760 +
5761 +/* additional amount of free space in @node required to complete @op */
5762 +static int free_space_shortage(znode * node /* node to check */ ,
5763 +                              carry_op * op/* operation being performed */)
5764 +{
5765 +       assert("nikita-1061", node != NULL);
5766 +       assert("nikita-1062", op != NULL);
5767 +
5768 +       switch (op->op) {
5769 +       default:
5770 +               impossible("nikita-1702", "Wrong opcode");
5771 +       case COP_INSERT:
5772 +       case COP_PASTE:
5773 +               return space_needed_for_op(node, op) - znode_free_space(node);
5774 +       case COP_EXTENT:
5775 +               /* when inserting extent shift data around until insertion
5776 +                  point is utmost in the node. */
5777 +               if (coord_wrt(op->u.insert.d->coord) == COORD_INSIDE)
5778 +                       return +1;
5779 +               else
5780 +                       return -1;
5781 +       }
5782 +}
5783 +
5784 +/* helper function: update node pointer in operation after insertion
5785 +   point was probably shifted into @target. */
5786 +static znode *sync_op(carry_op * op, carry_node * target)
5787 +{
5788 +       znode *insertion_node;
5789 +
5790 +       /* reget node from coord: shift might move insertion coord to
5791 +          the neighbor */
5792 +       insertion_node = op->u.insert.d->coord->node;
5793 +       /* if insertion point was actually moved into new node,
5794 +          update carry node pointer in operation. */
5795 +       if (insertion_node != reiser4_carry_real(op->node)) {
5796 +               op->node = target;
5797 +               assert("nikita-2540",
5798 +                      reiser4_carry_real(target) == insertion_node);
5799 +       }
5800 +       assert("nikita-2541",
5801 +              reiser4_carry_real(op->node) == op->u.insert.d->coord->node);
5802 +       return insertion_node;
5803 +}
5804 +
5805 +/*
5806 + * complete make_space() call: update tracked lock handle if necessary. See
5807 + * comments for fs/reiser4/carry.h:carry_track_type
5808 + */
5809 +static int
5810 +make_space_tail(carry_op * op, carry_level * doing, znode * orig_node)
5811 +{
5812 +       int result;
5813 +       carry_track_type tracking;
5814 +       znode *node;
5815 +
5816 +       tracking = doing->track_type;
5817 +       node = op->u.insert.d->coord->node;
5818 +
5819 +       if (tracking == CARRY_TRACK_NODE ||
5820 +           (tracking == CARRY_TRACK_CHANGE && node != orig_node)) {
5821 +               /* inserting or pasting into node different from
5822 +                  original. Update lock handle supplied by caller. */
5823 +               assert("nikita-1417", doing->tracked != NULL);
5824 +               done_lh(doing->tracked);
5825 +               init_lh(doing->tracked);
5826 +               result = longterm_lock_znode(doing->tracked, node,
5827 +                                            ZNODE_WRITE_LOCK,
5828 +                                            ZNODE_LOCK_HIPRI);
5829 +       } else
5830 +               result = 0;
5831 +       return result;
5832 +}
5833 +
5834 +/* This is insertion policy function. It shifts data to the left and right
5835 +   neighbors of insertion coord and allocates new nodes until there is enough
5836 +   free space to complete @op.
5837 +
5838 +   See comments in the body.
5839 +
5840 +   Assumes that the node format favors insertions at the right end of the node
5841 +   as node40 does.
5842 +
5843 +   See carry_flow() on detail about flow insertion
5844 +*/
5845 +static int make_space(carry_op * op /* carry operation, insert or paste */ ,
5846 +                     carry_level * doing /* current carry queue */ ,
5847 +                     carry_level * todo/* carry queue on the parent level */)
5848 +{
5849 +       znode *node;
5850 +       int result;
5851 +       int not_enough_space;
5852 +       int blk_alloc;
5853 +       znode *orig_node;
5854 +       __u32 flags;
5855 +
5856 +       coord_t *coord;
5857 +
5858 +       assert("nikita-890", op != NULL);
5859 +       assert("nikita-891", todo != NULL);
5860 +       assert("nikita-892",
5861 +              op->op == COP_INSERT ||
5862 +              op->op == COP_PASTE || op->op == COP_EXTENT);
5863 +       assert("nikita-1607",
5864 +              reiser4_carry_real(op->node) == op->u.insert.d->coord->node);
5865 +
5866 +       flags = op->u.insert.flags;
5867 +
5868 +       /* NOTE check that new node can only be allocated after checking left
5869 +        * and right neighbors. This is necessary for proper work of
5870 +        * find_{left,right}_neighbor(). */
5871 +       assert("nikita-3410", ergo(flags & COPI_DONT_ALLOCATE,
5872 +                                  flags & COPI_DONT_SHIFT_LEFT));
5873 +       assert("nikita-3411", ergo(flags & COPI_DONT_ALLOCATE,
5874 +                                  flags & COPI_DONT_SHIFT_RIGHT));
5875 +
5876 +       coord = op->u.insert.d->coord;
5877 +       orig_node = node = coord->node;
5878 +
5879 +       assert("nikita-908", node != NULL);
5880 +       assert("nikita-909", node_plugin_by_node(node) != NULL);
5881 +
5882 +       result = 0;
5883 +       /* If there is not enough space in a node, try to shift something to
5884 +          the left neighbor. This is a bit tricky, as locking to the left is
5885 +          low priority. This is handled by restart logic in carry().
5886 +        */
5887 +       not_enough_space = free_space_shortage(node, op);
5888 +       if (not_enough_space <= 0)
5889 +               /* it is possible that carry was called when there actually
5890 +                  was enough space in the node. For example, when inserting
5891 +                  leftmost item so that delimiting keys have to be updated.
5892 +                */
5893 +               return make_space_tail(op, doing, orig_node);
5894 +       if (!(flags & COPI_DONT_SHIFT_LEFT)) {
5895 +               carry_node *left;
5896 +               /* make note in statistics of an attempt to move
5897 +                  something into the left neighbor */
5898 +               left = find_left_neighbor(op, doing);
5899 +               if (unlikely(IS_ERR(left))) {
5900 +                       if (PTR_ERR(left) == -E_REPEAT)
5901 +                               return -E_REPEAT;
5902 +                       else {
5903 +                               /* some error other than restart request
5904 +                                  occurred. This shouldn't happen. Issue a
5905 +                                  warning and continue as if left neighbor
5906 +                                  weren't existing.
5907 +                                */
5908 +                               warning("nikita-924",
5909 +                                       "Error accessing left neighbor: %li",
5910 +                                       PTR_ERR(left));
5911 +                       }
5912 +               } else if (left != NULL) {
5913 +
5914 +                       /* shift everything possible on the left of and
5915 +                          including insertion coord into the left neighbor */
5916 +                       result = carry_shift_data(LEFT_SIDE, coord,
5917 +                                                 reiser4_carry_real(left),
5918 +                                                 doing, todo,
5919 +                                                 flags & COPI_GO_LEFT);
5920 +
5921 +                       /* reget node from coord: shift_left() might move
5922 +                          insertion coord to the left neighbor */
5923 +                       node = sync_op(op, left);
5924 +
5925 +                       not_enough_space = free_space_shortage(node, op);
5926 +                       /* There is not enough free space in @node, but
5927 +                          may be, there is enough free space in
5928 +                          @left. Various balancing decisions are valid here.
5929 +                          The same for the shifiting to the right.
5930 +                        */
5931 +               }
5932 +       }
5933 +       /* If there still is not enough space, shift to the right */
5934 +       if (not_enough_space > 0 && !(flags & COPI_DONT_SHIFT_RIGHT)) {
5935 +               carry_node *right;
5936 +
5937 +               right = find_right_neighbor(op, doing);
5938 +               if (IS_ERR(right)) {
5939 +                       warning("nikita-1065",
5940 +                               "Error accessing right neighbor: %li",
5941 +                               PTR_ERR(right));
5942 +               } else if (right != NULL) {
5943 +                       /* node containing insertion point, and its right
5944 +                          neighbor node are write locked by now.
5945 +
5946 +                          shift everything possible on the right of but
5947 +                          excluding insertion coord into the right neighbor
5948 +                        */
5949 +                       result = carry_shift_data(RIGHT_SIDE, coord,
5950 +                                                 reiser4_carry_real(right),
5951 +                                                 doing, todo,
5952 +                                                 flags & COPI_GO_RIGHT);
5953 +                       /* reget node from coord: shift_right() might move
5954 +                          insertion coord to the right neighbor */
5955 +                       node = sync_op(op, right);
5956 +                       not_enough_space = free_space_shortage(node, op);
5957 +               }
5958 +       }
5959 +       /* If there is still not enough space, allocate new node(s).
5960 +
5961 +          We try to allocate new blocks if COPI_DONT_ALLOCATE is not set in
5962 +          the carry operation flags (currently this is needed during flush
5963 +          only).
5964 +        */
5965 +       for (blk_alloc = 0;
5966 +            not_enough_space > 0 && result == 0 && blk_alloc < 2 &&
5967 +            !(flags & COPI_DONT_ALLOCATE); ++blk_alloc) {
5968 +               carry_node *fresh;      /* new node we are allocating */
5969 +               coord_t coord_shadow;   /* remembered insertion point before
5970 +                                        * shifting data into new node */
5971 +               carry_node *node_shadow;        /* remembered insertion node
5972 +                                                * before shifting */
5973 +               unsigned int gointo;    /* whether insertion point should move
5974 +                                        * into newly allocated node */
5975 +
5976 +               /* allocate new node on the right of @node. Znode and disk
5977 +                  fake block number for new node are allocated.
5978 +
5979 +                  add_new_znode() posts carry operation COP_INSERT with
5980 +                  COPT_CHILD option to the parent level to add
5981 +                  pointer to newly created node to its parent.
5982 +
5983 +                  Subtle point: if several new nodes are required to complete
5984 +                  insertion operation at this level, they will be inserted
5985 +                  into their parents in the order of creation, which means
5986 +                  that @node will be valid "cookie" at the time of insertion.
5987 +
5988 +                */
5989 +               fresh = add_new_znode(node, op->node, doing, todo);
5990 +               if (IS_ERR(fresh))
5991 +                       return PTR_ERR(fresh);
5992 +
5993 +               /* Try to shift into new node. */
5994 +               result = lock_carry_node(doing, fresh);
5995 +               zput(reiser4_carry_real(fresh));
5996 +               if (result != 0) {
5997 +                       warning("nikita-947",
5998 +                               "Cannot lock new node: %i", result);
5999 +                       return result;
6000 +               }
6001 +
6002 +               /* both nodes are write locked by now.
6003 +
6004 +                  shift everything possible on the right of and
6005 +                  including insertion coord into the right neighbor.
6006 +                */
6007 +               coord_dup(&coord_shadow, op->u.insert.d->coord);
6008 +               node_shadow = op->node;
6009 +               /* move insertion point into newly created node if:
6010 +
6011 +                  . insertion point is rightmost in the source node, or
6012 +                  . this is not the first node we are allocating in a row.
6013 +                */
6014 +               gointo =
6015 +                   (blk_alloc > 0) ||
6016 +                   coord_is_after_rightmost(op->u.insert.d->coord);
6017 +
6018 +               if (gointo &&
6019 +                   op->op == COP_PASTE &&
6020 +                   coord_is_existing_item(op->u.insert.d->coord) &&
6021 +                   is_solid_item((item_plugin_by_coord(op->u.insert.d->coord)))) {
6022 +                       /* paste into solid (atomic) item, which can contain
6023 +                          only one unit, so we need to shift it right, where
6024 +                          insertion point supposed to be */
6025 +
6026 +                       assert("edward-1444", op->u.insert.d->data->iplug ==
6027 +                              item_plugin_by_id(STATIC_STAT_DATA_ID));
6028 +                       assert("edward-1445",
6029 +                              op->u.insert.d->data->length >
6030 +                              node_plugin_by_node(coord->node)->free_space
6031 +                              (coord->node));
6032 +
6033 +                       op->u.insert.d->coord->between = BEFORE_UNIT;
6034 +               }
6035 +
6036 +               result = carry_shift_data(RIGHT_SIDE, coord,
6037 +                                         reiser4_carry_real(fresh),
6038 +                                         doing, todo, gointo);
6039 +               /* if insertion point was actually moved into new node,
6040 +                  update carry node pointer in operation. */
6041 +               node = sync_op(op, fresh);
6042 +               not_enough_space = free_space_shortage(node, op);
6043 +               if ((not_enough_space > 0) && (node != coord_shadow.node)) {
6044 +                       /* there is not enough free in new node. Shift
6045 +                          insertion point back to the @shadow_node so that
6046 +                          next new node would be inserted between
6047 +                          @shadow_node and @fresh.
6048 +                        */
6049 +                       coord_normalize(&coord_shadow);
6050 +                       coord_dup(coord, &coord_shadow);
6051 +                       node = coord->node;
6052 +                       op->node = node_shadow;
6053 +                       if (1 || (flags & COPI_STEP_BACK)) {
6054 +                               /* still not enough space?! Maybe there is
6055 +                                  enough space in the source node (i.e., node
6056 +                                  data are moved from) now.
6057 +                                */
6058 +                               not_enough_space =
6059 +                                   free_space_shortage(node, op);
6060 +                       }
6061 +               }
6062 +       }
6063 +       if (not_enough_space > 0) {
6064 +               if (!(flags & COPI_DONT_ALLOCATE))
6065 +                       warning("nikita-948", "Cannot insert new item");
6066 +               result = -E_NODE_FULL;
6067 +       }
6068 +       assert("nikita-1622", ergo(result == 0,
6069 +                                 reiser4_carry_real(op->node) == coord->node));
6070 +       assert("nikita-2616", coord == op->u.insert.d->coord);
6071 +       if (result == 0)
6072 +               result = make_space_tail(op, doing, orig_node);
6073 +       return result;
6074 +}
6075 +
6076 +/* insert_paste_common() - common part of insert and paste operations
6077 +
6078 +   This function performs common part of COP_INSERT and COP_PASTE.
6079 +
6080 +   There are two ways in which insertion/paste can be requested:
6081 +
6082 +    . by directly supplying reiser4_item_data. In this case, op ->
6083 +    u.insert.type is set to COPT_ITEM_DATA.
6084 +
6085 +    . by supplying child pointer to which is to inserted into parent. In this
6086 +    case op -> u.insert.type == COPT_CHILD.
6087 +
6088 +    . by supplying key of new item/unit. This is currently only used during
6089 +    extent insertion
6090 +
6091 +   This is required, because when new node is allocated we don't know at what
6092 +   position pointer to it is to be stored in the parent. Actually, we don't
6093 +   even know what its parent will be, because parent can be re-balanced
6094 +   concurrently and new node re-parented, and because parent can be full and
6095 +   pointer to the new node will go into some other node.
6096 +
6097 +   insert_paste_common() resolves pointer to child node into position in the
6098 +   parent by calling find_new_child_coord(), that fills
6099 +   reiser4_item_data. After this, insertion/paste proceeds uniformly.
6100 +
6101 +   Another complication is with finding free space during pasting. It may
6102 +   happen that while shifting items to the neighbors and newly allocated
6103 +   nodes, insertion coord can no longer be in the item we wanted to paste
6104 +   into. At this point, paste becomes (morphs) into insert. Moreover free
6105 +   space analysis has to be repeated, because amount of space required for
6106 +   insertion is different from that of paste (item header overhead, etc).
6107 +
6108 +   This function "unifies" different insertion modes (by resolving child
6109 +   pointer or key into insertion coord), and then calls make_space() to free
6110 +   enough space in the node by shifting data to the left and right and by
6111 +   allocating new nodes if necessary. Carry operation knows amount of space
6112 +   required for its completion. After enough free space is obtained, caller of
6113 +   this function (carry_{insert,paste,etc.}) performs actual insertion/paste
6114 +   by calling item plugin method.
6115 +
6116 +*/
6117 +static int insert_paste_common(carry_op * op   /* carry operation being
6118 +                                                * performed */ ,
6119 +                              carry_level * doing /* current carry level */ ,
6120 +                              carry_level * todo /* next carry level */ ,
6121 +                              carry_insert_data * cdata        /* pointer to
6122 +                                                                * cdata */ ,
6123 +                              coord_t *coord /* insertion/paste coord */ ,
6124 +                              reiser4_item_data * data /* data to be
6125 +                                                        * inserted/pasted */ )
6126 +{
6127 +       assert("nikita-981", op != NULL);
6128 +       assert("nikita-980", todo != NULL);
6129 +       assert("nikita-979", (op->op == COP_INSERT) || (op->op == COP_PASTE)
6130 +              || (op->op == COP_EXTENT));
6131 +
6132 +       if (op->u.insert.type == COPT_PASTE_RESTARTED) {
6133 +               /* nothing to do. Fall through to make_space(). */
6134 +               ;
6135 +       } else if (op->u.insert.type == COPT_KEY) {
6136 +               node_search_result intra_node;
6137 +               znode *node;
6138 +               /* Problem with doing batching at the lowest level, is that
6139 +                  operations here are given by coords where modification is
6140 +                  to be performed, and one modification can invalidate coords
6141 +                  of all following operations.
6142 +
6143 +                  So, we are implementing yet another type for operation that
6144 +                  will use (the only) "locator" stable across shifting of
6145 +                  data between nodes, etc.: key (COPT_KEY).
6146 +
6147 +                  This clause resolves key to the coord in the node.
6148 +
6149 +                  But node can change also. Probably some pieces have to be
6150 +                  added to the lock_carry_node(), to lock node by its key.
6151 +
6152 +                */
6153 +               /* NOTE-NIKITA Lookup bias is fixed to FIND_EXACT. Complain
6154 +                  if you need something else. */
6155 +               op->u.insert.d->coord = coord;
6156 +               node = reiser4_carry_real(op->node);
6157 +               intra_node = node_plugin_by_node(node)->lookup
6158 +                   (node, op->u.insert.d->key, FIND_EXACT,
6159 +                    op->u.insert.d->coord);
6160 +               if ((intra_node != NS_FOUND) && (intra_node != NS_NOT_FOUND)) {
6161 +                       warning("nikita-1715", "Intra node lookup failure: %i",
6162 +                               intra_node);
6163 +                       return intra_node;
6164 +               }
6165 +       } else if (op->u.insert.type == COPT_CHILD) {
6166 +               /* if we are asked to insert pointer to the child into
6167 +                  internal node, first convert pointer to the child into
6168 +                  coord within parent node.
6169 +                */
6170 +               znode *child;
6171 +               int result;
6172 +
6173 +               op->u.insert.d = cdata;
6174 +               op->u.insert.d->coord = coord;
6175 +               op->u.insert.d->data = data;
6176 +               op->u.insert.d->coord->node = reiser4_carry_real(op->node);
6177 +               result = find_new_child_coord(op);
6178 +               child = reiser4_carry_real(op->u.insert.child);
6179 +               if (result != NS_NOT_FOUND) {
6180 +                       warning("nikita-993",
6181 +                               "Cannot find a place for child pointer: %i",
6182 +                               result);
6183 +                       return result;
6184 +               }
6185 +               /* This only happens when we did multiple insertions at
6186 +                  the previous level, trying to insert single item and
6187 +                  it so happened, that insertion of pointers to all new
6188 +                  nodes before this one already caused parent node to
6189 +                  split (may be several times).
6190 +
6191 +                  I am going to come up with better solution.
6192 +
6193 +                  You are not expected to understand this.
6194 +                  -- v6root/usr/sys/ken/slp.c
6195 +
6196 +                  Basically, what happens here is the following: carry came
6197 +                  to the parent level and is about to insert internal item
6198 +                  pointing to the child node that it just inserted in the
6199 +                  level below. Position where internal item is to be inserted
6200 +                  was found by find_new_child_coord() above, but node of the
6201 +                  current carry operation (that is, parent node of child
6202 +                  inserted on the previous level), was determined earlier in
6203 +                  the lock_carry_level/lock_carry_node. It could so happen
6204 +                  that other carry operations already performed on the parent
6205 +                  level already split parent node, so that insertion point
6206 +                  moved into another node. Handle this by creating new carry
6207 +                  node for insertion point if necessary.
6208 +                */
6209 +               if (reiser4_carry_real(op->node) !=
6210 +                   op->u.insert.d->coord->node) {
6211 +                       pool_ordering direction;
6212 +                       znode *z1;
6213 +                       znode *z2;
6214 +                       reiser4_key k1;
6215 +                       reiser4_key k2;
6216 +
6217 +                       /*
6218 +                        * determine in what direction insertion point
6219 +                        * moved. Do this by comparing delimiting keys.
6220 +                        */
6221 +                       z1 = op->u.insert.d->coord->node;
6222 +                       z2 = reiser4_carry_real(op->node);
6223 +                       if (keyle(leftmost_key_in_node(z1, &k1),
6224 +                                 leftmost_key_in_node(z2, &k2)))
6225 +                               /* insertion point moved to the left */
6226 +                               direction = POOLO_BEFORE;
6227 +                       else
6228 +                               /* insertion point moved to the right */
6229 +                               direction = POOLO_AFTER;
6230 +
6231 +                       op->node = reiser4_add_carry_skip(doing,
6232 +                                                         direction, op->node);
6233 +                       if (IS_ERR(op->node))
6234 +                               return PTR_ERR(op->node);
6235 +                       op->node->node = op->u.insert.d->coord->node;
6236 +                       op->node->free = 1;
6237 +                       result = lock_carry_node(doing, op->node);
6238 +                       if (result != 0)
6239 +                               return result;
6240 +               }
6241 +
6242 +               /*
6243 +                * set up key of an item being inserted: we are inserting
6244 +                * internal item and its key is (by the very definition of
6245 +                * search tree) is leftmost key in the child node.
6246 +                */
6247 +               write_lock_dk(znode_get_tree(child));
6248 +               op->u.insert.d->key = leftmost_key_in_node(child,
6249 +                                                          znode_get_ld_key(child));
6250 +               write_unlock_dk(znode_get_tree(child));
6251 +               op->u.insert.d->data->arg = op->u.insert.brother;
6252 +       } else {
6253 +               assert("vs-243", op->u.insert.d->coord != NULL);
6254 +               op->u.insert.d->coord->node = reiser4_carry_real(op->node);
6255 +       }
6256 +
6257 +       /* find free space. */
6258 +       return make_space(op, doing, todo);
6259 +}
6260 +
6261 +/* handle carry COP_INSERT operation.
6262 +
6263 +   Insert new item into node. New item can be given in one of two ways:
6264 +
6265 +   - by passing &tree_coord and &reiser4_item_data as part of @op. This is
6266 +   only applicable at the leaf/twig level.
6267 +
6268 +   - by passing a child node pointer to which is to be inserted by this
6269 +   operation.
6270 +
6271 +*/
6272 +static int carry_insert(carry_op * op /* operation to perform */ ,
6273 +                       carry_level * doing     /* queue of operations @op
6274 +                                                * is part of */ ,
6275 +                       carry_level * todo      /* queue where new operations
6276 +                                                * are accumulated */ )
6277 +{
6278 +       znode *node;
6279 +       carry_insert_data cdata;
6280 +       coord_t coord;
6281 +       reiser4_item_data data;
6282 +       carry_plugin_info info;
6283 +       int result;
6284 +
6285 +       assert("nikita-1036", op != NULL);
6286 +       assert("nikita-1037", todo != NULL);
6287 +       assert("nikita-1038", op->op == COP_INSERT);
6288 +
6289 +       coord_init_zero(&coord);
6290 +
6291 +       /* perform common functionality of insert and paste. */
6292 +       result = insert_paste_common(op, doing, todo, &cdata, &coord, &data);
6293 +       if (result != 0)
6294 +               return result;
6295 +
6296 +       node = op->u.insert.d->coord->node;
6297 +       assert("nikita-1039", node != NULL);
6298 +       assert("nikita-1040", node_plugin_by_node(node) != NULL);
6299 +
6300 +       assert("nikita-949",
6301 +              space_needed_for_op(node, op) <= znode_free_space(node));
6302 +
6303 +       /* ask node layout to create new item. */
6304 +       info.doing = doing;
6305 +       info.todo = todo;
6306 +       result = node_plugin_by_node(node)->create_item
6307 +           (op->u.insert.d->coord, op->u.insert.d->key, op->u.insert.d->data,
6308 +            &info);
6309 +       doing->restartable = 0;
6310 +       znode_make_dirty(node);
6311 +
6312 +       return result;
6313 +}
6314 +
6315 +/*
6316 + * Flow insertion code. COP_INSERT_FLOW is special tree operation that is
6317 + * supplied with a "flow" (that is, a stream of data) and inserts it into tree
6318 + * by slicing into multiple items.
6319 + */
6320 +
6321 +#define flow_insert_point(op) ((op)->u.insert_flow.insert_point)
6322 +#define flow_insert_flow(op) ((op)->u.insert_flow.flow)
6323 +#define flow_insert_data(op) ((op)->u.insert_flow.data)
6324 +
6325 +static size_t item_data_overhead(carry_op * op)
6326 +{
6327 +       if (flow_insert_data(op)->iplug->b.estimate == NULL)
6328 +               return 0;
6329 +       return (flow_insert_data(op)->iplug->b.
6330 +               estimate(NULL /* estimate insertion */ , flow_insert_data(op)) -
6331 +               flow_insert_data(op)->length);
6332 +}
6333 +
6334 +/* FIXME-VS: this is called several times during one make_flow_for_insertion
6335 +   and it will always return the same result. Some optimization could be made
6336 +   by calculating this value once at the beginning and passing it around. That
6337 +   would reduce some flexibility in future changes
6338 +*/
6339 +static int can_paste(coord_t *, const reiser4_key *, const reiser4_item_data *);
6340 +static size_t flow_insertion_overhead(carry_op * op)
6341 +{
6342 +       znode *node;
6343 +       size_t insertion_overhead;
6344 +
6345 +       node = flow_insert_point(op)->node;
6346 +       insertion_overhead = 0;
6347 +       if (node->nplug->item_overhead &&
6348 +           !can_paste(flow_insert_point(op), &flow_insert_flow(op)->key,
6349 +                      flow_insert_data(op)))
6350 +               insertion_overhead =
6351 +                   node->nplug->item_overhead(node, NULL) +
6352 +                       item_data_overhead(op);
6353 +       return insertion_overhead;
6354 +}
6355 +
6356 +/* how many bytes of flow does fit to the node */
6357 +static int what_can_fit_into_node(carry_op * op)
6358 +{
6359 +       size_t free, overhead;
6360 +
6361 +       overhead = flow_insertion_overhead(op);
6362 +       free = znode_free_space(flow_insert_point(op)->node);
6363 +       if (free <= overhead)
6364 +               return 0;
6365 +       free -= overhead;
6366 +       /* FIXME: flow->length is loff_t only to not get overflowed in case of
6367 +          expandign truncate */
6368 +       if (free < op->u.insert_flow.flow->length)
6369 +               return free;
6370 +       return (int)op->u.insert_flow.flow->length;
6371 +}
6372 +
6373 +/* in make_space_for_flow_insertion we need to check either whether whole flow
6374 +   fits into a node or whether minimal fraction of flow fits into a node */
6375 +static int enough_space_for_whole_flow(carry_op * op)
6376 +{
6377 +       return (unsigned)what_can_fit_into_node(op) ==
6378 +           op->u.insert_flow.flow->length;
6379 +}
6380 +
6381 +#define MIN_FLOW_FRACTION 1
6382 +static int enough_space_for_min_flow_fraction(carry_op * op)
6383 +{
6384 +       assert("vs-902", coord_is_after_rightmost(flow_insert_point(op)));
6385 +
6386 +       return what_can_fit_into_node(op) >= MIN_FLOW_FRACTION;
6387 +}
6388 +
6389 +/* this returns 0 if left neighbor was obtained successfully and everything
6390 +   upto insertion point including it were shifted and left neighbor still has
6391 +   some free space to put minimal fraction of flow into it */
6392 +static int
6393 +make_space_by_shift_left(carry_op * op, carry_level * doing, carry_level * todo)
6394 +{
6395 +       carry_node *left;
6396 +       znode *orig;
6397 +
6398 +       left = find_left_neighbor(op, doing);
6399 +       if (unlikely(IS_ERR(left))) {
6400 +               warning("vs-899",
6401 +                       "make_space_by_shift_left: "
6402 +                       "error accessing left neighbor: %li", PTR_ERR(left));
6403 +               return 1;
6404 +       }
6405 +       if (left == NULL)
6406 +               /* left neighbor either does not exist or is unformatted
6407 +                  node */
6408 +               return 1;
6409 +
6410 +       orig = flow_insert_point(op)->node;
6411 +       /* try to shift content of node @orig from its head upto insert point
6412 +          including insertion point into the left neighbor */
6413 +       carry_shift_data(LEFT_SIDE, flow_insert_point(op),
6414 +                        reiser4_carry_real(left), doing, todo,
6415 +                        1/* including insert point */);
6416 +       if (reiser4_carry_real(left) != flow_insert_point(op)->node) {
6417 +               /* insertion point did not move */
6418 +               return 1;
6419 +       }
6420 +
6421 +       /* insertion point is set after last item in the node */
6422 +       assert("vs-900", coord_is_after_rightmost(flow_insert_point(op)));
6423 +
6424 +       if (!enough_space_for_min_flow_fraction(op)) {
6425 +               /* insertion point node does not have enough free space to put
6426 +                  even minimal portion of flow into it, therefore, move
6427 +                  insertion point back to orig node (before first item) */
6428 +               coord_init_before_first_item(flow_insert_point(op), orig);
6429 +               return 1;
6430 +       }
6431 +
6432 +       /* part of flow is to be written to the end of node */
6433 +       op->node = left;
6434 +       return 0;
6435 +}
6436 +
6437 +/* this returns 0 if right neighbor was obtained successfully and everything to
6438 +   the right of insertion point was shifted to it and node got enough free
6439 +   space to put minimal fraction of flow into it */
6440 +static int
6441 +make_space_by_shift_right(carry_op * op, carry_level * doing,
6442 +                         carry_level * todo)
6443 +{
6444 +       carry_node *right;
6445 +
6446 +       right = find_right_neighbor(op, doing);
6447 +       if (unlikely(IS_ERR(right))) {
6448 +               warning("nikita-1065", "shift_right_excluding_insert_point: "
6449 +                       "error accessing right neighbor: %li", PTR_ERR(right));
6450 +               return 1;
6451 +       }
6452 +       if (right) {
6453 +               /* shift everything possible on the right of but excluding
6454 +                  insertion coord into the right neighbor */
6455 +               carry_shift_data(RIGHT_SIDE, flow_insert_point(op),
6456 +                                reiser4_carry_real(right), doing, todo,
6457 +                                0/* not including insert point */);
6458 +       } else {
6459 +               /* right neighbor either does not exist or is unformatted
6460 +                  node */
6461 +               ;
6462 +       }
6463 +       if (coord_is_after_rightmost(flow_insert_point(op))) {
6464 +               if (enough_space_for_min_flow_fraction(op)) {
6465 +                       /* part of flow is to be written to the end of node */
6466 +                       return 0;
6467 +               }
6468 +       }
6469 +
6470 +       /* new node is to be added if insert point node did not get enough
6471 +          space for whole flow */
6472 +       return 1;
6473 +}
6474 +
6475 +/* this returns 0 when insert coord is set at the node end and fraction of flow
6476 +   fits into that node */
6477 +static int
6478 +make_space_by_new_nodes(carry_op * op, carry_level * doing, carry_level * todo)
6479 +{
6480 +       int result;
6481 +       znode *node;
6482 +       carry_node *new;
6483 +
6484 +       node = flow_insert_point(op)->node;
6485 +
6486 +       if (op->u.insert_flow.new_nodes == CARRY_FLOW_NEW_NODES_LIMIT)
6487 +               return RETERR(-E_NODE_FULL);
6488 +       /* add new node after insert point node */
6489 +       new = add_new_znode(node, op->node, doing, todo);
6490 +       if (unlikely(IS_ERR(new)))
6491 +               return PTR_ERR(new);
6492 +       result = lock_carry_node(doing, new);
6493 +       zput(reiser4_carry_real(new));
6494 +       if (unlikely(result))
6495 +               return result;
6496 +       op->u.insert_flow.new_nodes++;
6497 +       if (!coord_is_after_rightmost(flow_insert_point(op))) {
6498 +               carry_shift_data(RIGHT_SIDE, flow_insert_point(op),
6499 +                                reiser4_carry_real(new), doing, todo,
6500 +                                0/* not including insert point */);
6501 +               assert("vs-901",
6502 +                      coord_is_after_rightmost(flow_insert_point(op)));
6503 +
6504 +               if (enough_space_for_min_flow_fraction(op))
6505 +                       return 0;
6506 +               if (op->u.insert_flow.new_nodes == CARRY_FLOW_NEW_NODES_LIMIT)
6507 +                       return RETERR(-E_NODE_FULL);
6508 +
6509 +               /* add one more new node */
6510 +               new = add_new_znode(node, op->node, doing, todo);
6511 +               if (unlikely(IS_ERR(new)))
6512 +                       return PTR_ERR(new);
6513 +               result = lock_carry_node(doing, new);
6514 +               zput(reiser4_carry_real(new));
6515 +               if (unlikely(result))
6516 +                       return result;
6517 +               op->u.insert_flow.new_nodes++;
6518 +       }
6519 +
6520 +       /* move insertion point to new node */
6521 +       coord_init_before_first_item(flow_insert_point(op),
6522 +                                    reiser4_carry_real(new));
6523 +       op->node = new;
6524 +       return 0;
6525 +}
6526 +
6527 +static int
6528 +make_space_for_flow_insertion(carry_op * op, carry_level * doing,
6529 +                             carry_level * todo)
6530 +{
6531 +       __u32 flags = op->u.insert_flow.flags;
6532 +
6533 +       if (enough_space_for_whole_flow(op)) {
6534 +               /* whole flow fits into insert point node */
6535 +               return 0;
6536 +       }
6537 +
6538 +       if (!(flags & COPI_DONT_SHIFT_LEFT)
6539 +           && (make_space_by_shift_left(op, doing, todo) == 0)) {
6540 +               /* insert point is shifted to left neighbor of original insert
6541 +                  point node and is set after last unit in that node. It has
6542 +                  enough space to fit at least minimal fraction of flow. */
6543 +               return 0;
6544 +       }
6545 +
6546 +       if (enough_space_for_whole_flow(op)) {
6547 +               /* whole flow fits into insert point node */
6548 +               return 0;
6549 +       }
6550 +
6551 +       if (!(flags & COPI_DONT_SHIFT_RIGHT)
6552 +           && (make_space_by_shift_right(op, doing, todo) == 0)) {
6553 +               /* insert point is still set to the same node, but there is
6554 +                  nothing to the right of insert point. */
6555 +               return 0;
6556 +       }
6557 +
6558 +       if (enough_space_for_whole_flow(op)) {
6559 +               /* whole flow fits into insert point node */
6560 +               return 0;
6561 +       }
6562 +
6563 +       return make_space_by_new_nodes(op, doing, todo);
6564 +}
6565 +
6566 +/* implements COP_INSERT_FLOW operation */
6567 +static int
6568 +carry_insert_flow(carry_op * op, carry_level * doing, carry_level * todo)
6569 +{
6570 +       int result;
6571 +       flow_t *f;
6572 +       coord_t *insert_point;
6573 +       node_plugin *nplug;
6574 +       carry_plugin_info info;
6575 +       znode *orig_node;
6576 +       lock_handle *orig_lh;
6577 +
6578 +       f = op->u.insert_flow.flow;
6579 +       result = 0;
6580 +
6581 +       /* carry system needs this to work */
6582 +       info.doing = doing;
6583 +       info.todo = todo;
6584 +
6585 +       orig_node = flow_insert_point(op)->node;
6586 +       orig_lh = doing->tracked;
6587 +
6588 +       while (f->length) {
6589 +               result = make_space_for_flow_insertion(op, doing, todo);
6590 +               if (result)
6591 +                       break;
6592 +
6593 +               insert_point = flow_insert_point(op);
6594 +               nplug = node_plugin_by_node(insert_point->node);
6595 +
6596 +               /* compose item data for insertion/pasting */
6597 +               flow_insert_data(op)->data = f->data;
6598 +               flow_insert_data(op)->length = what_can_fit_into_node(op);
6599 +
6600 +               if (can_paste(insert_point, &f->key, flow_insert_data(op))) {
6601 +                       /* insert point is set to item of file we are writing to
6602 +                          and we have to append to it */
6603 +                       assert("vs-903", insert_point->between == AFTER_UNIT);
6604 +                       nplug->change_item_size(insert_point,
6605 +                                               flow_insert_data(op)->length);
6606 +                       flow_insert_data(op)->iplug->b.paste(insert_point,
6607 +                                                            flow_insert_data
6608 +                                                            (op), &info);
6609 +               } else {
6610 +                       /* new item must be inserted */
6611 +                       pos_in_node_t new_pos;
6612 +                       flow_insert_data(op)->length += item_data_overhead(op);
6613 +
6614 +                       /* FIXME-VS: this is because node40_create_item changes
6615 +                          insert_point for obscure reasons */
6616 +                       switch (insert_point->between) {
6617 +                       case AFTER_ITEM:
6618 +                               new_pos = insert_point->item_pos + 1;
6619 +                               break;
6620 +                       case EMPTY_NODE:
6621 +                               new_pos = 0;
6622 +                               break;
6623 +                       case BEFORE_ITEM:
6624 +                               assert("vs-905", insert_point->item_pos == 0);
6625 +                               new_pos = 0;
6626 +                               break;
6627 +                       default:
6628 +                               impossible("vs-906",
6629 +                                          "carry_insert_flow: invalid coord");
6630 +                               new_pos = 0;
6631 +                               break;
6632 +                       }
6633 +
6634 +                       nplug->create_item(insert_point, &f->key,
6635 +                                          flow_insert_data(op), &info);
6636 +                       coord_set_item_pos(insert_point, new_pos);
6637 +               }
6638 +               coord_init_after_item_end(insert_point);
6639 +               doing->restartable = 0;
6640 +               znode_make_dirty(insert_point->node);
6641 +
6642 +               move_flow_forward(f, (unsigned)flow_insert_data(op)->length);
6643 +       }
6644 +
6645 +       if (orig_node != flow_insert_point(op)->node) {
6646 +               /* move lock to new insert point */
6647 +               done_lh(orig_lh);
6648 +               init_lh(orig_lh);
6649 +               result =
6650 +                   longterm_lock_znode(orig_lh, flow_insert_point(op)->node,
6651 +                                       ZNODE_WRITE_LOCK, ZNODE_LOCK_HIPRI);
6652 +       }
6653 +
6654 +       return result;
6655 +}
6656 +
6657 +/* implements COP_DELETE operation
6658 +
6659 +   Remove pointer to @op -> u.delete.child from it's parent.
6660 +
6661 +   This function also handles killing of a tree root is last pointer from it
6662 +   was removed. This is complicated by our handling of "twig" level: root on
6663 +   twig level is never killed.
6664 +
6665 +*/
6666 +static int carry_delete(carry_op * op /* operation to be performed */ ,
6667 +                       carry_level * doing UNUSED_ARG  /* current carry
6668 +                                                        * level */ ,
6669 +                       carry_level * todo/* next carry level */)
6670 +{
6671 +       int result;
6672 +       coord_t coord;
6673 +       coord_t coord2;
6674 +       znode *parent;
6675 +       znode *child;
6676 +       carry_plugin_info info;
6677 +       reiser4_tree *tree;
6678 +
6679 +       /*
6680 +        * This operation is called to delete internal item pointing to the
6681 +        * child node that was removed by carry from the tree on the previous
6682 +        * tree level.
6683 +        */
6684 +
6685 +       assert("nikita-893", op != NULL);
6686 +       assert("nikita-894", todo != NULL);
6687 +       assert("nikita-895", op->op == COP_DELETE);
6688 +
6689 +       coord_init_zero(&coord);
6690 +       coord_init_zero(&coord2);
6691 +
6692 +       parent = reiser4_carry_real(op->node);
6693 +       child = op->u.delete.child ?
6694 +               reiser4_carry_real(op->u.delete.child) : op->node->node;
6695 +       tree = znode_get_tree(child);
6696 +       read_lock_tree(tree);
6697 +
6698 +       /*
6699 +        * @parent was determined when carry entered parent level
6700 +        * (lock_carry_level/lock_carry_node). Since then, actual parent of
6701 +        * @child node could change due to other carry operations performed on
6702 +        * the parent level. Check for this.
6703 +        */
6704 +
6705 +       if (znode_parent(child) != parent) {
6706 +               /* NOTE-NIKITA add stat counter for this. */
6707 +               parent = znode_parent(child);
6708 +               assert("nikita-2581", find_carry_node(doing, parent));
6709 +       }
6710 +       read_unlock_tree(tree);
6711 +
6712 +       assert("nikita-1213", znode_get_level(parent) > LEAF_LEVEL);
6713 +
6714 +       /* Twig level horrors: tree should be of height at least 2. So, last
6715 +          pointer from the root at twig level is preserved even if child is
6716 +          empty. This is ugly, but so it was architectured.
6717 +        */
6718 +
6719 +       if (znode_is_root(parent) &&
6720 +           znode_get_level(parent) <= REISER4_MIN_TREE_HEIGHT &&
6721 +           node_num_items(parent) == 1) {
6722 +               /* Delimiting key manipulations. */
6723 +               write_lock_dk(tree);
6724 +               znode_set_ld_key(child, znode_set_ld_key(parent, reiser4_min_key()));
6725 +               znode_set_rd_key(child, znode_set_rd_key(parent, reiser4_max_key()));
6726 +               ZF_SET(child, JNODE_DKSET);
6727 +               write_unlock_dk(tree);
6728 +
6729 +               /* @child escaped imminent death! */
6730 +               ZF_CLR(child, JNODE_HEARD_BANSHEE);
6731 +               return 0;
6732 +       }
6733 +
6734 +       /* convert child pointer to the coord_t */
6735 +       result = find_child_ptr(parent, child, &coord);
6736 +       if (result != NS_FOUND) {
6737 +               warning("nikita-994", "Cannot find child pointer: %i", result);
6738 +               print_coord_content("coord", &coord);
6739 +               return result;
6740 +       }
6741 +
6742 +       coord_dup(&coord2, &coord);
6743 +       info.doing = doing;
6744 +       info.todo = todo;
6745 +       {
6746 +               /*
6747 +                * Actually kill internal item: prepare structure with
6748 +                * arguments for ->cut_and_kill() method...
6749 +                */
6750 +
6751 +               struct carry_kill_data kdata;
6752 +               kdata.params.from = &coord;
6753 +               kdata.params.to = &coord2;
6754 +               kdata.params.from_key = NULL;
6755 +               kdata.params.to_key = NULL;
6756 +               kdata.params.smallest_removed = NULL;
6757 +               kdata.params.truncate = 1;
6758 +               kdata.flags = op->u.delete.flags;
6759 +               kdata.inode = NULL;
6760 +               kdata.left = NULL;
6761 +               kdata.right = NULL;
6762 +               kdata.buf = NULL;
6763 +               /* ... and call it. */
6764 +               result = node_plugin_by_node(parent)->cut_and_kill(&kdata,
6765 +                                                                  &info);
6766 +       }
6767 +       doing->restartable = 0;
6768 +
6769 +       /* check whether root should be killed violently */
6770 +       if (znode_is_root(parent) &&
6771 +           /* don't kill roots at and lower than twig level */
6772 +           znode_get_level(parent) > REISER4_MIN_TREE_HEIGHT &&
6773 +           node_num_items(parent) == 1)
6774 +               result = reiser4_kill_tree_root(coord.node);
6775 +
6776 +       return result < 0 ? : 0;
6777 +}
6778 +
6779 +/* implements COP_CUT opration
6780 +
6781 +   Cuts part or whole content of node.
6782 +
6783 +*/
6784 +static int carry_cut(carry_op * op /* operation to be performed */ ,
6785 +                    carry_level * doing /* current carry level */ ,
6786 +                    carry_level * todo/* next carry level */)
6787 +{
6788 +       int result;
6789 +       carry_plugin_info info;
6790 +       node_plugin *nplug;
6791 +
6792 +       assert("nikita-896", op != NULL);
6793 +       assert("nikita-897", todo != NULL);
6794 +       assert("nikita-898", op->op == COP_CUT);
6795 +
6796 +       info.doing = doing;
6797 +       info.todo = todo;
6798 +
6799 +       nplug = node_plugin_by_node(reiser4_carry_real(op->node));
6800 +       if (op->u.cut_or_kill.is_cut)
6801 +               result = nplug->cut(op->u.cut_or_kill.u.cut, &info);
6802 +       else
6803 +               result = nplug->cut_and_kill(op->u.cut_or_kill.u.kill, &info);
6804 +
6805 +       doing->restartable = 0;
6806 +       return result < 0 ? : 0;
6807 +}
6808 +
6809 +/* helper function for carry_paste(): returns true if @op can be continued as
6810 +   paste  */
6811 +static int
6812 +can_paste(coord_t *icoord, const reiser4_key * key,
6813 +         const reiser4_item_data * data)
6814 +{
6815 +       coord_t circa;
6816 +       item_plugin *new_iplug;
6817 +       item_plugin *old_iplug;
6818 +       int result = 0;         /* to keep gcc shut */
6819 +
6820 +       assert("", icoord->between != AT_UNIT);
6821 +
6822 +       /* obviously, one cannot paste when node is empty---there is nothing
6823 +          to paste into. */
6824 +       if (node_is_empty(icoord->node))
6825 +               return 0;
6826 +       /* if insertion point is at the middle of the item, then paste */
6827 +       if (!coord_is_between_items(icoord))
6828 +               return 1;
6829 +       coord_dup(&circa, icoord);
6830 +       circa.between = AT_UNIT;
6831 +
6832 +       old_iplug = item_plugin_by_coord(&circa);
6833 +       new_iplug = data->iplug;
6834 +
6835 +       /* check whether we can paste to the item @icoord is "at" when we
6836 +          ignore ->between field */
6837 +       if (old_iplug == new_iplug && item_can_contain_key(&circa, key, data))
6838 +               result = 1;
6839 +       else if (icoord->between == BEFORE_UNIT
6840 +                  || icoord->between == BEFORE_ITEM) {
6841 +               /* otherwise, try to glue to the item at the left, if any */
6842 +               coord_dup(&circa, icoord);
6843 +               if (coord_set_to_left(&circa)) {
6844 +                       result = 0;
6845 +                       coord_init_before_item(icoord);
6846 +               } else {
6847 +                       old_iplug = item_plugin_by_coord(&circa);
6848 +                       result = (old_iplug == new_iplug)
6849 +                           && item_can_contain_key(icoord, key, data);
6850 +                       if (result) {
6851 +                               coord_dup(icoord, &circa);
6852 +                               icoord->between = AFTER_UNIT;
6853 +                       }
6854 +               }
6855 +       } else if (icoord->between == AFTER_UNIT
6856 +                  || icoord->between == AFTER_ITEM) {
6857 +               coord_dup(&circa, icoord);
6858 +               /* otherwise, try to glue to the item at the right, if any */
6859 +               if (coord_set_to_right(&circa)) {
6860 +                       result = 0;
6861 +                       coord_init_after_item(icoord);
6862 +               } else {
6863 +                       int (*cck) (const coord_t *, const reiser4_key *,
6864 +                                   const reiser4_item_data *);
6865 +
6866 +                       old_iplug = item_plugin_by_coord(&circa);
6867 +
6868 +                       cck = old_iplug->b.can_contain_key;
6869 +                       if (cck == NULL)
6870 +                               /* item doesn't define ->can_contain_key
6871 +                                  method? So it is not expandable. */
6872 +                               result = 0;
6873 +                       else {
6874 +                               result = (old_iplug == new_iplug)
6875 +                                   && cck(&circa /*icoord */ , key, data);
6876 +                               if (result) {
6877 +                                       coord_dup(icoord, &circa);
6878 +                                       icoord->between = BEFORE_UNIT;
6879 +                               }
6880 +                       }
6881 +               }
6882 +       } else
6883 +               impossible("nikita-2513", "Nothing works");
6884 +       if (result) {
6885 +               if (icoord->between == BEFORE_ITEM) {
6886 +                       assert("vs-912", icoord->unit_pos == 0);
6887 +                       icoord->between = BEFORE_UNIT;
6888 +               } else if (icoord->between == AFTER_ITEM) {
6889 +                       coord_init_after_item_end(icoord);
6890 +               }
6891 +       }
6892 +       return result;
6893 +}
6894 +
6895 +/* implements COP_PASTE operation
6896 +
6897 +   Paste data into existing item. This is complicated by the fact that after
6898 +   we shifted something to the left or right neighbors trying to free some
6899 +   space, item we were supposed to paste into can be in different node than
6900 +   insertion coord. If so, we are no longer doing paste, but insert. See
6901 +   comments in insert_paste_common().
6902 +
6903 +*/
6904 +static int carry_paste(carry_op * op /* operation to be performed */ ,
6905 +                      carry_level * doing UNUSED_ARG   /* current carry
6906 +                                                        * level */ ,
6907 +                      carry_level * todo/* next carry level */)
6908 +{
6909 +       znode *node;
6910 +       carry_insert_data cdata;
6911 +       coord_t dcoord;
6912 +       reiser4_item_data data;
6913 +       int result;
6914 +       int real_size;
6915 +       item_plugin *iplug;
6916 +       carry_plugin_info info;
6917 +       coord_t *coord;
6918 +
6919 +       assert("nikita-982", op != NULL);
6920 +       assert("nikita-983", todo != NULL);
6921 +       assert("nikita-984", op->op == COP_PASTE);
6922 +
6923 +       coord_init_zero(&dcoord);
6924 +
6925 +       result = insert_paste_common(op, doing, todo, &cdata, &dcoord, &data);
6926 +       if (result != 0)
6927 +               return result;
6928 +
6929 +       coord = op->u.insert.d->coord;
6930 +
6931 +       /* handle case when op -> u.insert.coord doesn't point to the item
6932 +          of required type. restart as insert. */
6933 +       if (!can_paste(coord, op->u.insert.d->key, op->u.insert.d->data)) {
6934 +               op->op = COP_INSERT;
6935 +               op->u.insert.type = COPT_PASTE_RESTARTED;
6936 +               result = op_dispatch_table[COP_INSERT].handler(op, doing, todo);
6937 +
6938 +               return result;
6939 +       }
6940 +
6941 +       node = coord->node;
6942 +       iplug = item_plugin_by_coord(coord);
6943 +       assert("nikita-992", iplug != NULL);
6944 +
6945 +       assert("nikita-985", node != NULL);
6946 +       assert("nikita-986", node_plugin_by_node(node) != NULL);
6947 +
6948 +       assert("nikita-987",
6949 +              space_needed_for_op(node, op) <= znode_free_space(node));
6950 +
6951 +       assert("nikita-1286", coord_is_existing_item(coord));
6952 +
6953 +       /*
6954 +        * if item is expanded as a result of this operation, we should first
6955 +        * change item size, than call ->b.paste item method. If item is
6956 +        * shrunk, it should be done other way around: first call ->b.paste
6957 +        * method, then reduce item size.
6958 +        */
6959 +
6960 +       real_size = space_needed_for_op(node, op);
6961 +       if (real_size > 0)
6962 +               node->nplug->change_item_size(coord, real_size);
6963 +
6964 +       doing->restartable = 0;
6965 +       info.doing = doing;
6966 +       info.todo = todo;
6967 +
6968 +       result = iplug->b.paste(coord, op->u.insert.d->data, &info);
6969 +
6970 +       if (real_size < 0)
6971 +               node->nplug->change_item_size(coord, real_size);
6972 +
6973 +       /* if we pasted at the beginning of the item, update item's key. */
6974 +       if (coord->unit_pos == 0 && coord->between != AFTER_UNIT)
6975 +               node->nplug->update_item_key(coord, op->u.insert.d->key, &info);
6976 +
6977 +       znode_make_dirty(node);
6978 +       return result;
6979 +}
6980 +
6981 +/* handle carry COP_EXTENT operation. */
6982 +static int carry_extent(carry_op * op /* operation to perform */ ,
6983 +                       carry_level * doing     /* queue of operations @op
6984 +                                                * is part of */ ,
6985 +                       carry_level * todo      /* queue where new operations
6986 +                                                * are accumulated */ )
6987 +{
6988 +       znode *node;
6989 +       carry_insert_data cdata;
6990 +       coord_t coord;
6991 +       reiser4_item_data data;
6992 +       carry_op *delete_dummy;
6993 +       carry_op *insert_extent;
6994 +       int result;
6995 +       carry_plugin_info info;
6996 +
6997 +       assert("nikita-1751", op != NULL);
6998 +       assert("nikita-1752", todo != NULL);
6999 +       assert("nikita-1753", op->op == COP_EXTENT);
7000 +
7001 +       /* extent insertion overview:
7002 +
7003 +          extents live on the TWIG LEVEL, which is level one above the leaf
7004 +          one. This complicates extent insertion logic somewhat: it may
7005 +          happen (and going to happen all the time) that in logical key
7006 +          ordering extent has to be placed between items I1 and I2, located
7007 +          at the leaf level, but I1 and I2 are in the same formatted leaf
7008 +          node N1. To insert extent one has to
7009 +
7010 +          (1) reach node N1 and shift data between N1, its neighbors and
7011 +          possibly newly allocated nodes until I1 and I2 fall into different
7012 +          nodes. Since I1 and I2 are still neighboring items in logical key
7013 +          order, they will be necessary utmost items in their respective
7014 +          nodes.
7015 +
7016 +          (2) After this new extent item is inserted into node on the twig
7017 +          level.
7018 +
7019 +          Fortunately this process can reuse almost all code from standard
7020 +          insertion procedure (viz. make_space() and insert_paste_common()),
7021 +          due to the following observation: make_space() only shifts data up
7022 +          to and excluding or including insertion point. It never
7023 +          "over-moves" through insertion point. Thus, one can use
7024 +          make_space() to perform step (1). All required for this is just to
7025 +          instruct free_space_shortage() to keep make_space() shifting data
7026 +          until insertion point is at the node border.
7027 +
7028 +        */
7029 +
7030 +       /* perform common functionality of insert and paste. */
7031 +       result = insert_paste_common(op, doing, todo, &cdata, &coord, &data);
7032 +       if (result != 0)
7033 +               return result;
7034 +
7035 +       node = op->u.extent.d->coord->node;
7036 +       assert("nikita-1754", node != NULL);
7037 +       assert("nikita-1755", node_plugin_by_node(node) != NULL);
7038 +       assert("nikita-1700", coord_wrt(op->u.extent.d->coord) != COORD_INSIDE);
7039 +
7040 +       /* NOTE-NIKITA add some checks here. Not assertions, -EIO. Check that
7041 +          extent fits between items. */
7042 +
7043 +       info.doing = doing;
7044 +       info.todo = todo;
7045 +
7046 +       /* there is another complication due to placement of extents on the
7047 +          twig level: extents are "rigid" in the sense that key-range
7048 +          occupied by extent cannot grow indefinitely to the right as it is
7049 +          for the formatted leaf nodes. Because of this when search finds two
7050 +          adjacent extents on the twig level, it has to "drill" to the leaf
7051 +          level, creating new node. Here we are removing this node.
7052 +        */
7053 +       if (node_is_empty(node)) {
7054 +               delete_dummy = node_post_carry(&info, COP_DELETE, node, 1);
7055 +               if (IS_ERR(delete_dummy))
7056 +                       return PTR_ERR(delete_dummy);
7057 +               delete_dummy->u.delete.child = NULL;
7058 +               delete_dummy->u.delete.flags = DELETE_RETAIN_EMPTY;
7059 +               ZF_SET(node, JNODE_HEARD_BANSHEE);
7060 +       }
7061 +
7062 +       /* proceed with inserting extent item into parent. We are definitely
7063 +          inserting rather than pasting if we get that far. */
7064 +       insert_extent = node_post_carry(&info, COP_INSERT, node, 1);
7065 +       if (IS_ERR(insert_extent))
7066 +               /* @delete_dummy will be automatically destroyed on the level
7067 +                  exiting  */
7068 +               return PTR_ERR(insert_extent);
7069 +       /* NOTE-NIKITA insertion by key is simplest option here. Another
7070 +          possibility is to insert on the left or right of already existing
7071 +          item.
7072 +        */
7073 +       insert_extent->u.insert.type = COPT_KEY;
7074 +       insert_extent->u.insert.d = op->u.extent.d;
7075 +       assert("nikita-1719", op->u.extent.d->key != NULL);
7076 +       insert_extent->u.insert.d->data->arg = op->u.extent.d->coord;
7077 +       insert_extent->u.insert.flags =
7078 +           znode_get_tree(node)->carry.new_extent_flags;
7079 +
7080 +       /*
7081 +        * if carry was asked to track lock handle we should actually track
7082 +        * lock handle on the twig node rather than on the leaf where
7083 +        * operation was started from. Transfer tracked lock handle.
7084 +        */
7085 +       if (doing->track_type) {
7086 +               assert("nikita-3242", doing->tracked != NULL);
7087 +               assert("nikita-3244", todo->tracked == NULL);
7088 +               todo->tracked = doing->tracked;
7089 +               todo->track_type = CARRY_TRACK_NODE;
7090 +               doing->tracked = NULL;
7091 +               doing->track_type = 0;
7092 +       }
7093 +
7094 +       return 0;
7095 +}
7096 +
7097 +/* update key in @parent between pointers to @left and @right.
7098 +
7099 +   Find coords of @left and @right and update delimiting key between them.
7100 +   This is helper function called by carry_update(). Finds position of
7101 +   internal item involved. Updates item key. Updates delimiting keys of child
7102 +   nodes involved.
7103 +*/
7104 +static int update_delimiting_key(znode * parent        /* node key is updated
7105 +                                                * in */ ,
7106 +                                znode * left /* child of @parent */ ,
7107 +                                znode * right /* child of @parent */ ,
7108 +                                carry_level * doing    /* current carry
7109 +                                                        * level */ ,
7110 +                                carry_level * todo     /* parent carry
7111 +                                                        * level */ ,
7112 +                                const char **error_msg /* place to
7113 +                                                        * store error
7114 +                                                        * message */ )
7115 +{
7116 +       coord_t left_pos;
7117 +       coord_t right_pos;
7118 +       int result;
7119 +       reiser4_key ldkey;
7120 +       carry_plugin_info info;
7121 +
7122 +       assert("nikita-1177", right != NULL);
7123 +       /* find position of right left child in a parent */
7124 +       result = find_child_ptr(parent, right, &right_pos);
7125 +       if (result != NS_FOUND) {
7126 +               *error_msg = "Cannot find position of right child";
7127 +               return result;
7128 +       }
7129 +
7130 +       if ((left != NULL) && !coord_is_leftmost_unit(&right_pos)) {
7131 +               /* find position of the left child in a parent */
7132 +               result = find_child_ptr(parent, left, &left_pos);
7133 +               if (result != NS_FOUND) {
7134 +                       *error_msg = "Cannot find position of left child";
7135 +                       return result;
7136 +               }
7137 +               assert("nikita-1355", left_pos.node != NULL);
7138 +       } else
7139 +               left_pos.node = NULL;
7140 +
7141 +       /* check that they are separated by exactly one key and are basically
7142 +          sane */
7143 +       if (REISER4_DEBUG) {
7144 +               if ((left_pos.node != NULL)
7145 +                   && !coord_is_existing_unit(&left_pos)) {
7146 +                       *error_msg = "Left child is bastard";
7147 +                       return RETERR(-EIO);
7148 +               }
7149 +               if (!coord_is_existing_unit(&right_pos)) {
7150 +                       *error_msg = "Right child is bastard";
7151 +                       return RETERR(-EIO);
7152 +               }
7153 +               if (left_pos.node != NULL &&
7154 +                   !coord_are_neighbors(&left_pos, &right_pos)) {
7155 +                       *error_msg = "Children are not direct siblings";
7156 +                       return RETERR(-EIO);
7157 +               }
7158 +       }
7159 +       *error_msg = NULL;
7160 +
7161 +       info.doing = doing;
7162 +       info.todo = todo;
7163 +
7164 +       /*
7165 +        * If child node is not empty, new key of internal item is a key of
7166 +        * leftmost item in the child node. If the child is empty, take its
7167 +        * right delimiting key as a new key of the internal item. Precise key
7168 +        * in the latter case is not important per se, because the child (and
7169 +        * the internal item) are going to be killed shortly anyway, but we
7170 +        * have to preserve correct order of keys in the parent node.
7171 +        */
7172 +
7173 +       if (!ZF_ISSET(right, JNODE_HEARD_BANSHEE))
7174 +               leftmost_key_in_node(right, &ldkey);
7175 +       else {
7176 +               read_lock_dk(znode_get_tree(parent));
7177 +               ldkey = *znode_get_rd_key(right);
7178 +               read_unlock_dk(znode_get_tree(parent));
7179 +       }
7180 +       node_plugin_by_node(parent)->update_item_key(&right_pos, &ldkey, &info);
7181 +       doing->restartable = 0;
7182 +       znode_make_dirty(parent);
7183 +       return 0;
7184 +}
7185 +
7186 +/* implements COP_UPDATE opration
7187 +
7188 +   Update delimiting keys.
7189 +
7190 +*/
7191 +static int carry_update(carry_op * op /* operation to be performed */ ,
7192 +                       carry_level * doing /* current carry level */ ,
7193 +                       carry_level * todo/* next carry level */)
7194 +{
7195 +       int result;
7196 +       carry_node *missing UNUSED_ARG;
7197 +       znode *left;
7198 +       znode *right;
7199 +       carry_node *lchild;
7200 +       carry_node *rchild;
7201 +       const char *error_msg;
7202 +       reiser4_tree *tree;
7203 +
7204 +       /*
7205 +        * This operation is called to update key of internal item. This is
7206 +        * necessary when carry shifted of cut data on the child
7207 +        * level. Arguments of this operation are:
7208 +        *
7209 +        *     @right --- child node. Operation should update key of internal
7210 +        *     item pointing to @right.
7211 +        *
7212 +        *     @left --- left neighbor of @right. This parameter is optional.
7213 +        */
7214 +
7215 +       assert("nikita-902", op != NULL);
7216 +       assert("nikita-903", todo != NULL);
7217 +       assert("nikita-904", op->op == COP_UPDATE);
7218 +
7219 +       lchild = op->u.update.left;
7220 +       rchild = op->node;
7221 +
7222 +       if (lchild != NULL) {
7223 +               assert("nikita-1001", lchild->parent);
7224 +               assert("nikita-1003", !lchild->left);
7225 +               left = reiser4_carry_real(lchild);
7226 +       } else
7227 +               left = NULL;
7228 +
7229 +       tree = znode_get_tree(rchild->node);
7230 +       read_lock_tree(tree);
7231 +       right = znode_parent(rchild->node);
7232 +       read_unlock_tree(tree);
7233 +
7234 +       if (right != NULL) {
7235 +               result = update_delimiting_key(right,
7236 +                                              lchild ? lchild->node : NULL,
7237 +                                              rchild->node,
7238 +                                              doing, todo, &error_msg);
7239 +       } else {
7240 +               error_msg = "Cannot find node to update key in";
7241 +               result = RETERR(-EIO);
7242 +       }
7243 +       /* operation will be reposted to the next level by the
7244 +          ->update_item_key() method of node plugin, if necessary. */
7245 +
7246 +       if (result != 0) {
7247 +               warning("nikita-999", "Error updating delimiting key: %s (%i)",
7248 +                       error_msg ? : "", result);
7249 +       }
7250 +       return result;
7251 +}
7252 +
7253 +/* move items from @node during carry */
7254 +static int carry_shift_data(sideof side /* in what direction to move data */ ,
7255 +                           coord_t *insert_coord       /* coord where new item
7256 +                                                        * is to be inserted */,
7257 +                           znode * node /* node which data are moved from */ ,
7258 +                           carry_level * doing /* active carry queue */ ,
7259 +                           carry_level * todo  /* carry queue where new
7260 +                                                * operations are to be put
7261 +                                                * in */ ,
7262 +                           unsigned int including_insert_coord_p
7263 +                               /* true if @insertion_coord can be moved */ )
7264 +{
7265 +       int result;
7266 +       znode *source;
7267 +       carry_plugin_info info;
7268 +       node_plugin *nplug;
7269 +
7270 +       source = insert_coord->node;
7271 +
7272 +       info.doing = doing;
7273 +       info.todo = todo;
7274 +
7275 +       nplug = node_plugin_by_node(node);
7276 +       result = nplug->shift(insert_coord, node,
7277 +                             (side == LEFT_SIDE) ? SHIFT_LEFT : SHIFT_RIGHT, 0,
7278 +                             (int)including_insert_coord_p, &info);
7279 +       /* the only error ->shift() method of node plugin can return is
7280 +          -ENOMEM due to carry node/operation allocation. */
7281 +       assert("nikita-915", result >= 0 || result == -ENOMEM);
7282 +       if (result > 0) {
7283 +               /*
7284 +                * if some number of bytes was actually shifted, mark nodes
7285 +                * dirty, and carry level as non-restartable.
7286 +                */
7287 +               doing->restartable = 0;
7288 +               znode_make_dirty(source);
7289 +               znode_make_dirty(node);
7290 +       }
7291 +
7292 +       assert("nikita-2077", coord_check(insert_coord));
7293 +       return 0;
7294 +}
7295 +
7296 +typedef carry_node *(*carry_iterator) (carry_node * node);
7297 +static carry_node *find_dir_carry(carry_node * node, carry_level * level,
7298 +                                 carry_iterator iterator);
7299 +
7300 +static carry_node *pool_level_list_prev(carry_node *node)
7301 +{
7302 +       return list_entry(node->header.level_linkage.prev, carry_node, header.level_linkage);
7303 +}
7304 +
7305 +/* look for the left neighbor of given carry node in a carry queue.
7306 +
7307 +   This is used by find_left_neighbor(), but I am not sure that this
7308 +   really gives any advantage. More statistics required.
7309 +
7310 +*/
7311 +carry_node *find_left_carry(carry_node * node  /* node to find left neighbor
7312 +                                                * of */ ,
7313 +                           carry_level * level/* level to scan */)
7314 +{
7315 +       return find_dir_carry(node, level,
7316 +                             (carry_iterator) pool_level_list_prev);
7317 +}
7318 +
7319 +static carry_node *pool_level_list_next(carry_node *node)
7320 +{
7321 +       return list_entry(node->header.level_linkage.next, carry_node, header.level_linkage);
7322 +}
7323 +
7324 +/* look for the right neighbor of given carry node in a
7325 +   carry queue.
7326 +
7327 +   This is used by find_right_neighbor(), but I am not sure that this
7328 +   really gives any advantage. More statistics required.
7329 +
7330 +*/
7331 +carry_node *find_right_carry(carry_node * node /* node to find right neighbor
7332 +                                                * of */ ,
7333 +                            carry_level * level/* level to scan */)
7334 +{
7335 +       return find_dir_carry(node, level,
7336 +                             (carry_iterator) pool_level_list_next);
7337 +}
7338 +
7339 +/* look for the left or right neighbor of given carry node in a carry
7340 +   queue.
7341 +
7342 +   Helper function used by find_{left|right}_carry().
7343 +*/
7344 +static carry_node *find_dir_carry(carry_node * node    /* node to start
7345 +                                                        * scanning from */ ,
7346 +                                 carry_level * level /* level to scan */ ,
7347 +                                 carry_iterator iterator       /* operation to
7348 +                                                                * move to the
7349 +                                                                * next node */)
7350 +{
7351 +       carry_node *neighbor;
7352 +
7353 +       assert("nikita-1059", node != NULL);
7354 +       assert("nikita-1060", level != NULL);
7355 +
7356 +       /* scan list of carry nodes on this list dir-ward, skipping all
7357 +          carry nodes referencing the same znode. */
7358 +       neighbor = node;
7359 +       while (1) {
7360 +               neighbor = iterator(neighbor);
7361 +               if (carry_node_end(level, neighbor))
7362 +                       /* list head is reached */
7363 +                       return NULL;
7364 +               if (reiser4_carry_real(neighbor) != reiser4_carry_real(node))
7365 +                       return neighbor;
7366 +       }
7367 +}
7368 +
7369 +/*
7370 + * Memory reservation estimation.
7371 + *
7372 + * Carry process proceeds through tree levels upwards. Carry assumes that it
7373 + * takes tree in consistent state (e.g., that search tree invariants hold),
7374 + * and leaves tree consistent after it finishes. This means that when some
7375 + * error occurs carry cannot simply return if there are pending carry
7376 + * operations. Generic solution for this problem is carry-undo either as
7377 + * transaction manager feature (requiring checkpoints and isolation), or
7378 + * through some carry specific mechanism.
7379 + *
7380 + * Our current approach is to panic if carry hits an error while tree is
7381 + * inconsistent. Unfortunately -ENOMEM can easily be triggered. To work around
7382 + * this "memory reservation" mechanism was added.
7383 + *
7384 + * Memory reservation is implemented by perthread-pages.diff patch from
7385 + * core-patches. Its API is defined in <linux/gfp.h>
7386 + *
7387 + *     int  perthread_pages_reserve(int nrpages, gfp_t gfp);
7388 + *     void perthread_pages_release(int nrpages);
7389 + *     int  perthread_pages_count(void);
7390 + *
7391 + * carry estimates its worst case memory requirements at the entry, reserved
7392 + * enough memory, and released unused pages before returning.
7393 + *
7394 + * Code below estimates worst case memory requirements for a given carry
7395 + * queue. This is dome by summing worst case memory requirements for each
7396 + * operation in the queue.
7397 + *
7398 + */
7399 +
7400 +/*
7401 + * Memory memory requirements of many operations depends on the tree
7402 + * height. For example, item insertion requires new node to be inserted at
7403 + * each tree level in the worst case. What tree height should be used for
7404 + * estimation? Current tree height is wrong, because tree height can change
7405 + * between the time when estimation was done and the time when operation is
7406 + * actually performed. Maximal possible tree height (REISER4_MAX_ZTREE_HEIGHT)
7407 + * is also not desirable, because it would lead to the huge over-estimation
7408 + * all the time. Plausible solution is "capped tree height": if current tree
7409 + * height is less than some TREE_HEIGHT_CAP constant, capped tree height is
7410 + * TREE_HEIGHT_CAP, otherwise it's current tree height. Idea behind this is
7411 + * that if tree height is TREE_HEIGHT_CAP or larger, it's extremely unlikely
7412 + * to be increased even more during short interval of time.
7413 + */
7414 +#define TREE_HEIGHT_CAP (5)
7415 +
7416 +/* return capped tree height for the @tree. See comment above. */
7417 +static int cap_tree_height(reiser4_tree * tree)
7418 +{
7419 +       return max_t(int, tree->height, TREE_HEIGHT_CAP);
7420 +}
7421 +
7422 +/* return capped tree height for the current tree. */
7423 +static int capped_height(void)
7424 +{
7425 +       return cap_tree_height(current_tree);
7426 +}
7427 +
7428 +/* return number of pages required to store given number of bytes */
7429 +static int bytes_to_pages(int bytes)
7430 +{
7431 +       return (bytes + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
7432 +}
7433 +
7434 +/* how many pages are required to allocate znodes during item insertion. */
7435 +static int carry_estimate_znodes(void)
7436 +{
7437 +       /*
7438 +        * Note, that there we have some problem here: there is no way to
7439 +        * reserve pages specifically for the given slab. This means that
7440 +        * these pages can be hijacked for some other end.
7441 +        */
7442 +
7443 +       /* in the worst case we need 3 new znode on each tree level */
7444 +       return bytes_to_pages(capped_height() * sizeof(znode) * 3);
7445 +}
7446 +
7447 +/*
7448 + * how many pages are required to load bitmaps. One bitmap per level.
7449 + */
7450 +static int carry_estimate_bitmaps(void)
7451 +{
7452 +       if (reiser4_is_set(reiser4_get_current_sb(), REISER4_DONT_LOAD_BITMAP)) {
7453 +               int bytes;
7454 +
7455 +               bytes = capped_height() * (0 +  /* bnode should be added, but
7456 +                                                * its is private to bitmap.c,
7457 +                                                * skip for now. */
7458 +                                          2 * sizeof(jnode));
7459 +                                               /* working and commit jnodes */
7460 +               return bytes_to_pages(bytes) + 2;       /* and their contents */
7461 +       } else
7462 +               /* bitmaps were pre-loaded during mount */
7463 +               return 0;
7464 +}
7465 +
7466 +/* worst case item insertion memory requirements */
7467 +static int carry_estimate_insert(carry_op * op, carry_level * level)
7468 +{
7469 +       return carry_estimate_bitmaps() + carry_estimate_znodes() + 1 +
7470 +                                                               /* new atom */
7471 +           capped_height() +   /* new block on each level */
7472 +           1 +         /* and possibly extra new block at the leaf level */
7473 +           3;                  /* loading of leaves into memory */
7474 +}
7475 +
7476 +/* worst case item deletion memory requirements */
7477 +static int carry_estimate_delete(carry_op * op, carry_level * level)
7478 +{
7479 +       return carry_estimate_bitmaps() + carry_estimate_znodes() + 1 +
7480 +                                                               /* new atom */
7481 +           3;                  /* loading of leaves into memory */
7482 +}
7483 +
7484 +/* worst case tree cut memory requirements */
7485 +static int carry_estimate_cut(carry_op * op, carry_level * level)
7486 +{
7487 +       return carry_estimate_bitmaps() + carry_estimate_znodes() + 1 +
7488 +                                                               /* new atom */
7489 +           3;                  /* loading of leaves into memory */
7490 +}
7491 +
7492 +/* worst case memory requirements of pasting into item */
7493 +static int carry_estimate_paste(carry_op * op, carry_level * level)
7494 +{
7495 +       return carry_estimate_bitmaps() + carry_estimate_znodes() + 1 +
7496 +                                                               /* new atom */
7497 +           capped_height() +   /* new block on each level */
7498 +           1 +         /* and possibly extra new block at the leaf level */
7499 +           3;                  /* loading of leaves into memory */
7500 +}
7501 +
7502 +/* worst case memory requirements of extent insertion */
7503 +static int carry_estimate_extent(carry_op * op, carry_level * level)
7504 +{
7505 +       return carry_estimate_insert(op, level) +       /* insert extent */
7506 +           carry_estimate_delete(op, level);   /* kill leaf */
7507 +}
7508 +
7509 +/* worst case memory requirements of key update */
7510 +static int carry_estimate_update(carry_op * op, carry_level * level)
7511 +{
7512 +       return 0;
7513 +}
7514 +
7515 +/* worst case memory requirements of flow insertion */
7516 +static int carry_estimate_insert_flow(carry_op * op, carry_level * level)
7517 +{
7518 +       int newnodes;
7519 +
7520 +       newnodes = min(bytes_to_pages(op->u.insert_flow.flow->length),
7521 +                      CARRY_FLOW_NEW_NODES_LIMIT);
7522 +       /*
7523 +        * roughly estimate insert_flow as a sequence of insertions.
7524 +        */
7525 +       return newnodes * carry_estimate_insert(op, level);
7526 +}
7527 +
7528 +/* This is dispatch table for carry operations. It can be trivially
7529 +   abstracted into useful plugin: tunable balancing policy is a good
7530 +   thing. */
7531 +carry_op_handler op_dispatch_table[COP_LAST_OP] = {
7532 +       [COP_INSERT] = {
7533 +                       .handler = carry_insert,
7534 +                       .estimate = carry_estimate_insert}
7535 +       ,
7536 +       [COP_DELETE] = {
7537 +                       .handler = carry_delete,
7538 +                       .estimate = carry_estimate_delete}
7539 +       ,
7540 +       [COP_CUT] = {
7541 +                    .handler = carry_cut,
7542 +                    .estimate = carry_estimate_cut}
7543 +       ,
7544 +       [COP_PASTE] = {
7545 +                      .handler = carry_paste,
7546 +                      .estimate = carry_estimate_paste}
7547 +       ,
7548 +       [COP_EXTENT] = {
7549 +                       .handler = carry_extent,
7550 +                       .estimate = carry_estimate_extent}
7551 +       ,
7552 +       [COP_UPDATE] = {
7553 +                       .handler = carry_update,
7554 +                       .estimate = carry_estimate_update}
7555 +       ,
7556 +       [COP_INSERT_FLOW] = {
7557 +                            .handler = carry_insert_flow,
7558 +                            .estimate = carry_estimate_insert_flow}
7559 +};
7560 +
7561 +/* Make Linus happy.
7562 +   Local variables:
7563 +   c-indentation-style: "K&R"
7564 +   mode-name: "LC"
7565 +   c-basic-offset: 8
7566 +   tab-width: 8
7567 +   fill-column: 120
7568 +   scroll-step: 1
7569 +   End:
7570 +*/
7571 diff -puN /dev/null fs/reiser4/carry_ops.h
7572 --- /dev/null
7573 +++ a/fs/reiser4/carry_ops.h
7574 @@ -0,0 +1,43 @@
7575 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
7576 +   reiser4/README */
7577 +
7578 +/* implementation of carry operations. See carry_ops.c for details. */
7579 +
7580 +#if !defined(__CARRY_OPS_H__)
7581 +#define __CARRY_OPS_H__
7582 +
7583 +#include "forward.h"
7584 +#include "znode.h"
7585 +#include "carry.h"
7586 +
7587 +/* carry operation handlers */
7588 +typedef struct carry_op_handler {
7589 +       /* perform operation */
7590 +       int (*handler) (carry_op * op, carry_level * doing, carry_level * todo);
7591 +       /* estimate memory requirements for @op */
7592 +       int (*estimate) (carry_op * op, carry_level * level);
7593 +} carry_op_handler;
7594 +
7595 +/* This is dispatch table for carry operations. It can be trivially
7596 +   abstracted into useful plugin: tunable balancing policy is a good
7597 +   thing. */
7598 +extern carry_op_handler op_dispatch_table[COP_LAST_OP];
7599 +
7600 +unsigned int space_needed(const znode * node, const coord_t *coord,
7601 +                         const reiser4_item_data * data, int inserting);
7602 +extern carry_node *find_left_carry(carry_node * node, carry_level * level);
7603 +extern carry_node *find_right_carry(carry_node * node, carry_level * level);
7604 +
7605 +/* __CARRY_OPS_H__ */
7606 +#endif
7607 +
7608 +/* Make Linus happy.
7609 +   Local variables:
7610 +   c-indentation-style: "K&R"
7611 +   mode-name: "LC"
7612 +   c-basic-offset: 8
7613 +   tab-width: 8
7614 +   fill-column: 120
7615 +   scroll-step: 1
7616 +   End:
7617 +*/
7618 diff -puN /dev/null fs/reiser4/context.c
7619 --- /dev/null
7620 +++ a/fs/reiser4/context.c
7621 @@ -0,0 +1,289 @@
7622 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
7623 +
7624 +/* Manipulation of reiser4_context */
7625 +
7626 +/*
7627 + * global context used during system call. Variable of this type is allocated
7628 + * on the stack at the beginning of the reiser4 part of the system call and
7629 + * pointer to it is stored in the current->fs_context. This allows us to avoid
7630 + * passing pointer to current transaction and current lockstack (both in
7631 + * one-to-one mapping with threads) all over the call chain.
7632 + *
7633 + * It's kind of like those global variables the prof used to tell you not to
7634 + * use in CS1, except thread specific.;-) Nikita, this was a good idea.
7635 + *
7636 + * In some situations it is desirable to have ability to enter reiser4_context
7637 + * more than once for the same thread (nested contexts). For example, there
7638 + * are some functions that can be called either directly from VFS/VM or from
7639 + * already active reiser4 context (->writepage, for example).
7640 + *
7641 + * In such situations "child" context acts like dummy: all activity is
7642 + * actually performed in the top level context, and get_current_context()
7643 + * always returns top level context.
7644 + * Of course, reiser4_init_context()/reiser4_done_context() have to be properly
7645 + * nested any way.
7646 + *
7647 + * Note that there is an important difference between reiser4 uses
7648 + * ->fs_context and the way other file systems use it. Other file systems
7649 + * (ext3 and reiserfs) use ->fs_context only for the duration of _transaction_
7650 + * (this is why ->fs_context was initially called ->journal_info). This means,
7651 + * that when ext3 or reiserfs finds that ->fs_context is not NULL on the entry
7652 + * to the file system, they assume that some transaction is already underway,
7653 + * and usually bail out, because starting nested transaction would most likely
7654 + * lead to the deadlock. This gives false positives with reiser4, because we
7655 + * set ->fs_context before starting transaction.
7656 + */
7657 +
7658 +#include "debug.h"
7659 +#include "super.h"
7660 +#include "context.h"
7661 +
7662 +#include <linux/writeback.h>   /* balance_dirty_pages() */
7663 +#include <linux/hardirq.h>
7664 +
7665 +static void _reiser4_init_context(reiser4_context * context,
7666 +                                 struct super_block *super)
7667 +{
7668 +       memset(context, 0, sizeof(*context));
7669 +
7670 +       context->super = super;
7671 +       context->magic = context_magic;
7672 +       context->outer = current->journal_info;
7673 +       current->journal_info = (void *)context;
7674 +       context->nr_children = 0;
7675 +       context->gfp_mask = GFP_KERNEL;
7676 +
7677 +       init_lock_stack(&context->stack);
7678 +
7679 +       reiser4_txn_begin(context);
7680 +
7681 +       /* initialize head of tap list */
7682 +       INIT_LIST_HEAD(&context->taps);
7683 +#if REISER4_DEBUG
7684 +       context->task = current;
7685 +#endif
7686 +       grab_space_enable();
7687 +}
7688 +
7689 +/* initialize context and bind it to the current thread
7690 +
7691 +   This function should be called at the beginning of reiser4 part of
7692 +   syscall.
7693 +*/
7694 +reiser4_context * reiser4_init_context(struct super_block *super)
7695 +{
7696 +       reiser4_context *context;
7697 +
7698 +       assert("nikita-2662", !in_interrupt() && !in_irq());
7699 +       assert("nikita-3357", super != NULL);
7700 +       assert("nikita-3358", super->s_op == NULL || is_reiser4_super(super));
7701 +
7702 +       context = get_current_context_check();
7703 +       if (context && context->super == super) {
7704 +               context = (reiser4_context *) current->journal_info;
7705 +               context->nr_children++;
7706 +               return context;
7707 +       }
7708 +
7709 +       context = kmalloc(sizeof(*context), GFP_KERNEL);
7710 +       if (context == NULL)
7711 +               return ERR_PTR(RETERR(-ENOMEM));
7712 +
7713 +       _reiser4_init_context(context, super);
7714 +       return context;
7715 +}
7716 +
7717 +/* this is used in scan_mgr which is called with spinlock held and in
7718 +   reiser4_fill_super magic */
7719 +void init_stack_context(reiser4_context *context, struct super_block *super)
7720 +{
7721 +       assert("nikita-2662", !in_interrupt() && !in_irq());
7722 +       assert("nikita-3357", super != NULL);
7723 +       assert("nikita-3358", super->s_op == NULL || is_reiser4_super(super));
7724 +       assert("vs-12", !is_in_reiser4_context());
7725 +
7726 +       _reiser4_init_context(context, super);
7727 +       context->on_stack = 1;
7728 +       return;
7729 +}
7730 +
7731 +/* cast lock stack embedded into reiser4 context up to its container */
7732 +reiser4_context *get_context_by_lock_stack(lock_stack * owner)
7733 +{
7734 +       return container_of(owner, reiser4_context, stack);
7735 +}
7736 +
7737 +/* true if there is already _any_ reiser4 context for the current thread */
7738 +int is_in_reiser4_context(void)
7739 +{
7740 +       reiser4_context *ctx;
7741 +
7742 +       ctx = current->journal_info;
7743 +       return ctx != NULL && ((unsigned long)ctx->magic) == context_magic;
7744 +}
7745 +
7746 +/*
7747 + * call balance dirty pages for the current context.
7748 + *
7749 + * File system is expected to call balance_dirty_pages_ratelimited() whenever
7750 + * it dirties a page. reiser4 does this for unformatted nodes (that is, during
7751 + * write---this covers vast majority of all dirty traffic), but we cannot do
7752 + * this immediately when formatted node is dirtied, because long term lock is
7753 + * usually held at that time. To work around this, dirtying of formatted node
7754 + * simply increases ->nr_marked_dirty counter in the current reiser4
7755 + * context. When we are about to leave this context,
7756 + * balance_dirty_pages_ratelimited() is called, if necessary.
7757 + *
7758 + * This introduces another problem: sometimes we do not want to run
7759 + * balance_dirty_pages_ratelimited() when leaving a context, for example
7760 + * because some important lock (like ->i_mutex on the parent directory) is
7761 + * held. To achieve this, ->nobalance flag can be set in the current context.
7762 + */
7763 +static void balance_dirty_pages_at(reiser4_context *context)
7764 +{
7765 +       reiser4_super_info_data *sbinfo = get_super_private(context->super);
7766 +
7767 +       /*
7768 +        * call balance_dirty_pages_ratelimited() to process formatted nodes
7769 +        * dirtied during this system call. Do that only if we are not in mount
7770 +        * and there were nodes dirtied in this context and we are not in
7771 +        * writepage (to avoid deadlock) and not in pdflush
7772 +        */
7773 +       if (sbinfo != NULL && sbinfo->fake != NULL &&
7774 +           context->nr_marked_dirty != 0 &&
7775 +           !(current->flags & PF_MEMALLOC) &&
7776 +           !current_is_pdflush())
7777 +               balance_dirty_pages_ratelimited(sbinfo->fake->i_mapping);
7778 +}
7779 +
7780 +/* release resources associated with context.
7781 +
7782 +   This function should be called at the end of "session" with reiser4,
7783 +   typically just before leaving reiser4 driver back to VFS.
7784 +
7785 +   This is good place to put some degugging consistency checks, like that
7786 +   thread released all locks and closed transcrash etc.
7787 +
7788 +*/
7789 +static void reiser4_done_context(reiser4_context * context)
7790 +                               /* context being released */
7791 +{
7792 +       assert("nikita-860", context != NULL);
7793 +       assert("nikita-859", context->magic == context_magic);
7794 +       assert("vs-646", (reiser4_context *) current->journal_info == context);
7795 +       assert("zam-686", !in_interrupt() && !in_irq());
7796 +
7797 +       /* only do anything when leaving top-level reiser4 context. All nested
7798 +        * contexts are just dummies. */
7799 +       if (context->nr_children == 0) {
7800 +               assert("jmacd-673", context->trans == NULL);
7801 +               assert("jmacd-1002", lock_stack_isclean(&context->stack));
7802 +               assert("nikita-1936", reiser4_no_counters_are_held());
7803 +               assert("nikita-2626", list_empty_careful(reiser4_taps_list()));
7804 +               assert("zam-1004", ergo(get_super_private(context->super),
7805 +                                       get_super_private(context->super)->delete_mutex_owner !=
7806 +                                       current));
7807 +
7808 +               /* release all grabbed but as yet unused blocks */
7809 +               if (context->grabbed_blocks != 0)
7810 +                       all_grabbed2free();
7811 +
7812 +               /*
7813 +                * synchronize against longterm_unlock_znode():
7814 +                * wake_up_requestor() wakes up requestors without holding
7815 +                * zlock (otherwise they will immediately bump into that lock
7816 +                * after wake up on another CPU). To work around (rare)
7817 +                * situation where requestor has been woken up asynchronously
7818 +                * and managed to run until completion (and destroy its
7819 +                * context and lock stack) before wake_up_requestor() called
7820 +                * wake_up() on it, wake_up_requestor() synchronize on lock
7821 +                * stack spin lock. It has actually been observed that spin
7822 +                * lock _was_ locked at this point, because
7823 +                * wake_up_requestor() took interrupt.
7824 +                */
7825 +               spin_lock_stack(&context->stack);
7826 +               spin_unlock_stack(&context->stack);
7827 +
7828 +               assert("zam-684", context->nr_children == 0);
7829 +               /* restore original ->fs_context value */
7830 +               current->journal_info = context->outer;
7831 +               if (context->on_stack == 0)
7832 +                       kfree(context);
7833 +       } else {
7834 +               context->nr_children--;
7835 +#if REISER4_DEBUG
7836 +               assert("zam-685", context->nr_children >= 0);
7837 +#endif
7838 +       }
7839 +}
7840 +
7841 +/*
7842 + * exit reiser4 context. Call balance_dirty_pages_at() if necessary. Close
7843 + * transaction. Call done_context() to do context related book-keeping.
7844 + */
7845 +void reiser4_exit_context(reiser4_context * context)
7846 +{
7847 +       assert("nikita-3021", reiser4_schedulable());
7848 +
7849 +       if (context->nr_children == 0) {
7850 +               if (!context->nobalance) {
7851 +                       reiser4_txn_restart(context);
7852 +                       balance_dirty_pages_at(context);
7853 +               }
7854 +
7855 +               /* if filesystem is mounted with -o sync or -o dirsync - commit
7856 +                  transaction.  FIXME: TXNH_DONT_COMMIT is used to avoid
7857 +                  commiting on exit_context when inode semaphore is held and
7858 +                  to have ktxnmgrd to do commit instead to get better
7859 +                  concurrent filesystem accesses. But, when one mounts with -o
7860 +                  sync, he cares more about reliability than about
7861 +                  performance. So, for now we have this simple mount -o sync
7862 +                  support. */
7863 +               if (context->super->s_flags & (MS_SYNCHRONOUS | MS_DIRSYNC)) {
7864 +                       txn_atom *atom;
7865 +
7866 +                       atom = get_current_atom_locked_nocheck();
7867 +                       if (atom) {
7868 +                               atom->flags |= ATOM_FORCE_COMMIT;
7869 +                               context->trans->flags &= ~TXNH_DONT_COMMIT;
7870 +                               spin_unlock_atom(atom);
7871 +                       }
7872 +               }
7873 +               reiser4_txn_end(context);
7874 +       }
7875 +       reiser4_done_context(context);
7876 +}
7877 +
7878 +void reiser4_ctx_gfp_mask_set(void)
7879 +{
7880 +       reiser4_context *ctx;
7881 +
7882 +       ctx = get_current_context();
7883 +       if (ctx->entd == 0 &&
7884 +           list_empty(&ctx->stack.locks) &&
7885 +           ctx->trans->atom == NULL)
7886 +               ctx->gfp_mask = GFP_KERNEL;
7887 +       else
7888 +               ctx->gfp_mask = GFP_NOFS;
7889 +}
7890 +
7891 +void reiser4_ctx_gfp_mask_force(gfp_t mask)
7892 +{
7893 +       reiser4_context *ctx;
7894 +       ctx = get_current_context();
7895 +
7896 +       assert("edward-1454", ctx != NULL);
7897 +
7898 +       ctx->gfp_mask = mask;
7899 +}
7900 +
7901 +/*
7902 + * Local variables:
7903 + * c-indentation-style: "K&R"
7904 + * mode-name: "LC"
7905 + * c-basic-offset: 8
7906 + * tab-width: 8
7907 + * fill-column: 120
7908 + * scroll-step: 1
7909 + * End:
7910 + */
7911 diff -puN /dev/null fs/reiser4/context.h
7912 --- /dev/null
7913 +++ a/fs/reiser4/context.h
7914 @@ -0,0 +1,228 @@
7915 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
7916 + * reiser4/README */
7917 +
7918 +/* Reiser4 context. See context.c for details. */
7919 +
7920 +#if !defined( __REISER4_CONTEXT_H__ )
7921 +#define __REISER4_CONTEXT_H__
7922 +
7923 +#include "forward.h"
7924 +#include "debug.h"
7925 +#include "dformat.h"
7926 +#include "tap.h"
7927 +#include "lock.h"
7928 +
7929 +#include <linux/types.h>       /* for __u??  */
7930 +#include <linux/fs.h>          /* for struct super_block  */
7931 +#include <linux/spinlock.h>
7932 +#include <linux/sched.h>       /* for struct task_struct */
7933 +
7934 +/* reiser4 per-thread context */
7935 +struct reiser4_context {
7936 +       /* magic constant. For identification of reiser4 contexts. */
7937 +       __u32 magic;
7938 +
7939 +       /* current lock stack. See lock.[ch]. This is where list of all
7940 +          locks taken by current thread is kept. This is also used in
7941 +          deadlock detection. */
7942 +       lock_stack stack;
7943 +
7944 +       /* current transcrash. */
7945 +       txn_handle *trans;
7946 +       /* transaction handle embedded into reiser4_context. ->trans points
7947 +        * here by default. */
7948 +       txn_handle trans_in_ctx;
7949 +
7950 +       /* super block we are working with.  To get the current tree
7951 +          use &get_super_private (reiser4_get_current_sb ())->tree. */
7952 +       struct super_block *super;
7953 +
7954 +       /* parent fs activation */
7955 +       struct fs_activation *outer;
7956 +
7957 +       /* per-thread grabbed (for further allocation) blocks counter */
7958 +       reiser4_block_nr grabbed_blocks;
7959 +
7960 +       /* list of taps currently monitored. See tap.c */
7961 +       struct list_head taps;
7962 +
7963 +       /* grabbing space is enabled */
7964 +       unsigned int grab_enabled:1;
7965 +       /* should be set when we are write dirty nodes to disk in jnode_flush or
7966 +        * reiser4_write_logs() */
7967 +       unsigned int writeout_mode:1;
7968 +       /* true, if current thread is an ent thread */
7969 +       unsigned int entd:1;
7970 +       /* true, if balance_dirty_pages() should not be run when leaving this
7971 +        * context. This is used to avoid lengthly balance_dirty_pages()
7972 +        * operation when holding some important resource, like directory
7973 +        * ->i_mutex */
7974 +       unsigned int nobalance:1;
7975 +
7976 +       /* this bit is used on reiser4_done_context to decide whether context is
7977 +          kmalloc-ed and has to be kfree-ed */
7978 +       unsigned int on_stack:1;
7979 +
7980 +       /* count non-trivial jnode_set_dirty() calls */
7981 +       unsigned long nr_marked_dirty;
7982 +
7983 +       /* reiser4_sync_inodes calls (via generic_sync_sb_inodes)
7984 +        * reiser4_writepages for each of dirty inodes. Reiser4_writepages
7985 +        * captures pages. When number of pages captured in one
7986 +        * reiser4_sync_inodes reaches some threshold - some atoms get
7987 +        * flushed */
7988 +       int nr_captured;
7989 +       int nr_children;        /* number of child contexts */
7990 +#if REISER4_DEBUG
7991 +       /* debugging information about reiser4 locks held by the current
7992 +        * thread */
7993 +       reiser4_lock_cnt_info locks;
7994 +       struct task_struct *task;       /* so we can easily find owner of the stack */
7995 +
7996 +       /*
7997 +        * disk space grabbing debugging support
7998 +        */
7999 +       /* how many disk blocks were grabbed by the first call to
8000 +        * reiser4_grab_space() in this context */
8001 +       reiser4_block_nr grabbed_initially;
8002 +
8003 +       /* list of all threads doing flush currently */
8004 +       struct list_head flushers_link;
8005 +       /* information about last error encountered by reiser4 */
8006 +       err_site err;
8007 +#endif
8008 +       void *vp;
8009 +       gfp_t gfp_mask;
8010 +};
8011 +
8012 +extern reiser4_context *get_context_by_lock_stack(lock_stack *);
8013 +
8014 +/* Debugging helps. */
8015 +#if REISER4_DEBUG
8016 +extern void print_contexts(void);
8017 +#endif
8018 +
8019 +#define current_tree (&(get_super_private(reiser4_get_current_sb())->tree))
8020 +#define current_blocksize reiser4_get_current_sb()->s_blocksize
8021 +#define current_blocksize_bits reiser4_get_current_sb()->s_blocksize_bits
8022 +
8023 +extern reiser4_context *reiser4_init_context(struct super_block *);
8024 +extern void init_stack_context(reiser4_context *, struct super_block *);
8025 +extern void reiser4_exit_context(reiser4_context *);
8026 +
8027 +/* magic constant we store in reiser4_context allocated at the stack. Used to
8028 +   catch accesses to staled or uninitialized contexts. */
8029 +#define context_magic ((__u32) 0x4b1b5d0b)
8030 +
8031 +extern int is_in_reiser4_context(void);
8032 +
8033 +/*
8034 + * return reiser4_context for the thread @tsk
8035 + */
8036 +static inline reiser4_context *get_context(const struct task_struct *tsk)
8037 +{
8038 +       assert("vs-1682",
8039 +              ((reiser4_context *) tsk->journal_info)->magic == context_magic);
8040 +       return (reiser4_context *) tsk->journal_info;
8041 +}
8042 +
8043 +/*
8044 + * return reiser4 context of the current thread, or NULL if there is none.
8045 + */
8046 +static inline reiser4_context *get_current_context_check(void)
8047 +{
8048 +       if (is_in_reiser4_context())
8049 +               return get_context(current);
8050 +       else
8051 +               return NULL;
8052 +}
8053 +
8054 +static inline reiser4_context *get_current_context(void);      /* __attribute__((const)); */
8055 +
8056 +/* return context associated with current thread */
8057 +static inline reiser4_context *get_current_context(void)
8058 +{
8059 +       return get_context(current);
8060 +}
8061 +
8062 +static inline gfp_t reiser4_ctx_gfp_mask_get(void)
8063 +{
8064 +       reiser4_context *ctx;
8065 +
8066 +       ctx = get_current_context_check();
8067 +       return (ctx == NULL) ? GFP_KERNEL : ctx->gfp_mask;
8068 +}
8069 +
8070 +void reiser4_ctx_gfp_mask_set(void);
8071 +void reiser4_ctx_gfp_mask_force (gfp_t mask);
8072 +
8073 +/*
8074 + * true if current thread is in the write-out mode. Thread enters write-out
8075 + * mode during jnode_flush and reiser4_write_logs().
8076 + */
8077 +static inline int is_writeout_mode(void)
8078 +{
8079 +       return get_current_context()->writeout_mode;
8080 +}
8081 +
8082 +/*
8083 + * enter write-out mode
8084 + */
8085 +static inline void writeout_mode_enable(void)
8086 +{
8087 +       assert("zam-941", !get_current_context()->writeout_mode);
8088 +       get_current_context()->writeout_mode = 1;
8089 +}
8090 +
8091 +/*
8092 + * leave write-out mode
8093 + */
8094 +static inline void writeout_mode_disable(void)
8095 +{
8096 +       assert("zam-942", get_current_context()->writeout_mode);
8097 +       get_current_context()->writeout_mode = 0;
8098 +}
8099 +
8100 +static inline void grab_space_enable(void)
8101 +{
8102 +       get_current_context()->grab_enabled = 1;
8103 +}
8104 +
8105 +static inline void grab_space_disable(void)
8106 +{
8107 +       get_current_context()->grab_enabled = 0;
8108 +}
8109 +
8110 +static inline void grab_space_set_enabled(int enabled)
8111 +{
8112 +       get_current_context()->grab_enabled = enabled;
8113 +}
8114 +
8115 +static inline int is_grab_enabled(reiser4_context * ctx)
8116 +{
8117 +       return ctx->grab_enabled;
8118 +}
8119 +
8120 +/* mark transaction handle in @ctx as TXNH_DONT_COMMIT, so that no commit or
8121 + * flush would be performed when it is closed. This is necessary when handle
8122 + * has to be closed under some coarse semaphore, like i_mutex of
8123 + * directory. Commit will be performed by ktxnmgrd. */
8124 +static inline void context_set_commit_async(reiser4_context * context)
8125 +{
8126 +       context->nobalance = 1;
8127 +       context->trans->flags |= TXNH_DONT_COMMIT;
8128 +}
8129 +
8130 +/* __REISER4_CONTEXT_H__ */
8131 +#endif
8132 +
8133 +/* Make Linus happy.
8134 +   Local variables:
8135 +   c-indentation-style: "K&R"
8136 +   mode-name: "LC"
8137 +   c-basic-offset: 8
8138 +   tab-width: 8
8139 +   fill-column: 120
8140 +   scroll-step: 1
8141 +   End:
8142 +*/
8143 diff -puN /dev/null fs/reiser4/coord.c
8144 --- /dev/null
8145 +++ a/fs/reiser4/coord.c
8146 @@ -0,0 +1,928 @@
8147 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
8148 +   reiser4/README */
8149 +
8150 +#include "forward.h"
8151 +#include "debug.h"
8152 +#include "dformat.h"
8153 +#include "tree.h"
8154 +#include "plugin/item/item.h"
8155 +#include "znode.h"
8156 +#include "coord.h"
8157 +
8158 +/* Internal constructor. */
8159 +static inline void
8160 +coord_init_values(coord_t *coord, const znode * node, pos_in_node_t item_pos,
8161 +                 pos_in_node_t unit_pos, between_enum between)
8162 +{
8163 +       coord->node = (znode *) node;
8164 +       coord_set_item_pos(coord, item_pos);
8165 +       coord->unit_pos = unit_pos;
8166 +       coord->between = between;
8167 +       ON_DEBUG(coord->plug_v = 0);
8168 +       ON_DEBUG(coord->body_v = 0);
8169 +
8170 +       /*ON_TRACE (TRACE_COORDS, "init coord %p node %p: %u %u %s\n", coord,
8171 + node, item_pos, unit_pos, coord_tween_tostring (between)); */
8172 +}
8173 +
8174 +/* after shifting of node content, coord previously set properly may become
8175 +   invalid, try to "normalize" it. */
8176 +void coord_normalize(coord_t *coord)
8177 +{
8178 +       znode *node;
8179 +
8180 +       node = coord->node;
8181 +       assert("vs-683", node);
8182 +
8183 +       coord_clear_iplug(coord);
8184 +
8185 +       if (node_is_empty(node))
8186 +               coord_init_first_unit(coord, node);
8187 +        else if ((coord->between == AFTER_ITEM)
8188 +                  || (coord->between == AFTER_UNIT))
8189 +               return;
8190 +        else if (coord->item_pos == coord_num_items(coord)
8191 +                  && coord->between == BEFORE_ITEM) {
8192 +               coord_dec_item_pos(coord);
8193 +               coord->between = AFTER_ITEM;
8194 +       } else if (coord->unit_pos == coord_num_units(coord)
8195 +                  && coord->between == BEFORE_UNIT) {
8196 +               coord->unit_pos--;
8197 +               coord->between = AFTER_UNIT;
8198 +       } else if (coord->item_pos == coord_num_items(coord)
8199 +                  && coord->unit_pos == 0 && coord->between == BEFORE_UNIT) {
8200 +               coord_dec_item_pos(coord);
8201 +               coord->unit_pos = 0;
8202 +               coord->between = AFTER_ITEM;
8203 +       }
8204 +}
8205 +
8206 +/* Copy a coordinate. */
8207 +void coord_dup(coord_t *coord, const coord_t *old_coord)
8208 +{
8209 +       assert("jmacd-9800", coord_check(old_coord));
8210 +       coord_dup_nocheck(coord, old_coord);
8211 +}
8212 +
8213 +/* Copy a coordinate without check. Useful when old_coord->node is not
8214 +   loaded. As in cbk_tree_lookup -> connect_znode -> connect_one_side */
8215 +void coord_dup_nocheck(coord_t *coord, const coord_t *old_coord)
8216 +{
8217 +       coord->node = old_coord->node;
8218 +       coord_set_item_pos(coord, old_coord->item_pos);
8219 +       coord->unit_pos = old_coord->unit_pos;
8220 +       coord->between = old_coord->between;
8221 +       coord->iplugid = old_coord->iplugid;
8222 +       ON_DEBUG(coord->plug_v = old_coord->plug_v);
8223 +       ON_DEBUG(coord->body_v = old_coord->body_v);
8224 +}
8225 +
8226 +/* Initialize an invalid coordinate. */
8227 +void coord_init_invalid(coord_t *coord, const znode * node)
8228 +{
8229 +       coord_init_values(coord, node, 0, 0, INVALID_COORD);
8230 +}
8231 +
8232 +void coord_init_first_unit_nocheck(coord_t *coord, const znode * node)
8233 +{
8234 +       coord_init_values(coord, node, 0, 0, AT_UNIT);
8235 +}
8236 +
8237 +/* Initialize a coordinate to point at the first unit of the first item. If the
8238 +   node is empty, it is positioned at the EMPTY_NODE. */
8239 +void coord_init_first_unit(coord_t *coord, const znode * node)
8240 +{
8241 +       int is_empty = node_is_empty(node);
8242 +
8243 +       coord_init_values(coord, node, 0, 0, (is_empty ? EMPTY_NODE : AT_UNIT));
8244 +
8245 +       assert("jmacd-9801", coord_check(coord));
8246 +}
8247 +
8248 +/* Initialize a coordinate to point at the last unit of the last item.  If the
8249 +   node is empty, it is positioned at the EMPTY_NODE. */
8250 +void coord_init_last_unit(coord_t *coord, const znode * node)
8251 +{
8252 +       int is_empty = node_is_empty(node);
8253 +
8254 +       coord_init_values(coord, node,
8255 +                         (is_empty ? 0 : node_num_items(node) - 1), 0,
8256 +                         (is_empty ? EMPTY_NODE : AT_UNIT));
8257 +       if (!is_empty)
8258 +               coord->unit_pos = coord_last_unit_pos(coord);
8259 +       assert("jmacd-9802", coord_check(coord));
8260 +}
8261 +
8262 +/* Initialize a coordinate to before the first item. If the node is empty, it is
8263 +   positioned at the EMPTY_NODE. */
8264 +void coord_init_before_first_item(coord_t *coord, const znode * node)
8265 +{
8266 +       int is_empty = node_is_empty(node);
8267 +
8268 +       coord_init_values(coord, node, 0, 0,
8269 +                         (is_empty ? EMPTY_NODE : BEFORE_UNIT));
8270 +
8271 +       assert("jmacd-9803", coord_check(coord));
8272 +}
8273 +
8274 +/* Initialize a coordinate to after the last item. If the node is empty, it is
8275 +   positioned at the EMPTY_NODE. */
8276 +void coord_init_after_last_item(coord_t *coord, const znode * node)
8277 +{
8278 +       int is_empty = node_is_empty(node);
8279 +
8280 +       coord_init_values(coord, node,
8281 +                         (is_empty ? 0 : node_num_items(node) - 1), 0,
8282 +                         (is_empty ? EMPTY_NODE : AFTER_ITEM));
8283 +
8284 +       assert("jmacd-9804", coord_check(coord));
8285 +}
8286 +
8287 +/* Initialize a coordinate to after last unit in the item. Coord must be set
8288 +   already to existing item */
8289 +void coord_init_after_item_end(coord_t *coord)
8290 +{
8291 +       coord->between = AFTER_UNIT;
8292 +       coord->unit_pos = coord_last_unit_pos(coord);
8293 +}
8294 +
8295 +/* Initialize a coordinate to before the item. Coord must be set already to
8296 +   existing item */
8297 +void coord_init_before_item(coord_t *coord)
8298 +{
8299 +       coord->unit_pos = 0;
8300 +       coord->between = BEFORE_ITEM;
8301 +}
8302 +
8303 +/* Initialize a coordinate to after the item. Coord must be set already to
8304 +   existing item */
8305 +void coord_init_after_item(coord_t *coord)
8306 +{
8307 +       coord->unit_pos = 0;
8308 +       coord->between = AFTER_ITEM;
8309 +}
8310 +
8311 +/* Initialize a coordinate by 0s. Used in places where init_coord was used and
8312 +   it was not clear how actually */
8313 +void coord_init_zero(coord_t *coord)
8314 +{
8315 +       memset(coord, 0, sizeof(*coord));
8316 +}
8317 +
8318 +/* Return the number of units at the present item.
8319 +   Asserts coord_is_existing_item(). */
8320 +unsigned coord_num_units(const coord_t *coord)
8321 +{
8322 +       assert("jmacd-9806", coord_is_existing_item(coord));
8323 +
8324 +       return item_plugin_by_coord(coord)->b.nr_units(coord);
8325 +}
8326 +
8327 +/* Returns true if the coord was initializewd by coord_init_invalid (). */
8328 +/* Audited by: green(2002.06.15) */
8329 +int coord_is_invalid(const coord_t *coord)
8330 +{
8331 +       return coord->between == INVALID_COORD;
8332 +}
8333 +
8334 +/* Returns true if the coordinate is positioned at an existing item, not before
8335 +   or after an item.  It may be placed at, before, or after any unit within the
8336 +   item, whether existing or not. */
8337 +int coord_is_existing_item(const coord_t *coord)
8338 +{
8339 +       switch (coord->between) {
8340 +       case EMPTY_NODE:
8341 +       case BEFORE_ITEM:
8342 +       case AFTER_ITEM:
8343 +       case INVALID_COORD:
8344 +               return 0;
8345 +
8346 +       case BEFORE_UNIT:
8347 +       case AT_UNIT:
8348 +       case AFTER_UNIT:
8349 +               return coord->item_pos < coord_num_items(coord);
8350 +       }
8351 +
8352 +       impossible("jmacd-9900", "unreachable coord: %p", coord);
8353 +       return 0;
8354 +}
8355 +
8356 +/* Returns true if the coordinate is positioned at an existing unit, not before
8357 +   or after a unit. */
8358 +/* Audited by: green(2002.06.15) */
8359 +int coord_is_existing_unit(const coord_t *coord)
8360 +{
8361 +       switch (coord->between) {
8362 +       case EMPTY_NODE:
8363 +       case BEFORE_UNIT:
8364 +       case AFTER_UNIT:
8365 +       case BEFORE_ITEM:
8366 +       case AFTER_ITEM:
8367 +       case INVALID_COORD:
8368 +               return 0;
8369 +
8370 +       case AT_UNIT:
8371 +               return (coord->item_pos < coord_num_items(coord)
8372 +                       && coord->unit_pos < coord_num_units(coord));
8373 +       }
8374 +
8375 +       impossible("jmacd-9902", "unreachable");
8376 +       return 0;
8377 +}
8378 +
8379 +/* Returns true if the coordinate is positioned at the first unit of the first
8380 +   item. Not true for empty nodes nor coordinates positioned before the first
8381 +   item. */
8382 +/* Audited by: green(2002.06.15) */
8383 +int coord_is_leftmost_unit(const coord_t *coord)
8384 +{
8385 +       return (coord->between == AT_UNIT && coord->item_pos == 0
8386 +               && coord->unit_pos == 0);
8387 +}
8388 +
8389 +#if REISER4_DEBUG
8390 +/* For assertions only, checks for a valid coordinate. */
8391 +int coord_check(const coord_t *coord)
8392 +{
8393 +       if (coord->node == NULL)
8394 +               return 0;
8395 +       if (znode_above_root(coord->node))
8396 +               return 1;
8397 +
8398 +       switch (coord->between) {
8399 +       default:
8400 +       case INVALID_COORD:
8401 +               return 0;
8402 +       case EMPTY_NODE:
8403 +               if (!node_is_empty(coord->node))
8404 +                       return 0;
8405 +               return coord->item_pos == 0 && coord->unit_pos == 0;
8406 +
8407 +       case BEFORE_UNIT:
8408 +       case AFTER_UNIT:
8409 +               if (node_is_empty(coord->node) && (coord->item_pos == 0)
8410 +                   && (coord->unit_pos == 0))
8411 +                       return 1;
8412 +       case AT_UNIT:
8413 +               break;
8414 +       case AFTER_ITEM:
8415 +       case BEFORE_ITEM:
8416 +               /* before/after item should not set unit_pos. */
8417 +               if (coord->unit_pos != 0)
8418 +                       return 0;
8419 +               break;
8420 +       }
8421 +
8422 +       if (coord->item_pos >= node_num_items(coord->node))
8423 +               return 0;
8424 +
8425 +       /* FIXME-VS: we are going to check unit_pos. This makes no sense when
8426 +          between is set either AFTER_ITEM or BEFORE_ITEM */
8427 +       if (coord->between == AFTER_ITEM || coord->between == BEFORE_ITEM)
8428 +               return 1;
8429 +
8430 +       if (coord_is_iplug_set(coord) &&
8431 +           coord->unit_pos >
8432 +           item_plugin_by_coord(coord)->b.nr_units(coord) - 1)
8433 +               return 0;
8434 +       return 1;
8435 +}
8436 +#endif
8437 +
8438 +/* Adjust coordinate boundaries based on the number of items prior to
8439 +   coord_next/prev. Returns 1 if the new position is does not exist. */
8440 +static int coord_adjust_items(coord_t *coord, unsigned items, int is_next)
8441 +{
8442 +       /* If the node is invalid, leave it. */
8443 +       if (coord->between == INVALID_COORD)
8444 +               return 1;
8445 +
8446 +       /* If the node is empty, set it appropriately. */
8447 +       if (items == 0) {
8448 +               coord->between = EMPTY_NODE;
8449 +               coord_set_item_pos(coord, 0);
8450 +               coord->unit_pos = 0;
8451 +               return 1;
8452 +       }
8453 +
8454 +       /* If it was empty and it no longer is, set to BEFORE/AFTER_ITEM. */
8455 +       if (coord->between == EMPTY_NODE) {
8456 +               coord->between = (is_next ? BEFORE_ITEM : AFTER_ITEM);
8457 +               coord_set_item_pos(coord, 0);
8458 +               coord->unit_pos = 0;
8459 +               return 0;
8460 +       }
8461 +
8462 +       /* If the item_pos is out-of-range, set it appropriatly. */
8463 +       if (coord->item_pos >= items) {
8464 +               coord->between = AFTER_ITEM;
8465 +               coord_set_item_pos(coord, items - 1);
8466 +               coord->unit_pos = 0;
8467 +               /* If is_next, return 1 (can't go any further). */
8468 +               return is_next;
8469 +       }
8470 +
8471 +       return 0;
8472 +}
8473 +
8474 +/* Advances the coordinate by one unit to the right.  If empty, no change.  If
8475 +   coord_is_rightmost_unit, advances to AFTER THE LAST ITEM.  Returns 0 if new
8476 +   position is an existing unit. */
8477 +int coord_next_unit(coord_t *coord)
8478 +{
8479 +       unsigned items = coord_num_items(coord);
8480 +
8481 +       if (coord_adjust_items(coord, items, 1) == 1)
8482 +               return 1;
8483 +
8484 +       switch (coord->between) {
8485 +       case BEFORE_UNIT:
8486 +               /* Now it is positioned at the same unit. */
8487 +               coord->between = AT_UNIT;
8488 +               return 0;
8489 +
8490 +       case AFTER_UNIT:
8491 +       case AT_UNIT:
8492 +               /* If it was at or after a unit and there are more units in this
8493 +                  item, advance to the next one. */
8494 +               if (coord->unit_pos < coord_last_unit_pos(coord)) {
8495 +                       coord->unit_pos += 1;
8496 +                       coord->between = AT_UNIT;
8497 +                       return 0;
8498 +               }
8499 +
8500 +               /* Otherwise, it is crossing an item boundary and treated as if
8501 +                  it was after the current item. */
8502 +               coord->between = AFTER_ITEM;
8503 +               coord->unit_pos = 0;
8504 +               /* FALLTHROUGH */
8505 +
8506 +       case AFTER_ITEM:
8507 +               /* Check for end-of-node. */
8508 +               if (coord->item_pos == items - 1)
8509 +                       return 1;
8510 +
8511 +               coord_inc_item_pos(coord);
8512 +               coord->unit_pos = 0;
8513 +               coord->between = AT_UNIT;
8514 +               return 0;
8515 +
8516 +       case BEFORE_ITEM:
8517 +               /* The adjust_items checks ensure that we are valid here. */
8518 +               coord->unit_pos = 0;
8519 +               coord->between = AT_UNIT;
8520 +               return 0;
8521 +
8522 +       case INVALID_COORD:
8523 +       case EMPTY_NODE:
8524 +               /* Handled in coord_adjust_items(). */
8525 +               break;
8526 +       }
8527 +
8528 +       impossible("jmacd-9902", "unreachable");
8529 +       return 0;
8530 +}
8531 +
8532 +/* Advances the coordinate by one item to the right.  If empty, no change.  If
8533 +   coord_is_rightmost_unit, advances to AFTER THE LAST ITEM.  Returns 0 if new
8534 +   position is an existing item. */
8535 +int coord_next_item(coord_t *coord)
8536 +{
8537 +       unsigned items = coord_num_items(coord);
8538 +
8539 +       if (coord_adjust_items(coord, items, 1) == 1)
8540 +               return 1;
8541 +
8542 +       switch (coord->between) {
8543 +       case AFTER_UNIT:
8544 +       case AT_UNIT:
8545 +       case BEFORE_UNIT:
8546 +       case AFTER_ITEM:
8547 +               /* Check for end-of-node. */
8548 +               if (coord->item_pos == items - 1) {
8549 +                       coord->between = AFTER_ITEM;
8550 +                       coord->unit_pos = 0;
8551 +                       coord_clear_iplug(coord);
8552 +                       return 1;
8553 +               }
8554 +
8555 +               /* Anywhere in an item, go to the next one. */
8556 +               coord->between = AT_UNIT;
8557 +               coord_inc_item_pos(coord);
8558 +               coord->unit_pos = 0;
8559 +               return 0;
8560 +
8561 +       case BEFORE_ITEM:
8562 +               /* The out-of-range check ensures that we are valid here. */
8563 +               coord->unit_pos = 0;
8564 +               coord->between = AT_UNIT;
8565 +               return 0;
8566 +       case INVALID_COORD:
8567 +       case EMPTY_NODE:
8568 +               /* Handled in coord_adjust_items(). */
8569 +               break;
8570 +       }
8571 +
8572 +       impossible("jmacd-9903", "unreachable");
8573 +       return 0;
8574 +}
8575 +
8576 +/* Advances the coordinate by one unit to the left.  If empty, no change.  If
8577 +   coord_is_leftmost_unit, advances to BEFORE THE FIRST ITEM.  Returns 0 if new
8578 +   position is an existing unit. */
8579 +int coord_prev_unit(coord_t *coord)
8580 +{
8581 +       unsigned items = coord_num_items(coord);
8582 +
8583 +       if (coord_adjust_items(coord, items, 0) == 1)
8584 +               return 1;
8585 +
8586 +       switch (coord->between) {
8587 +       case AT_UNIT:
8588 +       case BEFORE_UNIT:
8589 +               if (coord->unit_pos > 0) {
8590 +                       coord->unit_pos -= 1;
8591 +                       coord->between = AT_UNIT;
8592 +                       return 0;
8593 +               }
8594 +
8595 +               if (coord->item_pos == 0) {
8596 +                       coord->between = BEFORE_ITEM;
8597 +                       return 1;
8598 +               }
8599 +
8600 +               coord_dec_item_pos(coord);
8601 +               coord->unit_pos = coord_last_unit_pos(coord);
8602 +               coord->between = AT_UNIT;
8603 +               return 0;
8604 +
8605 +       case AFTER_UNIT:
8606 +               /* What if unit_pos is out-of-range? */
8607 +               assert("jmacd-5442",
8608 +                      coord->unit_pos <= coord_last_unit_pos(coord));
8609 +               coord->between = AT_UNIT;
8610 +               return 0;
8611 +
8612 +       case BEFORE_ITEM:
8613 +               if (coord->item_pos == 0)
8614 +                       return 1;
8615 +
8616 +               coord_dec_item_pos(coord);
8617 +               /* FALLTHROUGH */
8618 +
8619 +       case AFTER_ITEM:
8620 +               coord->between = AT_UNIT;
8621 +               coord->unit_pos = coord_last_unit_pos(coord);
8622 +               return 0;
8623 +
8624 +       case INVALID_COORD:
8625 +       case EMPTY_NODE:
8626 +               break;
8627 +       }
8628 +
8629 +       impossible("jmacd-9904", "unreachable");
8630 +       return 0;
8631 +}
8632 +
8633 +/* Advances the coordinate by one item to the left.  If empty, no change.  If
8634 +   coord_is_leftmost_unit, advances to BEFORE THE FIRST ITEM.  Returns 0 if new
8635 +   position is an existing item. */
8636 +int coord_prev_item(coord_t *coord)
8637 +{
8638 +       unsigned items = coord_num_items(coord);
8639 +
8640 +       if (coord_adjust_items(coord, items, 0) == 1)
8641 +               return 1;
8642 +
8643 +       switch (coord->between) {
8644 +       case AT_UNIT:
8645 +       case AFTER_UNIT:
8646 +       case BEFORE_UNIT:
8647 +       case BEFORE_ITEM:
8648 +
8649 +               if (coord->item_pos == 0) {
8650 +                       coord->between = BEFORE_ITEM;
8651 +                       coord->unit_pos = 0;
8652 +                       return 1;
8653 +               }
8654 +
8655 +               coord_dec_item_pos(coord);
8656 +               coord->unit_pos = 0;
8657 +               coord->between = AT_UNIT;
8658 +               return 0;
8659 +
8660 +       case AFTER_ITEM:
8661 +               coord->between = AT_UNIT;
8662 +               coord->unit_pos = 0;
8663 +               return 0;
8664 +
8665 +       case INVALID_COORD:
8666 +       case EMPTY_NODE:
8667 +               break;
8668 +       }
8669 +
8670 +       impossible("jmacd-9905", "unreachable");
8671 +       return 0;
8672 +}
8673 +
8674 +/* Calls either coord_init_first_unit or coord_init_last_unit depending on
8675 +   sideof argument. */
8676 +void coord_init_sideof_unit(coord_t *coord, const znode * node, sideof dir)
8677 +{
8678 +       assert("jmacd-9821", dir == LEFT_SIDE || dir == RIGHT_SIDE);
8679 +       if (dir == LEFT_SIDE) {
8680 +               coord_init_first_unit(coord, node);
8681 +       } else {
8682 +               coord_init_last_unit(coord, node);
8683 +       }
8684 +}
8685 +
8686 +/* Calls either coord_is_before_leftmost or coord_is_after_rightmost depending
8687 +   on sideof argument. */
8688 +/* Audited by: green(2002.06.15) */
8689 +int coord_is_after_sideof_unit(coord_t *coord, sideof dir)
8690 +{
8691 +       assert("jmacd-9822", dir == LEFT_SIDE || dir == RIGHT_SIDE);
8692 +       if (dir == LEFT_SIDE) {
8693 +               return coord_is_before_leftmost(coord);
8694 +       } else {
8695 +               return coord_is_after_rightmost(coord);
8696 +       }
8697 +}
8698 +
8699 +/* Calls either coord_next_unit or coord_prev_unit depending on sideof argument.
8700 + */
8701 +/* Audited by: green(2002.06.15) */
8702 +int coord_sideof_unit(coord_t *coord, sideof dir)
8703 +{
8704 +       assert("jmacd-9823", dir == LEFT_SIDE || dir == RIGHT_SIDE);
8705 +       if (dir == LEFT_SIDE) {
8706 +               return coord_prev_unit(coord);
8707 +       } else {
8708 +               return coord_next_unit(coord);
8709 +       }
8710 +}
8711 +
8712 +#if REISER4_DEBUG
8713 +int coords_equal(const coord_t *c1, const coord_t *c2)
8714 +{
8715 +       assert("nikita-2840", c1 != NULL);
8716 +       assert("nikita-2841", c2 != NULL);
8717 +
8718 +       return
8719 +           c1->node == c2->node &&
8720 +           c1->item_pos == c2->item_pos &&
8721 +           c1->unit_pos == c2->unit_pos && c1->between == c2->between;
8722 +}
8723 +#endif  /*  REISER4_DEBUG  */
8724 +
8725 +/* If coord_is_after_rightmost return NCOORD_ON_THE_RIGHT, if
8726 +   coord_is_after_leftmost return NCOORD_ON_THE_LEFT, otherwise return
8727 +   NCOORD_INSIDE. */
8728 +/* Audited by: green(2002.06.15) */
8729 +coord_wrt_node coord_wrt(const coord_t *coord)
8730 +{
8731 +       if (coord_is_before_leftmost(coord))
8732 +               return COORD_ON_THE_LEFT;
8733 +
8734 +       if (coord_is_after_rightmost(coord))
8735 +               return COORD_ON_THE_RIGHT;
8736 +
8737 +       return COORD_INSIDE;
8738 +}
8739 +
8740 +/* Returns true if the coordinate is positioned after the last item or after the
8741 +   last unit of the last item or it is an empty node. */
8742 +/* Audited by: green(2002.06.15) */
8743 +int coord_is_after_rightmost(const coord_t *coord)
8744 +{
8745 +       assert("jmacd-7313", coord_check(coord));
8746 +
8747 +       switch (coord->between) {
8748 +       case INVALID_COORD:
8749 +       case AT_UNIT:
8750 +       case BEFORE_UNIT:
8751 +       case BEFORE_ITEM:
8752 +               return 0;
8753 +
8754 +       case EMPTY_NODE:
8755 +               return 1;
8756 +
8757 +       case AFTER_ITEM:
8758 +               return (coord->item_pos == node_num_items(coord->node) - 1);
8759 +
8760 +       case AFTER_UNIT:
8761 +               return ((coord->item_pos == node_num_items(coord->node) - 1) &&
8762 +                       coord->unit_pos == coord_last_unit_pos(coord));
8763 +       }
8764 +
8765 +       impossible("jmacd-9908", "unreachable");
8766 +       return 0;
8767 +}
8768 +
8769 +/* Returns true if the coordinate is positioned before the first item or it is
8770 +   an empty node. */
8771 +int coord_is_before_leftmost(const coord_t *coord)
8772 +{
8773 +       /* FIXME-VS: coord_check requires node to be loaded whereas it is not
8774 +          necessary to check if coord is set before leftmost
8775 +          assert ("jmacd-7313", coord_check (coord)); */
8776 +       switch (coord->between) {
8777 +       case INVALID_COORD:
8778 +       case AT_UNIT:
8779 +       case AFTER_ITEM:
8780 +       case AFTER_UNIT:
8781 +               return 0;
8782 +
8783 +       case EMPTY_NODE:
8784 +               return 1;
8785 +
8786 +       case BEFORE_ITEM:
8787 +       case BEFORE_UNIT:
8788 +               return (coord->item_pos == 0) && (coord->unit_pos == 0);
8789 +       }
8790 +
8791 +       impossible("jmacd-9908", "unreachable");
8792 +       return 0;
8793 +}
8794 +
8795 +/* Returns true if the coordinate is positioned after a item, before a item,
8796 +   after the last unit of an item, before the first unit of an item, or at an
8797 +   empty node. */
8798 +/* Audited by: green(2002.06.15) */
8799 +int coord_is_between_items(const coord_t *coord)
8800 +{
8801 +       assert("jmacd-7313", coord_check(coord));
8802 +
8803 +       switch (coord->between) {
8804 +       case INVALID_COORD:
8805 +       case AT_UNIT:
8806 +               return 0;
8807 +
8808 +       case AFTER_ITEM:
8809 +       case BEFORE_ITEM:
8810 +       case EMPTY_NODE:
8811 +               return 1;
8812 +
8813 +       case BEFORE_UNIT:
8814 +               return coord->unit_pos == 0;
8815 +
8816 +       case AFTER_UNIT:
8817 +               return coord->unit_pos == coord_last_unit_pos(coord);
8818 +       }
8819 +
8820 +       impossible("jmacd-9908", "unreachable");
8821 +       return 0;
8822 +}
8823 +
8824 +#if REISER4_DEBUG
8825 +/* Returns true if the coordinates are positioned at adjacent units, regardless
8826 +   of before-after or item boundaries. */
8827 +int coord_are_neighbors(coord_t *c1, coord_t *c2)
8828 +{
8829 +       coord_t *left;
8830 +       coord_t *right;
8831 +
8832 +       assert("nikita-1241", c1 != NULL);
8833 +       assert("nikita-1242", c2 != NULL);
8834 +       assert("nikita-1243", c1->node == c2->node);
8835 +       assert("nikita-1244", coord_is_existing_unit(c1));
8836 +       assert("nikita-1245", coord_is_existing_unit(c2));
8837 +
8838 +       left = right = NULL;
8839 +       switch (coord_compare(c1, c2)) {
8840 +       case COORD_CMP_ON_LEFT:
8841 +               left = c1;
8842 +               right = c2;
8843 +               break;
8844 +       case COORD_CMP_ON_RIGHT:
8845 +               left = c2;
8846 +               right = c1;
8847 +               break;
8848 +       case COORD_CMP_SAME:
8849 +               return 0;
8850 +       default:
8851 +               wrong_return_value("nikita-1246", "compare_coords()");
8852 +       }
8853 +       assert("vs-731", left && right);
8854 +       if (left->item_pos == right->item_pos) {
8855 +               return left->unit_pos + 1 == right->unit_pos;
8856 +       } else if (left->item_pos + 1 == right->item_pos) {
8857 +               return (left->unit_pos == coord_last_unit_pos(left))
8858 +                   && (right->unit_pos == 0);
8859 +       } else {
8860 +               return 0;
8861 +       }
8862 +}
8863 +#endif  /*  REISER4_DEBUG  */
8864 +
8865 +/* Assuming two coordinates are positioned in the same node, return
8866 +   COORD_CMP_ON_RIGHT, COORD_CMP_ON_LEFT, or COORD_CMP_SAME depending on c1's
8867 +   position relative to c2.  */
8868 +/* Audited by: green(2002.06.15) */
8869 +coord_cmp coord_compare(coord_t *c1, coord_t *c2)
8870 +{
8871 +       assert("vs-209", c1->node == c2->node);
8872 +       assert("vs-194", coord_is_existing_unit(c1)
8873 +              && coord_is_existing_unit(c2));
8874 +
8875 +       if (c1->item_pos > c2->item_pos)
8876 +               return COORD_CMP_ON_RIGHT;
8877 +       if (c1->item_pos < c2->item_pos)
8878 +               return COORD_CMP_ON_LEFT;
8879 +       if (c1->unit_pos > c2->unit_pos)
8880 +               return COORD_CMP_ON_RIGHT;
8881 +       if (c1->unit_pos < c2->unit_pos)
8882 +               return COORD_CMP_ON_LEFT;
8883 +       return COORD_CMP_SAME;
8884 +}
8885 +
8886 +/* If the coordinate is between items, shifts it to the right.  Returns 0 on
8887 +   success and non-zero if there is no position to the right. */
8888 +int coord_set_to_right(coord_t *coord)
8889 +{
8890 +       unsigned items = coord_num_items(coord);
8891 +
8892 +       if (coord_adjust_items(coord, items, 1) == 1)
8893 +               return 1;
8894 +
8895 +       switch (coord->between) {
8896 +       case AT_UNIT:
8897 +               return 0;
8898 +
8899 +       case BEFORE_ITEM:
8900 +       case BEFORE_UNIT:
8901 +               coord->between = AT_UNIT;
8902 +               return 0;
8903 +
8904 +       case AFTER_UNIT:
8905 +               if (coord->unit_pos < coord_last_unit_pos(coord)) {
8906 +                       coord->unit_pos += 1;
8907 +                       coord->between = AT_UNIT;
8908 +                       return 0;
8909 +               } else {
8910 +
8911 +                       coord->unit_pos = 0;
8912 +
8913 +                       if (coord->item_pos == items - 1) {
8914 +                               coord->between = AFTER_ITEM;
8915 +                               return 1;
8916 +                       }
8917 +
8918 +                       coord_inc_item_pos(coord);
8919 +                       coord->between = AT_UNIT;
8920 +                       return 0;
8921 +               }
8922 +
8923 +       case AFTER_ITEM:
8924 +               if (coord->item_pos == items - 1)
8925 +                       return 1;
8926 +
8927 +               coord_inc_item_pos(coord);
8928 +               coord->unit_pos = 0;
8929 +               coord->between = AT_UNIT;
8930 +               return 0;
8931 +
8932 +       case EMPTY_NODE:
8933 +               return 1;
8934 +
8935 +       case INVALID_COORD:
8936 +               break;
8937 +       }
8938 +
8939 +       impossible("jmacd-9920", "unreachable");
8940 +       return 0;
8941 +}
8942 +
8943 +/* If the coordinate is between items, shifts it to the left.  Returns 0 on
8944 +   success and non-zero if there is no position to the left. */
8945 +int coord_set_to_left(coord_t *coord)
8946 +{
8947 +       unsigned items = coord_num_items(coord);
8948 +
8949 +       if (coord_adjust_items(coord, items, 0) == 1)
8950 +               return 1;
8951 +
8952 +       switch (coord->between) {
8953 +       case AT_UNIT:
8954 +               return 0;
8955 +
8956 +       case AFTER_UNIT:
8957 +               coord->between = AT_UNIT;
8958 +               return 0;
8959 +
8960 +       case AFTER_ITEM:
8961 +               coord->between = AT_UNIT;
8962 +               coord->unit_pos = coord_last_unit_pos(coord);
8963 +               return 0;
8964 +
8965 +       case BEFORE_UNIT:
8966 +               if (coord->unit_pos > 0) {
8967 +                       coord->unit_pos -= 1;
8968 +                       coord->between = AT_UNIT;
8969 +                       return 0;
8970 +               } else {
8971 +
8972 +                       if (coord->item_pos == 0) {
8973 +                               coord->between = BEFORE_ITEM;
8974 +                               return 1;
8975 +                       }
8976 +
8977 +                       coord->unit_pos = coord_last_unit_pos(coord);
8978 +                       coord_dec_item_pos(coord);
8979 +                       coord->between = AT_UNIT;
8980 +                       return 0;
8981 +               }
8982 +
8983 +       case BEFORE_ITEM:
8984 +               if (coord->item_pos == 0)
8985 +                       return 1;
8986 +
8987 +               coord_dec_item_pos(coord);
8988 +               coord->unit_pos = coord_last_unit_pos(coord);
8989 +               coord->between = AT_UNIT;
8990 +               return 0;
8991 +
8992 +       case EMPTY_NODE:
8993 +               return 1;
8994 +
8995 +       case INVALID_COORD:
8996 +               break;
8997 +       }
8998 +
8999 +       impossible("jmacd-9920", "unreachable");
9000 +       return 0;
9001 +}
9002 +
9003 +static const char *coord_tween_tostring(between_enum n)
9004 +{
9005 +       switch (n) {
9006 +       case BEFORE_UNIT:
9007 +               return "before unit";
9008 +       case BEFORE_ITEM:
9009 +               return "before item";
9010 +       case AT_UNIT:
9011 +               return "at unit";
9012 +       case AFTER_UNIT:
9013 +               return "after unit";
9014 +       case AFTER_ITEM:
9015 +               return "after item";
9016 +       case EMPTY_NODE:
9017 +               return "empty node";
9018 +       case INVALID_COORD:
9019 +               return "invalid";
9020 +       default:
9021 +       {
9022 +               static char buf[30];
9023 +
9024 +               sprintf(buf, "unknown: %i", n);
9025 +               return buf;
9026 +       }
9027 +       }
9028 +}
9029 +
9030 +void print_coord(const char *mes, const coord_t *coord, int node)
9031 +{
9032 +       if (coord == NULL) {
9033 +               printk("%s: null\n", mes);
9034 +               return;
9035 +       }
9036 +       printk("%s: item_pos = %d, unit_pos %d, tween=%s, iplug=%d\n",
9037 +              mes, coord->item_pos, coord->unit_pos,
9038 +              coord_tween_tostring(coord->between), coord->iplugid);
9039 +}
9040 +
9041 +int
9042 +item_utmost_child_real_block(const coord_t *coord, sideof side,
9043 +                            reiser4_block_nr * blk)
9044 +{
9045 +       return item_plugin_by_coord(coord)->f.utmost_child_real_block(coord,
9046 +                                                                     side,
9047 +                                                                     blk);
9048 +}
9049 +
9050 +int item_utmost_child(const coord_t *coord, sideof side, jnode ** child)
9051 +{
9052 +       return item_plugin_by_coord(coord)->f.utmost_child(coord, side, child);
9053 +}
9054 +
9055 +/* @count bytes of flow @f got written, update correspondingly f->length,
9056 +   f->data and f->key */
9057 +void move_flow_forward(flow_t *f, unsigned count)
9058 +{
9059 +       if (f->data)
9060 +               f->data += count;
9061 +       f->length -= count;
9062 +       set_key_offset(&f->key, get_key_offset(&f->key) + count);
9063 +}
9064 +
9065 +/*
9066 +   Local variables:
9067 +   c-indentation-style: "K&R"
9068 +   mode-name: "LC"
9069 +   c-basic-offset: 8
9070 +   tab-width: 8
9071 +   fill-column: 120
9072 +   scroll-step: 1
9073 +   End:
9074 +*/
9075 diff -puN /dev/null fs/reiser4/coord.h
9076 --- /dev/null
9077 +++ a/fs/reiser4/coord.h
9078 @@ -0,0 +1,399 @@
9079 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
9080 +   reiser4/README */
9081 +
9082 +/* Coords */
9083 +
9084 +#if !defined(__REISER4_COORD_H__)
9085 +#define __REISER4_COORD_H__
9086 +
9087 +#include "forward.h"
9088 +#include "debug.h"
9089 +#include "dformat.h"
9090 +#include "key.h"
9091 +
9092 +/* insertions happen between coords in the tree, so we need some means
9093 +   of specifying the sense of betweenness. */
9094 +typedef enum {
9095 +       BEFORE_UNIT, /* Note: we/init_coord depends on this value being zero. */
9096 +       AT_UNIT,
9097 +       AFTER_UNIT,
9098 +       BEFORE_ITEM,
9099 +       AFTER_ITEM,
9100 +       INVALID_COORD,
9101 +       EMPTY_NODE,
9102 +} between_enum;
9103 +
9104 +/* location of coord w.r.t. its node */
9105 +typedef enum {
9106 +       COORD_ON_THE_LEFT = -1,
9107 +       COORD_ON_THE_RIGHT = +1,
9108 +       COORD_INSIDE = 0
9109 +} coord_wrt_node;
9110 +
9111 +typedef enum {
9112 +       COORD_CMP_SAME = 0, COORD_CMP_ON_LEFT = -1, COORD_CMP_ON_RIGHT = +1
9113 +} coord_cmp;
9114 +
9115 +struct coord {
9116 +       /* node in a tree */
9117 +       /*  0 */ znode *node;
9118 +
9119 +       /* position of item within node */
9120 +       /*  4 */ pos_in_node_t item_pos;
9121 +       /* position of unit within item */
9122 +       /*  6 */ pos_in_node_t unit_pos;
9123 +       /* optimization: plugin of item is stored in coord_t. Until this was
9124 +          implemented, item_plugin_by_coord() was major CPU consumer. ->iplugid
9125 +          is invalidated (set to 0xff) on each modification of ->item_pos,
9126 +          and all such modifications are funneled through coord_*_item_pos()
9127 +          functions below.
9128 +        */
9129 +       /*  8 */ char iplugid;
9130 +       /* position of coord w.r.t. to neighboring items and/or units.
9131 +          Values are taken from &between_enum above.
9132 +        */
9133 +       /*  9 */ char between;
9134 +       /* padding. It will be added by the compiler anyway to conform to the
9135 +        * C language alignment requirements. We keep it here to be on the
9136 +        * safe side and to have a clear picture of the memory layout of this
9137 +        * structure. */
9138 +       /* 10 */ __u16 pad;
9139 +       /* 12 */ int offset;
9140 +#if REISER4_DEBUG
9141 +       unsigned long plug_v;
9142 +       unsigned long body_v;
9143 +#endif
9144 +};
9145 +
9146 +#define INVALID_PLUGID  ((char)((1 << 8) - 1))
9147 +#define INVALID_OFFSET -1
9148 +
9149 +static inline void coord_clear_iplug(coord_t *coord)
9150 +{
9151 +       assert("nikita-2835", coord != NULL);
9152 +       coord->iplugid = INVALID_PLUGID;
9153 +       coord->offset = INVALID_OFFSET;
9154 +}
9155 +
9156 +static inline int coord_is_iplug_set(const coord_t *coord)
9157 +{
9158 +       assert("nikita-2836", coord != NULL);
9159 +       return coord->iplugid != INVALID_PLUGID;
9160 +}
9161 +
9162 +static inline void coord_set_item_pos(coord_t *coord, pos_in_node_t pos)
9163 +{
9164 +       assert("nikita-2478", coord != NULL);
9165 +       coord->item_pos = pos;
9166 +       coord_clear_iplug(coord);
9167 +}
9168 +
9169 +static inline void coord_dec_item_pos(coord_t *coord)
9170 +{
9171 +       assert("nikita-2480", coord != NULL);
9172 +       --coord->item_pos;
9173 +       coord_clear_iplug(coord);
9174 +}
9175 +
9176 +static inline void coord_inc_item_pos(coord_t *coord)
9177 +{
9178 +       assert("nikita-2481", coord != NULL);
9179 +       ++coord->item_pos;
9180 +       coord_clear_iplug(coord);
9181 +}
9182 +
9183 +static inline void coord_add_item_pos(coord_t *coord, int delta)
9184 +{
9185 +       assert("nikita-2482", coord != NULL);
9186 +       coord->item_pos += delta;
9187 +       coord_clear_iplug(coord);
9188 +}
9189 +
9190 +static inline void coord_invalid_item_pos(coord_t *coord)
9191 +{
9192 +       assert("nikita-2832", coord != NULL);
9193 +       coord->item_pos = (unsigned short)~0;
9194 +       coord_clear_iplug(coord);
9195 +}
9196 +
9197 +/* Reverse a direction. */
9198 +static inline sideof sideof_reverse(sideof side)
9199 +{
9200 +       return side == LEFT_SIDE ? RIGHT_SIDE : LEFT_SIDE;
9201 +}
9202 +
9203 +/* NOTE: There is a somewhat odd mixture of the following opposed terms:
9204 +
9205 +   "first" and "last"
9206 +   "next" and "prev"
9207 +   "before" and "after"
9208 +   "leftmost" and "rightmost"
9209 +
9210 +   But I think the chosen names are decent the way they are.
9211 +*/
9212 +
9213 +/* COORD INITIALIZERS */
9214 +
9215 +/* Initialize an invalid coordinate. */
9216 +extern void coord_init_invalid(coord_t *coord, const znode * node);
9217 +
9218 +extern void coord_init_first_unit_nocheck(coord_t *coord, const znode * node);
9219 +
9220 +/* Initialize a coordinate to point at the first unit of the first item. If the
9221 +   node is empty, it is positioned at the EMPTY_NODE. */
9222 +extern void coord_init_first_unit(coord_t *coord, const znode * node);
9223 +
9224 +/* Initialize a coordinate to point at the last unit of the last item. If the
9225 +   node is empty, it is positioned at the EMPTY_NODE. */
9226 +extern void coord_init_last_unit(coord_t *coord, const znode * node);
9227 +
9228 +/* Initialize a coordinate to before the first item. If the node is empty, it is
9229 +   positioned at the EMPTY_NODE. */
9230 +extern void coord_init_before_first_item(coord_t *coord, const znode * node);
9231 +
9232 +/* Initialize a coordinate to after the last item. If the node is empty, it is
9233 +   positioned at the EMPTY_NODE. */
9234 +extern void coord_init_after_last_item(coord_t *coord, const znode * node);
9235 +
9236 +/* Initialize a coordinate to after last unit in the item. Coord must be set
9237 +   already to existing item */
9238 +void coord_init_after_item_end(coord_t *coord);
9239 +
9240 +/* Initialize a coordinate to before the item. Coord must be set already to
9241 +   existing item */
9242 +void coord_init_before_item(coord_t *);
9243 +/* Initialize a coordinate to after the item. Coord must be set already to
9244 +   existing item */
9245 +void coord_init_after_item(coord_t *);
9246 +
9247 +/* Calls either coord_init_first_unit or coord_init_last_unit depending on
9248 +   sideof argument. */
9249 +extern void coord_init_sideof_unit(coord_t *coord, const znode * node,
9250 +                                  sideof dir);
9251 +
9252 +/* Initialize a coordinate by 0s. Used in places where init_coord was used and
9253 +   it was not clear how actually
9254 +   FIXME-VS: added by vs (2002, june, 8) */
9255 +extern void coord_init_zero(coord_t *coord);
9256 +
9257 +/* COORD METHODS */
9258 +
9259 +/* after shifting of node content, coord previously set properly may become
9260 +   invalid, try to "normalize" it. */
9261 +void coord_normalize(coord_t *coord);
9262 +
9263 +/* Copy a coordinate. */
9264 +extern void coord_dup(coord_t *coord, const coord_t *old_coord);
9265 +
9266 +/* Copy a coordinate without check. */
9267 +void coord_dup_nocheck(coord_t *coord, const coord_t *old_coord);
9268 +
9269 +unsigned coord_num_units(const coord_t *coord);
9270 +
9271 +/* Return the last valid unit number at the present item (i.e.,
9272 +   coord_num_units() - 1). */
9273 +static inline unsigned coord_last_unit_pos(const coord_t *coord)
9274 +{
9275 +       return coord_num_units(coord) - 1;
9276 +}
9277 +
9278 +#if REISER4_DEBUG
9279 +/* For assertions only, checks for a valid coordinate. */
9280 +extern int coord_check(const coord_t *coord);
9281 +
9282 +extern unsigned long znode_times_locked(const znode * z);
9283 +
9284 +static inline void coord_update_v(coord_t *coord)
9285 +{
9286 +       coord->plug_v = coord->body_v = znode_times_locked(coord->node);
9287 +}
9288 +#endif
9289 +
9290 +extern int coords_equal(const coord_t *c1, const coord_t *c2);
9291 +
9292 +extern void print_coord(const char *mes, const coord_t *coord, int print_node);
9293 +
9294 +/* If coord_is_after_rightmost return NCOORD_ON_THE_RIGHT, if
9295 +   coord_is_after_leftmost return NCOORD_ON_THE_LEFT, otherwise return
9296 +   NCOORD_INSIDE. */
9297 +extern coord_wrt_node coord_wrt(const coord_t *coord);
9298 +
9299 +/* Returns true if the coordinates are positioned at adjacent units, regardless
9300 +   of before-after or item boundaries. */
9301 +extern int coord_are_neighbors(coord_t *c1, coord_t *c2);
9302 +
9303 +/* Assuming two coordinates are positioned in the same node, return
9304 +   NCOORD_CMP_ON_RIGHT, NCOORD_CMP_ON_LEFT, or NCOORD_CMP_SAME depending on c1's
9305 +   position relative to c2.  */
9306 +extern coord_cmp coord_compare(coord_t *c1, coord_t *c2);
9307 +
9308 +/* COORD PREDICATES */
9309 +
9310 +/* Returns true if the coord was initializewd by coord_init_invalid (). */
9311 +extern int coord_is_invalid(const coord_t *coord);
9312 +
9313 +/* Returns true if the coordinate is positioned at an existing item, not before
9314 +   or after an item. It may be placed at, before, or after any unit within the
9315 +   item, whether existing or not. If this is true you can call methods of the
9316 +   item plugin.  */
9317 +extern int coord_is_existing_item(const coord_t *coord);
9318 +
9319 +/* Returns true if the coordinate is positioned after a item, before a item,
9320 +   after the last unit of an item, before the first unit of an item, or at an
9321 +   empty node. */
9322 +extern int coord_is_between_items(const coord_t *coord);
9323 +
9324 +/* Returns true if the coordinate is positioned at an existing unit, not before
9325 +   or after a unit. */
9326 +extern int coord_is_existing_unit(const coord_t *coord);
9327 +
9328 +/* Returns true if the coordinate is positioned at an empty node. */
9329 +extern int coord_is_empty(const coord_t *coord);
9330 +
9331 +/* Returns true if the coordinate is positioned at the first unit of the first
9332 +   item. Not true for empty nodes nor coordinates positioned before the first
9333 +   item. */
9334 +extern int coord_is_leftmost_unit(const coord_t *coord);
9335 +
9336 +/* Returns true if the coordinate is positioned after the last item or after the
9337 +   last unit of the last item or it is an empty node. */
9338 +extern int coord_is_after_rightmost(const coord_t *coord);
9339 +
9340 +/* Returns true if the coordinate is positioned before the first item or it is
9341 +    an empty node. */
9342 +extern int coord_is_before_leftmost(const coord_t *coord);
9343 +
9344 +/* Calls either coord_is_before_leftmost or coord_is_after_rightmost depending
9345 +   on sideof argument. */
9346 +extern int coord_is_after_sideof_unit(coord_t *coord, sideof dir);
9347 +
9348 +/* COORD MODIFIERS */
9349 +
9350 +/* Advances the coordinate by one unit to the right.  If empty, no change.  If
9351 +   coord_is_rightmost_unit, advances to AFTER THE LAST ITEM.  Returns 0 if new
9352 +   position is an existing unit. */
9353 +extern int coord_next_unit(coord_t *coord);
9354 +
9355 +/* Advances the coordinate by one item to the right.  If empty, no change.  If
9356 +   coord_is_rightmost_unit, advances to AFTER THE LAST ITEM.  Returns 0 if new
9357 +   position is an existing item. */
9358 +extern int coord_next_item(coord_t *coord);
9359 +
9360 +/* Advances the coordinate by one unit to the left.  If empty, no change.  If
9361 +   coord_is_leftmost_unit, advances to BEFORE THE FIRST ITEM.  Returns 0 if new
9362 +   position is an existing unit. */
9363 +extern int coord_prev_unit(coord_t *coord);
9364 +
9365 +/* Advances the coordinate by one item to the left.  If empty, no change. If
9366 +   coord_is_leftmost_unit, advances to BEFORE THE FIRST ITEM. Returns 0 if new
9367 +   position is an existing item. */
9368 +extern int coord_prev_item(coord_t *coord);
9369 +
9370 +/* If the coordinate is between items, shifts it to the right.  Returns 0 on
9371 +   success and non-zero if there is no position to the right. */
9372 +extern int coord_set_to_right(coord_t *coord);
9373 +
9374 +/* If the coordinate is between items, shifts it to the left.  Returns 0 on
9375 +   success and non-zero if there is no position to the left. */
9376 +extern int coord_set_to_left(coord_t *coord);
9377 +
9378 +/* If the coordinate is at an existing unit, set to after that unit.  Returns 0
9379 +   on success and non-zero if the unit did not exist. */
9380 +extern int coord_set_after_unit(coord_t *coord);
9381 +
9382 +/* Calls either coord_next_unit or coord_prev_unit depending on sideof
9383 +   argument. */
9384 +extern int coord_sideof_unit(coord_t *coord, sideof dir);
9385 +
9386 +/* iterate over all units in @node */
9387 +#define for_all_units(coord, node)                                     \
9388 +       for (coord_init_before_first_item((coord), (node)) ;            \
9389 +            coord_next_unit(coord) == 0 ;)
9390 +
9391 +/* iterate over all items in @node */
9392 +#define for_all_items(coord, node)                                     \
9393 +       for (coord_init_before_first_item((coord), (node)) ;            \
9394 +            coord_next_item(coord) == 0 ;)
9395 +
9396 +/* COORD/ITEM METHODS */
9397 +
9398 +extern int item_utmost_child_real_block(const coord_t *coord, sideof side,
9399 +                                       reiser4_block_nr * blk);
9400 +extern int item_utmost_child(const coord_t *coord, sideof side,
9401 +                            jnode ** child);
9402 +
9403 +/* a flow is a sequence of bytes being written to or read from the tree.  The
9404 +   tree will slice the flow into items while storing it into nodes, but all of
9405 +   that is hidden from anything outside the tree.  */
9406 +
9407 +struct flow {
9408 +       reiser4_key key;        /* key of start of flow's sequence of bytes */
9409 +       loff_t length;          /* length of flow's sequence of bytes */
9410 +       char *data;             /* start of flow's sequence of bytes */
9411 +       int user;               /* if 1 data is user space, 0 - kernel space */
9412 +       rw_op op;               /* NIKITA-FIXME-HANS: comment is where?  */
9413 +};
9414 +
9415 +void move_flow_forward(flow_t *f, unsigned count);
9416 +
9417 +/* &reiser4_item_data - description of data to be inserted or pasted
9418 +
9419 +   Q: articulate the reasons for the difference between this and flow.
9420 +
9421 +   A: Becides flow we insert into tree other things: stat data, directory
9422 +   entry, etc.  To insert them into tree one has to provide this structure. If
9423 +   one is going to insert flow - he can use insert_flow, where this structure
9424 +   does not have to be created
9425 +*/
9426 +struct reiser4_item_data {
9427 +       /* actual data to be inserted. If NULL, ->create_item() will not
9428 +          do xmemcpy itself, leaving this up to the caller. This can
9429 +          save some amount of unnecessary memory copying, for example,
9430 +          during insertion of stat data.
9431 +
9432 +        */
9433 +       char *data;
9434 +       /* 1 if 'char * data' contains pointer to user space and 0 if it is
9435 +          kernel space */
9436 +       int user;
9437 +       /* amount of data we are going to insert or paste */
9438 +       int length;
9439 +       /* "Arg" is opaque data that is passed down to the
9440 +          ->create_item() method of node layout, which in turn
9441 +          hands it to the ->create_hook() of item being created. This
9442 +          arg is currently used by:
9443 +
9444 +          .  ->create_hook() of internal item
9445 +          (fs/reiser4/plugin/item/internal.c:internal_create_hook()),
9446 +          . ->paste() method of directory item.
9447 +          . ->create_hook() of extent item
9448 +
9449 +          For internal item, this is left "brother" of new node being
9450 +          inserted and it is used to add new node into sibling list
9451 +          after parent to it was just inserted into parent.
9452 +
9453 +          While ->arg does look somewhat of unnecessary compication,
9454 +          it actually saves a lot of headache in many places, because
9455 +          all data necessary to insert or paste new data into tree are
9456 +          collected in one place, and this eliminates a lot of extra
9457 +          argument passing and storing everywhere.
9458 +
9459 +        */
9460 +       void *arg;
9461 +       /* plugin of item we are inserting */
9462 +       item_plugin *iplug;
9463 +};
9464 +
9465 +/* __REISER4_COORD_H__ */
9466 +#endif
9467 +
9468 +/* Make Linus happy.
9469 +   Local variables:
9470 +   c-indentation-style: "K&R"
9471 +   mode-name: "LC"
9472 +   c-basic-offset: 8
9473 +   tab-width: 8
9474 +   fill-column: 120
9475 +   scroll-step: 1
9476 +   End:
9477 +*/
9478 diff -puN /dev/null fs/reiser4/debug.c
9479 --- /dev/null
9480 +++ a/fs/reiser4/debug.c
9481 @@ -0,0 +1,308 @@
9482 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
9483 + * reiser4/README */
9484 +
9485 +/* Debugging facilities. */
9486 +
9487 +/*
9488 + * This file contains generic debugging functions used by reiser4. Roughly
9489 + * following:
9490 + *
9491 + *     panicking: reiser4_do_panic(), reiser4_print_prefix().
9492 + *
9493 + *     locking:
9494 + *     reiser4_schedulable(), reiser4_lock_counters(), print_lock_counters(),
9495 + *     reiser4_no_counters_are_held(), reiser4_commit_check_locks()
9496 + *
9497 + *     error code monitoring (see comment before RETERR macro):
9498 + *     reiser4_return_err(), reiser4_report_err().
9499 + *
9500 + *     stack back-tracing: fill_backtrace()
9501 + *
9502 + *     miscellaneous: reiser4_preempt_point(), call_on_each_assert(),
9503 + *     reiser4_debugtrap().
9504 + *
9505 + */
9506 +
9507 +#include "reiser4.h"
9508 +#include "context.h"
9509 +#include "super.h"
9510 +#include "txnmgr.h"
9511 +#include "znode.h"
9512 +
9513 +#include <linux/sysfs.h>
9514 +#include <linux/slab.h>
9515 +#include <linux/types.h>
9516 +#include <linux/fs.h>
9517 +#include <linux/spinlock.h>
9518 +#include <linux/kallsyms.h>
9519 +#include <linux/vmalloc.h>
9520 +#include <linux/ctype.h>
9521 +#include <linux/sysctl.h>
9522 +#include <linux/hardirq.h>
9523 +
9524 +#if 0
9525 +#if REISER4_DEBUG
9526 +static void reiser4_report_err(void);
9527 +#else
9528 +#define reiser4_report_err() noop
9529 +#endif
9530 +#endif  /*  0  */
9531 +
9532 +/*
9533 + * global buffer where message given to reiser4_panic is formatted.
9534 + */
9535 +static char panic_buf[REISER4_PANIC_MSG_BUFFER_SIZE];
9536 +
9537 +/*
9538 + * lock protecting consistency of panic_buf under concurrent panics
9539 + */
9540 +static DEFINE_SPINLOCK(panic_guard);
9541 +
9542 +/* Your best friend. Call it on each occasion.  This is called by
9543 +    fs/reiser4/debug.h:reiser4_panic(). */
9544 +void reiser4_do_panic(const char *format/* format string */ , ... /* rest */)
9545 +{
9546 +       static int in_panic = 0;
9547 +       va_list args;
9548 +
9549 +       /*
9550 +        * check for recursive panic.
9551 +        */
9552 +       if (in_panic == 0) {
9553 +               in_panic = 1;
9554 +
9555 +               spin_lock(&panic_guard);
9556 +               va_start(args, format);
9557 +               vsnprintf(panic_buf, sizeof(panic_buf), format, args);
9558 +               va_end(args);
9559 +               printk(KERN_EMERG "reiser4 panicked cowardly: %s", panic_buf);
9560 +               spin_unlock(&panic_guard);
9561 +
9562 +               /*
9563 +                * if kernel debugger is configured---drop in. Early dropping
9564 +                * into kgdb is not always convenient, because panic message
9565 +                * is not yet printed most of the times. But:
9566 +                *
9567 +                *     (1) message can be extracted from printk_buf[]
9568 +                *     (declared static inside of printk()), and
9569 +                *
9570 +                *     (2) sometimes serial/kgdb combo dies while printing
9571 +                *     long panic message, so it's more prudent to break into
9572 +                *     debugger earlier.
9573 +                *
9574 +                */
9575 +               DEBUGON(1);
9576 +       }
9577 +       /* to make gcc happy about noreturn attribute */
9578 +       panic("%s", panic_buf);
9579 +}
9580 +
9581 +#if 0
9582 +void
9583 +reiser4_print_prefix(const char *level, int reperr, const char *mid,
9584 +                    const char *function, const char *file, int lineno)
9585 +{
9586 +       const char *comm;
9587 +       int pid;
9588 +
9589 +       if (unlikely(in_interrupt() || in_irq())) {
9590 +               comm = "interrupt";
9591 +               pid = 0;
9592 +       } else {
9593 +               comm = current->comm;
9594 +               pid = current->pid;
9595 +       }
9596 +       printk("%sreiser4[%.16s(%i)]: %s (%s:%i)[%s]:\n",
9597 +              level, comm, pid, function, file, lineno, mid);
9598 +       if (reperr)
9599 +               reiser4_report_err();
9600 +}
9601 +#endif  /*  0  */
9602 +
9603 +/* Preemption point: this should be called periodically during long running
9604 +   operations (carry, allocate, and squeeze are best examples) */
9605 +int reiser4_preempt_point(void)
9606 +{
9607 +       assert("nikita-3008", reiser4_schedulable());
9608 +       cond_resched();
9609 +       return signal_pending(current);
9610 +}
9611 +
9612 +#if REISER4_DEBUG
9613 +/* Debugging aid: return struct where information about locks taken by current
9614 +   thread is accumulated. This can be used to formulate lock ordering
9615 +   constraints and various assertions.
9616 +
9617 +*/
9618 +reiser4_lock_cnt_info *reiser4_lock_counters(void)
9619 +{
9620 +       reiser4_context *ctx = get_current_context();
9621 +       assert("jmacd-1123", ctx != NULL);
9622 +       return &ctx->locks;
9623 +}
9624 +
9625 +/*
9626 + * print human readable information about locks held by the reiser4 context.
9627 + */
9628 +static void print_lock_counters(const char *prefix,
9629 +                               const reiser4_lock_cnt_info * info)
9630 +{
9631 +       printk("%s: jnode: %i, tree: %i (r:%i,w:%i), dk: %i (r:%i,w:%i)\n"
9632 +              "jload: %i, "
9633 +              "txnh: %i, atom: %i, stack: %i, txnmgr: %i, "
9634 +              "ktxnmgrd: %i, fq: %i\n"
9635 +              "inode: %i, "
9636 +              "cbk_cache: %i (r:%i,w%i), "
9637 +              "eflush: %i, "
9638 +              "zlock: %i,\n"
9639 +              "spin: %i, long: %i inode_sem: (r:%i,w:%i)\n"
9640 +              "d: %i, x: %i, t: %i\n", prefix,
9641 +              info->spin_locked_jnode,
9642 +              info->rw_locked_tree, info->read_locked_tree,
9643 +              info->write_locked_tree,
9644 +              info->rw_locked_dk, info->read_locked_dk, info->write_locked_dk,
9645 +              info->spin_locked_jload,
9646 +              info->spin_locked_txnh,
9647 +              info->spin_locked_atom, info->spin_locked_stack,
9648 +              info->spin_locked_txnmgr, info->spin_locked_ktxnmgrd,
9649 +              info->spin_locked_fq,
9650 +              info->spin_locked_inode,
9651 +              info->rw_locked_cbk_cache,
9652 +              info->read_locked_cbk_cache,
9653 +              info->write_locked_cbk_cache,
9654 +              info->spin_locked_super_eflush,
9655 +              info->spin_locked_zlock,
9656 +              info->spin_locked,
9657 +              info->long_term_locked_znode,
9658 +              info->inode_sem_r, info->inode_sem_w,
9659 +              info->d_refs, info->x_refs, info->t_refs);
9660 +}
9661 +
9662 +/* check that no spinlocks are held */
9663 +int reiser4_schedulable(void)
9664 +{
9665 +       if (get_current_context_check() != NULL) {
9666 +               if (!LOCK_CNT_NIL(spin_locked)) {
9667 +                       print_lock_counters("in atomic", reiser4_lock_counters());
9668 +                       return 0;
9669 +               }
9670 +       }
9671 +       might_sleep();
9672 +       return 1;
9673 +}
9674 +/*
9675 + * return true, iff no locks are held.
9676 + */
9677 +int reiser4_no_counters_are_held(void)
9678 +{
9679 +       reiser4_lock_cnt_info *counters;
9680 +
9681 +       counters = reiser4_lock_counters();
9682 +       return
9683 +           (counters->spin_locked_zlock == 0) &&
9684 +           (counters->spin_locked_jnode == 0) &&
9685 +           (counters->rw_locked_tree == 0) &&
9686 +           (counters->read_locked_tree == 0) &&
9687 +           (counters->write_locked_tree == 0) &&
9688 +           (counters->rw_locked_dk == 0) &&
9689 +           (counters->read_locked_dk == 0) &&
9690 +           (counters->write_locked_dk == 0) &&
9691 +           (counters->spin_locked_txnh == 0) &&
9692 +           (counters->spin_locked_atom == 0) &&
9693 +           (counters->spin_locked_stack == 0) &&
9694 +           (counters->spin_locked_txnmgr == 0) &&
9695 +           (counters->spin_locked_inode == 0) &&
9696 +           (counters->spin_locked == 0) &&
9697 +           (counters->long_term_locked_znode == 0) &&
9698 +           (counters->inode_sem_r == 0) &&
9699 +           (counters->inode_sem_w == 0) && (counters->d_refs == 0);
9700 +}
9701 +
9702 +/*
9703 + * return true, iff transaction commit can be done under locks held by the
9704 + * current thread.
9705 + */
9706 +int reiser4_commit_check_locks(void)
9707 +{
9708 +       reiser4_lock_cnt_info *counters;
9709 +       int inode_sem_r;
9710 +       int inode_sem_w;
9711 +       int result;
9712 +
9713 +       /*
9714 +        * inode's read/write semaphore is the only reiser4 lock that can be
9715 +        * held during commit.
9716 +        */
9717 +
9718 +       counters = reiser4_lock_counters();
9719 +       inode_sem_r = counters->inode_sem_r;
9720 +       inode_sem_w = counters->inode_sem_w;
9721 +
9722 +       counters->inode_sem_r = counters->inode_sem_w = 0;
9723 +       result = reiser4_no_counters_are_held();
9724 +       counters->inode_sem_r = inode_sem_r;
9725 +       counters->inode_sem_w = inode_sem_w;
9726 +       return result;
9727 +}
9728 +
9729 +/*
9730 + * fill "error site" in the current reiser4 context. See comment before RETERR
9731 + * macro for more details.
9732 + */
9733 +void reiser4_return_err(int code, const char *file, int line)
9734 +{
9735 +       if (code < 0 && is_in_reiser4_context()) {
9736 +               reiser4_context *ctx = get_current_context();
9737 +
9738 +               if (ctx != NULL) {
9739 +                       ctx->err.code = code;
9740 +                       ctx->err.file = file;
9741 +                       ctx->err.line = line;
9742 +               }
9743 +       }
9744 +}
9745 +
9746 +#if 0
9747 +/*
9748 + * report error information recorder by reiser4_return_err().
9749 + */
9750 +static void reiser4_report_err(void)
9751 +{
9752 +       reiser4_context *ctx = get_current_context_check();
9753 +
9754 +       if (ctx != NULL) {
9755 +               if (ctx->err.code != 0) {
9756 +                       printk("code: %i at %s:%i\n",
9757 +                              ctx->err.code, ctx->err.file, ctx->err.line);
9758 +               }
9759 +       }
9760 +}
9761 +#endif  /*  0  */
9762 +
9763 +#endif                         /* REISER4_DEBUG */
9764 +
9765 +#if KERNEL_DEBUGGER
9766 +
9767 +/*
9768 + * this functions just drops into kernel debugger. It is a convenient place to
9769 + * put breakpoint in.
9770 + */
9771 +void reiser4_debugtrap(void)
9772 +{
9773 +       /* do nothing. Put break point here. */
9774 +#if defined(CONFIG_KGDB) && !defined(CONFIG_REISER4_FS_MODULE)
9775 +       extern void kgdb_breakpoint(void);
9776 +       kgdb_breakpoint();
9777 +#endif
9778 +}
9779 +#endif
9780 +
9781 +/* Make Linus happy.
9782 +   Local variables:
9783 +   c-indentation-style: "K&R"
9784 +   mode-name: "LC"
9785 +   c-basic-offset: 8
9786 +   tab-width: 8
9787 +   fill-column: 120
9788 +   End:
9789 +*/
9790 diff -puN /dev/null fs/reiser4/debug.h
9791 --- /dev/null
9792 +++ a/fs/reiser4/debug.h
9793 @@ -0,0 +1,351 @@
9794 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
9795 +   reiser4/README */
9796 +
9797 +/* Declarations of debug macros. */
9798 +
9799 +#if !defined(__FS_REISER4_DEBUG_H__)
9800 +#define __FS_REISER4_DEBUG_H__
9801 +
9802 +#include "forward.h"
9803 +#include "reiser4.h"
9804 +
9805 +/* generic function to produce formatted output, decorating it with
9806 +   whatever standard prefixes/postfixes we want. "Fun" is a function
9807 +   that will be actually called, can be printk, panic etc.
9808 +   This is for use by other debugging macros, not by users. */
9809 +#define DCALL(lev, fun, reperr, label, format, ...)                    \
9810 +({                                                                     \
9811 +       fun(lev "reiser4[%.16s(%i)]: %s (%s:%i)[%s]:\n" format "\n" ,   \
9812 +           current->comm, task_pid_nr(current), __FUNCTION__,          \
9813 +           __FILE__, __LINE__, label, ## __VA_ARGS__);                 \
9814 +})
9815 +
9816 +/*
9817 + * cause kernel to crash
9818 + */
9819 +#define reiser4_panic(mid, format, ...)                                \
9820 +       DCALL("", reiser4_do_panic, 1, mid, format , ## __VA_ARGS__)
9821 +
9822 +/* print message with indication of current process, file, line and
9823 +   function */
9824 +#define reiser4_log(label, format, ...)                                \
9825 +       DCALL(KERN_DEBUG, printk, 0, label, format , ## __VA_ARGS__)
9826 +
9827 +/* Assertion checked during compilation.
9828 +    If "cond" is false (0) we get duplicate case label in switch.
9829 +    Use this to check something like famous
9830 +       cassert (sizeof(struct reiserfs_journal_commit) == 4096) ;
9831 +    in 3.x journal.c. If cassertion fails you get compiler error,
9832 +    so no "maintainer-id".
9833 +*/
9834 +#define cassert(cond) ({ switch (-1) { case (cond): case 0: break; } })
9835 +
9836 +#define noop   do {; } while (0)
9837 +
9838 +#if REISER4_DEBUG
9839 +/* version of info that only actually prints anything when _d_ebugging
9840 +    is on */
9841 +#define dinfo(format, ...) printk(format , ## __VA_ARGS__)
9842 +/* macro to catch logical errors. Put it into `default' clause of
9843 +    switch() statement. */
9844 +#define impossible(label, format, ...)                         \
9845 +       reiser4_panic(label, "impossible: " format , ## __VA_ARGS__)
9846 +/* assert assures that @cond is true. If it is not, reiser4_panic() is
9847 +   called. Use this for checking logical consistency and _never_ call
9848 +   this to check correctness of external data: disk blocks and user-input . */
9849 +#define assert(label, cond)                                            \
9850 +({                                                                     \
9851 +       /* call_on_each_assert(); */                                    \
9852 +       if (cond) {                                                     \
9853 +               /* put negated check to avoid using !(cond) that would lose \
9854 +                * warnings for things like assert(a = b); */           \
9855 +               ;                                                       \
9856 +       } else {                                                        \
9857 +               DEBUGON(1);                                             \
9858 +               reiser4_panic(label, "assertion failed: %s", #cond);    \
9859 +       }                                                               \
9860 +})
9861 +
9862 +/* like assertion, but @expr is evaluated even if REISER4_DEBUG is off. */
9863 +#define check_me(label, expr)  assert(label, (expr))
9864 +
9865 +#define ON_DEBUG(exp) exp
9866 +
9867 +extern int reiser4_schedulable(void);
9868 +extern void call_on_each_assert(void);
9869 +
9870 +#else
9871 +
9872 +#define dinfo(format, args...) noop
9873 +#define impossible(label, format, args...) noop
9874 +#define assert(label, cond) noop
9875 +#define check_me(label, expr)  ((void) (expr))
9876 +#define ON_DEBUG(exp)
9877 +#define reiser4_schedulable() might_sleep()
9878 +
9879 +/* REISER4_DEBUG */
9880 +#endif
9881 +
9882 +#if REISER4_DEBUG
9883 +/* per-thread information about lock acquired by this thread. Used by lock
9884 + * ordering checking in spin_macros.h */
9885 +typedef struct reiser4_lock_cnt_info {
9886 +       int rw_locked_tree;
9887 +       int read_locked_tree;
9888 +       int write_locked_tree;
9889 +
9890 +       int rw_locked_dk;
9891 +       int read_locked_dk;
9892 +       int write_locked_dk;
9893 +
9894 +       int rw_locked_cbk_cache;
9895 +       int read_locked_cbk_cache;
9896 +       int write_locked_cbk_cache;
9897 +
9898 +       int spin_locked_zlock;
9899 +       int spin_locked_jnode;
9900 +       int spin_locked_jload;
9901 +       int spin_locked_txnh;
9902 +       int spin_locked_atom;
9903 +       int spin_locked_stack;
9904 +       int spin_locked_txnmgr;
9905 +       int spin_locked_ktxnmgrd;
9906 +       int spin_locked_fq;
9907 +       int spin_locked_inode;
9908 +       int spin_locked_super_eflush;
9909 +       int spin_locked;
9910 +       int long_term_locked_znode;
9911 +
9912 +       int inode_sem_r;
9913 +       int inode_sem_w;
9914 +
9915 +       int d_refs;
9916 +       int x_refs;
9917 +       int t_refs;
9918 +} reiser4_lock_cnt_info;
9919 +
9920 +extern struct reiser4_lock_cnt_info *reiser4_lock_counters(void);
9921 +#define IN_CONTEXT(a, b) (is_in_reiser4_context() ? (a) : (b))
9922 +
9923 +/* increment lock-counter @counter, if present */
9924 +#define LOCK_CNT_INC(counter)                                  \
9925 +       IN_CONTEXT(++(reiser4_lock_counters()->counter), 0)
9926 +
9927 +/* decrement lock-counter @counter, if present */
9928 +#define LOCK_CNT_DEC(counter)                                  \
9929 +       IN_CONTEXT(--(reiser4_lock_counters()->counter), 0)
9930 +
9931 +/* check that lock-counter is zero. This is for use in assertions */
9932 +#define LOCK_CNT_NIL(counter)                                  \
9933 +       IN_CONTEXT(reiser4_lock_counters()->counter == 0, 1)
9934 +
9935 +/* check that lock-counter is greater than zero. This is for use in
9936 + * assertions */
9937 +#define LOCK_CNT_GTZ(counter)                                  \
9938 +       IN_CONTEXT(reiser4_lock_counters()->counter > 0, 1)
9939 +#define LOCK_CNT_LT(counter,n)                                 \
9940 +       IN_CONTEXT(reiser4_lock_counters()->counter < n, 1)
9941 +
9942 +#else                          /* REISER4_DEBUG */
9943 +
9944 +/* no-op versions on the above */
9945 +
9946 +typedef struct reiser4_lock_cnt_info {
9947 +} reiser4_lock_cnt_info;
9948 +
9949 +#define reiser4_lock_counters() ((reiser4_lock_cnt_info *)NULL)
9950 +#define LOCK_CNT_INC(counter) noop
9951 +#define LOCK_CNT_DEC(counter) noop
9952 +#define LOCK_CNT_NIL(counter) (1)
9953 +#define LOCK_CNT_GTZ(counter) (1)
9954 +#define LOCK_CNT_LT(counter, n) (1)
9955 +
9956 +#endif                         /* REISER4_DEBUG */
9957 +
9958 +#define assert_spin_not_locked(lock) BUG_ON(0)
9959 +#define assert_rw_write_locked(lock) BUG_ON(0)
9960 +#define assert_rw_read_locked(lock) BUG_ON(0)
9961 +#define assert_rw_locked(lock) BUG_ON(0)
9962 +#define assert_rw_not_write_locked(lock) BUG_ON(0)
9963 +#define assert_rw_not_read_locked(lock) BUG_ON(0)
9964 +#define assert_rw_not_locked(lock) BUG_ON(0)
9965 +
9966 +/* flags controlling debugging behavior. Are set through debug_flags=N mount
9967 +   option. */
9968 +typedef enum {
9969 +       /* print a lot of information during panic. When this is on all jnodes
9970 +        * are listed. This can be *very* large output. Usually you don't want
9971 +        * this. Especially over serial line. */
9972 +       REISER4_VERBOSE_PANIC = 0x00000001,
9973 +       /* print a lot of information during umount */
9974 +       REISER4_VERBOSE_UMOUNT = 0x00000002,
9975 +       /* print gathered statistics on umount */
9976 +       REISER4_STATS_ON_UMOUNT = 0x00000004,
9977 +       /* check node consistency */
9978 +       REISER4_CHECK_NODE = 0x00000008
9979 +} reiser4_debug_flags;
9980 +
9981 +extern int is_in_reiser4_context(void);
9982 +
9983 +/*
9984 + * evaluate expression @e only if with reiser4 context
9985 + */
9986 +#define ON_CONTEXT(e)  do {                    \
9987 +       if (is_in_reiser4_context()) {          \
9988 +               e;                              \
9989 +       } } while (0)
9990 +
9991 +/*
9992 + * evaluate expression @e only when within reiser4_context and debugging is
9993 + * on.
9994 + */
9995 +#define ON_DEBUG_CONTEXT(e) ON_DEBUG(ON_CONTEXT(e))
9996 +
9997 +/*
9998 + * complain about unexpected function result and crash. Used in "default"
9999 + * branches of switch statements and alike to assert that invalid results are
10000 + * not silently ignored.
10001 + */
10002 +#define wrong_return_value(label, function)                            \
10003 +       impossible(label, "wrong return value from " function)
10004 +
10005 +/* Issue different types of reiser4 messages to the console */
10006 +#define warning(label, format, ...)                                    \
10007 +       DCALL(KERN_WARNING,                                             \
10008 +              printk, 1, label, "WARNING: " format , ## __VA_ARGS__)
10009 +#define notice(label, format, ...)                                     \
10010 +       DCALL(KERN_NOTICE,                                              \
10011 +              printk, 1, label, "NOTICE: " format , ## __VA_ARGS__)
10012 +
10013 +/* mark not yet implemented functionality */
10014 +#define not_yet(label, format, ...)                            \
10015 +       reiser4_panic(label, "NOT YET IMPLEMENTED: " format , ## __VA_ARGS__)
10016 +
10017 +extern void reiser4_do_panic(const char *format, ...)
10018 +    __attribute__ ((noreturn, format(printf, 1, 2)));
10019 +
10020 +extern int reiser4_preempt_point(void);
10021 +extern void reiser4_print_stats(void);
10022 +
10023 +#if REISER4_DEBUG
10024 +extern int reiser4_no_counters_are_held(void);
10025 +extern int reiser4_commit_check_locks(void);
10026 +#else
10027 +#define reiser4_no_counters_are_held() (1)
10028 +#define reiser4_commit_check_locks() (1)
10029 +#endif
10030 +
10031 +/* true if @i is power-of-two. Useful for rate-limited warnings, etc. */
10032 +#define IS_POW(i)                              \
10033 +({                                             \
10034 +       typeof(i) __i;                          \
10035 +                                               \
10036 +       __i = (i);                              \
10037 +       !(__i & (__i - 1));                     \
10038 +})
10039 +
10040 +#define KERNEL_DEBUGGER (1)
10041 +
10042 +#if KERNEL_DEBUGGER
10043 +
10044 +extern void reiser4_debugtrap(void);
10045 +
10046 +/*
10047 + * Check condition @cond and drop into kernel debugger (kgdb) if it's true. If
10048 + * kgdb is not compiled in, do nothing.
10049 + */
10050 +#define DEBUGON(cond)                                  \
10051 +({                                                     \
10052 +       if (unlikely(cond))                             \
10053 +               reiser4_debugtrap();                    \
10054 +})
10055 +#else
10056 +#define DEBUGON(cond) noop
10057 +#endif
10058 +
10059 +/*
10060 + * Error code tracing facility. (Idea is borrowed from XFS code.)
10061 + *
10062 + * Suppose some strange and/or unexpected code is returned from some function
10063 + * (for example, write(2) returns -EEXIST). It is possible to place a
10064 + * breakpoint in the reiser4_write(), but it is too late here. How to find out
10065 + * in what particular place -EEXIST was generated first?
10066 + *
10067 + * In reiser4 all places where actual error codes are produced (that is,
10068 + * statements of the form
10069 + *
10070 + *     return -EFOO;        // (1), or
10071 + *
10072 + *     result = -EFOO;      // (2)
10073 + *
10074 + * are replaced with
10075 + *
10076 + *     return RETERR(-EFOO);        // (1a), and
10077 + *
10078 + *     result = RETERR(-EFOO);      // (2a) respectively
10079 + *
10080 + * RETERR() macro fills a backtrace in reiser4_context. This back-trace is
10081 + * printed in error and warning messages. Moreover, it's possible to put a
10082 + * conditional breakpoint in reiser4_return_err (low-level function called
10083 + * by RETERR() to do the actual work) to break into debugger immediately
10084 + * when particular error happens.
10085 + *
10086 + */
10087 +
10088 +#if REISER4_DEBUG
10089 +
10090 +/*
10091 + * data-type to store information about where error happened ("error site").
10092 + */
10093 +typedef struct err_site {
10094 +       int code;               /* error code */
10095 +       const char *file;       /* source file, filled by __FILE__ */
10096 +       int line;               /* source file line, filled by __LINE__ */
10097 +} err_site;
10098 +
10099 +extern void reiser4_return_err(int code, const char *file, int line);
10100 +
10101 +/*
10102 + * fill &get_current_context()->err_site with error information.
10103 + */
10104 +#define RETERR(code)                                   \
10105 +({                                                     \
10106 +       typeof(code) __code;                            \
10107 +                                                       \
10108 +       __code = (code);                                \
10109 +       reiser4_return_err(__code, __FILE__, __LINE__); \
10110 +       __code;                                         \
10111 +})
10112 +
10113 +#else
10114 +
10115 +/*
10116 + * no-op versions of the above
10117 + */
10118 +
10119 +typedef struct err_site {
10120 +} err_site;
10121 +#define RETERR(code) code
10122 +#endif
10123 +
10124 +#if REISER4_LARGE_KEY
10125 +/*
10126 + * conditionally compile arguments only if REISER4_LARGE_KEY is on.
10127 + */
10128 +#define ON_LARGE_KEY(...) __VA_ARGS__
10129 +#else
10130 +#define ON_LARGE_KEY(...)
10131 +#endif
10132 +
10133 +/* __FS_REISER4_DEBUG_H__ */
10134 +#endif
10135 +
10136 +/* Make Linus happy.
10137 +   Local variables:
10138 +   c-indentation-style: "K&R"
10139 +   mode-name: "LC"
10140 +   c-basic-offset: 8
10141 +   tab-width: 8
10142 +   fill-column: 120
10143 +   End:
10144 +*/
10145 diff -puN /dev/null fs/reiser4/dformat.h
10146 --- /dev/null
10147 +++ a/fs/reiser4/dformat.h
10148 @@ -0,0 +1,71 @@
10149 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
10150 +   reiser4/README */
10151 +
10152 +/* Formats of on-disk data and conversion functions. */
10153 +
10154 +/* put all item formats in the files describing the particular items,
10155 +   our model is, everything you need to do to add an item to reiser4,
10156 +   (excepting the changes to the plugin that uses the item which go
10157 +   into the file defining that plugin), you put into one file. */
10158 +/* Data on disk are stored in little-endian format.
10159 +   To declare fields of on-disk structures, use d8, d16, d32 and d64.
10160 +   d??tocpu() and cputod??() to convert. */
10161 +
10162 +#if !defined(__FS_REISER4_DFORMAT_H__)
10163 +#define __FS_REISER4_DFORMAT_H__
10164 +
10165 +#include <asm/byteorder.h>
10166 +#include <asm/unaligned.h>
10167 +#include <linux/types.h>
10168 +
10169 +typedef __u8 d8;
10170 +typedef __le16 d16;
10171 +typedef __le32 d32;
10172 +typedef __le64 d64;
10173 +
10174 +#define PACKED __attribute__((packed))
10175 +
10176 +/* data-type for block number */
10177 +typedef __u64 reiser4_block_nr;
10178 +
10179 +/* data-type for block number on disk, disk format */
10180 +typedef __le64 reiser4_dblock_nr;
10181 +
10182 +/**
10183 + * disk_addr_eq - compare disk addresses
10184 + * @b1: pointer to block number ot compare
10185 + * @b2: pointer to block number ot compare
10186 + *
10187 + * Returns true if if disk addresses are the same
10188 + */
10189 +static inline int disk_addr_eq(const reiser4_block_nr * b1,
10190 +                              const reiser4_block_nr * b2)
10191 +{
10192 +       assert("nikita-1033", b1 != NULL);
10193 +       assert("nikita-1266", b2 != NULL);
10194 +
10195 +       return !memcmp(b1, b2, sizeof *b1);
10196 +}
10197 +
10198 +/* structure of master reiser4 super block */
10199 +typedef struct reiser4_master_sb {
10200 +       char magic[16];         /* "ReIsEr4" */
10201 +       __le16 disk_plugin_id;  /* id of disk layout plugin */
10202 +       __le16 blocksize;
10203 +       char uuid[16];          /* unique id */
10204 +       char label[16];         /* filesystem label */
10205 +       __le64 diskmap;         /* location of the diskmap. 0 if not present */
10206 +} reiser4_master_sb;
10207 +
10208 +/* __FS_REISER4_DFORMAT_H__ */
10209 +#endif
10210 +
10211 +/*
10212 + * Local variables:
10213 + * c-indentation-style: "K&R"
10214 + * mode-name: "LC"
10215 + * c-basic-offset: 8
10216 + * tab-width: 8
10217 + * fill-column: 79
10218 + * End:
10219 + */
10220 diff -puN /dev/null fs/reiser4/dscale.c
10221 --- /dev/null
10222 +++ a/fs/reiser4/dscale.c
10223 @@ -0,0 +1,192 @@
10224 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
10225 + * reiser4/README */
10226 +
10227 +/* Scalable on-disk integers */
10228 +
10229 +/*
10230 + * Various on-disk structures contain integer-like structures. Stat-data
10231 + * contain [yes, "data" is plural, check the dictionary] file size, link
10232 + * count; extent unit contains extent width etc. To accommodate for general
10233 + * case enough space is reserved to keep largest possible value. 64 bits in
10234 + * all cases above. But in overwhelming majority of cases numbers actually
10235 + * stored in these fields will be comparatively small and reserving 8 bytes is
10236 + * a waste of precious disk bandwidth.
10237 + *
10238 + * Scalable integers are one way to solve this problem. dscale_write()
10239 + * function stores __u64 value in the given area consuming from 1 to 9 bytes,
10240 + * depending on the magnitude of the value supplied. dscale_read() reads value
10241 + * previously stored by dscale_write().
10242 + *
10243 + * dscale_write() produces format not completely unlike of UTF: two highest
10244 + * bits of the first byte are used to store "tag". One of 4 possible tag
10245 + * values is chosen depending on the number being encoded:
10246 + *
10247 + *           0 ... 0x3f               => 0           [table 1]
10248 + *        0x40 ... 0x3fff             => 1
10249 + *      0x4000 ... 0x3fffffff         => 2
10250 + *  0x40000000 ... 0xffffffffffffffff => 3
10251 + *
10252 + * (see dscale_range() function)
10253 + *
10254 + * Values in the range 0x40000000 ... 0xffffffffffffffff require 8 full bytes
10255 + * to be stored, so in this case there is no place in the first byte to store
10256 + * tag. For such values tag is stored in an extra 9th byte.
10257 + *
10258 + * As _highest_ bits are used for the test (which is natural) scaled integers
10259 + * are stored in BIG-ENDIAN format in contrast with the rest of reiser4 which
10260 + * uses LITTLE-ENDIAN.
10261 + *
10262 + */
10263 +
10264 +#include "debug.h"
10265 +#include "dscale.h"
10266 +
10267 +/* return tag of scaled integer stored at @address */
10268 +static int gettag(const unsigned char *address)
10269 +{
10270 +       /* tag is stored in two highest bits */
10271 +       return (*address) >> 6;
10272 +}
10273 +
10274 +/* clear tag from value. Clear tag embedded into @value. */
10275 +static void cleartag(__u64 *value, int tag)
10276 +{
10277 +       /*
10278 +        * W-w-what ?!
10279 +        *
10280 +        * Actually, this is rather simple: @value passed here was read by
10281 +        * dscale_read(), converted from BIG-ENDIAN, and padded to __u64 by
10282 +        * zeroes. Tag is still stored in the highest (arithmetically)
10283 +        * non-zero bits of @value, but relative position of tag within __u64
10284 +        * depends on @tag.
10285 +        *
10286 +        * For example if @tag is 0, it's stored 2 highest bits of lowest
10287 +        * byte, and its offset (counting from lowest bit) is 8 - 2 == 6 bits.
10288 +        *
10289 +        * If tag is 1, it's stored in two highest bits of 2nd lowest byte,
10290 +        * and it's offset if (2 * 8) - 2 == 14 bits.
10291 +        *
10292 +        * See table 1 above for details.
10293 +        *
10294 +        * All these cases are captured by the formula:
10295 +        */
10296 +       *value &= ~(3 << (((1 << tag) << 3) - 2));
10297 +       /*
10298 +        * That is, clear two (3 == 0t11) bits at the offset
10299 +        *
10300 +        *                  8 * (2 ^ tag) - 2,
10301 +        *
10302 +        * that is, two highest bits of (2 ^ tag)-th byte of @value.
10303 +        */
10304 +}
10305 +
10306 +/* return tag for @value. See table 1 above for details. */
10307 +static int dscale_range(__u64 value)
10308 +{
10309 +       if (value > 0x3fffffff)
10310 +               return 3;
10311 +       if (value > 0x3fff)
10312 +               return 2;
10313 +       if (value > 0x3f)
10314 +               return 1;
10315 +       return 0;
10316 +}
10317 +
10318 +/* restore value stored at @adderss by dscale_write() and return number of
10319 + * bytes consumed */
10320 +int dscale_read(unsigned char *address, __u64 *value)
10321 +{
10322 +       int tag;
10323 +
10324 +       /* read tag */
10325 +       tag = gettag(address);
10326 +       switch (tag) {
10327 +       case 3:
10328 +               /* In this case tag is stored in an extra byte, skip this byte
10329 +                * and decode value stored in the next 8 bytes.*/
10330 +               *value = __be64_to_cpu(get_unaligned((__be64 *)(address + 1)));
10331 +               /* worst case: 8 bytes for value itself plus one byte for
10332 +                * tag. */
10333 +               return 9;
10334 +       case 0:
10335 +               *value = get_unaligned(address);
10336 +               break;
10337 +       case 1:
10338 +               *value = __be16_to_cpu(get_unaligned((__be16 *)address));
10339 +               break;
10340 +       case 2:
10341 +               *value = __be32_to_cpu(get_unaligned((__be32 *)address));
10342 +               break;
10343 +       default:
10344 +               return RETERR(-EIO);
10345 +       }
10346 +       /* clear tag embedded into @value */
10347 +       cleartag(value, tag);
10348 +       /* number of bytes consumed is (2 ^ tag)---see table 1. */
10349 +       return 1 << tag;
10350 +}
10351 +
10352 +/* number of bytes consumed */
10353 +int dscale_bytes_to_read(unsigned char *address)
10354 +{
10355 +       int tag;
10356 +
10357 +       tag = gettag(address);
10358 +       switch (tag) {
10359 +       case 0:
10360 +       case 1:
10361 +       case 2:
10362 +               return 1 << tag;
10363 +       case 3:
10364 +               return 9;
10365 +       default:
10366 +               return RETERR(-EIO);
10367 +       }
10368 +}
10369 +
10370 +/* store @value at @address and return number of bytes consumed */
10371 +int dscale_write(unsigned char *address, __u64 value)
10372 +{
10373 +       int tag;
10374 +       int shift;
10375 +       __be64 v;
10376 +       unsigned char *valarr;
10377 +
10378 +       tag = dscale_range(value);
10379 +       v = __cpu_to_be64(value);
10380 +       valarr = (unsigned char *)&v;
10381 +       shift = (tag == 3) ? 1 : 0;
10382 +       memcpy(address + shift, valarr + sizeof v - (1 << tag), 1 << tag);
10383 +       *address |= (tag << 6);
10384 +       return shift + (1 << tag);
10385 +}
10386 +
10387 +/* number of bytes required to store @value */
10388 +int dscale_bytes_to_write(__u64 value)
10389 +{
10390 +       int bytes;
10391 +
10392 +       bytes = 1 << dscale_range(value);
10393 +       if (bytes == 8)
10394 +               ++bytes;
10395 +       return bytes;
10396 +}
10397 +
10398 +/* returns true if @value and @other require the same number of bytes to be
10399 + * stored. Used by detect when data structure (like stat-data) has to be
10400 + * expanded or contracted. */
10401 +int dscale_fit(__u64 value, __u64 other)
10402 +{
10403 +       return dscale_range(value) == dscale_range(other);
10404 +}
10405 +
10406 +/* Make Linus happy.
10407 +   Local variables:
10408 +   c-indentation-style: "K&R"
10409 +   mode-name: "LC"
10410 +   c-basic-offset: 8
10411 +   tab-width: 8
10412 +   fill-column: 120
10413 +   scroll-step: 1
10414 +   End:
10415 +*/
10416 diff -puN /dev/null fs/reiser4/dscale.h
10417 --- /dev/null
10418 +++ a/fs/reiser4/dscale.h
10419 @@ -0,0 +1,28 @@
10420 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
10421 + * reiser4/README */
10422 +
10423 +/* Scalable on-disk integers. See dscale.h for details. */
10424 +
10425 +#if !defined(__FS_REISER4_DSCALE_H__)
10426 +#define __FS_REISER4_DSCALE_H__
10427 +
10428 +#include "dformat.h"
10429 +
10430 +extern int dscale_read(unsigned char *address, __u64 *value);
10431 +extern int dscale_write(unsigned char *address, __u64 value);
10432 +extern int dscale_bytes_to_read(unsigned char *address);
10433 +extern int dscale_bytes_to_write(__u64 value);
10434 +extern int dscale_fit(__u64 value, __u64 other);
10435 +
10436 +/* __FS_REISER4_DSCALE_H__ */
10437 +#endif
10438 +
10439 +/* Make Linus happy.
10440 +   Local variables:
10441 +   c-indentation-style: "K&R"
10442 +   mode-name: "LC"
10443 +   c-basic-offset: 8
10444 +   tab-width: 8
10445 +   fill-column: 120
10446 +   End:
10447 +*/
10448 diff -puN /dev/null fs/reiser4/entd.c
10449 --- /dev/null
10450 +++ a/fs/reiser4/entd.c
10451 @@ -0,0 +1,335 @@
10452 +/* Copyright 2003, 2004 by Hans Reiser, licensing governed by
10453 + * reiser4/README */
10454 +
10455 +/* Ent daemon. */
10456 +
10457 +#include "debug.h"
10458 +#include "txnmgr.h"
10459 +#include "tree.h"
10460 +#include "entd.h"
10461 +#include "super.h"
10462 +#include "context.h"
10463 +#include "reiser4.h"
10464 +#include "vfs_ops.h"
10465 +#include "page_cache.h"
10466 +#include "inode.h"
10467 +
10468 +#include <linux/sched.h>       /* struct task_struct */
10469 +#include <linux/suspend.h>
10470 +#include <linux/kernel.h>
10471 +#include <linux/writeback.h>
10472 +#include <linux/time.h>                /* INITIAL_JIFFIES */
10473 +#include <linux/backing-dev.h> /* bdi_write_congested */
10474 +#include <linux/wait.h>
10475 +#include <linux/kthread.h>
10476 +#include <linux/freezer.h>
10477 +
10478 +#define DEF_PRIORITY 12
10479 +#define MAX_ENTD_ITERS 10
10480 +
10481 +static void entd_flush(struct super_block *, struct wbq *);
10482 +static int entd(void *arg);
10483 +
10484 +/*
10485 + * set ->comm field of end thread to make its state visible to the user level
10486 + */
10487 +#define entd_set_comm(state)                                   \
10488 +       snprintf(current->comm, sizeof(current->comm),  \
10489 +               "ent:%s%s", super->s_id, (state))
10490 +
10491 +/**
10492 + * reiser4_init_entd - initialize entd context and start kernel daemon
10493 + * @super: super block to start ent thread for
10494 + *
10495 + * Creates entd contexts, starts kernel thread and waits until it
10496 + * initializes.
10497 + */
10498 +int reiser4_init_entd(struct super_block *super)
10499 +{
10500 +       entd_context *ctx;
10501 +
10502 +       assert("nikita-3104", super != NULL);
10503 +
10504 +       ctx = get_entd_context(super);
10505 +
10506 +       memset(ctx, 0, sizeof *ctx);
10507 +       spin_lock_init(&ctx->guard);
10508 +       init_waitqueue_head(&ctx->wait);
10509 +#if REISER4_DEBUG
10510 +       INIT_LIST_HEAD(&ctx->flushers_list);
10511 +#endif
10512 +       /* lists of writepage requests */
10513 +       INIT_LIST_HEAD(&ctx->todo_list);
10514 +       INIT_LIST_HEAD(&ctx->done_list);
10515 +       /* start entd */
10516 +       ctx->tsk = kthread_run(entd, super, "ent:%s", super->s_id);
10517 +       if (IS_ERR(ctx->tsk))
10518 +               return PTR_ERR(ctx->tsk);
10519 +       return 0;
10520 +}
10521 +
10522 +static void put_wbq(struct wbq *rq)
10523 +{
10524 +       iput(rq->mapping->host);
10525 +       complete(&rq->completion);
10526 +}
10527 +
10528 +/* ent should be locked */
10529 +static struct wbq *__get_wbq(entd_context * ent)
10530 +{
10531 +       struct wbq *wbq;
10532 +
10533 +       if (list_empty(&ent->todo_list))
10534 +               return NULL;
10535 +
10536 +       ent->nr_todo_reqs--;
10537 +       wbq = list_entry(ent->todo_list.next, struct wbq, link);
10538 +       list_del_init(&wbq->link);
10539 +       return wbq;
10540 +}
10541 +
10542 +/* ent thread function */
10543 +static int entd(void *arg)
10544 +{
10545 +       struct super_block *super;
10546 +       entd_context *ent;
10547 +       int done = 0;
10548 +
10549 +       super = arg;
10550 +       /* do_fork() just copies task_struct into the new
10551 +          thread. ->fs_context shouldn't be copied of course. This shouldn't
10552 +          be a problem for the rest of the code though.
10553 +        */
10554 +       current->journal_info = NULL;
10555 +
10556 +       ent = get_entd_context(super);
10557 +
10558 +       while (!done) {
10559 +               try_to_freeze();
10560 +
10561 +               spin_lock(&ent->guard);
10562 +               while (ent->nr_todo_reqs != 0) {
10563 +                       struct wbq *rq;
10564 +
10565 +                       assert("", list_empty(&ent->done_list));
10566 +
10567 +                       /* take request from the queue head */
10568 +                       rq = __get_wbq(ent);
10569 +                       assert("", rq != NULL);
10570 +                       ent->cur_request = rq;
10571 +                       spin_unlock(&ent->guard);
10572 +
10573 +                       entd_set_comm("!");
10574 +                       entd_flush(super, rq);
10575 +
10576 +                       put_wbq(rq);
10577 +
10578 +                       /*
10579 +                        * wakeup all requestors and iput their inodes
10580 +                        */
10581 +                       spin_lock(&ent->guard);
10582 +                       while (!list_empty(&ent->done_list)) {
10583 +                               rq = list_entry(ent->done_list.next, struct wbq, link);
10584 +                               list_del_init(&rq->link);
10585 +                               ent->nr_done_reqs--;
10586 +                               spin_unlock(&ent->guard);
10587 +                               assert("", rq->written == 1);
10588 +                               put_wbq(rq);
10589 +                               spin_lock(&ent->guard);
10590 +                       }
10591 +               }
10592 +               spin_unlock(&ent->guard);
10593 +
10594 +               entd_set_comm(".");
10595 +
10596 +               {
10597 +                       DEFINE_WAIT(__wait);
10598 +
10599 +                       do {
10600 +                               prepare_to_wait(&ent->wait, &__wait, TASK_INTERRUPTIBLE);
10601 +                               if (kthread_should_stop()) {
10602 +                                       done = 1;
10603 +                                       break;
10604 +                               }
10605 +                               if (ent->nr_todo_reqs != 0)
10606 +                                       break;
10607 +                               schedule();
10608 +                       } while (0);
10609 +                       finish_wait(&ent->wait, &__wait);
10610 +               }
10611 +       }
10612 +       BUG_ON(ent->nr_todo_reqs != 0);
10613 +       return 0;
10614 +}
10615 +
10616 +/**
10617 + * reiser4_done_entd - stop entd kernel thread
10618 + * @super: super block to stop ent thread for
10619 + *
10620 + * It is called on umount. Sends stop signal to entd and wait until it handles
10621 + * it.
10622 + */
10623 +void reiser4_done_entd(struct super_block *super)
10624 +{
10625 +       entd_context *ent;
10626 +
10627 +       assert("nikita-3103", super != NULL);
10628 +
10629 +       ent = get_entd_context(super);
10630 +       assert("zam-1055", ent->tsk != NULL);
10631 +       kthread_stop(ent->tsk);
10632 +}
10633 +
10634 +/* called at the beginning of jnode_flush to register flusher thread with ent
10635 + * daemon */
10636 +void reiser4_enter_flush(struct super_block *super)
10637 +{
10638 +       entd_context *ent;
10639 +
10640 +       assert("zam-1029", super != NULL);
10641 +       ent = get_entd_context(super);
10642 +
10643 +       assert("zam-1030", ent != NULL);
10644 +
10645 +       spin_lock(&ent->guard);
10646 +       ent->flushers++;
10647 +#if REISER4_DEBUG
10648 +       list_add(&get_current_context()->flushers_link, &ent->flushers_list);
10649 +#endif
10650 +       spin_unlock(&ent->guard);
10651 +}
10652 +
10653 +/* called at the end of jnode_flush */
10654 +void reiser4_leave_flush(struct super_block *super)
10655 +{
10656 +       entd_context *ent;
10657 +       int wake_up_ent;
10658 +
10659 +       assert("zam-1027", super != NULL);
10660 +       ent = get_entd_context(super);
10661 +
10662 +       assert("zam-1028", ent != NULL);
10663 +
10664 +       spin_lock(&ent->guard);
10665 +       ent->flushers--;
10666 +       wake_up_ent = (ent->flushers == 0 && ent->nr_todo_reqs != 0);
10667 +#if REISER4_DEBUG
10668 +       list_del_init(&get_current_context()->flushers_link);
10669 +#endif
10670 +       spin_unlock(&ent->guard);
10671 +       if (wake_up_ent)
10672 +               wake_up_process(ent->tsk);
10673 +}
10674 +
10675 +#define ENTD_CAPTURE_APAGE_BURST SWAP_CLUSTER_MAX
10676 +
10677 +static void entd_flush(struct super_block *super, struct wbq *rq)
10678 +{
10679 +       reiser4_context ctx;
10680 +       int tmp;
10681 +
10682 +       init_stack_context(&ctx, super);
10683 +       ctx.entd = 1;
10684 +       ctx.gfp_mask = GFP_NOFS;
10685 +
10686 +       rq->wbc->range_start = page_offset(rq->page);
10687 +       rq->wbc->range_end = rq->wbc->range_start +
10688 +               (ENTD_CAPTURE_APAGE_BURST << PAGE_CACHE_SHIFT);
10689 +       tmp = rq->wbc->nr_to_write;
10690 +       rq->mapping->a_ops->writepages(rq->mapping, rq->wbc);
10691 +
10692 +       if (rq->wbc->nr_to_write > 0) {
10693 +               rq->wbc->range_start = 0;
10694 +               rq->wbc->range_end = LLONG_MAX;
10695 +               generic_sync_sb_inodes(super, rq->wbc);
10696 +       }
10697 +       rq->wbc->nr_to_write = ENTD_CAPTURE_APAGE_BURST;
10698 +       reiser4_writeout(super, rq->wbc);
10699 +
10700 +       context_set_commit_async(&ctx);
10701 +       reiser4_exit_context(&ctx);
10702 +}
10703 +
10704 +/**
10705 + * write_page_by_ent - ask entd thread to flush this page as part of slum
10706 + * @page: page to be written
10707 + * @wbc: writeback control passed to reiser4_writepage
10708 + *
10709 + * Creates a request, puts it on entd list of requests, wakeups entd if
10710 + * necessary, waits until entd completes with the request.
10711 + */
10712 +int write_page_by_ent(struct page *page, struct writeback_control *wbc)
10713 +{
10714 +       struct super_block *sb;
10715 +       struct inode *inode;
10716 +       entd_context *ent;
10717 +       struct wbq rq;
10718 +
10719 +       assert("", PageLocked(page));
10720 +       assert("", page->mapping != NULL);
10721 +
10722 +       sb = page->mapping->host->i_sb;
10723 +       ent = get_entd_context(sb);
10724 +       assert("", ent && ent->done == 0);
10725 +
10726 +       /*
10727 +        * we are going to unlock page and ask ent thread to write the
10728 +        * page. Re-dirty page before unlocking so that if ent thread fails to
10729 +        * write it - it will remain dirty
10730 +        */
10731 +       reiser4_set_page_dirty_internal(page);
10732 +
10733 +       /*
10734 +        * pin inode in memory, unlock page, entd_flush will iput. We can not
10735 +        * iput here becasue we can not allow delete_inode to be called here
10736 +        */
10737 +       inode = igrab(page->mapping->host);
10738 +       unlock_page(page);
10739 +       if (inode == NULL)
10740 +               /* inode is getting freed */
10741 +               return 0;
10742 +
10743 +       /* init wbq */
10744 +       INIT_LIST_HEAD(&rq.link);
10745 +       rq.magic = WBQ_MAGIC;
10746 +       rq.wbc = wbc;
10747 +       rq.page = page;
10748 +       rq.mapping = inode->i_mapping;
10749 +       rq.node = NULL;
10750 +       rq.written = 0;
10751 +       init_completion(&rq.completion);
10752 +
10753 +       /* add request to entd's list of writepage requests */
10754 +       spin_lock(&ent->guard);
10755 +       ent->nr_todo_reqs++;
10756 +       list_add_tail(&rq.link, &ent->todo_list);
10757 +       if (ent->nr_todo_reqs == 1)
10758 +               wake_up_process(ent->tsk);
10759 +
10760 +       spin_unlock(&ent->guard);
10761 +
10762 +       /* wait until entd finishes */
10763 +       wait_for_completion(&rq.completion);
10764 +
10765 +       if (rq.written)
10766 +               /* Eventually ENTD has written the page to disk. */
10767 +               return 0;
10768 +       return 0;
10769 +}
10770 +
10771 +int wbq_available(void)
10772 +{
10773 +       struct super_block *sb = reiser4_get_current_sb();
10774 +       entd_context *ent = get_entd_context(sb);
10775 +       return ent->nr_todo_reqs;
10776 +}
10777 +
10778 +/*
10779 + * Local variables:
10780 + * c-indentation-style: "K&R"
10781 + * mode-name: "LC"
10782 + * c-basic-offset: 8
10783 + * tab-width: 8
10784 + * fill-column: 79
10785 + * End:
10786 + */
10787 diff -puN /dev/null fs/reiser4/entd.h
10788 --- /dev/null
10789 +++ a/fs/reiser4/entd.h
10790 @@ -0,0 +1,90 @@
10791 +/* Copyright 2003 by Hans Reiser, licensing governed by reiser4/README */
10792 +
10793 +/* Ent daemon. */
10794 +
10795 +#ifndef __ENTD_H__
10796 +#define __ENTD_H__
10797 +
10798 +#include "context.h"
10799 +
10800 +#include <linux/fs.h>
10801 +#include <linux/completion.h>
10802 +#include <linux/wait.h>
10803 +#include <linux/spinlock.h>
10804 +#include <linux/sched.h>       /* for struct task_struct */
10805 +
10806 +#define WBQ_MAGIC 0x7876dc76
10807 +
10808 +/* write-back request. */
10809 +struct wbq {
10810 +       int magic;
10811 +       struct list_head link; /* list head of this list is in entd context */
10812 +       struct writeback_control *wbc;
10813 +       struct page *page;
10814 +       struct address_space *mapping;
10815 +       struct completion completion;
10816 +       jnode *node; /* set if ent thread captured requested page */
10817 +       int written; /* set if ent thread wrote requested page */
10818 +};
10819 +
10820 +/* ent-thread context. This is used to synchronize starting/stopping ent
10821 + * threads. */
10822 +typedef struct entd_context {
10823 +        /* wait queue that ent thread waits on for more work. It's
10824 +         * signaled by write_page_by_ent(). */
10825 +       wait_queue_head_t wait;
10826 +       /* spinlock protecting other fields */
10827 +       spinlock_t guard;
10828 +       /* ent thread */
10829 +       struct task_struct *tsk;
10830 +       /* set to indicate that ent thread should leave. */
10831 +       int done;
10832 +       /* counter of active flushers */
10833 +       int flushers;
10834 +       /*
10835 +        * when reiser4_writepage asks entd to write a page - it adds struct
10836 +        * wbq to this list
10837 +        */
10838 +       struct list_head todo_list;
10839 +       /* number of elements on the above list */
10840 +       int nr_todo_reqs;
10841 +
10842 +       struct wbq *cur_request;
10843 +       /*
10844 +        * when entd writes a page it moves write-back request from todo_list
10845 +        * to done_list. This list is used at the end of entd iteration to
10846 +        * wakeup requestors and iput inodes.
10847 +        */
10848 +       struct list_head done_list;
10849 +       /* number of elements on the above list */
10850 +       int nr_done_reqs;
10851 +
10852 +#if REISER4_DEBUG
10853 +       /* list of all active flushers */
10854 +       struct list_head flushers_list;
10855 +#endif
10856 +} entd_context;
10857 +
10858 +extern int  reiser4_init_entd(struct super_block *);
10859 +extern void reiser4_done_entd(struct super_block *);
10860 +
10861 +extern void reiser4_enter_flush(struct super_block *);
10862 +extern void reiser4_leave_flush(struct super_block *);
10863 +
10864 +extern int write_page_by_ent(struct page *, struct writeback_control *);
10865 +extern int wbq_available(void);
10866 +extern void ent_writes_page(struct super_block *, struct page *);
10867 +
10868 +extern jnode *get_jnode_by_wbq(struct super_block *, struct wbq *);
10869 +/* __ENTD_H__ */
10870 +#endif
10871 +
10872 +/* Make Linus happy.
10873 +   Local variables:
10874 +   c-indentation-style: "K&R"
10875 +   mode-name: "LC"
10876 +   c-basic-offset: 8
10877 +   tab-width: 8
10878 +   fill-column: 120
10879 +   End:
10880 +*/
10881 diff -puN /dev/null fs/reiser4/eottl.c
10882 --- /dev/null
10883 +++ a/fs/reiser4/eottl.c
10884 @@ -0,0 +1,510 @@
10885 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
10886 +   reiser4/README */
10887 +
10888 +#include "forward.h"
10889 +#include "debug.h"
10890 +#include "key.h"
10891 +#include "coord.h"
10892 +#include "plugin/item/item.h"
10893 +#include "plugin/node/node.h"
10894 +#include "znode.h"
10895 +#include "block_alloc.h"
10896 +#include "tree_walk.h"
10897 +#include "tree_mod.h"
10898 +#include "carry.h"
10899 +#include "tree.h"
10900 +#include "super.h"
10901 +
10902 +#include <linux/types.h>       /* for __u??  */
10903 +
10904 +/*
10905 + * Extents on the twig level (EOTTL) handling.
10906 + *
10907 + * EOTTL poses some problems to the tree traversal, that are better explained
10908 + * by example.
10909 + *
10910 + * Suppose we have block B1 on the twig level with the following items:
10911 + *
10912 + * 0. internal item I0 with key (0:0:0:0) (locality, key-type, object-id,
10913 + * offset)
10914 + * 1. extent item E1 with key (1:4:100:0), having 10 blocks of 4k each
10915 + * 2. internal item I2 with key (10:0:0:0)
10916 + *
10917 + * We are trying to insert item with key (5:0:0:0). Lookup finds node B1, and
10918 + * then intra-node lookup is done. This lookup finished on the E1, because the
10919 + * key we are looking for is larger than the key of E1 and is smaller than key
10920 + * the of I2.
10921 + *
10922 + * Here search is stuck.
10923 + *
10924 + * After some thought it is clear what is wrong here: extents on the twig level
10925 + * break some basic property of the *search* tree (on the pretext, that they
10926 + * restore property of balanced tree).
10927 + *
10928 + * Said property is the following: if in the internal node of the search tree
10929 + * we have [ ... Key1 Pointer Key2 ... ] then, all data that are or will be
10930 + * keyed in the tree with the Key such that Key1 <= Key < Key2 are accessible
10931 + * through the Pointer.
10932 + *
10933 + * This is not true, when Pointer is Extent-Pointer, simply because extent
10934 + * cannot expand indefinitely to the right to include any item with
10935 + *
10936 + *   Key1 <= Key <= Key2.
10937 + *
10938 + * For example, our E1 extent is only responsible for the data with keys
10939 + *
10940 + *   (1:4:100:0) <= key <= (1:4:100:0xffffffffffffffff), and
10941 + *
10942 + * so, key range
10943 + *
10944 + *   ( (1:4:100:0xffffffffffffffff), (10:0:0:0) )
10945 + *
10946 + * is orphaned: there is no way to get there from the tree root.
10947 + *
10948 + * In other words, extent pointers are different than normal child pointers as
10949 + * far as search tree is concerned, and this creates such problems.
10950 + *
10951 + * Possible solution for this problem is to insert our item into node pointed
10952 + * to by I2. There are some problems through:
10953 + *
10954 + * (1) I2 can be in a different node.
10955 + * (2) E1 can be immediately followed by another extent E2.
10956 + *
10957 + * (1) is solved by calling reiser4_get_right_neighbor() and accounting
10958 + * for locks/coords as necessary.
10959 + *
10960 + * (2) is more complex. Solution here is to insert new empty leaf node and
10961 + * insert internal item between E1 and E2 pointing to said leaf node. This is
10962 + * further complicated by possibility that E2 is in a different node, etc.
10963 + *
10964 + * Problems:
10965 + *
10966 + * (1) if there was internal item I2 immediately on the right of an extent E1
10967 + * we and we decided to insert new item S1 into node N2 pointed to by I2, then
10968 + * key of S1 will be less than smallest key in the N2. Normally, search key
10969 + * checks that key we are looking for is in the range of keys covered by the
10970 + * node key is being looked in. To work around of this situation, while
10971 + * preserving useful consistency check new flag CBK_TRUST_DK was added to the
10972 + * cbk falgs bitmask. This flag is automatically set on entrance to the
10973 + * coord_by_key() and is only cleared when we are about to enter situation
10974 + * described above.
10975 + *
10976 + * (2) If extent E1 is immediately followed by another extent E2 and we are
10977 + * searching for the key that is between E1 and E2 we only have to insert new
10978 + * empty leaf node when coord_by_key was called for insertion, rather than just
10979 + * for lookup. To distinguish these cases, new flag CBK_FOR_INSERT was added to
10980 + * the cbk falgs bitmask. This flag is automatically set by coord_by_key calls
10981 + * performed by insert_by_key() and friends.
10982 + *
10983 + * (3) Insertion of new empty leaf node (possibly) requires balancing. In any
10984 + * case it requires modification of node content which is only possible under
10985 + * write lock. It may well happen that we only have read lock on the node where
10986 + * new internal pointer is to be inserted (common case: lookup of non-existent
10987 + * stat-data that fells between two extents). If only read lock is held, tree
10988 + * traversal is restarted with lock_level modified so that next time we hit
10989 + * this problem, write lock will be held. Once we have write lock, balancing
10990 + * will be performed.
10991 + */
10992 +
10993 +/**
10994 + * is_next_item_internal - check whether next item is internal
10995 + * @coord: coordinate of extent item in twig node
10996 + * @key: search key
10997 + * @lh: twig node lock handle
10998 + *
10999 + * Looks at the unit next to @coord. If it is an internal one - 1 is returned,
11000 + * @coord is set to that unit. If that unit is in right neighbor, @lh is moved
11001 + * to that node, @coord is set to its first unit. If next item is not internal
11002 + * or does not exist then 0 is returned, @coord and @lh are left unchanged. 2
11003 + * is returned if search restart has to be done.
11004 + */
11005 +static int
11006 +is_next_item_internal(coord_t *coord, const reiser4_key * key,
11007 +                     lock_handle * lh)
11008 +{
11009 +       coord_t next;
11010 +       lock_handle rn;
11011 +       int result;
11012 +
11013 +       coord_dup(&next, coord);
11014 +       if (coord_next_unit(&next) == 0) {
11015 +               /* next unit is in this node */
11016 +               if (item_is_internal(&next)) {
11017 +                       coord_dup(coord, &next);
11018 +                       return 1;
11019 +               }
11020 +               assert("vs-3", item_is_extent(&next));
11021 +               return 0;
11022 +       }
11023 +
11024 +       /*
11025 +        * next unit either does not exist or is in right neighbor. If it is in
11026 +        * right neighbor we have to check right delimiting key because
11027 +        * concurrent thread could get their first and insert item with a key
11028 +        * smaller than @key
11029 +        */
11030 +       read_lock_dk(current_tree);
11031 +       result = keycmp(key, znode_get_rd_key(coord->node));
11032 +       read_unlock_dk(current_tree);
11033 +       assert("vs-6", result != EQUAL_TO);
11034 +       if (result == GREATER_THAN)
11035 +               return 2;
11036 +
11037 +       /* lock right neighbor */
11038 +       init_lh(&rn);
11039 +       result = reiser4_get_right_neighbor(&rn, coord->node,
11040 +                                           znode_is_wlocked(coord->node) ?
11041 +                                           ZNODE_WRITE_LOCK : ZNODE_READ_LOCK,
11042 +                                           GN_CAN_USE_UPPER_LEVELS);
11043 +       if (result == -E_NO_NEIGHBOR) {
11044 +               /* we are on the rightmost edge of the tree */
11045 +               done_lh(&rn);
11046 +               return 0;
11047 +       }
11048 +
11049 +       if (result) {
11050 +               assert("vs-4", result < 0);
11051 +               done_lh(&rn);
11052 +               return result;
11053 +       }
11054 +
11055 +       /*
11056 +        * check whether concurrent thread managed to insert item with a key
11057 +        * smaller than @key
11058 +        */
11059 +       read_lock_dk(current_tree);
11060 +       result = keycmp(key, znode_get_ld_key(rn.node));
11061 +       read_unlock_dk(current_tree);
11062 +       assert("vs-6", result != EQUAL_TO);
11063 +       if (result == GREATER_THAN) {
11064 +               done_lh(&rn);
11065 +               return 2;
11066 +       }
11067 +
11068 +       result = zload(rn.node);
11069 +       if (result) {
11070 +               assert("vs-5", result < 0);
11071 +               done_lh(&rn);
11072 +               return result;
11073 +       }
11074 +
11075 +       coord_init_first_unit(&next, rn.node);
11076 +       if (item_is_internal(&next)) {
11077 +               /*
11078 +                * next unit is in right neighbor and it is an unit of internal
11079 +                * item. Unlock coord->node. Move @lh to right neighbor. @coord
11080 +                * is set to the first unit of right neighbor.
11081 +                */
11082 +               coord_dup(coord, &next);
11083 +               zrelse(rn.node);
11084 +               done_lh(lh);
11085 +               move_lh(lh, &rn);
11086 +               return 1;
11087 +       }
11088 +
11089 +       /*
11090 +        * next unit is unit of extent item. Return without chaning @lh and
11091 +        * @coord.
11092 +        */
11093 +       assert("vs-6", item_is_extent(&next));
11094 +       zrelse(rn.node);
11095 +       done_lh(&rn);
11096 +       return 0;
11097 +}
11098 +
11099 +/**
11100 + * rd_key - calculate key of an item next to the given one
11101 + * @coord: position in a node
11102 + * @key: storage for result key
11103 + *
11104 + * @coord is set between items or after the last item in a node. Calculate key
11105 + * of item to the right of @coord.
11106 + */
11107 +static reiser4_key *rd_key(const coord_t *coord, reiser4_key *key)
11108 +{
11109 +       coord_t dup;
11110 +
11111 +       assert("nikita-2281", coord_is_between_items(coord));
11112 +       coord_dup(&dup, coord);
11113 +
11114 +       if (coord_set_to_right(&dup) == 0)
11115 +               /* next item is in this node. Return its key. */
11116 +               unit_key_by_coord(&dup, key);
11117 +       else {
11118 +               /*
11119 +                * next item either does not exist or is in right
11120 +                * neighbor. Return znode's right delimiting key.
11121 +                */
11122 +               read_lock_dk(current_tree);
11123 +               *key = *znode_get_rd_key(coord->node);
11124 +               read_unlock_dk(current_tree);
11125 +       }
11126 +       return key;
11127 +}
11128 +
11129 +/**
11130 + * add_empty_leaf - insert empty leaf between two extents
11131 + * @insert_coord: position in twig node between two extents
11132 + * @lh: twig node lock handle
11133 + * @key: left delimiting key of new node
11134 + * @rdkey: right delimiting key of new node
11135 + *
11136 + * Inserts empty leaf node between two extent items. It is necessary when we
11137 + * have to insert an item on leaf level between two extents (items on the twig
11138 + * level).
11139 + */
11140 +static int
11141 +add_empty_leaf(coord_t *insert_coord, lock_handle *lh,
11142 +              const reiser4_key *key, const reiser4_key *rdkey)
11143 +{
11144 +       int result;
11145 +       carry_pool *pool;
11146 +       carry_level *todo;
11147 +       reiser4_item_data *item;
11148 +       carry_insert_data *cdata;
11149 +       carry_op *op;
11150 +       znode *node;
11151 +       reiser4_tree *tree;
11152 +
11153 +       assert("vs-49827", znode_contains_key_lock(insert_coord->node, key));
11154 +       tree = znode_get_tree(insert_coord->node);
11155 +       node = reiser4_new_node(insert_coord->node, LEAF_LEVEL);
11156 +       if (IS_ERR(node))
11157 +               return PTR_ERR(node);
11158 +
11159 +       /* setup delimiting keys for node being inserted */
11160 +       write_lock_dk(tree);
11161 +       znode_set_ld_key(node, key);
11162 +       znode_set_rd_key(node, rdkey);
11163 +       ON_DEBUG(node->creator = current);
11164 +       ON_DEBUG(node->first_key = *key);
11165 +       write_unlock_dk(tree);
11166 +
11167 +       ZF_SET(node, JNODE_ORPHAN);
11168 +
11169 +       /*
11170 +        * allocate carry_pool, 3 carry_level-s, reiser4_item_data and
11171 +        * carry_insert_data
11172 +        */
11173 +       pool = init_carry_pool(sizeof(*pool) + 3 * sizeof(*todo) +
11174 +                              sizeof(*item) + sizeof(*cdata));
11175 +       if (IS_ERR(pool))
11176 +               return PTR_ERR(pool);
11177 +       todo = (carry_level *) (pool + 1);
11178 +       init_carry_level(todo, pool);
11179 +
11180 +       item = (reiser4_item_data *) (todo + 3);
11181 +       cdata = (carry_insert_data *) (item + 1);
11182 +
11183 +       op = reiser4_post_carry(todo, COP_INSERT, insert_coord->node, 0);
11184 +       if (!IS_ERR(op)) {
11185 +               cdata->coord = insert_coord;
11186 +               cdata->key = key;
11187 +               cdata->data = item;
11188 +               op->u.insert.d = cdata;
11189 +               op->u.insert.type = COPT_ITEM_DATA;
11190 +               build_child_ptr_data(node, item);
11191 +               item->arg = NULL;
11192 +               /* have @insert_coord to be set at inserted item after
11193 +                  insertion is done */
11194 +               todo->track_type = CARRY_TRACK_CHANGE;
11195 +               todo->tracked = lh;
11196 +
11197 +               result = reiser4_carry(todo, NULL);
11198 +               if (result == 0) {
11199 +                       /*
11200 +                        * pin node in memory. This is necessary for
11201 +                        * znode_make_dirty() below.
11202 +                        */
11203 +                       result = zload(node);
11204 +                       if (result == 0) {
11205 +                               lock_handle local_lh;
11206 +
11207 +                               /*
11208 +                                * if we inserted new child into tree we have
11209 +                                * to mark it dirty so that flush will be able
11210 +                                * to process it.
11211 +                                */
11212 +                               init_lh(&local_lh);
11213 +                               result = longterm_lock_znode(&local_lh, node,
11214 +                                                            ZNODE_WRITE_LOCK,
11215 +                                                            ZNODE_LOCK_LOPRI);
11216 +                               if (result == 0) {
11217 +                                       znode_make_dirty(node);
11218 +
11219 +                                       /*
11220 +                                        * when internal item pointing to @node
11221 +                                        * was inserted into twig node
11222 +                                        * create_hook_internal did not connect
11223 +                                        * it properly because its right
11224 +                                        * neighbor was not known. Do it
11225 +                                        * here
11226 +                                        */
11227 +                                       write_lock_tree(tree);
11228 +                                       assert("nikita-3312",
11229 +                                              znode_is_right_connected(node));
11230 +                                       assert("nikita-2984",
11231 +                                              node->right == NULL);
11232 +                                       ZF_CLR(node, JNODE_RIGHT_CONNECTED);
11233 +                                       write_unlock_tree(tree);
11234 +                                       result =
11235 +                                           connect_znode(insert_coord, node);
11236 +                                       ON_DEBUG(if (result == 0) check_dkeys(node););
11237 +
11238 +                                       done_lh(lh);
11239 +                                       move_lh(lh, &local_lh);
11240 +                                       assert("vs-1676", node_is_empty(node));
11241 +                                       coord_init_first_unit(insert_coord,
11242 +                                                             node);
11243 +                               } else {
11244 +                                       warning("nikita-3136",
11245 +                                               "Cannot lock child");
11246 +                               }
11247 +                               done_lh(&local_lh);
11248 +                               zrelse(node);
11249 +                       }
11250 +               }
11251 +       } else
11252 +               result = PTR_ERR(op);
11253 +       zput(node);
11254 +       done_carry_pool(pool);
11255 +       return result;
11256 +}
11257 +
11258 +/**
11259 + * handle_eottl - handle extent-on-the-twig-level cases in tree traversal
11260 + * @h: search handle
11261 + * @outcome: flag saying whether search has to restart or is done
11262 + *
11263 + * Handles search on twig level. If this function completes search itself then
11264 + * it returns 1. If search has to go one level down then 0 is returned. If
11265 + * error happens then LOOKUP_DONE is returned via @outcome and error code is
11266 + * saved in @h->result.
11267 + */
11268 +int handle_eottl(cbk_handle *h, int *outcome)
11269 +{
11270 +       int result;
11271 +       reiser4_key key;
11272 +       coord_t *coord;
11273 +
11274 +       coord = h->coord;
11275 +
11276 +       if (h->level != TWIG_LEVEL ||
11277 +           (coord_is_existing_item(coord) && item_is_internal(coord))) {
11278 +               /* Continue to traverse tree downward. */
11279 +               return 0;
11280 +       }
11281 +
11282 +       /*
11283 +        * make sure that @h->coord is set to twig node and that it is either
11284 +        * set to extent item or after extent item
11285 +        */
11286 +       assert("vs-356", h->level == TWIG_LEVEL);
11287 +       assert("vs-357", ({
11288 +                         coord_t lcoord;
11289 +                         coord_dup(&lcoord, coord);
11290 +                         check_me("vs-733", coord_set_to_left(&lcoord) == 0);
11291 +                         item_is_extent(&lcoord);
11292 +                         }
11293 +              ));
11294 +
11295 +       if (*outcome == NS_FOUND) {
11296 +               /* we have found desired key on twig level in extent item */
11297 +               h->result = CBK_COORD_FOUND;
11298 +               *outcome = LOOKUP_DONE;
11299 +               return 1;
11300 +       }
11301 +
11302 +       if (!(h->flags & CBK_FOR_INSERT)) {
11303 +               /* tree traversal is not for insertion. Just return
11304 +                  CBK_COORD_NOTFOUND. */
11305 +               h->result = CBK_COORD_NOTFOUND;
11306 +               *outcome = LOOKUP_DONE;
11307 +               return 1;
11308 +       }
11309 +
11310 +       /* take a look at the item to the right of h -> coord */
11311 +       result = is_next_item_internal(coord, h->key, h->active_lh);
11312 +       if (unlikely(result < 0)) {
11313 +               h->error = "get_right_neighbor failed";
11314 +               h->result = result;
11315 +               *outcome = LOOKUP_DONE;
11316 +               return 1;
11317 +       }
11318 +       if (result == 0) {
11319 +               /*
11320 +                * item to the right is also an extent one. Allocate a new node
11321 +                * and insert pointer to it after item h -> coord.
11322 +                *
11323 +                * This is a result of extents being located at the twig
11324 +                * level. For explanation, see comment just above
11325 +                * is_next_item_internal().
11326 +                */
11327 +               znode *loaded;
11328 +
11329 +               if (cbk_lock_mode(h->level, h) != ZNODE_WRITE_LOCK) {
11330 +                       /*
11331 +                        * we got node read locked, restart coord_by_key to
11332 +                        * have write lock on twig level
11333 +                        */
11334 +                       h->lock_level = TWIG_LEVEL;
11335 +                       h->lock_mode = ZNODE_WRITE_LOCK;
11336 +                       *outcome = LOOKUP_REST;
11337 +                       return 1;
11338 +               }
11339 +
11340 +               loaded = coord->node;
11341 +               result =
11342 +                   add_empty_leaf(coord, h->active_lh, h->key,
11343 +                                  rd_key(coord, &key));
11344 +               if (result) {
11345 +                       h->error = "could not add empty leaf";
11346 +                       h->result = result;
11347 +                       *outcome = LOOKUP_DONE;
11348 +                       return 1;
11349 +               }
11350 +               /* added empty leaf is locked (h->active_lh), its parent node
11351 +                  is unlocked, h->coord is set as EMPTY */
11352 +               assert("vs-13", coord->between == EMPTY_NODE);
11353 +               assert("vs-14", znode_is_write_locked(coord->node));
11354 +               assert("vs-15",
11355 +                      WITH_DATA(coord->node, node_is_empty(coord->node)));
11356 +               assert("vs-16", jnode_is_leaf(ZJNODE(coord->node)));
11357 +               assert("vs-17", coord->node == h->active_lh->node);
11358 +               *outcome = LOOKUP_DONE;
11359 +               h->result = CBK_COORD_NOTFOUND;
11360 +               return 1;
11361 +       } else if (result == 1) {
11362 +               /*
11363 +                * this is special case mentioned in the comment on
11364 +                * tree.h:cbk_flags. We have found internal item immediately on
11365 +                * the right of extent, and we are going to insert new item
11366 +                * there. Key of item we are going to insert is smaller than
11367 +                * leftmost key in the node pointed to by said internal item
11368 +                * (otherwise search wouldn't come to the extent in the first
11369 +                * place).
11370 +                *
11371 +                * This is a result of extents being located at the twig
11372 +                * level. For explanation, see comment just above
11373 +                * is_next_item_internal().
11374 +                */
11375 +               h->flags &= ~CBK_TRUST_DK;
11376 +       } else {
11377 +               assert("vs-8", result == 2);
11378 +               *outcome = LOOKUP_REST;
11379 +               return 1;
11380 +       }
11381 +       assert("vs-362", WITH_DATA(coord->node, item_is_internal(coord)));
11382 +       return 0;
11383 +}
11384 +
11385 +/*
11386 + * Local variables:
11387 + * c-indentation-style: "K&R"
11388 + * mode-name: "LC"
11389 + * c-basic-offset: 8
11390 + * tab-width: 8
11391 + * fill-column: 120
11392 + * scroll-step: 1
11393 + * End:
11394 + */
11395 diff -puN /dev/null fs/reiser4/estimate.c
11396 --- /dev/null
11397 +++ a/fs/reiser4/estimate.c
11398 @@ -0,0 +1,129 @@
11399 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
11400 +   reiser4/README */
11401 +
11402 +#include "debug.h"
11403 +#include "dformat.h"
11404 +#include "tree.h"
11405 +#include "carry.h"
11406 +#include "inode.h"
11407 +#include "plugin/cluster.h"
11408 +#include "plugin/item/ctail.h"
11409 +
11410 +/* This returns how many nodes might get dirty and added nodes if @children
11411 +   nodes are dirtied
11412 +
11413 +   Amount of internals which will get dirty or get allocated we estimate as 5%
11414 +   of the childs + 1 balancing. 1 balancing is 2 neighbours, 2 new blocks and
11415 +   the current block on the leaf level, 2 neighbour nodes + the current (or 1
11416 +   neighbour and 1 new and the current) on twig level, 2 neighbour nodes on
11417 +   upper levels and 1 for a new root. So 5 for leaf level, 3 for twig level,
11418 +   2 on upper + 1 for root.
11419 +
11420 +   Do not calculate the current node of the lowest level here - this is overhead
11421 +   only.
11422 +
11423 +   children is almost always 1 here. Exception is flow insertion
11424 +*/
11425 +static reiser4_block_nr
11426 +max_balance_overhead(reiser4_block_nr childen, tree_level tree_height)
11427 +{
11428 +       reiser4_block_nr ten_percent;
11429 +
11430 +       ten_percent = ((103 * childen) >> 10);
11431 +
11432 +       /* If we have too many balancings at the time, tree height can raise on
11433 +          more then 1. Assume that if tree_height is 5, it can raise on 1 only.
11434 +       */
11435 +       return ((tree_height < 5 ? 5 : tree_height) * 2 + (4 + ten_percent));
11436 +}
11437 +
11438 +/* this returns maximal possible number of nodes which can be modified plus
11439 +   number of new nodes which can be required to perform insertion of one item
11440 +   into the tree */
11441 +/* it is only called when tree height changes, or gets initialized */
11442 +reiser4_block_nr calc_estimate_one_insert(tree_level height)
11443 +{
11444 +       return 1 + max_balance_overhead(1, height);
11445 +}
11446 +
11447 +reiser4_block_nr estimate_one_insert_item(reiser4_tree * tree)
11448 +{
11449 +       return tree->estimate_one_insert;
11450 +}
11451 +
11452 +/* this returns maximal possible number of nodes which can be modified plus
11453 +   number of new nodes which can be required to perform insertion of one unit
11454 +   into an item in the tree */
11455 +reiser4_block_nr estimate_one_insert_into_item(reiser4_tree * tree)
11456 +{
11457 +       /* estimate insert into item just like item insertion */
11458 +       return tree->estimate_one_insert;
11459 +}
11460 +
11461 +reiser4_block_nr estimate_one_item_removal(reiser4_tree * tree)
11462 +{
11463 +       /* on item removal reiser4 does not try to pack nodes more complact, so,
11464 +          only one node may be dirtied on leaf level */
11465 +       return tree->estimate_one_insert;
11466 +}
11467 +
11468 +/* on leaf level insert_flow may add CARRY_FLOW_NEW_NODES_LIMIT new nodes and
11469 +   dirty 3 existing nodes (insert point and both its neighbors).
11470 +   Max_balance_overhead should estimate number of blocks which may change/get
11471 +   added on internal levels */
11472 +reiser4_block_nr estimate_insert_flow(tree_level height)
11473 +{
11474 +       return 3 + CARRY_FLOW_NEW_NODES_LIMIT + max_balance_overhead(3 +
11475 +                                                                    CARRY_FLOW_NEW_NODES_LIMIT,
11476 +                                                                    height);
11477 +}
11478 +
11479 +/* returnes max number of nodes can be occupied by disk cluster */
11480 +static reiser4_block_nr estimate_cluster(struct inode *inode, int unprepped)
11481 +{
11482 +       int per_cluster;
11483 +       per_cluster = (unprepped ? 1 : cluster_nrpages(inode));
11484 +       return 3 + per_cluster +
11485 +               max_balance_overhead(3 + per_cluster,
11486 +                                    REISER4_MAX_ZTREE_HEIGHT);
11487 +}
11488 +
11489 +/* how many nodes might get dirty and added
11490 +   during insertion of a disk cluster */
11491 +reiser4_block_nr estimate_insert_cluster(struct inode *inode)
11492 +{
11493 +       return estimate_cluster(inode, 1); /* 24 */
11494 +}
11495 +
11496 +/* how many nodes might get dirty and added
11497 +   during update of a (prepped or unprepped) disk cluster */
11498 +reiser4_block_nr estimate_update_cluster(struct inode *inode)
11499 +{
11500 +       return estimate_cluster(inode, 0); /* 44, for 64K-cluster */
11501 +}
11502 +
11503 +/* How many nodes occupied by a disk cluster might get dirty.
11504 +   Note that this estimation is not precise (i.e. disk cluster
11505 +   can occupy more nodes).
11506 +   Q: Why we don't use precise estimation?
11507 +   A: 1.Because precise estimation is fairly bad: 65536 nodes
11508 +       for 64K logical cluster, it means 256M of dead space on
11509 +       a partition
11510 +      2.It is a very rare case when disk cluster occupies more
11511 +       nodes then this estimation returns.
11512 +*/
11513 +reiser4_block_nr estimate_dirty_cluster(struct inode *inode)
11514 +{
11515 +       return cluster_nrpages(inode) + 4;
11516 +}
11517 +
11518 +/* Make Linus happy.
11519 +   Local variables:
11520 +   c-indentation-style: "K&R"
11521 +   mode-name: "LC"
11522 +   c-basic-offset: 8
11523 +   tab-width: 8
11524 +   fill-column: 120
11525 +   scroll-step: 1
11526 +   End:
11527 +*/
11528 diff -puN /dev/null fs/reiser4/export_ops.c
11529 --- /dev/null
11530 +++ a/fs/reiser4/export_ops.c
11531 @@ -0,0 +1,328 @@
11532 +/* Copyright 2005 by Hans Reiser, licensing governed by
11533 + * reiser4/README */
11534 +
11535 +#include "inode.h"
11536 +#include "plugin/plugin.h"
11537 +
11538 +/*
11539 + * Supported file-handle types
11540 + */
11541 +typedef enum {
11542 +       FH_WITH_PARENT = 0x10,  /* file handle with parent */
11543 +       FH_WITHOUT_PARENT = 0x11        /* file handle without parent */
11544 +} reiser4_fhtype;
11545 +
11546 +#define NFSERROR (255)
11547 +
11548 +/* initialize place-holder for object */
11549 +static void object_on_wire_init(reiser4_object_on_wire *o)
11550 +{
11551 +       o->plugin = NULL;
11552 +}
11553 +
11554 +/* finish with @o */
11555 +static void object_on_wire_done(reiser4_object_on_wire *o)
11556 +{
11557 +       if (o->plugin != NULL)
11558 +               o->plugin->wire.done(o);
11559 +}
11560 +
11561 +/*
11562 + * read serialized object identity from @addr and store information about
11563 + * object in @obj. This is dual to encode_inode().
11564 + */
11565 +static char *decode_inode(struct super_block *s, char *addr,
11566 +                         reiser4_object_on_wire * obj)
11567 +{
11568 +       file_plugin *fplug;
11569 +
11570 +       /* identifier of object plugin is stored in the first two bytes,
11571 +        * followed by... */
11572 +       fplug = file_plugin_by_disk_id(reiser4_get_tree(s), (d16 *) addr);
11573 +       if (fplug != NULL) {
11574 +               addr += sizeof(d16);
11575 +               obj->plugin = fplug;
11576 +               assert("nikita-3520", fplug->wire.read != NULL);
11577 +               /* plugin specific encoding of object identity. */
11578 +               addr = fplug->wire.read(addr, obj);
11579 +       } else
11580 +               addr = ERR_PTR(RETERR(-EINVAL));
11581 +       return addr;
11582 +}
11583 +
11584 +static struct dentry *reiser4_get_dentry(struct super_block *super,
11585 +                                        void *data);
11586 +/**
11587 + * reiser4_decode_fh: decode on-wire object - helper function
11588 + * for fh_to_dentry, fh_to_parent export operations;
11589 + * @super: super block;
11590 + * @addr: onwire object to be decoded;
11591 + *
11592 + * Returns dentry referring to the object being decoded.
11593 + */
11594 +static struct dentry *reiser4_decode_fh(struct super_block * super,
11595 +                                       char * addr)
11596 +{
11597 +       reiser4_object_on_wire object;
11598 +
11599 +       object_on_wire_init(&object);
11600 +
11601 +       addr = decode_inode(super, addr, &object);
11602 +       if (!IS_ERR(addr)) {
11603 +               struct dentry *d;
11604 +               d = reiser4_get_dentry(super, &object);
11605 +               if (d != NULL && !IS_ERR(d))
11606 +                       /* FIXME check for -ENOMEM */
11607 +                       reiser4_get_dentry_fsdata(d)->stateless = 1;
11608 +               addr = (char *)d;
11609 +       }
11610 +       object_on_wire_done(&object);
11611 +       return (void *)addr;
11612 +}
11613 +
11614 +static struct dentry *reiser4_fh_to_dentry(struct super_block *sb,
11615 +                                          struct fid *fid,
11616 +                                          int fh_len, int fh_type)
11617 +{
11618 +       reiser4_context *ctx;
11619 +       struct dentry *d;
11620 +
11621 +       assert("edward-1536",
11622 +              fh_type == FH_WITH_PARENT || fh_type == FH_WITHOUT_PARENT);
11623 +
11624 +       ctx = reiser4_init_context(sb);
11625 +       if (IS_ERR(ctx))
11626 +               return (struct dentry *)ctx;
11627 +
11628 +       d = reiser4_decode_fh(sb, (char *)fid->raw);
11629 +
11630 +       reiser4_exit_context(ctx);
11631 +       return d;
11632 +}
11633 +
11634 +static struct dentry *reiser4_fh_to_parent(struct super_block *sb,
11635 +                                          struct fid *fid,
11636 +                                          int fh_len, int fh_type)
11637 +{
11638 +       char * addr;
11639 +       struct dentry * d;
11640 +       reiser4_context *ctx;
11641 +       file_plugin *fplug;
11642 +
11643 +       if (fh_type == FH_WITHOUT_PARENT)
11644 +               return NULL;
11645 +       assert("edward-1537", fh_type == FH_WITH_PARENT);
11646 +
11647 +       ctx = reiser4_init_context(sb);
11648 +       if (IS_ERR(ctx))
11649 +               return (struct dentry *)ctx;
11650 +       addr = (char *)fid->raw;
11651 +       /* extract 2-bytes file plugin id */
11652 +       fplug = file_plugin_by_disk_id(reiser4_get_tree(sb), (d16 *)addr);
11653 +       if (fplug == NULL) {
11654 +               d = ERR_PTR(RETERR(-EINVAL));
11655 +               goto exit;
11656 +       }
11657 +       addr += sizeof(d16);
11658 +       /* skip previously encoded object */
11659 +       addr = fplug->wire.read(addr, NULL /* skip */);
11660 +       if (IS_ERR(addr)) {
11661 +               d = (struct dentry *)addr;
11662 +               goto exit;
11663 +       }
11664 +       /* @extract and decode parent object */
11665 +       d = reiser4_decode_fh(sb, addr);
11666 + exit:
11667 +       reiser4_exit_context(ctx);
11668 +       return d;
11669 +}
11670 +
11671 +/*
11672 + * Object serialization support.
11673 + *
11674 + * To support knfsd file system provides export_operations that are used to
11675 + * construct and interpret NFS file handles. As a generalization of this,
11676 + * reiser4 object plugins have serialization support: it provides methods to
11677 + * create on-wire representation of identity of reiser4 object, and
11678 + * re-create/locate object given its on-wire identity.
11679 + *
11680 + */
11681 +
11682 +/*
11683 + * return number of bytes that on-wire representation of @inode's identity
11684 + * consumes.
11685 + */
11686 +static int encode_inode_size(struct inode *inode)
11687 +{
11688 +       assert("nikita-3514", inode != NULL);
11689 +       assert("nikita-3515", inode_file_plugin(inode) != NULL);
11690 +       assert("nikita-3516", inode_file_plugin(inode)->wire.size != NULL);
11691 +
11692 +       return inode_file_plugin(inode)->wire.size(inode) + sizeof(d16);
11693 +}
11694 +
11695 +/*
11696 + * store on-wire representation of @inode's identity at the area beginning at
11697 + * @start.
11698 + */
11699 +static char *encode_inode(struct inode *inode, char *start)
11700 +{
11701 +       assert("nikita-3517", inode != NULL);
11702 +       assert("nikita-3518", inode_file_plugin(inode) != NULL);
11703 +       assert("nikita-3519", inode_file_plugin(inode)->wire.write != NULL);
11704 +
11705 +       /*
11706 +        * first, store two-byte identifier of object plugin, then
11707 +        */
11708 +       save_plugin_id(file_plugin_to_plugin(inode_file_plugin(inode)),
11709 +                      (d16 *) start);
11710 +       start += sizeof(d16);
11711 +       /*
11712 +        * call plugin to serialize object's identity
11713 +        */
11714 +       return inode_file_plugin(inode)->wire.write(inode, start);
11715 +}
11716 +
11717 +/* this returns number of 32 bit long numbers encoded in @lenp. 255 is
11718 + * returned if file handle can not be stored */
11719 +/**
11720 + * reiser4_encode_fh - encode_fh of export operations
11721 + * @dentry:
11722 + * @fh:
11723 + * @lenp:
11724 + * @need_parent:
11725 + *
11726 + */
11727 +static int
11728 +reiser4_encode_fh(struct dentry *dentry, __u32 *fh, int *lenp,
11729 +                 int need_parent)
11730 +{
11731 +       struct inode *inode;
11732 +       struct inode *parent;
11733 +       char *addr;
11734 +       int need;
11735 +       int delta;
11736 +       int result;
11737 +       reiser4_context *ctx;
11738 +
11739 +       /*
11740 +        * knfsd asks as to serialize object in @dentry, and, optionally its
11741 +        * parent (if need_parent != 0).
11742 +        *
11743 +        * encode_inode() and encode_inode_size() is used to build
11744 +        * representation of object and its parent. All hard work is done by
11745 +        * object plugins.
11746 +        */
11747 +       inode = dentry->d_inode;
11748 +       parent = dentry->d_parent->d_inode;
11749 +
11750 +       addr = (char *)fh;
11751 +
11752 +       need = encode_inode_size(inode);
11753 +       if (need < 0)
11754 +               return NFSERROR;
11755 +       if (need_parent) {
11756 +               delta = encode_inode_size(parent);
11757 +               if (delta < 0)
11758 +                       return NFSERROR;
11759 +               need += delta;
11760 +       }
11761 +
11762 +       ctx = reiser4_init_context(dentry->d_inode->i_sb);
11763 +       if (IS_ERR(ctx))
11764 +               return PTR_ERR(ctx);
11765 +
11766 +       if (need <= sizeof(__u32) * (*lenp)) {
11767 +               addr = encode_inode(inode, addr);
11768 +               if (need_parent)
11769 +                       addr = encode_inode(parent, addr);
11770 +
11771 +               /* store in lenp number of 32bit words required for file
11772 +                * handle. */
11773 +               *lenp = (need + sizeof(__u32) - 1) >> 2;
11774 +               result = need_parent ? FH_WITH_PARENT : FH_WITHOUT_PARENT;
11775 +       } else
11776 +               /* no enough space in file handle */
11777 +               result = NFSERROR;
11778 +       reiser4_exit_context(ctx);
11779 +       return result;
11780 +}
11781 +
11782 +/**
11783 + * reiser4_get_dentry_parent - get_parent of export operations
11784 + * @child:
11785 + *
11786 + */
11787 +static struct dentry *reiser4_get_dentry_parent(struct dentry *child)
11788 +{
11789 +       struct inode *dir;
11790 +       dir_plugin *dplug;
11791 +       struct dentry *result;
11792 +       reiser4_context *ctx;
11793 +
11794 +       assert("nikita-3527", child != NULL);
11795 +
11796 +       dir = child->d_inode;
11797 +       assert("nikita-3529", dir != NULL);
11798 +
11799 +       ctx = reiser4_init_context(dir->i_sb);
11800 +       if (IS_ERR(ctx))
11801 +               return (void *)ctx;
11802 +
11803 +       dplug = inode_dir_plugin(dir);
11804 +       assert("nikita-3531", ergo(dplug != NULL, dplug->get_parent != NULL));
11805 +
11806 +       if (unlikely(dplug == NULL)) {
11807 +               reiser4_exit_context(ctx);
11808 +               return ERR_PTR(RETERR(-ENOTDIR));
11809 +       }
11810 +       result = dplug->get_parent(dir);
11811 +       reiser4_exit_context(ctx);
11812 +       return result;
11813 +}
11814 +
11815 +/**
11816 + * reiser4_get_dentry - get_dentry of export operations
11817 + * @super:
11818 + * @data:
11819 + *
11820 + *
11821 + */
11822 +static struct dentry *reiser4_get_dentry(struct super_block *super, void *data)
11823 +{
11824 +       reiser4_object_on_wire *o;
11825 +
11826 +       assert("nikita-3522", super != NULL);
11827 +       assert("nikita-3523", data != NULL);
11828 +       /*
11829 +        * this is only supposed to be called by
11830 +        *
11831 +        *     reiser4_decode_fh->find_exported_dentry
11832 +        *
11833 +        * so, reiser4_context should be here already.
11834 +        */
11835 +       assert("nikita-3526", is_in_reiser4_context());
11836 +
11837 +       o = (reiser4_object_on_wire *)data;
11838 +       assert("nikita-3524", o->plugin != NULL);
11839 +       assert("nikita-3525", o->plugin->wire.get != NULL);
11840 +
11841 +       return o->plugin->wire.get(super, o);
11842 +}
11843 +
11844 +struct export_operations reiser4_export_operations = {
11845 +       .encode_fh = reiser4_encode_fh,
11846 +       .fh_to_dentry = reiser4_fh_to_dentry,
11847 +       .fh_to_parent = reiser4_fh_to_parent,
11848 +       .get_parent = reiser4_get_dentry_parent,
11849 +};
11850 +
11851 +/*
11852 + * Local variables:
11853 + * c-indentation-style: "K&R"
11854 + * mode-name: "LC"
11855 + * c-basic-offset: 8
11856 + * tab-width: 8
11857 + * fill-column: 79
11858 + * End:
11859 + */
11860 diff -puN /dev/null fs/reiser4/flush.c
11861 --- /dev/null
11862 +++ a/fs/reiser4/flush.c
11863 @@ -0,0 +1,3703 @@
11864 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
11865 +   reiser4/README */
11866 +
11867 +/* The design document for this file is at http://www.namesys.com/v4/v4.html. */
11868 +
11869 +#include "forward.h"
11870 +#include "debug.h"
11871 +#include "dformat.h"
11872 +#include "key.h"
11873 +#include "coord.h"
11874 +#include "plugin/item/item.h"
11875 +#include "plugin/plugin.h"
11876 +#include "plugin/object.h"
11877 +#include "txnmgr.h"
11878 +#include "jnode.h"
11879 +#include "znode.h"
11880 +#include "block_alloc.h"
11881 +#include "tree_walk.h"
11882 +#include "carry.h"
11883 +#include "tree.h"
11884 +#include "vfs_ops.h"
11885 +#include "inode.h"
11886 +#include "page_cache.h"
11887 +#include "wander.h"
11888 +#include "super.h"
11889 +#include "entd.h"
11890 +#include "reiser4.h"
11891 +#include "flush.h"
11892 +#include "writeout.h"
11893 +
11894 +#include <asm/atomic.h>
11895 +#include <linux/fs.h>          /* for struct super_block  */
11896 +#include <linux/mm.h>          /* for struct page */
11897 +#include <linux/bio.h>         /* for struct bio */
11898 +#include <linux/pagemap.h>
11899 +#include <linux/blkdev.h>
11900 +
11901 +/* IMPLEMENTATION NOTES */
11902 +
11903 +/* PARENT-FIRST: Some terminology: A parent-first traversal is a way of
11904 +   assigning a total order to the nodes of the tree in which the parent is
11905 +   placed before its children, which are ordered (recursively) in left-to-right
11906 +   order. When we speak of a "parent-first preceder", it describes the node that
11907 +   "came before in forward parent-first order". When we speak of a "parent-first
11908 +   follower", it describes the node that "comes next in parent-first order"
11909 +   (alternatively the node that "came before in reverse parent-first order").
11910 +
11911 +   The following pseudo-code prints the nodes of a tree in forward parent-first
11912 +   order:
11913 +
11914 +   void parent_first (node)
11915 +   {
11916 +     print_node (node);
11917 +     if (node->level > leaf) {
11918 +       for (i = 0; i < num_children; i += 1) {
11919 +        parent_first (node->child[i]);
11920 +       }
11921 +     }
11922 +   }
11923 +*/
11924 +
11925 +/* JUST WHAT ARE WE TRYING TO OPTIMIZE, HERE?  The idea is to optimize block
11926 +   allocation so that a left-to-right scan of the tree's data (i.e., the leaves
11927 +   in left-to-right order) can be accomplished with sequential reads, which
11928 +   results in reading nodes in their parent-first order. This is a
11929 +   read-optimization aspect of the flush algorithm, and there is also a
11930 +   write-optimization aspect, which is that we wish to make large sequential
11931 +   writes to the disk by allocating or reallocating blocks so that they can be
11932 +   written in sequence. Sometimes the read-optimization and write-optimization
11933 +   goals conflict with each other, as we discuss in more detail below.
11934 +*/
11935 +
11936 +/* STATE BITS: The flush code revolves around the state of the jnodes it covers.
11937 +   Here are the relevant jnode->state bits and their relevence to flush:
11938 +
11939 +     JNODE_DIRTY: If a node is dirty, it must be flushed. But in order to be
11940 +     written it must be allocated first. In order to be considered allocated,
11941 +     the jnode must have exactly one of { JNODE_OVRWR, JNODE_RELOC } set. These
11942 +     two bits are exclusive, and all dirtied jnodes eventually have one of these
11943 +     bits set during each transaction.
11944 +
11945 +     JNODE_CREATED: The node was freshly created in its transaction and has no
11946 +     previous block address, so it is unconditionally assigned to be relocated,
11947 +     although this is mainly for code-convenience. It is not being 'relocated'
11948 +     from anything, but in almost every regard it is treated as part of the
11949 +     relocate set. The JNODE_CREATED bit remains set even after JNODE_RELOC is
11950 +     set, so the actual relocate can be distinguished from the
11951 +     created-and-allocated set easily: relocate-set members (belonging to the
11952 +     preserve-set) have (JNODE_RELOC) set and created-set members which have no
11953 +     previous location to preserve have (JNODE_RELOC | JNODE_CREATED) set.
11954 +
11955 +     JNODE_OVRWR: The node belongs to atom's overwrite set. The flush algorithm
11956 +     made the decision to maintain the pre-existing location for this node and
11957 +     it will be written to the wandered-log.
11958 +
11959 +     JNODE_RELOC: The flush algorithm made the decision to relocate this block
11960 +     (if it was not created, see note above). A block with JNODE_RELOC set is
11961 +     eligible for early-flushing and may be submitted during flush_empty_queues.
11962 +     When the JNODE_RELOC bit is set on a znode, the parent node's internal item
11963 +     is modified and the znode is rehashed.
11964 +
11965 +     JNODE_SQUEEZABLE: Before shifting everything left, the flush algorithm
11966 +     scans the node and calls plugin->f.squeeze() method for its items. By this
11967 +     technology we update disk clusters of cryptcompress objects. Also if
11968 +     leftmost point that was found by flush scan has this flag (races with
11969 +     write(), rare case) the flush algorythm makes the decision to pass it to
11970 +     squalloc() in spite of its flushprepped status for squeezing, not for
11971 +     repeated allocation.
11972 +
11973 +     JNODE_FLUSH_QUEUED: This bit is set when a call to flush enters the jnode
11974 +     into its flush queue. This means the jnode is not on any clean or dirty
11975 +     list, instead it is moved to one of the flush queue (see flush_queue.h)
11976 +     object private list. This prevents multiple concurrent flushes from
11977 +     attempting to start flushing from the same node.
11978 +
11979 +     (DEAD STATE BIT) JNODE_FLUSH_BUSY: This bit was set during the bottom-up
11980 +     squeeze-and-allocate on a node while its children are actively being
11981 +     squeezed and allocated. This flag was created to avoid submitting a write
11982 +     request for a node while its children are still being allocated and
11983 +     squeezed. Then flush queue was re-implemented to allow unlimited number of
11984 +     nodes be queued. This flag support was commented out in source code because
11985 +     we decided that there was no reason to submit queued nodes before
11986 +     jnode_flush() finishes.  However, current code calls fq_write() during a
11987 +     slum traversal and may submit "busy nodes" to disk. Probably we can
11988 +     re-enable the JNODE_FLUSH_BUSY bit support in future.
11989 +
11990 +   With these state bits, we describe a test used frequently in the code below,
11991 +   jnode_is_flushprepped()(and the spin-lock-taking jnode_check_flushprepped()).
11992 +   The test for "flushprepped" returns true if any of the following are true:
11993 +
11994 +     - The node is not dirty
11995 +     - The node has JNODE_RELOC set
11996 +     - The node has JNODE_OVRWR set
11997 +
11998 +   If either the node is not dirty or it has already been processed by flush
11999 +   (and assigned JNODE_OVRWR or JNODE_RELOC), then it is prepped. If
12000 +   jnode_is_flushprepped() returns true then flush has work to do on that node.
12001 +*/
12002 +
12003 +/* FLUSH_PREP_ONCE_PER_TRANSACTION: Within a single transaction a node is never
12004 +   flushprepped twice (unless an explicit call to flush_unprep is made as
12005 +   described in detail below). For example a node is dirtied, allocated, and
12006 +   then early-flushed to disk and set clean. Before the transaction commits, the
12007 +   page is dirtied again and, due to memory pressure, the node is flushed again.
12008 +   The flush algorithm will not relocate the node to a new disk location, it
12009 +   will simply write it to the same, previously relocated position again.
12010 +*/
12011 +
12012 +/* THE BOTTOM-UP VS. TOP-DOWN ISSUE: This code implements a bottom-up algorithm
12013 +   where we start at a leaf node and allocate in parent-first order by iterating
12014 +   to the right. At each step of the iteration, we check for the right neighbor.
12015 +   Before advancing to the right neighbor, we check if the current position and
12016 +   the right neighbor share the same parent. If they do not share the same
12017 +   parent, the parent is allocated before the right neighbor.
12018 +
12019 +   This process goes recursively up the tree and squeeze nodes level by level as
12020 +   long as the right neighbor and the current position have different parents,
12021 +   then it allocates the right-neighbors-with-different-parents on the way back
12022 +   down. This process is described in more detail in
12023 +   flush_squalloc_changed_ancestor and the recursive function
12024 +   squalloc_one_changed_ancestor. But the purpose here is not to discuss the
12025 +   specifics of the bottom-up approach as it is to contrast the bottom-up and
12026 +   top-down approaches.
12027 +
12028 +   The top-down algorithm was implemented earlier (April-May 2002). In the
12029 +   top-down approach, we find a starting point by scanning left along each level
12030 +   past dirty nodes, then going up and repeating the process until the left node
12031 +   and the parent node are clean. We then perform a parent-first traversal from
12032 +   the starting point, which makes allocating in parent-first order trivial.
12033 +   After one subtree has been allocated in this manner, we move to the right,
12034 +   try moving upward, then repeat the parent-first traversal.
12035 +
12036 +   Both approaches have problems that need to be addressed. Both are
12037 +   approximately the same amount of code, but the bottom-up approach has
12038 +   advantages in the order it acquires locks which, at the very least, make it
12039 +   the better approach. At first glance each one makes the other one look
12040 +   simpler, so it is important to remember a few of the problems with each one.
12041 +
12042 +   Main problem with the top-down approach: When you encounter a clean child
12043 +   during the parent-first traversal, what do you do? You would like to avoid
12044 +   searching through a large tree of nodes just to find a few dirty leaves at
12045 +   the bottom, and there is not an obvious solution. One of the advantages of
12046 +   the top-down approach is that during the parent-first traversal you check
12047 +   every child of a parent to see if it is dirty. In this way, the top-down
12048 +   approach easily handles the main problem of the bottom-up approach:
12049 +   unallocated children.
12050 +
12051 +   The unallocated children problem is that before writing a node to disk we
12052 +   must make sure that all of its children are allocated. Otherwise, the writing
12053 +   the node means extra I/O because the node will have to be written again when
12054 +   the child is finally allocated.
12055 +
12056 +   WE HAVE NOT YET ELIMINATED THE UNALLOCATED CHILDREN PROBLEM. Except for bugs,
12057 +   this should not cause any file system corruption, it only degrades I/O
12058 +   performance because a node may be written when it is sure to be written at
12059 +   least one more time in the same transaction when the remaining children are
12060 +   allocated. What follows is a description of how we will solve the problem.
12061 +*/
12062 +
12063 +/* HANDLING UNALLOCATED CHILDREN: During flush we may allocate a parent node,
12064 +   then proceeding in parent first order, allocate some of its left-children,
12065 +   then encounter a clean child in the middle of the parent. We do not allocate
12066 +   the clean child, but there may remain unallocated (dirty) children to the
12067 +   right of the clean child. If we were to stop flushing at this moment and
12068 +   write everything to disk, the parent might still contain unallocated
12069 +   children.
12070 +
12071 +   We could try to allocate all the descendents of every node that we allocate,
12072 +   but this is not necessary. Doing so could result in allocating the entire
12073 +   tree: if the root node is allocated then every unallocated node would have to
12074 +   be allocated before flushing. Actually, we do not have to write a node just
12075 +   because we allocate it. It is possible to allocate but not write a node
12076 +   during flush, when it still has unallocated children. However, this approach
12077 +   is probably not optimal for the following reason.
12078 +
12079 +   The flush algorithm is designed to allocate nodes in parent-first order in an
12080 +   attempt to optimize reads that occur in the same order. Thus we are
12081 +   read-optimizing for a left-to-right scan through all the leaves in the
12082 +   system, and we are hoping to write-optimize at the same time because those
12083 +   nodes will be written together in batch. What happens, however, if we assign
12084 +   a block number to a node in its read-optimized order but then avoid writing
12085 +   it because it has unallocated children? In that situation, we lose out on the
12086 +   write-optimization aspect because a node will have to be written again to the
12087 +   its location on the device, later, which likely means seeking back to that
12088 +   location.
12089 +
12090 +   So there are tradeoffs. We can choose either:
12091 +
12092 +   A. Allocate all unallocated children to preserve both write-optimization and
12093 +   read-optimization, but this is not always desirable because it may mean
12094 +   having to allocate and flush very many nodes at once.
12095 +
12096 +   B. Defer writing nodes with unallocated children, keep their read-optimized
12097 +   locations, but sacrifice write-optimization because those nodes will be
12098 +   written again.
12099 +
12100 +   C. Defer writing nodes with unallocated children, but do not keep their
12101 +   read-optimized locations. Instead, choose to write-optimize them later, when
12102 +   they are written. To facilitate this, we "undo" the read-optimized allocation
12103 +   that was given to the node so that later it can be write-optimized, thus
12104 +   "unpreparing" the flush decision. This is a case where we disturb the
12105 +   FLUSH_PREP_ONCE_PER_TRANSACTION rule described above. By a call to
12106 +   flush_unprep() we will: if the node was wandered, unset the JNODE_OVRWR bit;
12107 +   if the node was relocated, unset the JNODE_RELOC bit, non-deferred-deallocate
12108 +   its block location, and set the JNODE_CREATED bit, effectively setting the
12109 +   node back to an unallocated state.
12110 +
12111 +   We will take the following approach in v4.0: for twig nodes we will always
12112 +   finish allocating unallocated children (A).  For nodes with (level > TWIG)
12113 +   we will defer writing and choose write-optimization (C).
12114 +
12115 +   To summarize, there are several parts to a solution that avoids the problem
12116 +   with unallocated children:
12117 +
12118 +   FIXME-ZAM: Still no one approach is implemented to eliminate the
12119 +   "UNALLOCATED CHILDREN" problem because there was an experiment which was done
12120 +   showed that we have 1-2 nodes with unallocated children for thousands of
12121 +   written nodes. The experiment was simple like coping/deletion of linux kernel
12122 +   sources. However the problem can arise in more complex tests. I think we have
12123 +   jnode_io_hook to insert a check for unallocated children and see what kind of
12124 +   problem we have.
12125 +
12126 +   1. When flush reaches a stopping point (e.g. a clean node) it should continue
12127 +   calling squeeze-and-allocate on any remaining unallocated children.
12128 +   FIXME: Difficulty to implement: should be simple -- amounts to adding a while
12129 +   loop to jnode_flush, see comments in that function.
12130 +
12131 +   2. When flush reaches flush_empty_queue(), some of the (level > TWIG) nodes
12132 +   may still have unallocated children. If the twig level has unallocated
12133 +   children it is an assertion failure. If a higher-level node has unallocated
12134 +   children, then it should be explicitly de-allocated by a call to
12135 +   flush_unprep().
12136 +   FIXME: Difficulty to implement: should be simple.
12137 +
12138 +   3. (CPU-Optimization) Checking whether a node has unallocated children may
12139 +   consume more CPU cycles than we would like, and it is possible (but medium
12140 +   complexity) to optimize this somewhat in the case where large sub-trees are
12141 +   flushed. The following observation helps: if both the left- and
12142 +   right-neighbor of a node are processed by the flush algorithm then the node
12143 +   itself is guaranteed to have all of its children allocated. However, the cost
12144 +   of this check may not be so expensive after all: it is not needed for leaves
12145 +   and flush can guarantee this property for twigs. That leaves only (level >
12146 +   TWIG) nodes that have to be checked, so this optimization only helps if at
12147 +   least three (level > TWIG) nodes are flushed in one pass, and the savings
12148 +   will be very small unless there are many more (level > TWIG) nodes. But if
12149 +   there are many (level > TWIG) nodes then the number of blocks being written
12150 +   will be very large, so the savings may be insignificant. That said, the idea
12151 +   is to maintain both the left and right edges of nodes that are processed in
12152 +   flush.  When flush_empty_queue() is called, a relatively simple test will
12153 +   tell whether the (level > TWIG) node is on the edge. If it is on the edge,
12154 +   the slow check is necessary, but if it is in the interior then it can be
12155 +   assumed to have all of its children allocated. FIXME: medium complexity to
12156 +   implement, but simple to verify given that we must have a slow check anyway.
12157 +
12158 +   4. (Optional) This part is optional, not for v4.0--flush should work
12159 +   independently of whether this option is used or not. Called RAPID_SCAN, the
12160 +   idea is to amend the left-scan operation to take unallocated children into
12161 +   account. Normally, the left-scan operation goes left as long as adjacent
12162 +   nodes are dirty up until some large maximum value (FLUSH_SCAN_MAXNODES) at
12163 +   which point it stops and begins flushing. But scan-left may stop at a
12164 +   position where there are unallocated children to the left with the same
12165 +   parent. When RAPID_SCAN is enabled, the ordinary scan-left operation stops
12166 +   after FLUSH_RELOCATE_THRESHOLD, which is much smaller than
12167 +   FLUSH_SCAN_MAXNODES, then procedes with a rapid scan. The rapid scan skips
12168 +   all the interior children of a node--if the leftmost child of a twig is
12169 +   dirty, check its left neighbor (the rightmost child of the twig to the left).
12170 +   If the left neighbor of the leftmost child is also dirty, then continue the
12171 +   scan at the left twig and repeat.  This option will cause flush to allocate
12172 +   more twigs in a single pass, but it also has the potential to write many more
12173 +   nodes than would otherwise be written without the RAPID_SCAN option.
12174 +   RAPID_SCAN was partially implemented, code removed August 12, 2002 by JMACD.
12175 +*/
12176 +
12177 +/* FLUSH CALLED ON NON-LEAF LEVEL. Most of our design considerations assume that
12178 +   the starting point for flush is a leaf node, but actually the flush code
12179 +   cares very little about whether or not this is true.  It is possible that all
12180 +   the leaf nodes are flushed and dirty parent nodes still remain, in which case
12181 +   jnode_flush() is called on a non-leaf argument. Flush doesn't care--it treats
12182 +   the argument node as if it were a leaf, even when it is not. This is a simple
12183 +   approach, and there may be a more optimal policy but until a problem with
12184 +   this approach is discovered, simplest is probably best.
12185 +
12186 +   NOTE: In this case, the ordering produced by flush is parent-first only if
12187 +   you ignore the leaves. This is done as a matter of simplicity and there is
12188 +   only one (shaky) justification. When an atom commits, it flushes all leaf
12189 +   level nodes first, followed by twigs, and so on. With flushing done in this
12190 +   order, if flush is eventually called on a non-leaf node it means that
12191 +   (somehow) we reached a point where all leaves are clean and only internal
12192 +   nodes need to be flushed. If that it the case, then it means there were no
12193 +   leaves that were the parent-first preceder/follower of the parent. This is
12194 +   expected to be a rare case, which is why we do nothing special about it.
12195 +   However, memory pressure may pass an internal node to flush when there are
12196 +   still dirty leaf nodes that need to be flushed, which could prove our
12197 +   original assumptions "inoperative". If this needs to be fixed, then
12198 +   scan_left/right should have special checks for the non-leaf levels. For
12199 +   example, instead of passing from a node to the left neighbor, it should pass
12200 +   from the node to the left neighbor's rightmost descendent (if dirty).
12201 +
12202 +*/
12203 +
12204 +/* UNIMPLEMENTED AS YET: REPACKING AND RESIZING. We walk the tree in 4MB-16MB
12205 +   chunks, dirtying everything and putting it into a transaction. We tell the
12206 +   allocator to allocate the blocks as far as possible towards one end of the
12207 +   logical device--the left (starting) end of the device if we are walking from
12208 +   left to right, the right end of the device if we are walking from right to
12209 +   left.  We then make passes in alternating directions, and as we do this the
12210 +   device becomes sorted such that tree order and block number order fully
12211 +   correlate.
12212 +
12213 +   Resizing is done by shifting everything either all the way to the left or all
12214 +   the way to the right, and then reporting the last block.
12215 +*/
12216 +
12217 +/* RELOCATE DECISIONS: The code makes a decision to relocate in several places.
12218 +   This descibes the policy from the highest level:
12219 +
12220 +   The FLUSH_RELOCATE_THRESHOLD parameter: If we count this many consecutive
12221 +   nodes on the leaf level during flush-scan (right, left), then we
12222 +   unconditionally decide to relocate leaf nodes.
12223 +
12224 +   Otherwise, there are two contexts in which we make a decision to relocate:
12225 +
12226 +   1. The REVERSE PARENT-FIRST context: Implemented in reverse_relocate_test().
12227 +   During the initial stages of flush, after scan-right completes, we want to
12228 +   ask the question: should we relocate this leaf node and thus dirty the parent
12229 +   node. Then if the node is a leftmost child its parent is its own parent-first
12230 +   preceder, thus we repeat the question at the next level up, and so on. In
12231 +   these cases we are moving in the reverse-parent first direction.
12232 +
12233 +   There is another case which is considered the reverse direction, which comes
12234 +   at the end of a twig in reverse_relocate_end_of_twig(). As we finish
12235 +   processing a twig we may reach a point where there is a clean twig to the
12236 +   right with a dirty leftmost child. In this case, we may wish to relocate the
12237 +   child by testing if it should be relocated relative to its parent.
12238 +
12239 +   2. The FORWARD PARENT-FIRST context: Testing for forward relocation is done
12240 +   in allocate_znode. What distinguishes the forward parent-first case from the
12241 +   reverse-parent first case is that the preceder has already been allocated in
12242 +   the forward case, whereas in the reverse case we don't know what the preceder
12243 +   is until we finish "going in reverse". That simplifies the forward case
12244 +   considerably, and there we actually use the block allocator to determine
12245 +   whether, e.g., a block closer to the preceder is available.
12246 +*/
12247 +
12248 +/* SQUEEZE_LEFT_EDGE: Unimplemented idea for future consideration. The idea is,
12249 +   once we finish scan-left and find a starting point, if the parent's left
12250 +   neighbor is dirty then squeeze the parent's left neighbor and the parent.
12251 +   This may change the flush-starting-node's parent. Repeat until the child's
12252 +   parent is stable. If the child is a leftmost child, repeat this left-edge
12253 +   squeezing operation at the next level up. Note that we cannot allocate
12254 +   extents during this or they will be out of parent-first order. There is also
12255 +   some difficult coordinate maintenence issues.  We can't do a tree search to
12256 +   find coordinates again (because we hold locks), we have to determine them
12257 +   from the two nodes being squeezed. Looks difficult, but has potential to
12258 +   increase space utilization. */
12259 +
12260 +/* Flush-scan helper functions. */
12261 +static void scan_init(flush_scan * scan);
12262 +static void scan_done(flush_scan * scan);
12263 +
12264 +/* Flush-scan algorithm. */
12265 +static int scan_left(flush_scan * scan, flush_scan * right, jnode * node,
12266 +                    unsigned limit);
12267 +static int scan_right(flush_scan * scan, jnode * node, unsigned limit);
12268 +static int scan_common(flush_scan * scan, flush_scan * other);
12269 +static int scan_formatted(flush_scan * scan);
12270 +static int scan_unformatted(flush_scan * scan, flush_scan * other);
12271 +static int scan_by_coord(flush_scan * scan);
12272 +
12273 +/* Initial flush-point ancestor allocation. */
12274 +static int alloc_pos_and_ancestors(flush_pos_t *pos);
12275 +static int alloc_one_ancestor(const coord_t *coord, flush_pos_t *pos);
12276 +static int set_preceder(const coord_t *coord_in, flush_pos_t *pos);
12277 +
12278 +/* Main flush algorithm.
12279 +   Note on abbreviation: "squeeze and allocate" == "squalloc". */
12280 +static int squalloc(flush_pos_t *pos);
12281 +
12282 +/* Flush squeeze implementation. */
12283 +static int squeeze_right_non_twig(znode * left, znode * right);
12284 +static int shift_one_internal_unit(znode * left, znode * right);
12285 +
12286 +/* Flush reverse parent-first relocation routines. */
12287 +static int reverse_relocate_if_close_enough(const reiser4_block_nr * pblk,
12288 +                                           const reiser4_block_nr * nblk);
12289 +static int reverse_relocate_test(jnode * node, const coord_t *parent_coord,
12290 +                                flush_pos_t *pos);
12291 +static int reverse_relocate_check_dirty_parent(jnode * node,
12292 +                                              const coord_t *parent_coord,
12293 +                                              flush_pos_t *pos);
12294 +
12295 +/* Flush allocate write-queueing functions: */
12296 +static int allocate_znode(znode * node, const coord_t *parent_coord,
12297 +                         flush_pos_t *pos);
12298 +static int allocate_znode_update(znode * node, const coord_t *parent_coord,
12299 +                                flush_pos_t *pos);
12300 +static int lock_parent_and_allocate_znode(znode *, flush_pos_t *);
12301 +
12302 +/* Flush helper functions: */
12303 +static int jnode_lock_parent_coord(jnode * node,
12304 +                                  coord_t *coord,
12305 +                                  lock_handle * parent_lh,
12306 +                                  load_count * parent_zh,
12307 +                                  znode_lock_mode mode, int try);
12308 +static int neighbor_in_slum(znode * node, lock_handle * right_lock, sideof side,
12309 +                          znode_lock_mode mode, int check_dirty, int expected);
12310 +static int znode_same_parents(znode * a, znode * b);
12311 +
12312 +static int znode_check_flushprepped(znode * node)
12313 +{
12314 +       return jnode_check_flushprepped(ZJNODE(node));
12315 +}
12316 +
12317 +/* Flush position functions */
12318 +static void pos_init(flush_pos_t *pos);
12319 +static int pos_valid(flush_pos_t *pos);
12320 +static void pos_done(flush_pos_t *pos);
12321 +static int pos_stop(flush_pos_t *pos);
12322 +
12323 +/* check that @org is first jnode extent unit, if extent is unallocated,
12324 + * because all jnodes of unallocated extent are dirty and of the same atom. */
12325 +#define checkchild(scan)                                               \
12326 +assert("nikita-3435",                                                  \
12327 +       ergo(scan->direction == LEFT_SIDE &&                            \
12328 +           (scan->parent_coord.node->level == TWIG_LEVEL) &&           \
12329 +           jnode_is_unformatted(scan->node) &&                         \
12330 +           extent_is_unallocated(&scan->parent_coord),                 \
12331 +           extent_unit_index(&scan->parent_coord) == index_jnode(scan->node)))
12332 +
12333 +/* This flush_cnt variable is used to track the number of concurrent flush
12334 +   operations, useful for debugging. It is initialized in txnmgr.c out of
12335 +   laziness (because flush has no static initializer function...) */
12336 +ON_DEBUG(atomic_t flush_cnt;
12337 +    )
12338 +
12339 +/* check fs backing device for write congestion */
12340 +static int check_write_congestion(void)
12341 +{
12342 +       struct super_block *sb;
12343 +       struct backing_dev_info *bdi;
12344 +
12345 +       sb = reiser4_get_current_sb();
12346 +       bdi = reiser4_get_super_fake(sb)->i_mapping->backing_dev_info;
12347 +       return bdi_write_congested(bdi);
12348 +}
12349 +
12350 +/* conditionally write flush queue */
12351 +static int write_prepped_nodes(flush_pos_t *pos)
12352 +{
12353 +       int ret;
12354 +
12355 +       assert("zam-831", pos);
12356 +       assert("zam-832", pos->fq);
12357 +
12358 +       if (!(pos->flags & JNODE_FLUSH_WRITE_BLOCKS))
12359 +               return 0;
12360 +
12361 +       if (check_write_congestion())
12362 +               return 0;
12363 +
12364 +       ret = reiser4_write_fq(pos->fq, pos->nr_written,
12365 +                      WRITEOUT_SINGLE_STREAM | WRITEOUT_FOR_PAGE_RECLAIM);
12366 +       return ret;
12367 +}
12368 +
12369 +/* Proper release all flush pos. resources then move flush position to new
12370 +   locked node */
12371 +static void move_flush_pos(flush_pos_t *pos, lock_handle * new_lock,
12372 +                          load_count * new_load, const coord_t *new_coord)
12373 +{
12374 +       assert("zam-857", new_lock->node == new_load->node);
12375 +
12376 +       if (new_coord) {
12377 +               assert("zam-858", new_coord->node == new_lock->node);
12378 +               coord_dup(&pos->coord, new_coord);
12379 +       } else {
12380 +               coord_init_first_unit(&pos->coord, new_lock->node);
12381 +       }
12382 +
12383 +       if (pos->child) {
12384 +               jput(pos->child);
12385 +               pos->child = NULL;
12386 +       }
12387 +
12388 +       move_load_count(&pos->load, new_load);
12389 +       done_lh(&pos->lock);
12390 +       move_lh(&pos->lock, new_lock);
12391 +}
12392 +
12393 +/* delete empty node which link from the parent still exists. */
12394 +static int delete_empty_node(znode * node)
12395 +{
12396 +       reiser4_key smallest_removed;
12397 +
12398 +       assert("zam-1019", node != NULL);
12399 +       assert("zam-1020", node_is_empty(node));
12400 +       assert("zam-1023", znode_is_wlocked(node));
12401 +
12402 +       return reiser4_delete_node(node, &smallest_removed, NULL, 1);
12403 +}
12404 +
12405 +/* Prepare flush position for alloc_pos_and_ancestors() and squalloc() */
12406 +static int prepare_flush_pos(flush_pos_t *pos, jnode * org)
12407 +{
12408 +       int ret;
12409 +       load_count load;
12410 +       lock_handle lock;
12411 +
12412 +       init_lh(&lock);
12413 +       init_load_count(&load);
12414 +
12415 +       if (jnode_is_znode(org)) {
12416 +               ret = longterm_lock_znode(&lock, JZNODE(org),
12417 +                                         ZNODE_WRITE_LOCK, ZNODE_LOCK_HIPRI);
12418 +               if (ret)
12419 +                       return ret;
12420 +
12421 +               ret = incr_load_count_znode(&load, JZNODE(org));
12422 +               if (ret)
12423 +                       return ret;
12424 +
12425 +               pos->state =
12426 +                   (jnode_get_level(org) ==
12427 +                    LEAF_LEVEL) ? POS_ON_LEAF : POS_ON_INTERNAL;
12428 +               move_flush_pos(pos, &lock, &load, NULL);
12429 +       } else {
12430 +               coord_t parent_coord;
12431 +               ret = jnode_lock_parent_coord(org, &parent_coord, &lock,
12432 +                                             &load, ZNODE_WRITE_LOCK, 0);
12433 +               if (ret)
12434 +                       goto done;
12435 +               if (!item_is_extent(&parent_coord)) {
12436 +                       /* file was converted to tail, org became HB, we found
12437 +                          internal item */
12438 +                       ret = -EAGAIN;
12439 +                       goto done;
12440 +               }
12441 +
12442 +               pos->state = POS_ON_EPOINT;
12443 +               move_flush_pos(pos, &lock, &load, &parent_coord);
12444 +               pos->child = jref(org);
12445 +               if (extent_is_unallocated(&parent_coord)
12446 +                   && extent_unit_index(&parent_coord) != index_jnode(org)) {
12447 +                       /* @org is not first child of its parent unit. This may
12448 +                          happen because longerm lock of its parent node was
12449 +                          released between scan_left and scan_right. For now
12450 +                          work around this having flush to repeat */
12451 +                       ret = -EAGAIN;
12452 +               }
12453 +       }
12454 +
12455 +done:
12456 +       done_load_count(&load);
12457 +       done_lh(&lock);
12458 +       return ret;
12459 +}
12460 +
12461 +/* TODO LIST (no particular order): */
12462 +/* I have labelled most of the legitimate FIXME comments in this file with
12463 +   letters to indicate which issue they relate to. There are a few miscellaneous
12464 +   FIXMEs with specific names mentioned instead that need to be
12465 +   inspected/resolved. */
12466 +/* B. There is an issue described in reverse_relocate_test having to do with an
12467 +   imprecise is_preceder? check having to do with partially-dirty extents. The
12468 +   code that sets preceder hints and computes the preceder is basically
12469 +   untested. Careful testing needs to be done that preceder calculations are
12470 +   done correctly, since if it doesn't affect correctness we will not catch this
12471 +   stuff during regular testing. */
12472 +/* C. EINVAL, E_DEADLOCK, E_NO_NEIGHBOR, ENOENT handling. It is unclear which of
12473 +   these are considered expected but unlikely conditions. Flush currently
12474 +   returns 0 (i.e., success but no progress, i.e., restart) whenever it receives
12475 +   any of these in jnode_flush(). Many of the calls that may produce one of
12476 +   these return values (i.e., longterm_lock_znode, reiser4_get_parent,
12477 +   reiser4_get_neighbor, ...) check some of these values themselves and, for
12478 +   instance, stop flushing instead of resulting in a restart. If any of these
12479 +   results are true error conditions then flush will go into a busy-loop, as we
12480 +   noticed during testing when a corrupt tree caused find_child_ptr to return
12481 +   ENOENT. It needs careful thought and testing of corner conditions.
12482 +*/
12483 +/* D. Atomicity of flush_prep against deletion and flush concurrency. Suppose a
12484 +   created block is assigned a block number then early-flushed to disk. It is
12485 +   dirtied again and flush is called again. Concurrently, that block is deleted,
12486 +   and the de-allocation of its block number does not need to be deferred, since
12487 +   it is not part of the preserve set (i.e., it didn't exist before the
12488 +   transaction). I think there may be a race condition where flush writes the
12489 +   dirty, created block after the non-deferred deallocated block number is
12490 +   re-allocated, making it possible to write deleted data on top of non-deleted
12491 +   data. Its just a theory, but it needs to be thought out. */
12492 +/* F. bio_alloc() failure is not handled gracefully. */
12493 +/* G. Unallocated children. */
12494 +/* H. Add a WANDERED_LIST to the atom to clarify the placement of wandered
12495 +   blocks. */
12496 +/* I. Rename flush-scan to scan-point, (flush-pos to flush-point?) */
12497 +
12498 +/* JNODE_FLUSH: MAIN ENTRY POINT */
12499 +/* This is the main entry point for flushing a jnode and its dirty neighborhood
12500 +   (dirty neighborhood is named "slum"). Jnode_flush() is called if reiser4 has
12501 +   to write dirty blocks to disk, it happens when Linux VM decides to reduce
12502 +   number of dirty pages or as a part of transaction commit.
12503 +
12504 +   Our objective here is to prep and flush the slum the jnode belongs to. We
12505 +   want to squish the slum together, and allocate the nodes in it as we squish
12506 +   because allocation of children affects squishing of parents.
12507 +
12508 +   The "argument" @node tells flush where to start. From there, flush finds the
12509 +   left edge of the slum, and calls squalloc (in which nodes are squeezed and
12510 +   allocated). To find a "better place" to start squalloc first we perform a
12511 +   flush_scan.
12512 +
12513 +   Flush-scanning may be performed in both left and right directions, but for
12514 +   different purposes. When scanning to the left, we are searching for a node
12515 +   that precedes a sequence of parent-first-ordered nodes which we will then
12516 +   flush in parent-first order. During flush-scanning, we also take the
12517 +   opportunity to count the number of consecutive leaf nodes. If this number is
12518 +   past some threshold (FLUSH_RELOCATE_THRESHOLD), then we make a decision to
12519 +   reallocate leaf nodes (thus favoring write-optimization).
12520 +
12521 +   Since the flush argument node can be anywhere in a sequence of dirty leaves,
12522 +   there may also be dirty nodes to the right of the argument. If the scan-left
12523 +   operation does not count at least FLUSH_RELOCATE_THRESHOLD nodes then we
12524 +   follow it with a right-scan operation to see whether there is, in fact,
12525 +   enough nodes to meet the relocate threshold. Each right- and left-scan
12526 +   operation uses a single flush_scan object.
12527 +
12528 +   After left-scan and possibly right-scan, we prepare a flush_position object
12529 +   with the starting flush point or parent coordinate, which was determined
12530 +   using scan-left.
12531 +
12532 +   Next we call the main flush routine, squalloc, which iterates along the leaf
12533 +   level, squeezing and allocating nodes (and placing them into the flush
12534 +   queue).
12535 +
12536 +   After squalloc returns we take extra steps to ensure that all the children
12537 +   of the final twig node are allocated--this involves repeating squalloc
12538 +   until we finish at a twig with no unallocated children.
12539 +
12540 +   Finally, we call flush_empty_queue to submit write-requests to disk. If we
12541 +   encounter any above-twig nodes during flush_empty_queue that still have
12542 +   unallocated children, we flush_unprep them.
12543 +
12544 +   Flush treats several "failure" cases as non-failures, essentially causing
12545 +   them to start over. E_DEADLOCK is one example.
12546 +   FIXME:(C) EINVAL, E_NO_NEIGHBOR, ENOENT: these should probably be handled
12547 +   properly rather than restarting, but there are a bunch of cases to audit.
12548 +*/
12549 +
12550 +static int
12551 +jnode_flush(jnode * node, long nr_to_write, long *nr_written,
12552 +           flush_queue_t *fq, int flags)
12553 +{
12554 +       long ret = 0;
12555 +       flush_scan *right_scan;
12556 +       flush_scan *left_scan;
12557 +       flush_pos_t *flush_pos;
12558 +       int todo;
12559 +       struct super_block *sb;
12560 +       reiser4_super_info_data *sbinfo;
12561 +       jnode *leftmost_in_slum = NULL;
12562 +
12563 +       assert("jmacd-76619", lock_stack_isclean(get_current_lock_stack()));
12564 +       assert("nikita-3022", reiser4_schedulable());
12565 +
12566 +       assert("nikita-3185",
12567 +              get_current_super_private()->delete_mutex_owner != current);
12568 +
12569 +       /* allocate right_scan, left_scan and flush_pos */
12570 +       right_scan =
12571 +           kmalloc(2 * sizeof(*right_scan) + sizeof(*flush_pos),
12572 +                   reiser4_ctx_gfp_mask_get());
12573 +       if (right_scan == NULL)
12574 +               return RETERR(-ENOMEM);
12575 +       left_scan = right_scan + 1;
12576 +       flush_pos = (flush_pos_t *) (left_scan + 1);
12577 +
12578 +       sb = reiser4_get_current_sb();
12579 +       sbinfo = get_super_private(sb);
12580 +
12581 +       /* Flush-concurrency debug code */
12582 +#if REISER4_DEBUG
12583 +       atomic_inc(&flush_cnt);
12584 +#endif
12585 +
12586 +       reiser4_enter_flush(sb);
12587 +
12588 +       /* Initialize a flush position. */
12589 +       pos_init(flush_pos);
12590 +
12591 +       flush_pos->nr_written = nr_written;
12592 +       flush_pos->fq = fq;
12593 +       flush_pos->flags = flags;
12594 +       flush_pos->nr_to_write = nr_to_write;
12595 +
12596 +       scan_init(right_scan);
12597 +       scan_init(left_scan);
12598 +
12599 +       /* First scan left and remember the leftmost scan position. If the
12600 +          leftmost position is unformatted we remember its parent_coord. We
12601 +          scan until counting FLUSH_SCAN_MAXNODES.
12602 +
12603 +          If starting @node is unformatted, at the beginning of left scan its
12604 +          parent (twig level node, containing extent item) will be long term
12605 +          locked and lock handle will be stored in the
12606 +          @right_scan->parent_lock. This lock is used to start the rightward
12607 +          scan without redoing the tree traversal (necessary to find parent)
12608 +          and, hence, is kept during leftward scan. As a result, we have to
12609 +          use try-lock when taking long term locks during the leftward scan.
12610 +        */
12611 +       ret = scan_left(left_scan, right_scan,
12612 +                       node, sbinfo->flush.scan_maxnodes);
12613 +       if (ret != 0)
12614 +               goto failed;
12615 +
12616 +       leftmost_in_slum = jref(left_scan->node);
12617 +       scan_done(left_scan);
12618 +
12619 +       /* Then possibly go right to decide if we will use a policy of
12620 +          relocating leaves. This is only done if we did not scan past (and
12621 +          count) enough nodes during the leftward scan. If we do scan right,
12622 +          we only care to go far enough to establish that at least
12623 +          FLUSH_RELOCATE_THRESHOLD number of nodes are being flushed. The scan
12624 +          limit is the difference between left_scan.count and the threshold. */
12625 +
12626 +       todo = sbinfo->flush.relocate_threshold - left_scan->count;
12627 +       /* scan right is inherently deadlock prone, because we are
12628 +        * (potentially) holding a lock on the twig node at this moment.
12629 +        * FIXME: this is incorrect comment: lock is not held */
12630 +       if (todo > 0) {
12631 +               ret = scan_right(right_scan, node, (unsigned)todo);
12632 +               if (ret != 0)
12633 +                       goto failed;
12634 +       }
12635 +
12636 +       /* Only the right-scan count is needed, release any rightward locks
12637 +          right away. */
12638 +       scan_done(right_scan);
12639 +
12640 +       /* ... and the answer is: we should relocate leaf nodes if at least
12641 +          FLUSH_RELOCATE_THRESHOLD nodes were found. */
12642 +       flush_pos->leaf_relocate = JF_ISSET(node, JNODE_REPACK) ||
12643 +           (left_scan->count + right_scan->count >=
12644 +            sbinfo->flush.relocate_threshold);
12645 +
12646 +       /* Funny business here.  We set the 'point' in the flush_position at
12647 +          prior to starting squalloc regardless of whether the first point is
12648 +          formatted or unformatted. Without this there would be an invariant,
12649 +          in the rest of the code, that if the flush_position is unformatted
12650 +          then flush_position->point is NULL and
12651 +          flush_position->parent_{lock,coord} is set, and if the flush_position
12652 +          is formatted then flush_position->point is non-NULL and no parent
12653 +          info is set.
12654 +
12655 +          This seems lazy, but it makes the initial calls to
12656 +          reverse_relocate_test (which ask "is it the pos->point the leftmost
12657 +          child of its parent") much easier because we know the first child
12658 +          already.  Nothing is broken by this, but the reasoning is subtle.
12659 +          Holding an extra reference on a jnode during flush can cause us to
12660 +          see nodes with HEARD_BANSHEE during squalloc, because nodes are not
12661 +          removed from sibling lists until they have zero reference count.
12662 +          Flush would never observe a HEARD_BANSHEE node on the left-edge of
12663 +          flush, nodes are only deleted to the right. So if nothing is broken,
12664 +          why fix it?
12665 +
12666 +          NOTE-NIKITA actually, flush can meet HEARD_BANSHEE node at any
12667 +          point and in any moment, because of the concurrent file system
12668 +          activity (for example, truncate). */
12669 +
12670 +       /* Check jnode state after flush_scan completed. Having a lock on this
12671 +          node or its parent (in case of unformatted) helps us in case of
12672 +          concurrent flushing. */
12673 +       if (jnode_check_flushprepped(leftmost_in_slum)
12674 +           && !jnode_convertible(leftmost_in_slum)) {
12675 +               ret = 0;
12676 +               goto failed;
12677 +       }
12678 +
12679 +       /* Now setup flush_pos using scan_left's endpoint. */
12680 +       ret = prepare_flush_pos(flush_pos, leftmost_in_slum);
12681 +       if (ret)
12682 +               goto failed;
12683 +
12684 +       if (znode_get_level(flush_pos->coord.node) == LEAF_LEVEL
12685 +           && node_is_empty(flush_pos->coord.node)) {
12686 +               znode *empty = flush_pos->coord.node;
12687 +
12688 +               assert("zam-1022", !ZF_ISSET(empty, JNODE_HEARD_BANSHEE));
12689 +               ret = delete_empty_node(empty);
12690 +               goto failed;
12691 +       }
12692 +
12693 +       if (jnode_check_flushprepped(leftmost_in_slum)
12694 +           && !jnode_convertible(leftmost_in_slum)) {
12695 +               ret = 0;
12696 +               goto failed;
12697 +       }
12698 +
12699 +       /* Set pos->preceder and (re)allocate pos and its ancestors if it is
12700 +          needed  */
12701 +       ret = alloc_pos_and_ancestors(flush_pos);
12702 +       if (ret)
12703 +               goto failed;
12704 +
12705 +       /* Do the main rightward-bottom-up squeeze and allocate loop. */
12706 +       ret = squalloc(flush_pos);
12707 +       pos_stop(flush_pos);
12708 +       if (ret)
12709 +               goto failed;
12710 +
12711 +       /* FIXME_NFQUCMPD: Here, handle the twig-special case for unallocated
12712 +          children. First, the pos_stop() and pos_valid() routines should be
12713 +          modified so that pos_stop() sets a flush_position->stop flag to 1
12714 +          without releasing the current position immediately--instead release
12715 +          it in pos_done(). This is a better implementation than the current
12716 +          one anyway.
12717 +
12718 +          It is not clear that all fields of the flush_position should not be
12719 +          released, but at the very least the parent_lock, parent_coord, and
12720 +          parent_load should remain held because they are hold the last twig
12721 +          when pos_stop() is called.
12722 +
12723 +          When we reach this point in the code, if the parent_coord is set to
12724 +          after the last item then we know that flush reached the end of a twig
12725 +          (and according to the new flush queueing design, we will return now).
12726 +          If parent_coord is not past the last item, we should check if the
12727 +          current twig has any unallocated children to the right (we are not
12728 +          concerned with unallocated children to the left--in that case the
12729 +          twig itself should not have been allocated). If the twig has
12730 +          unallocated children to the right, set the parent_coord to that
12731 +          position and then repeat the call to squalloc.
12732 +
12733 +          Testing for unallocated children may be defined in two ways: if any
12734 +          internal item has a fake block number, it is unallocated; if any
12735 +          extent item is unallocated then all of its children are unallocated.
12736 +          But there is a more aggressive approach: if there are any dirty
12737 +          children of the twig to the right of the current position, we may
12738 +          wish to relocate those nodes now. Checking for potential relocation
12739 +          is more expensive as it requires knowing whether there are any dirty
12740 +          children that are not unallocated. The extent_needs_allocation should
12741 +          be used after setting the correct preceder.
12742 +
12743 +          When we reach the end of a twig at this point in the code, if the
12744 +          flush can continue (when the queue is ready) it will need some
12745 +          information on the future starting point. That should be stored away
12746 +          in the flush_handle using a seal, I believe. Holding a jref() on the
12747 +          future starting point may break other code that deletes that node.
12748 +        */
12749 +
12750 +       /* FIXME_NFQUCMPD: Also, we don't want to do any flushing when flush is
12751 +          called above the twig level.  If the VM calls flush above the twig
12752 +          level, do nothing and return (but figure out why this happens). The
12753 +          txnmgr should be modified to only flush its leaf-level dirty list.
12754 +          This will do all the necessary squeeze and allocate steps but leave
12755 +          unallocated branches and possibly unallocated twigs (when the twig's
12756 +          leftmost child is not dirty). After flushing the leaf level, the
12757 +          remaining unallocated nodes should be given write-optimized
12758 +          locations. (Possibly, the remaining unallocated twigs should be
12759 +          allocated just before their leftmost child.)
12760 +        */
12761 +
12762 +       /* Any failure reaches this point. */
12763 +failed:
12764 +
12765 +       switch (ret) {
12766 +       case -E_REPEAT:
12767 +       case -EINVAL:
12768 +       case -E_DEADLOCK:
12769 +       case -E_NO_NEIGHBOR:
12770 +       case -ENOENT:
12771 +               /* FIXME(C): Except for E_DEADLOCK, these should probably be
12772 +                  handled properly in each case. They already are handled in
12773 +                  many cases. */
12774 +               /* Something bad happened, but difficult to avoid... Try again!
12775 +               */
12776 +               ret = 0;
12777 +       }
12778 +
12779 +       if (leftmost_in_slum)
12780 +               jput(leftmost_in_slum);
12781 +
12782 +       pos_done(flush_pos);
12783 +       scan_done(left_scan);
12784 +       scan_done(right_scan);
12785 +       kfree(right_scan);
12786 +
12787 +       ON_DEBUG(atomic_dec(&flush_cnt));
12788 +
12789 +       reiser4_leave_flush(sb);
12790 +
12791 +       return ret;
12792 +}
12793 +
12794 +/* The reiser4 flush subsystem can be turned into "rapid flush mode" means that
12795 + * flusher should submit all prepped nodes immediately without keeping them in
12796 + * flush queues for long time.  The reason for rapid flush mode is to free
12797 + * memory as fast as possible. */
12798 +
12799 +#if REISER4_USE_RAPID_FLUSH
12800 +
12801 +/**
12802 + * submit all prepped nodes if rapid flush mode is set,
12803 + * turn rapid flush mode off.
12804 + */
12805 +
12806 +static int rapid_flush(flush_pos_t *pos)
12807 +{
12808 +       if (!wbq_available())
12809 +               return 0;
12810 +
12811 +       return write_prepped_nodes(pos);
12812 +}
12813 +
12814 +#else
12815 +
12816 +#define rapid_flush(pos) (0)
12817 +
12818 +#endif                         /* REISER4_USE_RAPID_FLUSH */
12819 +
12820 +static jnode *find_flush_start_jnode(jnode *start, txn_atom * atom,
12821 +                                    flush_queue_t *fq, int *nr_queued,
12822 +                                    int flags)
12823 +{
12824 +       jnode * node;
12825 +
12826 +       if (start != NULL) {
12827 +               spin_lock_jnode(start);
12828 +               if (!jnode_is_flushprepped(start)) {
12829 +                       assert("zam-1056", start->atom == atom);
12830 +                       node = start;
12831 +                       goto enter;
12832 +               }
12833 +               spin_unlock_jnode(start);
12834 +       }
12835 +       /*
12836 +        * In this loop we process all already prepped (RELOC or OVRWR) and
12837 +        * dirtied again nodes. The atom spin lock is not released until all
12838 +        * dirty nodes processed or not prepped node found in the atom dirty
12839 +        * lists.
12840 +        */
12841 +       while ((node = find_first_dirty_jnode(atom, flags))) {
12842 +               spin_lock_jnode(node);
12843 +enter:
12844 +               assert("zam-881", JF_ISSET(node, JNODE_DIRTY));
12845 +               assert("zam-898", !JF_ISSET(node, JNODE_OVRWR));
12846 +
12847 +               if (JF_ISSET(node, JNODE_WRITEBACK)) {
12848 +                       /* move node to the end of atom's writeback list */
12849 +                       list_move_tail(&node->capture_link, ATOM_WB_LIST(atom));
12850 +
12851 +                       /*
12852 +                        * jnode is not necessarily on dirty list: if it was
12853 +                        * dirtied when it was on flush queue - it does not get
12854 +                        * moved to dirty list
12855 +                        */
12856 +                       ON_DEBUG(count_jnode(atom, node, NODE_LIST(node),
12857 +                                            WB_LIST, 1));
12858 +
12859 +               } else if (jnode_is_znode(node)
12860 +                          && znode_above_root(JZNODE(node))) {
12861 +                       /*
12862 +                        * A special case for znode-above-root. The above-root
12863 +                        * (fake) znode is captured and dirtied when the tree
12864 +                        * height changes or when the root node is relocated.
12865 +                        * This causes atoms to fuse so that changes at the root
12866 +                        * are serialized.  However, this node is never flushed.
12867 +                        * This special case used to be in lock.c to prevent the
12868 +                        * above-root node from ever being captured, but now
12869 +                        * that it is captured we simply prevent it from
12870 +                        * flushing. The log-writer code relies on this to
12871 +                        * properly log superblock modifications of the tree
12872 +                        * height.
12873 +                        */
12874 +                       jnode_make_wander_nolock(node);
12875 +               } else if (JF_ISSET(node, JNODE_RELOC)) {
12876 +                       queue_jnode(fq, node);
12877 +                       ++(*nr_queued);
12878 +               } else
12879 +                       break;
12880 +
12881 +               spin_unlock_jnode(node);
12882 +       }
12883 +       return node;
12884 +}
12885 +
12886 +/* Flush some nodes of current atom, usually slum, return -E_REPEAT if there are
12887 + * more nodes to flush, return 0 if atom's dirty lists empty and keep current
12888 + * atom locked, return other errors as they are. */
12889 +int
12890 +flush_current_atom(int flags, long nr_to_write, long *nr_submitted,
12891 +                  txn_atom ** atom, jnode *start)
12892 +{
12893 +       reiser4_super_info_data *sinfo = get_current_super_private();
12894 +       flush_queue_t *fq = NULL;
12895 +       jnode *node;
12896 +       int nr_queued;
12897 +       int ret;
12898 +
12899 +       assert("zam-889", atom != NULL && *atom != NULL);
12900 +       assert_spin_locked(&((*atom)->alock));
12901 +       assert("zam-892", get_current_context()->trans->atom == *atom);
12902 +
12903 +       nr_to_write = LONG_MAX;
12904 +       while (1) {
12905 +               ret = reiser4_fq_by_atom(*atom, &fq);
12906 +               if (ret != -E_REPEAT)
12907 +                       break;
12908 +               *atom = get_current_atom_locked();
12909 +       }
12910 +       if (ret)
12911 +               return ret;
12912 +
12913 +       assert_spin_locked(&((*atom)->alock));
12914 +
12915 +       /* parallel flushers limit */
12916 +       if (sinfo->tmgr.atom_max_flushers != 0) {
12917 +               while ((*atom)->nr_flushers >= sinfo->tmgr.atom_max_flushers) {
12918 +                       /* An reiser4_atom_send_event() call is inside
12919 +                          reiser4_fq_put_nolock() which is called when flush is
12920 +                          finished and nr_flushers is decremented. */
12921 +                       reiser4_atom_wait_event(*atom);
12922 +                       *atom = get_current_atom_locked();
12923 +               }
12924 +       }
12925 +
12926 +       /* count ourself as a flusher */
12927 +       (*atom)->nr_flushers++;
12928 +
12929 +       writeout_mode_enable();
12930 +
12931 +       nr_queued = 0;
12932 +       node = find_flush_start_jnode(start, *atom, fq, &nr_queued, flags);
12933 +
12934 +       if (node == NULL) {
12935 +               if (nr_queued == 0) {
12936 +                       (*atom)->nr_flushers--;
12937 +                       reiser4_fq_put_nolock(fq);
12938 +                       reiser4_atom_send_event(*atom);
12939 +                       /* current atom remains locked */
12940 +                       writeout_mode_disable();
12941 +                       return 0;
12942 +               }
12943 +               spin_unlock_atom(*atom);
12944 +       } else {
12945 +               jref(node);
12946 +               BUG_ON((*atom)->super != node->tree->super);
12947 +               spin_unlock_atom(*atom);
12948 +               spin_unlock_jnode(node);
12949 +               BUG_ON(nr_to_write == 0);
12950 +               ret = jnode_flush(node, nr_to_write, nr_submitted, fq, flags);
12951 +               jput(node);
12952 +       }
12953 +
12954 +       ret =
12955 +           reiser4_write_fq(fq, nr_submitted,
12956 +                    WRITEOUT_SINGLE_STREAM | WRITEOUT_FOR_PAGE_RECLAIM);
12957 +
12958 +       *atom = get_current_atom_locked();
12959 +       (*atom)->nr_flushers--;
12960 +       reiser4_fq_put_nolock(fq);
12961 +       reiser4_atom_send_event(*atom);
12962 +       spin_unlock_atom(*atom);
12963 +
12964 +       writeout_mode_disable();
12965 +
12966 +       if (ret == 0)
12967 +               ret = -E_REPEAT;
12968 +
12969 +       return ret;
12970 +}
12971 +
12972 +/* REVERSE PARENT-FIRST RELOCATION POLICIES */
12973 +
12974 +/* This implements the is-it-close-enough-to-its-preceder? test for relocation
12975 +   in the reverse parent-first relocate context. Here all we know is the
12976 +   preceder and the block number. Since we are going in reverse, the preceder
12977 +   may still be relocated as well, so we can't ask the block allocator "is there
12978 +   a closer block available to relocate?" here. In the _forward_ parent-first
12979 +   relocate context (not here) we actually call the block allocator to try and
12980 +   find a closer location. */
12981 +static int
12982 +reverse_relocate_if_close_enough(const reiser4_block_nr * pblk,
12983 +                                const reiser4_block_nr * nblk)
12984 +{
12985 +       reiser4_block_nr dist;
12986 +
12987 +       assert("jmacd-7710", *pblk != 0 && *nblk != 0);
12988 +       assert("jmacd-7711", !reiser4_blocknr_is_fake(pblk));
12989 +       assert("jmacd-7712", !reiser4_blocknr_is_fake(nblk));
12990 +
12991 +       /* Distance is the absolute value. */
12992 +       dist = (*pblk > *nblk) ? (*pblk - *nblk) : (*nblk - *pblk);
12993 +
12994 +       /* If the block is less than FLUSH_RELOCATE_DISTANCE blocks away from
12995 +          its preceder block, do not relocate. */
12996 +       if (dist <= get_current_super_private()->flush.relocate_distance)
12997 +               return 0;
12998 +
12999 +       return 1;
13000 +}
13001 +
13002 +/* This function is a predicate that tests for relocation. Always called in the
13003 +   reverse-parent-first context, when we are asking whether the current node
13004 +   should be relocated in order to expand the flush by dirtying the parent level
13005 +   (and thus proceeding to flush that level). When traversing in the forward
13006 +   parent-first direction (not here), relocation decisions are handled in two
13007 +   places: allocate_znode() and extent_needs_allocation(). */
13008 +static int
13009 +reverse_relocate_test(jnode * node, const coord_t *parent_coord,
13010 +                     flush_pos_t *pos)
13011 +{
13012 +       reiser4_block_nr pblk = 0;
13013 +       reiser4_block_nr nblk = 0;
13014 +
13015 +       assert("jmacd-8989", !jnode_is_root(node));
13016 +
13017 +       /*
13018 +        * This function is called only from the
13019 +        * reverse_relocate_check_dirty_parent() and only if the parent
13020 +        * node is clean. This implies that the parent has the real (i.e., not
13021 +        * fake) block number, and, so does the child, because otherwise the
13022 +        * parent would be dirty.
13023 +        */
13024 +
13025 +       /* New nodes are treated as if they are being relocated. */
13026 +       if (JF_ISSET(node, JNODE_CREATED) ||
13027 +           (pos->leaf_relocate && jnode_get_level(node) == LEAF_LEVEL))
13028 +               return 1;
13029 +
13030 +       /* Find the preceder. FIXME(B): When the child is an unformatted,
13031 +          previously existing node, the coord may be leftmost even though the
13032 +          child is not the parent-first preceder of the parent. If the first
13033 +          dirty node appears somewhere in the middle of the first extent unit,
13034 +          this preceder calculation is wrong.
13035 +          Needs more logic in here. */
13036 +       if (coord_is_leftmost_unit(parent_coord)) {
13037 +               pblk = *znode_get_block(parent_coord->node);
13038 +       } else {
13039 +               pblk = pos->preceder.blk;
13040 +       }
13041 +       check_preceder(pblk);
13042 +
13043 +       /* If (pblk == 0) then the preceder isn't allocated or isn't known:
13044 +          relocate. */
13045 +       if (pblk == 0)
13046 +               return 1;
13047 +
13048 +       nblk = *jnode_get_block(node);
13049 +
13050 +       if (reiser4_blocknr_is_fake(&nblk))
13051 +               /* child is unallocated, mark parent dirty */
13052 +               return 1;
13053 +
13054 +       return reverse_relocate_if_close_enough(&pblk, &nblk);
13055 +}
13056 +
13057 +/* This function calls reverse_relocate_test to make a reverse-parent-first
13058 +   relocation decision and then, if yes, it marks the parent dirty. */
13059 +static int
13060 +reverse_relocate_check_dirty_parent(jnode * node, const coord_t *parent_coord,
13061 +                                   flush_pos_t *pos)
13062 +{
13063 +       int ret;
13064 +
13065 +       if (!JF_ISSET(ZJNODE(parent_coord->node), JNODE_DIRTY)) {
13066 +
13067 +               ret = reverse_relocate_test(node, parent_coord, pos);
13068 +               if (ret < 0)
13069 +                       return ret;
13070 +
13071 +               /* FIXME-ZAM
13072 +                 if parent is already relocated - we do not want to grab space,
13073 +                 right? */
13074 +               if (ret == 1) {
13075 +                       int grabbed;
13076 +
13077 +                       grabbed = get_current_context()->grabbed_blocks;
13078 +                       if (reiser4_grab_space_force((__u64) 1, BA_RESERVED) !=
13079 +                           0)
13080 +                               reiser4_panic("umka-1250",
13081 +                                             "No space left during flush.");
13082 +
13083 +                       assert("jmacd-18923",
13084 +                              znode_is_write_locked(parent_coord->node));
13085 +                       znode_make_dirty(parent_coord->node);
13086 +                       grabbed2free_mark(grabbed);
13087 +               }
13088 +       }
13089 +
13090 +       return 0;
13091 +}
13092 +
13093 +/* INITIAL ALLOCATE ANCESTORS STEP (REVERSE PARENT-FIRST ALLOCATION BEFORE
13094 +   FORWARD PARENT-FIRST LOOP BEGINS) */
13095 +
13096 +/* Get the leftmost child for given coord. */
13097 +static int get_leftmost_child_of_unit(const coord_t *coord, jnode ** child)
13098 +{
13099 +       int ret;
13100 +
13101 +       ret = item_utmost_child(coord, LEFT_SIDE, child);
13102 +
13103 +       if (ret)
13104 +               return ret;
13105 +
13106 +       if (IS_ERR(*child))
13107 +               return PTR_ERR(*child);
13108 +
13109 +       return 0;
13110 +}
13111 +
13112 +/* This step occurs after the left- and right-scans are completed, before
13113 +   starting the forward parent-first traversal. Here we attempt to allocate
13114 +   ancestors of the starting flush point, which means continuing in the reverse
13115 +   parent-first direction to the parent, grandparent, and so on (as long as the
13116 +   child is a leftmost child). This routine calls a recursive process,
13117 +   alloc_one_ancestor, which does the real work, except there is special-case
13118 +   handling here for the first ancestor, which may be a twig. At each level
13119 +   (here and alloc_one_ancestor), we check for relocation and then, if the child
13120 +   is a leftmost child, repeat at the next level. On the way back down (the
13121 +   recursion), we allocate the ancestors in parent-first order. */
13122 +static int alloc_pos_and_ancestors(flush_pos_t *pos)
13123 +{
13124 +       int ret = 0;
13125 +       lock_handle plock;
13126 +       load_count pload;
13127 +       coord_t pcoord;
13128 +
13129 +       if (znode_check_flushprepped(pos->lock.node))
13130 +               return 0;
13131 +
13132 +       coord_init_invalid(&pcoord, NULL);
13133 +       init_lh(&plock);
13134 +       init_load_count(&pload);
13135 +
13136 +       if (pos->state == POS_ON_EPOINT) {
13137 +               /* a special case for pos on twig level, where we already have
13138 +                  a lock on parent node. */
13139 +               /* The parent may not be dirty, in which case we should decide
13140 +                  whether to relocate the child now. If decision is made to
13141 +                  relocate the child, the parent is marked dirty. */
13142 +               ret =
13143 +                   reverse_relocate_check_dirty_parent(pos->child, &pos->coord,
13144 +                                                       pos);
13145 +               if (ret)
13146 +                       goto exit;
13147 +
13148 +               /* FIXME_NFQUCMPD: We only need to allocate the twig (if child
13149 +                  is leftmost) and the leaf/child, so recursion is not needed.
13150 +                  Levels above the twig will be allocated for
13151 +                  write-optimization before the transaction commits.  */
13152 +
13153 +               /* Do the recursive step, allocating zero or more of our
13154 +                * ancestors. */
13155 +               ret = alloc_one_ancestor(&pos->coord, pos);
13156 +
13157 +       } else {
13158 +               if (!znode_is_root(pos->lock.node)) {
13159 +                       /* all formatted nodes except tree root */
13160 +                       ret =
13161 +                           reiser4_get_parent(&plock, pos->lock.node,
13162 +                                              ZNODE_WRITE_LOCK);
13163 +                       if (ret)
13164 +                               goto exit;
13165 +
13166 +                       ret = incr_load_count_znode(&pload, plock.node);
13167 +                       if (ret)
13168 +                               goto exit;
13169 +
13170 +                       ret =
13171 +                           find_child_ptr(plock.node, pos->lock.node, &pcoord);
13172 +                       if (ret)
13173 +                               goto exit;
13174 +
13175 +                       ret =
13176 +                           reverse_relocate_check_dirty_parent(ZJNODE
13177 +                                                               (pos->lock.
13178 +                                                                node), &pcoord,
13179 +                                                               pos);
13180 +                       if (ret)
13181 +                               goto exit;
13182 +
13183 +                       ret = alloc_one_ancestor(&pcoord, pos);
13184 +                       if (ret)
13185 +                               goto exit;
13186 +               }
13187 +
13188 +               ret = allocate_znode(pos->lock.node, &pcoord, pos);
13189 +       }
13190 +exit:
13191 +       done_load_count(&pload);
13192 +       done_lh(&plock);
13193 +       return ret;
13194 +}
13195 +
13196 +/* This is the recursive step described in alloc_pos_and_ancestors, above.
13197 +   Ignoring the call to set_preceder, which is the next function described, this
13198 +   checks if the child is a leftmost child and returns if it is not. If the
13199 +   child is a leftmost child it checks for relocation, possibly dirtying the
13200 +   parent. Then it performs the recursive step. */
13201 +static int alloc_one_ancestor(const coord_t *coord, flush_pos_t *pos)
13202 +{
13203 +       int ret = 0;
13204 +       lock_handle alock;
13205 +       load_count aload;
13206 +       coord_t acoord;
13207 +
13208 +       /* As we ascend at the left-edge of the region to flush, take this
13209 +          opportunity at the twig level to find our parent-first preceder
13210 +          unless we have already set it. */
13211 +       if (pos->preceder.blk == 0) {
13212 +               ret = set_preceder(coord, pos);
13213 +               if (ret != 0)
13214 +                       return ret;
13215 +       }
13216 +
13217 +       /* If the ancestor is clean or already allocated, or if the child is not
13218 +          a leftmost child, stop going up, even leaving coord->node not
13219 +          flushprepped. */
13220 +       if (znode_check_flushprepped(coord->node)
13221 +           || !coord_is_leftmost_unit(coord))
13222 +               return 0;
13223 +
13224 +       init_lh(&alock);
13225 +       init_load_count(&aload);
13226 +       coord_init_invalid(&acoord, NULL);
13227 +
13228 +       /* Only ascend to the next level if it is a leftmost child, but
13229 +          write-lock the parent in case we will relocate the child. */
13230 +       if (!znode_is_root(coord->node)) {
13231 +
13232 +               ret =
13233 +                   jnode_lock_parent_coord(ZJNODE(coord->node), &acoord,
13234 +                                           &alock, &aload, ZNODE_WRITE_LOCK,
13235 +                                           0);
13236 +               if (ret != 0) {
13237 +                       /* FIXME(C): check EINVAL, E_DEADLOCK */
13238 +                       goto exit;
13239 +               }
13240 +
13241 +               ret =
13242 +                   reverse_relocate_check_dirty_parent(ZJNODE(coord->node),
13243 +                                                       &acoord, pos);
13244 +               if (ret != 0)
13245 +                       goto exit;
13246 +
13247 +               /* Recursive call. */
13248 +               if (!znode_check_flushprepped(acoord.node)) {
13249 +                       ret = alloc_one_ancestor(&acoord, pos);
13250 +                       if (ret)
13251 +                               goto exit;
13252 +               }
13253 +       }
13254 +
13255 +       /* Note: we call allocate with the parent write-locked (except at the
13256 +          root) in case we relocate the child, in which case it will modify the
13257 +          parent during this call. */
13258 +       ret = allocate_znode(coord->node, &acoord, pos);
13259 +
13260 +exit:
13261 +       done_load_count(&aload);
13262 +       done_lh(&alock);
13263 +       return ret;
13264 +}
13265 +
13266 +/* During the reverse parent-first alloc_pos_and_ancestors process described
13267 +   above there is a call to this function at the twig level. During
13268 +   alloc_pos_and_ancestors we may ask: should this node be relocated (in reverse
13269 +   parent-first context)?  We repeat this process as long as the child is the
13270 +   leftmost child, eventually reaching an ancestor of the flush point that is
13271 +   not a leftmost child. The preceder of that ancestors, which is not a leftmost
13272 +   child, is actually on the leaf level. The preceder of that block is the
13273 +   left-neighbor of the flush point. The preceder of that block is the rightmost
13274 +   child of the twig on the left. So, when alloc_pos_and_ancestors passes upward
13275 +   through the twig level, it stops momentarily to remember the block of the
13276 +   rightmost child of the twig on the left and sets it to the flush_position's
13277 +   preceder_hint.
13278 +
13279 +   There is one other place where we may set the flush_position's preceder hint,
13280 +   which is during scan-left.
13281 +*/
13282 +static int set_preceder(const coord_t *coord_in, flush_pos_t *pos)
13283 +{
13284 +       int ret;
13285 +       coord_t coord;
13286 +       lock_handle left_lock;
13287 +       load_count left_load;
13288 +
13289 +       coord_dup(&coord, coord_in);
13290 +
13291 +       init_lh(&left_lock);
13292 +       init_load_count(&left_load);
13293 +
13294 +       /* FIXME(B): Same FIXME as in "Find the preceder" in
13295 +          reverse_relocate_test. coord_is_leftmost_unit is not the right test
13296 +          if the unformatted child is in the middle of the first extent unit.*/
13297 +       if (!coord_is_leftmost_unit(&coord)) {
13298 +               coord_prev_unit(&coord);
13299 +       } else {
13300 +               ret =
13301 +                   reiser4_get_left_neighbor(&left_lock, coord.node,
13302 +                                             ZNODE_READ_LOCK, GN_SAME_ATOM);
13303 +               if (ret) {
13304 +                       /* If we fail for any reason it doesn't matter because
13305 +                          the preceder is only a hint. We are low-priority at
13306 +                          this point, so this must be the case. */
13307 +                       if (ret == -E_REPEAT || ret == -E_NO_NEIGHBOR ||
13308 +                           ret == -ENOENT || ret == -EINVAL
13309 +                           || ret == -E_DEADLOCK)
13310 +                               ret = 0;
13311 +                       goto exit;
13312 +               }
13313 +
13314 +               ret = incr_load_count_znode(&left_load, left_lock.node);
13315 +               if (ret)
13316 +                       goto exit;
13317 +
13318 +               coord_init_last_unit(&coord, left_lock.node);
13319 +       }
13320 +
13321 +       ret =
13322 +           item_utmost_child_real_block(&coord, RIGHT_SIDE,
13323 +                                        &pos->preceder.blk);
13324 +exit:
13325 +       check_preceder(pos->preceder.blk);
13326 +       done_load_count(&left_load);
13327 +       done_lh(&left_lock);
13328 +       return ret;
13329 +}
13330 +
13331 +/* MAIN SQUEEZE AND ALLOCATE LOOP (THREE BIG FUNCTIONS) */
13332 +
13333 +/* This procedure implements the outer loop of the flush algorithm. To put this
13334 +   in context, here is the general list of steps taken by the flush routine as a
13335 +   whole:
13336 +
13337 +   1. Scan-left
13338 +   2. Scan-right (maybe)
13339 +   3. Allocate initial flush position and its ancestors
13340 +   4. <handle extents>
13341 +   5. <squeeze and next position and its ancestors to-the-right,
13342 +       then update position to-the-right>
13343 +   6. <repeat from #4 until flush is stopped>
13344 +
13345 +   This procedure implements the loop in steps 4 through 6 in the above listing.
13346 +
13347 +   Step 4: if the current flush position is an extent item (position on the twig
13348 +   level), it allocates the extent (allocate_extent_item_in_place) then shifts
13349 +   to the next coordinate. If the next coordinate's leftmost child needs
13350 +   flushprep, we will continue. If the next coordinate is an internal item, we
13351 +   descend back to the leaf level, otherwise we repeat a step #4 (labeled
13352 +   ALLOC_EXTENTS below). If the "next coordinate" brings us past the end of the
13353 +   twig level, then we call reverse_relocate_end_of_twig to possibly dirty the
13354 +   next (right) twig, prior to step #5 which moves to the right.
13355 +
13356 +   Step 5: calls squalloc_changed_ancestors, which initiates a recursive call up
13357 +   the tree to allocate any ancestors of the next-right flush position that are
13358 +   not also ancestors of the current position. Those ancestors (in top-down
13359 +   order) are the next in parent-first order. We squeeze adjacent nodes on the
13360 +   way up until the right node and current node share the same parent, then
13361 +   allocate on the way back down. Finally, this step sets the flush position to
13362 +   the next-right node.  Then repeat steps 4 and 5.
13363 +*/
13364 +
13365 +/* SQUEEZE CODE */
13366 +
13367 +/* squalloc_right_twig helper function, cut a range of extent items from
13368 +   cut node to->node from the beginning up to coord @to. */
13369 +static int squalloc_right_twig_cut(coord_t *to, reiser4_key * to_key,
13370 +                                  znode * left)
13371 +{
13372 +       coord_t from;
13373 +       reiser4_key from_key;
13374 +
13375 +       coord_init_first_unit(&from, to->node);
13376 +       item_key_by_coord(&from, &from_key);
13377 +
13378 +       return cut_node_content(&from, to, &from_key, to_key, NULL);
13379 +}
13380 +
13381 +/* Copy as much of the leading extents from @right to @left, allocating
13382 +   unallocated extents as they are copied.  Returns SQUEEZE_TARGET_FULL or
13383 +   SQUEEZE_SOURCE_EMPTY when no more can be shifted.  If the next item is an
13384 +   internal item it calls shift_one_internal_unit and may then return
13385 +   SUBTREE_MOVED. */
13386 +static int squeeze_right_twig(znode * left, znode * right, flush_pos_t *pos)
13387 +{
13388 +       int ret = SUBTREE_MOVED;
13389 +       coord_t coord;          /* used to iterate over items */
13390 +       reiser4_key stop_key;
13391 +
13392 +       assert("jmacd-2008", !node_is_empty(right));
13393 +       coord_init_first_unit(&coord, right);
13394 +
13395 +       /* FIXME: can be optimized to cut once */
13396 +       while (!node_is_empty(coord.node) && item_is_extent(&coord)) {
13397 +               ON_DEBUG(void *vp);
13398 +
13399 +               assert("vs-1468", coord_is_leftmost_unit(&coord));
13400 +               ON_DEBUG(vp = shift_check_prepare(left, coord.node));
13401 +
13402 +               /* stop_key is used to find what was copied and what to cut */
13403 +               stop_key = *reiser4_min_key();
13404 +               ret = squalloc_extent(left, &coord, pos, &stop_key);
13405 +               if (ret != SQUEEZE_CONTINUE) {
13406 +                       ON_DEBUG(kfree(vp));
13407 +                       break;
13408 +               }
13409 +               assert("vs-1465", !keyeq(&stop_key, reiser4_min_key()));
13410 +
13411 +               /* Helper function to do the cutting. */
13412 +               set_key_offset(&stop_key, get_key_offset(&stop_key) - 1);
13413 +               check_me("vs-1466",
13414 +                        squalloc_right_twig_cut(&coord, &stop_key, left) == 0);
13415 +
13416 +               ON_DEBUG(shift_check(vp, left, coord.node));
13417 +       }
13418 +
13419 +       if (node_is_empty(coord.node))
13420 +               ret = SQUEEZE_SOURCE_EMPTY;
13421 +
13422 +       if (ret == SQUEEZE_TARGET_FULL)
13423 +               goto out;
13424 +
13425 +       if (node_is_empty(right)) {
13426 +               /* The whole right node was copied into @left. */
13427 +               assert("vs-464", ret == SQUEEZE_SOURCE_EMPTY);
13428 +               goto out;
13429 +       }
13430 +
13431 +       coord_init_first_unit(&coord, right);
13432 +
13433 +       if (!item_is_internal(&coord)) {
13434 +               /* we do not want to squeeze anything else to left neighbor
13435 +                  because "slum" is over */
13436 +               ret = SQUEEZE_TARGET_FULL;
13437 +               goto out;
13438 +       }
13439 +       assert("jmacd-433", item_is_internal(&coord));
13440 +
13441 +       /* Shift an internal unit.  The child must be allocated before shifting
13442 +          any more extents, so we stop here. */
13443 +       ret = shift_one_internal_unit(left, right);
13444 +
13445 +out:
13446 +       assert("jmacd-8612", ret < 0 || ret == SQUEEZE_TARGET_FULL
13447 +              || ret == SUBTREE_MOVED || ret == SQUEEZE_SOURCE_EMPTY);
13448 +
13449 +       if (ret == SQUEEZE_TARGET_FULL) {
13450 +               /* We submit prepped nodes here and expect that this @left twig
13451 +                * will not be modified again during this jnode_flush() call. */
13452 +               int ret1;
13453 +
13454 +               /* NOTE: seems like io is done under long term locks. */
13455 +               ret1 = write_prepped_nodes(pos);
13456 +               if (ret1 < 0)
13457 +                       return ret1;
13458 +       }
13459 +
13460 +       return ret;
13461 +}
13462 +
13463 +#if REISER4_DEBUG
13464 +static void item_convert_invariant(flush_pos_t *pos)
13465 +{
13466 +       assert("edward-1225", coord_is_existing_item(&pos->coord));
13467 +       if (chaining_data_present(pos)) {
13468 +               item_plugin *iplug = item_convert_plug(pos);
13469 +
13470 +               assert("edward-1000",
13471 +                      iplug == item_plugin_by_coord(&pos->coord));
13472 +               assert("edward-1001", iplug->f.convert != NULL);
13473 +       } else
13474 +               assert("edward-1226", pos->child == NULL);
13475 +}
13476 +#else
13477 +
13478 +#define item_convert_invariant(pos) noop
13479 +
13480 +#endif
13481 +
13482 +/* Scan node items starting from the first one and apply for each
13483 +   item its flush ->convert() method (if any). This method may
13484 +   resize/kill the item so the tree will be changed.
13485 +*/
13486 +static int convert_node(flush_pos_t *pos, znode * node)
13487 +{
13488 +       int ret = 0;
13489 +       item_plugin *iplug;
13490 +
13491 +       assert("edward-304", pos != NULL);
13492 +       assert("edward-305", pos->child == NULL);
13493 +       assert("edward-475", znode_convertible(node));
13494 +       assert("edward-669", znode_is_wlocked(node));
13495 +       assert("edward-1210", !node_is_empty(node));
13496 +
13497 +       if (znode_get_level(node) != LEAF_LEVEL)
13498 +               /* unsupported */
13499 +               goto exit;
13500 +
13501 +       coord_init_first_unit(&pos->coord, node);
13502 +
13503 +       while (1) {
13504 +               ret = 0;
13505 +               coord_set_to_left(&pos->coord);
13506 +               item_convert_invariant(pos);
13507 +
13508 +               iplug = item_plugin_by_coord(&pos->coord);
13509 +               assert("edward-844", iplug != NULL);
13510 +
13511 +               if (iplug->f.convert) {
13512 +                       ret = iplug->f.convert(pos);
13513 +                       if (ret)
13514 +                               goto exit;
13515 +               }
13516 +               assert("edward-307", pos->child == NULL);
13517 +
13518 +               if (coord_next_item(&pos->coord)) {
13519 +                       /* node is over */
13520 +
13521 +                       if (!chaining_data_present(pos))
13522 +                               /* finished this node */
13523 +                               break;
13524 +                       if (should_chain_next_node(pos)) {
13525 +                               /* go to next node */
13526 +                               move_chaining_data(pos, 0/* to next node */);
13527 +                               break;
13528 +                       }
13529 +                       /* repeat this node */
13530 +                       move_chaining_data(pos, 1/* this node */);
13531 +                       continue;
13532 +               }
13533 +               /* Node is not over.
13534 +                  Check if there is attached convert data.
13535 +                  If so roll one item position back and repeat
13536 +                  on this node
13537 +                */
13538 +               if (chaining_data_present(pos)) {
13539 +
13540 +                       if (iplug != item_plugin_by_coord(&pos->coord))
13541 +                               set_item_convert_count(pos, 0);
13542 +
13543 +                       ret = coord_prev_item(&pos->coord);
13544 +                       assert("edward-1003", !ret);
13545 +
13546 +                       move_chaining_data(pos, 1/* this node */);
13547 +               }
13548 +       }
13549 +       JF_CLR(ZJNODE(node), JNODE_CONVERTIBLE);
13550 +       znode_make_dirty(node);
13551 +exit:
13552 +       assert("edward-1004", !ret);
13553 +       return ret;
13554 +}
13555 +
13556 +/* Squeeze and allocate the right neighbor.  This is called after @left and
13557 +   its current children have been squeezed and allocated already.  This
13558 +   procedure's job is to squeeze and items from @right to @left.
13559 +
13560 +   If at the leaf level, use the shift_everything_left memcpy-optimized
13561 +   version of shifting (squeeze_right_leaf).
13562 +
13563 +   If at the twig level, extents are allocated as they are shifted from @right
13564 +   to @left (squalloc_right_twig).
13565 +
13566 +   At any other level, shift one internal item and return to the caller
13567 +   (squalloc_parent_first) so that the shifted-subtree can be processed in
13568 +   parent-first order.
13569 +
13570 +   When unit of internal item is moved, squeezing stops and SUBTREE_MOVED is
13571 +   returned.  When all content of @right is squeezed, SQUEEZE_SOURCE_EMPTY is
13572 +   returned.  If nothing can be moved into @left anymore, SQUEEZE_TARGET_FULL
13573 +   is returned.
13574 +*/
13575 +
13576 +static int squeeze_right_neighbor(flush_pos_t *pos, znode * left,
13577 +                                 znode * right)
13578 +{
13579 +       int ret;
13580 +
13581 +       /* FIXME it is possible to see empty hasn't-heard-banshee node in a
13582 +        * tree owing to error (for example, ENOSPC) in write */
13583 +       /* assert("jmacd-9321", !node_is_empty(left)); */
13584 +       assert("jmacd-9322", !node_is_empty(right));
13585 +       assert("jmacd-9323", znode_get_level(left) == znode_get_level(right));
13586 +
13587 +       switch (znode_get_level(left)) {
13588 +       case TWIG_LEVEL:
13589 +               /* Shift with extent allocating until either an internal item
13590 +                  is encountered or everything is shifted or no free space
13591 +                  left in @left */
13592 +               ret = squeeze_right_twig(left, right, pos);
13593 +               break;
13594 +
13595 +       default:
13596 +               /* All other levels can use shift_everything until we implement
13597 +                  per-item flush plugins. */
13598 +               ret = squeeze_right_non_twig(left, right);
13599 +               break;
13600 +       }
13601 +
13602 +       assert("jmacd-2011", (ret < 0 ||
13603 +                             ret == SQUEEZE_SOURCE_EMPTY
13604 +                             || ret == SQUEEZE_TARGET_FULL
13605 +                             || ret == SUBTREE_MOVED));
13606 +       return ret;
13607 +}
13608 +
13609 +static int squeeze_right_twig_and_advance_coord(flush_pos_t *pos,
13610 +                                               znode * right)
13611 +{
13612 +       int ret;
13613 +
13614 +       ret = squeeze_right_twig(pos->lock.node, right, pos);
13615 +       if (ret < 0)
13616 +               return ret;
13617 +       if (ret > 0) {
13618 +               coord_init_after_last_item(&pos->coord, pos->lock.node);
13619 +               return ret;
13620 +       }
13621 +
13622 +       coord_init_last_unit(&pos->coord, pos->lock.node);
13623 +       return 0;
13624 +}
13625 +
13626 +/* forward declaration */
13627 +static int squalloc_upper_levels(flush_pos_t *, znode *, znode *);
13628 +
13629 +/* do a fast check for "same parents" condition before calling
13630 + * squalloc_upper_levels() */
13631 +static inline int check_parents_and_squalloc_upper_levels(flush_pos_t *pos,
13632 +                                                         znode * left,
13633 +                                                         znode * right)
13634 +{
13635 +       if (znode_same_parents(left, right))
13636 +               return 0;
13637 +
13638 +       return squalloc_upper_levels(pos, left, right);
13639 +}
13640 +
13641 +/* Check whether the parent of given @right node needs to be processes
13642 +   ((re)allocated) prior to processing of the child.  If @left and @right do not
13643 +   share at least the parent of the @right is after the @left but before the
13644 +   @right in parent-first order, we have to (re)allocate it before the @right
13645 +   gets (re)allocated. */
13646 +static int squalloc_upper_levels(flush_pos_t *pos, znode * left, znode * right)
13647 +{
13648 +       int ret;
13649 +
13650 +       lock_handle left_parent_lock;
13651 +       lock_handle right_parent_lock;
13652 +
13653 +       load_count left_parent_load;
13654 +       load_count right_parent_load;
13655 +
13656 +       init_lh(&left_parent_lock);
13657 +       init_lh(&right_parent_lock);
13658 +
13659 +       init_load_count(&left_parent_load);
13660 +       init_load_count(&right_parent_load);
13661 +
13662 +       ret = reiser4_get_parent(&left_parent_lock, left, ZNODE_WRITE_LOCK);
13663 +       if (ret)
13664 +               goto out;
13665 +
13666 +       ret = reiser4_get_parent(&right_parent_lock, right, ZNODE_WRITE_LOCK);
13667 +       if (ret)
13668 +               goto out;
13669 +
13670 +       /* Check for same parents */
13671 +       if (left_parent_lock.node == right_parent_lock.node)
13672 +               goto out;
13673 +
13674 +       if (znode_check_flushprepped(right_parent_lock.node)) {
13675 +               /* Keep parent-first order.  In the order, the right parent node
13676 +                  stands before the @right node.  If it is already allocated,
13677 +                  we set the preceder (next block search start point) to its
13678 +                  block number, @right node should be allocated after it.
13679 +
13680 +                  However, preceder is set only if the right parent is on twig
13681 +                  level. The explanation is the following: new branch nodes are
13682 +                  allocated over already allocated children while the tree
13683 +                  grows, it is difficult to keep tree ordered, we assume that
13684 +                  only leaves and twings are correctly allocated. So, only
13685 +                  twigs are used as a preceder for allocating of the rest of
13686 +                  the slum. */
13687 +               if (znode_get_level(right_parent_lock.node) == TWIG_LEVEL) {
13688 +                       pos->preceder.blk =
13689 +                           *znode_get_block(right_parent_lock.node);
13690 +                       check_preceder(pos->preceder.blk);
13691 +               }
13692 +               goto out;
13693 +       }
13694 +
13695 +       ret = incr_load_count_znode(&left_parent_load, left_parent_lock.node);
13696 +       if (ret)
13697 +               goto out;
13698 +
13699 +       ret = incr_load_count_znode(&right_parent_load, right_parent_lock.node);
13700 +       if (ret)
13701 +               goto out;
13702 +
13703 +       ret =
13704 +           squeeze_right_neighbor(pos, left_parent_lock.node,
13705 +                                  right_parent_lock.node);
13706 +       /* We stop if error. We stop if some items/units were shifted (ret == 0)
13707 +        * and thus @right changed its parent. It means we have not process
13708 +        * right_parent node prior to processing of @right. Positive return
13709 +        * values say that shifting items was not happen because of "empty
13710 +        * source" or "target full" conditions. */
13711 +       if (ret <= 0)
13712 +               goto out;
13713 +
13714 +       /* parent(@left) and parent(@right) may have different parents also. We
13715 +        * do a recursive call for checking that. */
13716 +       ret =
13717 +           check_parents_and_squalloc_upper_levels(pos, left_parent_lock.node,
13718 +                                                   right_parent_lock.node);
13719 +       if (ret)
13720 +               goto out;
13721 +
13722 +       /* allocate znode when going down */
13723 +       ret = lock_parent_and_allocate_znode(right_parent_lock.node, pos);
13724 +
13725 +out:
13726 +       done_load_count(&left_parent_load);
13727 +       done_load_count(&right_parent_load);
13728 +
13729 +       done_lh(&left_parent_lock);
13730 +       done_lh(&right_parent_lock);
13731 +
13732 +       return ret;
13733 +}
13734 +
13735 +/* Check the leftmost child "flushprepped" status, also returns true if child
13736 + * node was not found in cache.  */
13737 +static int leftmost_child_of_unit_check_flushprepped(const coord_t *coord)
13738 +{
13739 +       int ret;
13740 +       int prepped;
13741 +
13742 +       jnode *child;
13743 +
13744 +       ret = get_leftmost_child_of_unit(coord, &child);
13745 +
13746 +       if (ret)
13747 +               return ret;
13748 +
13749 +       if (child) {
13750 +               prepped = jnode_check_flushprepped(child);
13751 +               jput(child);
13752 +       } else {
13753 +               /* We consider not existing child as a node which slum
13754 +                  processing should not continue to.  Not cached node is clean,
13755 +                  so it is flushprepped. */
13756 +               prepped = 1;
13757 +       }
13758 +
13759 +       return prepped;
13760 +}
13761 +
13762 +/* (re)allocate znode with automated getting parent node */
13763 +static int lock_parent_and_allocate_znode(znode * node, flush_pos_t *pos)
13764 +{
13765 +       int ret;
13766 +       lock_handle parent_lock;
13767 +       load_count parent_load;
13768 +       coord_t pcoord;
13769 +
13770 +       assert("zam-851", znode_is_write_locked(node));
13771 +
13772 +       init_lh(&parent_lock);
13773 +       init_load_count(&parent_load);
13774 +
13775 +       ret = reiser4_get_parent(&parent_lock, node, ZNODE_WRITE_LOCK);
13776 +       if (ret)
13777 +               goto out;
13778 +
13779 +       ret = incr_load_count_znode(&parent_load, parent_lock.node);
13780 +       if (ret)
13781 +               goto out;
13782 +
13783 +       ret = find_child_ptr(parent_lock.node, node, &pcoord);
13784 +       if (ret)
13785 +               goto out;
13786 +
13787 +       ret = allocate_znode(node, &pcoord, pos);
13788 +
13789 +out:
13790 +       done_load_count(&parent_load);
13791 +       done_lh(&parent_lock);
13792 +       return ret;
13793 +}
13794 +
13795 +/* Process nodes on leaf level until unformatted node or rightmost node in the
13796 + * slum reached.  */
13797 +static int handle_pos_on_formatted(flush_pos_t *pos)
13798 +{
13799 +       int ret;
13800 +       lock_handle right_lock;
13801 +       load_count right_load;
13802 +
13803 +       init_lh(&right_lock);
13804 +       init_load_count(&right_load);
13805 +
13806 +       if (should_convert_node(pos, pos->lock.node)) {
13807 +               ret = convert_node(pos, pos->lock.node);
13808 +               if (ret)
13809 +                       return ret;
13810 +       }
13811 +
13812 +       while (1) {
13813 +               int expected;
13814 +               expected = should_convert_next_node(pos);
13815 +               ret = neighbor_in_slum(pos->lock.node, &right_lock, RIGHT_SIDE,
13816 +                                      ZNODE_WRITE_LOCK, !expected, expected);
13817 +               if (ret) {
13818 +                       if (expected)
13819 +                               warning("edward-1495",
13820 +                               "Expected neighbor not found (ret = %d). Fsck?",
13821 +                                       ret);
13822 +                       break;
13823 +               }
13824 +
13825 +               /* we don't prep(allocate) nodes for flushing twice. This can be
13826 +                * suboptimal, or it can be optimal. For now we choose to live
13827 +                * with the risk that it will be suboptimal because it would be
13828 +                * quite complex to code it to be smarter. */
13829 +               if (znode_check_flushprepped(right_lock.node)
13830 +                   && !znode_convertible(right_lock.node)) {
13831 +                       assert("edward-1005", !should_convert_next_node(pos));
13832 +                       pos_stop(pos);
13833 +                       break;
13834 +               }
13835 +
13836 +               ret = incr_load_count_znode(&right_load, right_lock.node);
13837 +               if (ret)
13838 +                       break;
13839 +               if (should_convert_node(pos, right_lock.node)) {
13840 +                       ret = convert_node(pos, right_lock.node);
13841 +                       if (ret)
13842 +                               break;
13843 +                       if (node_is_empty(right_lock.node)) {
13844 +                               /* node became empty after converting, repeat */
13845 +                               done_load_count(&right_load);
13846 +                               done_lh(&right_lock);
13847 +                               continue;
13848 +                       }
13849 +               }
13850 +
13851 +               /* squeeze _before_ going upward. */
13852 +               ret =
13853 +                   squeeze_right_neighbor(pos, pos->lock.node,
13854 +                                          right_lock.node);
13855 +               if (ret < 0)
13856 +                       break;
13857 +
13858 +               if (znode_check_flushprepped(right_lock.node)) {
13859 +                       if (should_convert_next_node(pos)) {
13860 +                               /* in spite of flushprepped status of the node,
13861 +                                  its right slum neighbor should be converted*/
13862 +                               assert("edward-953", convert_data(pos));
13863 +                               assert("edward-954", item_convert_data(pos));
13864 +
13865 +                               if (node_is_empty(right_lock.node)) {
13866 +                                       done_load_count(&right_load);
13867 +                                       done_lh(&right_lock);
13868 +                               } else
13869 +                                       move_flush_pos(pos, &right_lock,
13870 +                                                      &right_load, NULL);
13871 +                               continue;
13872 +                       }
13873 +                       pos_stop(pos);
13874 +                       break;
13875 +               }
13876 +
13877 +               if (node_is_empty(right_lock.node)) {
13878 +                       /* repeat if right node was squeezed completely */
13879 +                       done_load_count(&right_load);
13880 +                       done_lh(&right_lock);
13881 +                       continue;
13882 +               }
13883 +
13884 +               /* parent(right_lock.node) has to be processed before
13885 +                * (right_lock.node) due to "parent-first" allocation order. */
13886 +               ret =
13887 +                   check_parents_and_squalloc_upper_levels(pos, pos->lock.node,
13888 +                                                           right_lock.node);
13889 +               if (ret)
13890 +                       break;
13891 +               /* (re)allocate _after_ going upward */
13892 +               ret = lock_parent_and_allocate_znode(right_lock.node, pos);
13893 +               if (ret)
13894 +                       break;
13895 +               if (should_terminate_squalloc(pos)) {
13896 +                       set_item_convert_count(pos, 0);
13897 +                       break;
13898 +               }
13899 +
13900 +               /* advance the flush position to the right neighbor */
13901 +               move_flush_pos(pos, &right_lock, &right_load, NULL);
13902 +
13903 +               ret = rapid_flush(pos);
13904 +               if (ret)
13905 +                       break;
13906 +       }
13907 +       check_convert_info(pos);
13908 +       done_load_count(&right_load);
13909 +       done_lh(&right_lock);
13910 +
13911 +       /* This function indicates via pos whether to stop or go to twig or
13912 +        * continue on current level. */
13913 +       return ret;
13914 +
13915 +}
13916 +
13917 +/* Process nodes on leaf level until unformatted node or rightmost node in the
13918 + * slum reached.  */
13919 +static int handle_pos_on_leaf(flush_pos_t *pos)
13920 +{
13921 +       int ret;
13922 +
13923 +       assert("zam-845", pos->state == POS_ON_LEAF);
13924 +
13925 +       ret = handle_pos_on_formatted(pos);
13926 +
13927 +       if (ret == -E_NO_NEIGHBOR) {
13928 +               /* cannot get right neighbor, go process extents. */
13929 +               pos->state = POS_TO_TWIG;
13930 +               return 0;
13931 +       }
13932 +
13933 +       return ret;
13934 +}
13935 +
13936 +/* Process slum on level > 1 */
13937 +static int handle_pos_on_internal(flush_pos_t *pos)
13938 +{
13939 +       assert("zam-850", pos->state == POS_ON_INTERNAL);
13940 +       return handle_pos_on_formatted(pos);
13941 +}
13942 +
13943 +/* check whether squalloc should stop before processing given extent */
13944 +static int squalloc_extent_should_stop(flush_pos_t *pos)
13945 +{
13946 +       assert("zam-869", item_is_extent(&pos->coord));
13947 +
13948 +       /* pos->child is a jnode handle_pos_on_extent() should start with in
13949 +        * stead of the first child of the first extent unit. */
13950 +       if (pos->child) {
13951 +               int prepped;
13952 +
13953 +               assert("vs-1383", jnode_is_unformatted(pos->child));
13954 +               prepped = jnode_check_flushprepped(pos->child);
13955 +               pos->pos_in_unit =
13956 +                   jnode_get_index(pos->child) -
13957 +                   extent_unit_index(&pos->coord);
13958 +               assert("vs-1470",
13959 +                      pos->pos_in_unit < extent_unit_width(&pos->coord));
13960 +               assert("nikita-3434",
13961 +                      ergo(extent_is_unallocated(&pos->coord),
13962 +                           pos->pos_in_unit == 0));
13963 +               jput(pos->child);
13964 +               pos->child = NULL;
13965 +
13966 +               return prepped;
13967 +       }
13968 +
13969 +       pos->pos_in_unit = 0;
13970 +       if (extent_is_unallocated(&pos->coord))
13971 +               return 0;
13972 +
13973 +       return leftmost_child_of_unit_check_flushprepped(&pos->coord);
13974 +}
13975 +
13976 +/* Handle the case when regular reiser4 tree (znodes connected one to its
13977 + * neighbors by sibling pointers) is interrupted on leaf level by one or more
13978 + * unformatted nodes.  By having a lock on twig level and use extent code
13979 + * routines to process unformatted nodes we swim around an irregular part of
13980 + * reiser4 tree. */
13981 +static int handle_pos_on_twig(flush_pos_t *pos)
13982 +{
13983 +       int ret;
13984 +
13985 +       assert("zam-844", pos->state == POS_ON_EPOINT);
13986 +       assert("zam-843", item_is_extent(&pos->coord));
13987 +
13988 +       /* We decide should we continue slum processing with current extent
13989 +          unit: if leftmost child of current extent unit is flushprepped
13990 +          (i.e. clean or already processed by flush) we stop squalloc().  There
13991 +          is a fast check for unallocated extents which we assume contain all
13992 +          not flushprepped nodes. */
13993 +       /* FIXME: Here we implement simple check, we are only looking on the
13994 +          leftmost child. */
13995 +       ret = squalloc_extent_should_stop(pos);
13996 +       if (ret != 0) {
13997 +               pos_stop(pos);
13998 +               return ret;
13999 +       }
14000 +
14001 +       while (pos_valid(pos) && coord_is_existing_unit(&pos->coord)
14002 +              && item_is_extent(&pos->coord)) {
14003 +               ret = reiser4_alloc_extent(pos);
14004 +               if (ret)
14005 +                       break;
14006 +               coord_next_unit(&pos->coord);
14007 +       }
14008 +
14009 +       if (coord_is_after_rightmost(&pos->coord)) {
14010 +               pos->state = POS_END_OF_TWIG;
14011 +               return 0;
14012 +       }
14013 +       if (item_is_internal(&pos->coord)) {
14014 +               pos->state = POS_TO_LEAF;
14015 +               return 0;
14016 +       }
14017 +
14018 +       assert("zam-860", item_is_extent(&pos->coord));
14019 +
14020 +       /* "slum" is over */
14021 +       pos->state = POS_INVALID;
14022 +       return 0;
14023 +}
14024 +
14025 +/* When we about to return flush position from twig to leaf level we can process
14026 + * the right twig node or move position to the leaf.  This processes right twig
14027 + * if it is possible and jump to leaf level if not. */
14028 +static int handle_pos_end_of_twig(flush_pos_t *pos)
14029 +{
14030 +       int ret;
14031 +       lock_handle right_lock;
14032 +       load_count right_load;
14033 +       coord_t at_right;
14034 +       jnode *child = NULL;
14035 +
14036 +       assert("zam-848", pos->state == POS_END_OF_TWIG);
14037 +       assert("zam-849", coord_is_after_rightmost(&pos->coord));
14038 +
14039 +       init_lh(&right_lock);
14040 +       init_load_count(&right_load);
14041 +
14042 +       /* We get a lock on the right twig node even it is not dirty because
14043 +        * slum continues or discontinues on leaf level not on next twig. This
14044 +        * lock on the right twig is needed for getting its leftmost child. */
14045 +       ret =
14046 +           reiser4_get_right_neighbor(&right_lock, pos->lock.node,
14047 +                                      ZNODE_WRITE_LOCK, GN_SAME_ATOM);
14048 +       if (ret)
14049 +               goto out;
14050 +
14051 +       ret = incr_load_count_znode(&right_load, right_lock.node);
14052 +       if (ret)
14053 +               goto out;
14054 +
14055 +       /* right twig could be not dirty */
14056 +       if (JF_ISSET(ZJNODE(right_lock.node), JNODE_DIRTY)) {
14057 +               /* If right twig node is dirty we always attempt to squeeze it
14058 +                * content to the left... */
14059 +became_dirty:
14060 +               ret =
14061 +                   squeeze_right_twig_and_advance_coord(pos, right_lock.node);
14062 +               if (ret <= 0) {
14063 +                       /* pos->coord is on internal item, go to leaf level, or
14064 +                        * we have an error which will be caught in squalloc()
14065 +                        */
14066 +                       pos->state = POS_TO_LEAF;
14067 +                       goto out;
14068 +               }
14069 +
14070 +               /* If right twig was squeezed completely we wave to re-lock
14071 +                * right twig. now it is done through the top-level squalloc
14072 +                * routine. */
14073 +               if (node_is_empty(right_lock.node))
14074 +                       goto out;
14075 +
14076 +               /* ... and prep it if it is not yet prepped */
14077 +               if (!znode_check_flushprepped(right_lock.node)) {
14078 +                       /* As usual, process parent before ... */
14079 +                       ret =
14080 +                           check_parents_and_squalloc_upper_levels(pos,
14081 +                                                                   pos->lock.
14082 +                                                                   node,
14083 +                                                                   right_lock.
14084 +                                                                   node);
14085 +                       if (ret)
14086 +                               goto out;
14087 +
14088 +                       /* ... processing the child */
14089 +                       ret =
14090 +                           lock_parent_and_allocate_znode(right_lock.node,
14091 +                                                          pos);
14092 +                       if (ret)
14093 +                               goto out;
14094 +               }
14095 +       } else {
14096 +               coord_init_first_unit(&at_right, right_lock.node);
14097 +
14098 +               /* check first child of next twig, should we continue there ? */
14099 +               ret = get_leftmost_child_of_unit(&at_right, &child);
14100 +               if (ret || child == NULL || jnode_check_flushprepped(child)) {
14101 +                       pos_stop(pos);
14102 +                       goto out;
14103 +               }
14104 +
14105 +               /* check clean twig for possible relocation */
14106 +               if (!znode_check_flushprepped(right_lock.node)) {
14107 +                       ret =
14108 +                           reverse_relocate_check_dirty_parent(child,
14109 +                                                               &at_right, pos);
14110 +                       if (ret)
14111 +                               goto out;
14112 +                       if (JF_ISSET(ZJNODE(right_lock.node), JNODE_DIRTY))
14113 +                               goto became_dirty;
14114 +               }
14115 +       }
14116 +
14117 +       assert("zam-875", znode_check_flushprepped(right_lock.node));
14118 +
14119 +       /* Update the preceder by a block number of just processed right twig
14120 +        * node. The code above could miss the preceder updating because
14121 +        * allocate_znode() could not be called for this node. */
14122 +       pos->preceder.blk = *znode_get_block(right_lock.node);
14123 +       check_preceder(pos->preceder.blk);
14124 +
14125 +       coord_init_first_unit(&at_right, right_lock.node);
14126 +       assert("zam-868", coord_is_existing_unit(&at_right));
14127 +
14128 +       pos->state = item_is_extent(&at_right) ? POS_ON_EPOINT : POS_TO_LEAF;
14129 +       move_flush_pos(pos, &right_lock, &right_load, &at_right);
14130 +
14131 +out:
14132 +       done_load_count(&right_load);
14133 +       done_lh(&right_lock);
14134 +
14135 +       if (child)
14136 +               jput(child);
14137 +
14138 +       return ret;
14139 +}
14140 +
14141 +/* Move the pos->lock to leaf node pointed by pos->coord, check should we
14142 + * continue there. */
14143 +static int handle_pos_to_leaf(flush_pos_t *pos)
14144 +{
14145 +       int ret;
14146 +       lock_handle child_lock;
14147 +       load_count child_load;
14148 +       jnode *child;
14149 +
14150 +       assert("zam-846", pos->state == POS_TO_LEAF);
14151 +       assert("zam-847", item_is_internal(&pos->coord));
14152 +
14153 +       init_lh(&child_lock);
14154 +       init_load_count(&child_load);
14155 +
14156 +       ret = get_leftmost_child_of_unit(&pos->coord, &child);
14157 +       if (ret)
14158 +               return ret;
14159 +       if (child == NULL) {
14160 +               pos_stop(pos);
14161 +               return 0;
14162 +       }
14163 +
14164 +       if (jnode_check_flushprepped(child)) {
14165 +               pos->state = POS_INVALID;
14166 +               goto out;
14167 +       }
14168 +
14169 +       ret =
14170 +           longterm_lock_znode(&child_lock, JZNODE(child), ZNODE_WRITE_LOCK,
14171 +                               ZNODE_LOCK_LOPRI);
14172 +       if (ret)
14173 +               goto out;
14174 +
14175 +       ret = incr_load_count_znode(&child_load, JZNODE(child));
14176 +       if (ret)
14177 +               goto out;
14178 +
14179 +       ret = allocate_znode(JZNODE(child), &pos->coord, pos);
14180 +       if (ret)
14181 +               goto out;
14182 +
14183 +       /* move flush position to leaf level */
14184 +       pos->state = POS_ON_LEAF;
14185 +       move_flush_pos(pos, &child_lock, &child_load, NULL);
14186 +
14187 +       if (node_is_empty(JZNODE(child))) {
14188 +               ret = delete_empty_node(JZNODE(child));
14189 +               pos->state = POS_INVALID;
14190 +       }
14191 +out:
14192 +       done_load_count(&child_load);
14193 +       done_lh(&child_lock);
14194 +       jput(child);
14195 +
14196 +       return ret;
14197 +}
14198 +
14199 +/* move pos from leaf to twig, and move lock from leaf to twig. */
14200 +/* Move pos->lock to upper (twig) level */
14201 +static int handle_pos_to_twig(flush_pos_t *pos)
14202 +{
14203 +       int ret;
14204 +
14205 +       lock_handle parent_lock;
14206 +       load_count parent_load;
14207 +       coord_t pcoord;
14208 +
14209 +       assert("zam-852", pos->state == POS_TO_TWIG);
14210 +
14211 +       init_lh(&parent_lock);
14212 +       init_load_count(&parent_load);
14213 +
14214 +       ret =
14215 +           reiser4_get_parent(&parent_lock, pos->lock.node, ZNODE_WRITE_LOCK);
14216 +       if (ret)
14217 +               goto out;
14218 +
14219 +       ret = incr_load_count_znode(&parent_load, parent_lock.node);
14220 +       if (ret)
14221 +               goto out;
14222 +
14223 +       ret = find_child_ptr(parent_lock.node, pos->lock.node, &pcoord);
14224 +       if (ret)
14225 +               goto out;
14226 +
14227 +       assert("zam-870", item_is_internal(&pcoord));
14228 +       coord_next_item(&pcoord);
14229 +
14230 +       if (coord_is_after_rightmost(&pcoord))
14231 +               pos->state = POS_END_OF_TWIG;
14232 +       else if (item_is_extent(&pcoord))
14233 +               pos->state = POS_ON_EPOINT;
14234 +       else {
14235 +               /* Here we understand that getting -E_NO_NEIGHBOR in
14236 +                * handle_pos_on_leaf() was because of just a reaching edge of
14237 +                * slum */
14238 +               pos_stop(pos);
14239 +               goto out;
14240 +       }
14241 +
14242 +       move_flush_pos(pos, &parent_lock, &parent_load, &pcoord);
14243 +
14244 +out:
14245 +       done_load_count(&parent_load);
14246 +       done_lh(&parent_lock);
14247 +
14248 +       return ret;
14249 +}
14250 +
14251 +typedef int (*pos_state_handle_t) (flush_pos_t *);
14252 +static pos_state_handle_t flush_pos_handlers[] = {
14253 +       /* process formatted nodes on leaf level, keep lock on a leaf node */
14254 +       [POS_ON_LEAF] = handle_pos_on_leaf,
14255 +       /* process unformatted nodes, keep lock on twig node, pos->coord points
14256 +        * to extent currently being processed */
14257 +       [POS_ON_EPOINT] = handle_pos_on_twig,
14258 +       /* move a lock from leaf node to its parent for further processing of
14259 +          unformatted nodes */
14260 +       [POS_TO_TWIG] = handle_pos_to_twig,
14261 +       /* move a lock from twig to leaf level when a processing of unformatted
14262 +        * nodes finishes, pos->coord points to the leaf node we jump to */
14263 +       [POS_TO_LEAF] = handle_pos_to_leaf,
14264 +       /* after processing last extent in the twig node, attempting to shift
14265 +        * items from the twigs right neighbor and process them while shifting*/
14266 +       [POS_END_OF_TWIG] = handle_pos_end_of_twig,
14267 +       /* process formatted nodes on internal level, keep lock on an internal
14268 +          node */
14269 +       [POS_ON_INTERNAL] = handle_pos_on_internal
14270 +};
14271 +
14272 +/* Advance flush position horizontally, prepare for flushing ((re)allocate,
14273 + * squeeze, encrypt) nodes and their ancestors in "parent-first" order */
14274 +static int squalloc(flush_pos_t *pos)
14275 +{
14276 +       int ret = 0;
14277 +
14278 +       /* maybe needs to be made a case statement with handle_pos_on_leaf as
14279 +        * first case, for greater CPU efficiency? Measure and see.... -Hans */
14280 +       while (pos_valid(pos)) {
14281 +               ret = flush_pos_handlers[pos->state] (pos);
14282 +               if (ret < 0)
14283 +                       break;
14284 +
14285 +               ret = rapid_flush(pos);
14286 +               if (ret)
14287 +                       break;
14288 +       }
14289 +
14290 +       /* any positive value or -E_NO_NEIGHBOR are legal return codes for
14291 +          handle_pos* routines, -E_NO_NEIGHBOR means that slum edge was
14292 +          reached */
14293 +       if (ret > 0 || ret == -E_NO_NEIGHBOR)
14294 +               ret = 0;
14295 +
14296 +       return ret;
14297 +}
14298 +
14299 +static void update_ldkey(znode * node)
14300 +{
14301 +       reiser4_key ldkey;
14302 +
14303 +       assert_rw_write_locked(&(znode_get_tree(node)->dk_lock));
14304 +       if (node_is_empty(node))
14305 +               return;
14306 +
14307 +       znode_set_ld_key(node, leftmost_key_in_node(node, &ldkey));
14308 +}
14309 +
14310 +/* this is to be called after calling of shift node's method to shift data from
14311 +   @right to @left. It sets left delimiting keys of @left and @right to keys of
14312 +   first items of @left and @right correspondingly and sets right delimiting key
14313 +   of @left to first key of @right */
14314 +static void update_znode_dkeys(znode * left, znode * right)
14315 +{
14316 +       assert_rw_write_locked(&(znode_get_tree(right)->dk_lock));
14317 +       assert("vs-1629", (znode_is_write_locked(left) &&
14318 +                          znode_is_write_locked(right)));
14319 +
14320 +       /* we need to update left delimiting of left if it was empty before
14321 +          shift */
14322 +       update_ldkey(left);
14323 +       update_ldkey(right);
14324 +       if (node_is_empty(right))
14325 +               znode_set_rd_key(left, znode_get_rd_key(right));
14326 +       else
14327 +               znode_set_rd_key(left, znode_get_ld_key(right));
14328 +}
14329 +
14330 +/* try to shift everything from @right to @left. If everything was shifted -
14331 +   @right is removed from the tree.  Result is the number of bytes shifted. */
14332 +static int
14333 +shift_everything_left(znode * right, znode * left, carry_level * todo)
14334 +{
14335 +       coord_t from;
14336 +       node_plugin *nplug;
14337 +       carry_plugin_info info;
14338 +
14339 +       coord_init_after_last_item(&from, right);
14340 +
14341 +       nplug = node_plugin_by_node(right);
14342 +       info.doing = NULL;
14343 +       info.todo = todo;
14344 +       return nplug->shift(&from, left, SHIFT_LEFT,
14345 +                           1 /* delete @right if it becomes empty */ ,
14346 +                           1
14347 +                           /* move coord @from to node @left if everything will
14348 +                              be shifted */
14349 +                           ,
14350 +                           &info);
14351 +}
14352 +
14353 +/* Shift as much as possible from @right to @left using the memcpy-optimized
14354 +   shift_everything_left.  @left and @right are formatted neighboring nodes on
14355 +   leaf level. */
14356 +static int squeeze_right_non_twig(znode * left, znode * right)
14357 +{
14358 +       int ret;
14359 +       carry_pool *pool;
14360 +       carry_level *todo;
14361 +
14362 +       assert("nikita-2246", znode_get_level(left) == znode_get_level(right));
14363 +
14364 +       if (!JF_ISSET(ZJNODE(left), JNODE_DIRTY) ||
14365 +           !JF_ISSET(ZJNODE(right), JNODE_DIRTY))
14366 +               return SQUEEZE_TARGET_FULL;
14367 +
14368 +       pool = init_carry_pool(sizeof(*pool) + 3 * sizeof(*todo));
14369 +       if (IS_ERR(pool))
14370 +               return PTR_ERR(pool);
14371 +       todo = (carry_level *) (pool + 1);
14372 +       init_carry_level(todo, pool);
14373 +
14374 +       ret = shift_everything_left(right, left, todo);
14375 +       if (ret > 0) {
14376 +               /* something was shifted */
14377 +               reiser4_tree *tree;
14378 +               __u64 grabbed;
14379 +
14380 +               znode_make_dirty(left);
14381 +               znode_make_dirty(right);
14382 +
14383 +               /* update delimiting keys of nodes which participated in
14384 +                  shift. FIXME: it would be better to have this in shift
14385 +                  node's operation. But it can not be done there. Nobody
14386 +                  remembers why, though */
14387 +               tree = znode_get_tree(left);
14388 +               write_lock_dk(tree);
14389 +               update_znode_dkeys(left, right);
14390 +               write_unlock_dk(tree);
14391 +
14392 +               /* Carry is called to update delimiting key and, maybe, to
14393 +                  remove empty node. */
14394 +               grabbed = get_current_context()->grabbed_blocks;
14395 +               ret = reiser4_grab_space_force(tree->height, BA_RESERVED);
14396 +               assert("nikita-3003", ret == 0);        /* reserved space is
14397 +                                                       exhausted. Ask Hans. */
14398 +               ret = reiser4_carry(todo, NULL/* previous level */);
14399 +               grabbed2free_mark(grabbed);
14400 +       } else {
14401 +               /* Shifting impossible, we return appropriate result code */
14402 +               ret =
14403 +                   node_is_empty(right) ? SQUEEZE_SOURCE_EMPTY :
14404 +                   SQUEEZE_TARGET_FULL;
14405 +       }
14406 +
14407 +       done_carry_pool(pool);
14408 +
14409 +       return ret;
14410 +}
14411 +
14412 +#if REISER4_DEBUG
14413 +static int sibling_link_is_ok(const znode *left, const znode *right)
14414 +{
14415 +       int result;
14416 +
14417 +       read_lock_tree(znode_get_tree(left));
14418 +       result = (left->right == right && left == right->left);
14419 +       read_unlock_tree(znode_get_tree(left));
14420 +       return result;
14421 +}
14422 +#endif
14423 +
14424 +/* Shift first unit of first item if it is an internal one.  Return
14425 +   SQUEEZE_TARGET_FULL if it fails to shift an item, otherwise return
14426 +   SUBTREE_MOVED. */
14427 +static int shift_one_internal_unit(znode * left, znode * right)
14428 +{
14429 +       int ret;
14430 +       carry_pool *pool;
14431 +       carry_level *todo;
14432 +       coord_t *coord;
14433 +       carry_plugin_info *info;
14434 +       int size, moved;
14435 +
14436 +       assert("nikita-2247", znode_get_level(left) == znode_get_level(right));
14437 +       assert("nikita-2435", znode_is_write_locked(left));
14438 +       assert("nikita-2436", znode_is_write_locked(right));
14439 +       assert("nikita-2434", sibling_link_is_ok(left, right));
14440 +
14441 +       pool = init_carry_pool(sizeof(*pool) + 3 * sizeof(*todo) +
14442 +                              sizeof(*coord) + sizeof(*info)
14443 +#if REISER4_DEBUG
14444 +                              + sizeof(*coord) + 2 * sizeof(reiser4_key)
14445 +#endif
14446 +           );
14447 +       if (IS_ERR(pool))
14448 +               return PTR_ERR(pool);
14449 +       todo = (carry_level *) (pool + 1);
14450 +       init_carry_level(todo, pool);
14451 +
14452 +       coord = (coord_t *) (todo + 3);
14453 +       coord_init_first_unit(coord, right);
14454 +       info = (carry_plugin_info *) (coord + 1);
14455 +
14456 +#if REISER4_DEBUG
14457 +       if (!node_is_empty(left)) {
14458 +               coord_t *last;
14459 +               reiser4_key *right_key;
14460 +               reiser4_key *left_key;
14461 +
14462 +               last = (coord_t *) (info + 1);
14463 +               right_key = (reiser4_key *) (last + 1);
14464 +               left_key = right_key + 1;
14465 +               coord_init_last_unit(last, left);
14466 +
14467 +               assert("nikita-2463",
14468 +                      keyle(item_key_by_coord(last, left_key),
14469 +                            item_key_by_coord(coord, right_key)));
14470 +       }
14471 +#endif
14472 +
14473 +       assert("jmacd-2007", item_is_internal(coord));
14474 +
14475 +       size = item_length_by_coord(coord);
14476 +       info->todo = todo;
14477 +       info->doing = NULL;
14478 +
14479 +       ret = node_plugin_by_node(left)->shift(coord, left, SHIFT_LEFT,
14480 +                                              1
14481 +                                              /* delete @right if it becomes
14482 +                                                 empty */
14483 +                                              ,
14484 +                                              0
14485 +                                              /* do not move coord @coord to
14486 +                                                 node @left */
14487 +                                              ,
14488 +                                              info);
14489 +
14490 +       /* If shift returns positive, then we shifted the item. */
14491 +       assert("vs-423", ret <= 0 || size == ret);
14492 +       moved = (ret > 0);
14493 +
14494 +       if (moved) {
14495 +               /* something was moved */
14496 +               reiser4_tree *tree;
14497 +               int grabbed;
14498 +
14499 +               znode_make_dirty(left);
14500 +               znode_make_dirty(right);
14501 +               tree = znode_get_tree(left);
14502 +               write_lock_dk(tree);
14503 +               update_znode_dkeys(left, right);
14504 +               write_unlock_dk(tree);
14505 +
14506 +               /* reserve space for delimiting keys after shifting */
14507 +               grabbed = get_current_context()->grabbed_blocks;
14508 +               ret = reiser4_grab_space_force(tree->height, BA_RESERVED);
14509 +               assert("nikita-3003", ret == 0);        /* reserved space is
14510 +                                                       exhausted. Ask Hans. */
14511 +
14512 +               ret = reiser4_carry(todo, NULL/* previous level */);
14513 +               grabbed2free_mark(grabbed);
14514 +       }
14515 +
14516 +       done_carry_pool(pool);
14517 +
14518 +       if (ret != 0) {
14519 +               /* Shift or carry operation failed. */
14520 +               assert("jmacd-7325", ret < 0);
14521 +               return ret;
14522 +       }
14523 +
14524 +       return moved ? SUBTREE_MOVED : SQUEEZE_TARGET_FULL;
14525 +}
14526 +
14527 +/* Make the final relocate/wander decision during forward parent-first squalloc
14528 +   for a znode. For unformatted nodes this is done in
14529 +   plugin/item/extent.c:extent_needs_allocation(). */
14530 +static int
14531 +allocate_znode_loaded(znode * node,
14532 +                     const coord_t *parent_coord, flush_pos_t *pos)
14533 +{
14534 +       int ret;
14535 +       reiser4_super_info_data *sbinfo = get_current_super_private();
14536 +       /* FIXME(D): We have the node write-locked and should have checked for !
14537 +          allocated() somewhere before reaching this point, but there can be a
14538 +          race, so this assertion is bogus. */
14539 +       assert("jmacd-7987", !jnode_check_flushprepped(ZJNODE(node)));
14540 +       assert("jmacd-7988", znode_is_write_locked(node));
14541 +       assert("jmacd-7989", coord_is_invalid(parent_coord)
14542 +              || znode_is_write_locked(parent_coord->node));
14543 +
14544 +       if (ZF_ISSET(node, JNODE_REPACK) || ZF_ISSET(node, JNODE_CREATED) ||
14545 +           znode_is_root(node) ||
14546 +           /* We have enough nodes to relocate no matter what. */
14547 +           (pos->leaf_relocate != 0 && znode_get_level(node) == LEAF_LEVEL)) {
14548 +               /* No need to decide with new nodes, they are treated the same
14549 +                  as relocate. If the root node is dirty, relocate. */
14550 +               if (pos->preceder.blk == 0) {
14551 +                       /* preceder is unknown and we have decided to relocate
14552 +                          node -- using of default value for search start is
14553 +                          better than search from block #0. */
14554 +                       get_blocknr_hint_default(&pos->preceder.blk);
14555 +                       check_preceder(pos->preceder.blk);
14556 +               }
14557 +
14558 +               goto best_reloc;
14559 +
14560 +       } else if (pos->preceder.blk == 0) {
14561 +               /* If we don't know the preceder, leave it where it is. */
14562 +               jnode_make_wander(ZJNODE(node));
14563 +       } else {
14564 +               /* Make a decision based on block distance. */
14565 +               reiser4_block_nr dist;
14566 +               reiser4_block_nr nblk = *znode_get_block(node);
14567 +
14568 +               assert("jmacd-6172", !reiser4_blocknr_is_fake(&nblk));
14569 +               assert("jmacd-6173", !reiser4_blocknr_is_fake(&pos->preceder.blk));
14570 +               assert("jmacd-6174", pos->preceder.blk != 0);
14571 +
14572 +               if (pos->preceder.blk == nblk - 1) {
14573 +                       /* Ideal. */
14574 +                       jnode_make_wander(ZJNODE(node));
14575 +               } else {
14576 +
14577 +                       dist =
14578 +                           (nblk <
14579 +                            pos->preceder.blk) ? (pos->preceder.blk -
14580 +                                                  nblk) : (nblk -
14581 +                                                           pos->preceder.blk);
14582 +
14583 +                       /* See if we can find a closer block
14584 +                          (forward direction only). */
14585 +                       pos->preceder.max_dist =
14586 +                           min((reiser4_block_nr) sbinfo->flush.
14587 +                               relocate_distance, dist);
14588 +                       pos->preceder.level = znode_get_level(node);
14589 +
14590 +                       ret = allocate_znode_update(node, parent_coord, pos);
14591 +
14592 +                       pos->preceder.max_dist = 0;
14593 +
14594 +                       if (ret && (ret != -ENOSPC))
14595 +                               return ret;
14596 +
14597 +                       if (ret == 0) {
14598 +                               /* Got a better allocation. */
14599 +                               znode_make_reloc(node, pos->fq);
14600 +                       } else if (dist < sbinfo->flush.relocate_distance) {
14601 +                               /* The present allocation is good enough. */
14602 +                               jnode_make_wander(ZJNODE(node));
14603 +                       } else {
14604 +                               /* Otherwise, try to relocate to the best
14605 +                                  position. */
14606 +best_reloc:
14607 +                               ret =
14608 +                                   allocate_znode_update(node, parent_coord,
14609 +                                                         pos);
14610 +                               if (ret != 0)
14611 +                                       return ret;
14612 +
14613 +                               /* set JNODE_RELOC bit _after_ node gets
14614 +                                  allocated */
14615 +                               znode_make_reloc(node, pos->fq);
14616 +                       }
14617 +               }
14618 +       }
14619 +
14620 +       /* This is the new preceder. */
14621 +       pos->preceder.blk = *znode_get_block(node);
14622 +       check_preceder(pos->preceder.blk);
14623 +       pos->alloc_cnt += 1;
14624 +
14625 +       assert("jmacd-4277", !reiser4_blocknr_is_fake(&pos->preceder.blk));
14626 +
14627 +       return 0;
14628 +}
14629 +
14630 +static int
14631 +allocate_znode(znode * node, const coord_t *parent_coord, flush_pos_t *pos)
14632 +{
14633 +       /*
14634 +        * perform znode allocation with znode pinned in memory to avoid races
14635 +        * with asynchronous emergency flush (which plays with
14636 +        * JNODE_FLUSH_RESERVED bit).
14637 +        */
14638 +       return WITH_DATA(node, allocate_znode_loaded(node, parent_coord, pos));
14639 +}
14640 +
14641 +/* A subroutine of allocate_znode, this is called first to see if there is a
14642 +   close position to relocate to. It may return ENOSPC if there is no close
14643 +   position. If there is no close position it may not relocate. This takes care
14644 +   of updating the parent node with the relocated block address. */
14645 +static int
14646 +allocate_znode_update(znode * node, const coord_t *parent_coord,
14647 +                     flush_pos_t *pos)
14648 +{
14649 +       int ret;
14650 +       reiser4_block_nr blk;
14651 +       lock_handle uber_lock;
14652 +       int flush_reserved_used = 0;
14653 +       int grabbed;
14654 +       reiser4_context *ctx;
14655 +       reiser4_super_info_data *sbinfo;
14656 +
14657 +       init_lh(&uber_lock);
14658 +
14659 +       ctx = get_current_context();
14660 +       sbinfo = get_super_private(ctx->super);
14661 +
14662 +       grabbed = ctx->grabbed_blocks;
14663 +
14664 +       /* discard e-flush allocation */
14665 +       ret = zload(node);
14666 +       if (ret)
14667 +               return ret;
14668 +
14669 +       if (ZF_ISSET(node, JNODE_CREATED)) {
14670 +               assert("zam-816", reiser4_blocknr_is_fake(znode_get_block(node)));
14671 +               pos->preceder.block_stage = BLOCK_UNALLOCATED;
14672 +       } else {
14673 +               pos->preceder.block_stage = BLOCK_GRABBED;
14674 +
14675 +               /* The disk space for relocating the @node is already reserved
14676 +                * in "flush reserved" counter if @node is leaf, otherwise we
14677 +                * grab space using BA_RESERVED (means grab space from whole
14678 +                * disk not from only 95%). */
14679 +               if (znode_get_level(node) == LEAF_LEVEL) {
14680 +                       /*
14681 +                        * earlier (during do_jnode_make_dirty()) we decided
14682 +                        * that @node can possibly go into overwrite set and
14683 +                        * reserved block for its wandering location.
14684 +                        */
14685 +                       txn_atom *atom = get_current_atom_locked();
14686 +                       assert("nikita-3449",
14687 +                              ZF_ISSET(node, JNODE_FLUSH_RESERVED));
14688 +                       flush_reserved2grabbed(atom, (__u64) 1);
14689 +                       spin_unlock_atom(atom);
14690 +                       /*
14691 +                        * we are trying to move node into relocate
14692 +                        * set. Allocation of relocated position "uses"
14693 +                        * reserved block.
14694 +                        */
14695 +                       ZF_CLR(node, JNODE_FLUSH_RESERVED);
14696 +                       flush_reserved_used = 1;
14697 +               } else {
14698 +                       ret = reiser4_grab_space_force((__u64) 1, BA_RESERVED);
14699 +                       if (ret != 0)
14700 +                               goto exit;
14701 +               }
14702 +       }
14703 +
14704 +       /* We may do not use 5% of reserved disk space here and flush will not
14705 +          pack tightly. */
14706 +       ret = reiser4_alloc_block(&pos->preceder, &blk,
14707 +                                 BA_FORMATTED | BA_PERMANENT);
14708 +       if (ret)
14709 +               goto exit;
14710 +
14711 +       if (!ZF_ISSET(node, JNODE_CREATED) &&
14712 +           (ret =
14713 +            reiser4_dealloc_block(znode_get_block(node), 0,
14714 +                                  BA_DEFER | BA_FORMATTED)))
14715 +               goto exit;
14716 +
14717 +       if (likely(!znode_is_root(node))) {
14718 +               item_plugin *iplug;
14719 +
14720 +               iplug = item_plugin_by_coord(parent_coord);
14721 +               assert("nikita-2954", iplug->f.update != NULL);
14722 +               iplug->f.update(parent_coord, &blk);
14723 +
14724 +               znode_make_dirty(parent_coord->node);
14725 +
14726 +       } else {
14727 +               reiser4_tree *tree = znode_get_tree(node);
14728 +               znode *uber;
14729 +
14730 +               /* We take a longterm lock on the fake node in order to change
14731 +                  the root block number.  This may cause atom fusion. */
14732 +               ret = get_uber_znode(tree, ZNODE_WRITE_LOCK, ZNODE_LOCK_HIPRI,
14733 +                                    &uber_lock);
14734 +               /* The fake node cannot be deleted, and we must have priority
14735 +                  here, and may not be confused with ENOSPC. */
14736 +               assert("jmacd-74412",
14737 +                      ret != -EINVAL && ret != -E_DEADLOCK && ret != -ENOSPC);
14738 +
14739 +               if (ret)
14740 +                       goto exit;
14741 +
14742 +               uber = uber_lock.node;
14743 +
14744 +               write_lock_tree(tree);
14745 +               tree->root_block = blk;
14746 +               write_unlock_tree(tree);
14747 +
14748 +               znode_make_dirty(uber);
14749 +       }
14750 +
14751 +       ret = znode_rehash(node, &blk);
14752 +exit:
14753 +       if (ret) {
14754 +               /* Get flush reserved block back if something fails, because
14755 +                * callers assume that on error block wasn't relocated and its
14756 +                * flush reserved block wasn't used. */
14757 +               if (flush_reserved_used) {
14758 +                       /*
14759 +                        * ok, we failed to move node into relocate
14760 +                        * set. Restore status quo.
14761 +                        */
14762 +                       grabbed2flush_reserved((__u64) 1);
14763 +                       ZF_SET(node, JNODE_FLUSH_RESERVED);
14764 +               }
14765 +       }
14766 +       zrelse(node);
14767 +       done_lh(&uber_lock);
14768 +       grabbed2free_mark(grabbed);
14769 +       return ret;
14770 +}
14771 +
14772 +/* JNODE INTERFACE */
14773 +
14774 +/* Lock a node (if formatted) and then get its parent locked, set the child's
14775 +   coordinate in the parent.  If the child is the root node, the above_root
14776 +   znode is returned but the coord is not set.  This function may cause atom
14777 +   fusion, but it is only used for read locks (at this point) and therefore
14778 +   fusion only occurs when the parent is already dirty. */
14779 +/* Hans adds this note: remember to ask how expensive this operation is vs.
14780 +   storing parent pointer in jnodes. */
14781 +static int
14782 +jnode_lock_parent_coord(jnode * node,
14783 +                       coord_t *coord,
14784 +                       lock_handle * parent_lh,
14785 +                       load_count * parent_zh,
14786 +                       znode_lock_mode parent_mode, int try)
14787 +{
14788 +       int ret;
14789 +
14790 +       assert("edward-53", jnode_is_unformatted(node) || jnode_is_znode(node));
14791 +       assert("edward-54", jnode_is_unformatted(node)
14792 +              || znode_is_any_locked(JZNODE(node)));
14793 +
14794 +       if (!jnode_is_znode(node)) {
14795 +               reiser4_key key;
14796 +               tree_level stop_level = TWIG_LEVEL;
14797 +               lookup_bias bias = FIND_EXACT;
14798 +
14799 +               assert("edward-168", !(jnode_get_type(node) == JNODE_BITMAP));
14800 +
14801 +               /* The case when node is not znode, but can have parent coord
14802 +                  (unformatted node, node which represents cluster page,
14803 +                  etc..).  Generate a key for the appropriate entry, search
14804 +                  in the tree using coord_by_key, which handles locking for
14805 +                  us. */
14806 +
14807 +               /*
14808 +                * nothing is locked at this moment, so, nothing prevents
14809 +                * concurrent truncate from removing jnode from inode. To
14810 +                * prevent this spin-lock jnode. jnode can be truncated just
14811 +                * after call to the jnode_build_key(), but this is ok,
14812 +                * because coord_by_key() will just fail to find appropriate
14813 +                * extent.
14814 +                */
14815 +               spin_lock_jnode(node);
14816 +               if (!JF_ISSET(node, JNODE_HEARD_BANSHEE)) {
14817 +                       jnode_build_key(node, &key);
14818 +                       ret = 0;
14819 +               } else
14820 +                       ret = RETERR(-ENOENT);
14821 +               spin_unlock_jnode(node);
14822 +
14823 +               if (ret != 0)
14824 +                       return ret;
14825 +
14826 +               if (jnode_is_cluster_page(node))
14827 +                       stop_level = LEAF_LEVEL;
14828 +
14829 +               assert("jmacd-1812", coord != NULL);
14830 +
14831 +               ret = coord_by_key(jnode_get_tree(node), &key, coord, parent_lh,
14832 +                                  parent_mode, bias, stop_level, stop_level,
14833 +                                  CBK_UNIQUE, NULL/*ra_info */);
14834 +               switch (ret) {
14835 +               case CBK_COORD_NOTFOUND:
14836 +                       assert("edward-1038",
14837 +                              ergo(jnode_is_cluster_page(node),
14838 +                                   JF_ISSET(node, JNODE_HEARD_BANSHEE)));
14839 +                       if (!JF_ISSET(node, JNODE_HEARD_BANSHEE))
14840 +                               warning("nikita-3177", "Parent not found");
14841 +                       return ret;
14842 +               case CBK_COORD_FOUND:
14843 +                       if (coord->between != AT_UNIT) {
14844 +                               /* FIXME: comment needed */
14845 +                               done_lh(parent_lh);
14846 +                               if (!JF_ISSET(node, JNODE_HEARD_BANSHEE)) {
14847 +                                       warning("nikita-3178",
14848 +                                               "Found but not happy: %i",
14849 +                                               coord->between);
14850 +                               }
14851 +                               return RETERR(-ENOENT);
14852 +                       }
14853 +                       ret = incr_load_count_znode(parent_zh, parent_lh->node);
14854 +                       if (ret != 0)
14855 +                               return ret;
14856 +                       /* if (jnode_is_cluster_page(node)) {
14857 +                          races with write() are possible
14858 +                          check_child_cluster (parent_lh->node);
14859 +                          }
14860 +                        */
14861 +                       break;
14862 +               default:
14863 +                       return ret;
14864 +               }
14865 +
14866 +       } else {
14867 +               int flags;
14868 +               znode *z;
14869 +
14870 +               z = JZNODE(node);
14871 +               /* Formatted node case: */
14872 +               assert("jmacd-2061", !znode_is_root(z));
14873 +
14874 +               flags = GN_ALLOW_NOT_CONNECTED;
14875 +               if (try)
14876 +                       flags |= GN_TRY_LOCK;
14877 +
14878 +               ret =
14879 +                   reiser4_get_parent_flags(parent_lh, z, parent_mode, flags);
14880 +               if (ret != 0)
14881 +                       /* -E_REPEAT is ok here, it is handled by the caller. */
14882 +                       return ret;
14883 +
14884 +               /* Make the child's position "hint" up-to-date.  (Unless above
14885 +                  root, which caller must check.) */
14886 +               if (coord != NULL) {
14887 +
14888 +                       ret = incr_load_count_znode(parent_zh, parent_lh->node);
14889 +                       if (ret != 0) {
14890 +                               warning("jmacd-976812386",
14891 +                                       "incr_load_count_znode failed: %d",
14892 +                                       ret);
14893 +                               return ret;
14894 +                       }
14895 +
14896 +                       ret = find_child_ptr(parent_lh->node, z, coord);
14897 +                       if (ret != 0) {
14898 +                               warning("jmacd-976812",
14899 +                                       "find_child_ptr failed: %d", ret);
14900 +                               return ret;
14901 +                       }
14902 +               }
14903 +       }
14904 +
14905 +       return 0;
14906 +}
14907 +
14908 +/* Get the (locked) next neighbor of a znode which is dirty and a member of the
14909 +   same atom. If there is no next neighbor or the neighbor is not in memory or
14910 +   if there is a neighbor but it is not dirty or not in the same atom,
14911 +   -E_NO_NEIGHBOR is returned. In some cases the slum may include nodes which
14912 +   are not dirty, if so @check_dirty should be 0 */
14913 +static int neighbor_in_slum(znode * node,      /* starting point */
14914 +                           lock_handle * lock, /* lock on starting point */
14915 +                           sideof side,        /* left or right direction we
14916 +                                                  seek the next node in */
14917 +                           znode_lock_mode mode, /* kind of lock we want */
14918 +                           int check_dirty,    /* true if the neighbor should
14919 +                                                  be dirty */
14920 +                           int use_upper_levels /* get neighbor by going though
14921 +                                                   upper levels */)
14922 +{
14923 +       int ret;
14924 +       int flags;
14925 +
14926 +       assert("jmacd-6334", znode_is_connected(node));
14927 +
14928 +       flags =  GN_SAME_ATOM | (side == LEFT_SIDE ? GN_GO_LEFT : 0);
14929 +       if (use_upper_levels)
14930 +               flags |= GN_CAN_USE_UPPER_LEVELS;
14931 +
14932 +       ret = reiser4_get_neighbor(lock, node, mode, flags);
14933 +       if (ret) {
14934 +               /* May return -ENOENT or -E_NO_NEIGHBOR. */
14935 +               /* FIXME(C): check EINVAL, E_DEADLOCK */
14936 +               if (ret == -ENOENT)
14937 +                       ret = RETERR(-E_NO_NEIGHBOR);
14938 +               return ret;
14939 +       }
14940 +       if (!check_dirty)
14941 +               return 0;
14942 +       /* Check dirty bit of locked znode, no races here */
14943 +       if (JF_ISSET(ZJNODE(lock->node), JNODE_DIRTY))
14944 +               return 0;
14945 +
14946 +       done_lh(lock);
14947 +       return RETERR(-E_NO_NEIGHBOR);
14948 +}
14949 +
14950 +/* Return true if two znodes have the same parent.  This is called with both
14951 +   nodes write-locked (for squeezing) so no tree lock is needed. */
14952 +static int znode_same_parents(znode * a, znode * b)
14953 +{
14954 +       int result;
14955 +
14956 +       assert("jmacd-7011", znode_is_write_locked(a));
14957 +       assert("jmacd-7012", znode_is_write_locked(b));
14958 +
14959 +       /* We lock the whole tree for this check.... I really don't like whole
14960 +        * tree locks... -Hans */
14961 +       read_lock_tree(znode_get_tree(a));
14962 +       result = (znode_parent(a) == znode_parent(b));
14963 +       read_unlock_tree(znode_get_tree(a));
14964 +       return result;
14965 +}
14966 +
14967 +/* FLUSH SCAN */
14968 +
14969 +/* Initialize the flush_scan data structure. */
14970 +static void scan_init(flush_scan * scan)
14971 +{
14972 +       memset(scan, 0, sizeof(*scan));
14973 +       init_lh(&scan->node_lock);
14974 +       init_lh(&scan->parent_lock);
14975 +       init_load_count(&scan->parent_load);
14976 +       init_load_count(&scan->node_load);
14977 +       coord_init_invalid(&scan->parent_coord, NULL);
14978 +}
14979 +
14980 +/* Release any resources held by the flush scan, e.g. release locks,
14981 +   free memory, etc. */
14982 +static void scan_done(flush_scan * scan)
14983 +{
14984 +       done_load_count(&scan->node_load);
14985 +       if (scan->node != NULL) {
14986 +               jput(scan->node);
14987 +               scan->node = NULL;
14988 +       }
14989 +       done_load_count(&scan->parent_load);
14990 +       done_lh(&scan->parent_lock);
14991 +       done_lh(&scan->node_lock);
14992 +}
14993 +
14994 +/* Returns true if flush scanning is finished. */
14995 +int reiser4_scan_finished(flush_scan * scan)
14996 +{
14997 +       return scan->stop || (scan->direction == RIGHT_SIDE &&
14998 +                             scan->count >= scan->max_count);
14999 +}
15000 +
15001 +/* Return true if the scan should continue to the @tonode. True if the node
15002 +   meets the same_slum_check condition. If not, deref the "left" node and stop
15003 +   the scan. */
15004 +int reiser4_scan_goto(flush_scan * scan, jnode * tonode)
15005 +{
15006 +       int go = same_slum_check(scan->node, tonode, 1, 0);
15007 +
15008 +       if (!go) {
15009 +               scan->stop = 1;
15010 +               jput(tonode);
15011 +       }
15012 +
15013 +       return go;
15014 +}
15015 +
15016 +/* Set the current scan->node, refcount it, increment count by the @add_count
15017 +   (number to count, e.g., skipped unallocated nodes), deref previous current,
15018 +   and copy the current parent coordinate. */
15019 +int
15020 +scan_set_current(flush_scan * scan, jnode * node, unsigned add_count,
15021 +                const coord_t *parent)
15022 +{
15023 +       /* Release the old references, take the new reference. */
15024 +       done_load_count(&scan->node_load);
15025 +
15026 +       if (scan->node != NULL)
15027 +               jput(scan->node);
15028 +       scan->node = node;
15029 +       scan->count += add_count;
15030 +
15031 +       /* This next stmt is somewhat inefficient.  The reiser4_scan_extent()
15032 +          code could delay this update step until it finishes and update the
15033 +          parent_coord only once. It did that before, but there was a bug and
15034 +          this was the easiest way to make it correct. */
15035 +       if (parent != NULL)
15036 +               coord_dup(&scan->parent_coord, parent);
15037 +
15038 +       /* Failure may happen at the incr_load_count call, but the caller can
15039 +          assume the reference is safely taken. */
15040 +       return incr_load_count_jnode(&scan->node_load, node);
15041 +}
15042 +
15043 +/* Return true if scanning in the leftward direction. */
15044 +int reiser4_scanning_left(flush_scan * scan)
15045 +{
15046 +       return scan->direction == LEFT_SIDE;
15047 +}
15048 +
15049 +/* Performs leftward scanning starting from either kind of node. Counts the
15050 +   starting node. The right-scan object is passed in for the left-scan in order
15051 +   to copy the parent of an unformatted starting position. This way we avoid
15052 +   searching for the unformatted node's parent when scanning in each direction.
15053 +   If we search for the parent once it is set in both scan objects. The limit
15054 +   parameter tells flush-scan when to stop.
15055 +
15056 +   Rapid scanning is used only during scan_left, where we are interested in
15057 +   finding the 'leftpoint' where we begin flushing. We are interested in
15058 +   stopping at the left child of a twig that does not have a dirty left
15059 +   neighbour. THIS IS A SPECIAL CASE. The problem is finding a way to flush only
15060 +   those nodes without unallocated children, and it is difficult to solve in the
15061 +   bottom-up flushing algorithm we are currently using. The problem can be
15062 +   solved by scanning left at every level as we go upward, but this would
15063 +   basically bring us back to using a top-down allocation strategy, which we
15064 +   already tried (see BK history from May 2002), and has a different set of
15065 +   problems. The top-down strategy makes avoiding unallocated children easier,
15066 +   but makes it difficult to propertly flush dirty children with clean parents
15067 +   that would otherwise stop the top-down flush, only later to dirty the parent
15068 +   once the children are flushed. So we solve the problem in the bottom-up
15069 +   algorithm with a special case for twigs and leaves only.
15070 +
15071 +   The first step in solving the problem is this rapid leftward scan.  After we
15072 +   determine that there are at least enough nodes counted to qualify for
15073 +   FLUSH_RELOCATE_THRESHOLD we are no longer interested in the exact count, we
15074 +   are only interested in finding the best place to start the flush.
15075 +
15076 +   We could choose one of two possibilities:
15077 +
15078 +   1. Stop at the leftmost child (of a twig) that does not have a dirty left
15079 +   neighbor. This requires checking one leaf per rapid-scan twig
15080 +
15081 +   2. Stop at the leftmost child (of a twig) where there are no dirty children
15082 +   of the twig to the left. This requires checking possibly all of the in-memory
15083 +   children of each twig during the rapid scan.
15084 +
15085 +   For now we implement the first policy.
15086 +*/
15087 +static int
15088 +scan_left(flush_scan * scan, flush_scan * right, jnode * node, unsigned limit)
15089 +{
15090 +       int ret = 0;
15091 +
15092 +       scan->max_count = limit;
15093 +       scan->direction = LEFT_SIDE;
15094 +
15095 +       ret = scan_set_current(scan, jref(node), 1, NULL);
15096 +       if (ret != 0)
15097 +               return ret;
15098 +
15099 +       ret = scan_common(scan, right);
15100 +       if (ret != 0)
15101 +               return ret;
15102 +
15103 +       /* Before rapid scanning, we need a lock on scan->node so that we can
15104 +          get its parent, only if formatted. */
15105 +       if (jnode_is_znode(scan->node)) {
15106 +               ret = longterm_lock_znode(&scan->node_lock, JZNODE(scan->node),
15107 +                                         ZNODE_WRITE_LOCK, ZNODE_LOCK_LOPRI);
15108 +       }
15109 +
15110 +       /* Rapid_scan would go here (with limit set to FLUSH_RELOCATE_THRESHOLD)
15111 +       */
15112 +       return ret;
15113 +}
15114 +
15115 +/* Performs rightward scanning... Does not count the starting node. The limit
15116 +   parameter is described in scan_left. If the starting node is unformatted then
15117 +   the parent_coord was already set during scan_left. The rapid_after parameter
15118 +   is not used during right-scanning.
15119 +
15120 +   scan_right is only called if the scan_left operation does not count at least
15121 +   FLUSH_RELOCATE_THRESHOLD nodes for flushing.  Otherwise, the limit parameter
15122 +   is set to the difference between scan-left's count and
15123 +   FLUSH_RELOCATE_THRESHOLD, meaning scan-right counts as high as
15124 +   FLUSH_RELOCATE_THRESHOLD and then stops. */
15125 +static int scan_right(flush_scan * scan, jnode * node, unsigned limit)
15126 +{
15127 +       int ret;
15128 +
15129 +       scan->max_count = limit;
15130 +       scan->direction = RIGHT_SIDE;
15131 +
15132 +       ret = scan_set_current(scan, jref(node), 0, NULL);
15133 +       if (ret != 0)
15134 +               return ret;
15135 +
15136 +       return scan_common(scan, NULL);
15137 +}
15138 +
15139 +/* Common code to perform left or right scanning. */
15140 +static int scan_common(flush_scan * scan, flush_scan * other)
15141 +{
15142 +       int ret;
15143 +
15144 +       assert("nikita-2376", scan->node != NULL);
15145 +       assert("edward-54", jnode_is_unformatted(scan->node)
15146 +              || jnode_is_znode(scan->node));
15147 +
15148 +       /* Special case for starting at an unformatted node. Optimization: we
15149 +          only want to search for the parent (which requires a tree traversal)
15150 +          once. Obviously, we shouldn't have to call it once for the left scan
15151 +          and once for the right scan. For this reason, if we search for the
15152 +          parent during scan-left we then duplicate the coord/lock/load into
15153 +          the scan-right object. */
15154 +       if (jnode_is_unformatted(scan->node)) {
15155 +               ret = scan_unformatted(scan, other);
15156 +               if (ret != 0)
15157 +                       return ret;
15158 +       }
15159 +       /* This loop expects to start at a formatted position and performs
15160 +          chaining of formatted regions */
15161 +       while (!reiser4_scan_finished(scan)) {
15162 +
15163 +               ret = scan_formatted(scan);
15164 +               if (ret != 0)
15165 +                       return ret;
15166 +       }
15167 +
15168 +       return 0;
15169 +}
15170 +
15171 +static int scan_unformatted(flush_scan * scan, flush_scan * other)
15172 +{
15173 +       int ret = 0;
15174 +       int try = 0;
15175 +
15176 +       if (!coord_is_invalid(&scan->parent_coord))
15177 +               goto scan;
15178 +
15179 +       /* set parent coord from */
15180 +       if (!jnode_is_unformatted(scan->node)) {
15181 +               /* formatted position */
15182 +
15183 +               lock_handle lock;
15184 +               assert("edward-301", jnode_is_znode(scan->node));
15185 +               init_lh(&lock);
15186 +
15187 +               /*
15188 +                * when flush starts from unformatted node, first thing it
15189 +                * does is tree traversal to find formatted parent of starting
15190 +                * node. This parent is then kept lock across scans to the
15191 +                * left and to the right. This means that during scan to the
15192 +                * left we cannot take left-ward lock, because this is
15193 +                * dead-lock prone. So, if we are scanning to the left and
15194 +                * there is already lock held by this thread,
15195 +                * jnode_lock_parent_coord() should use try-lock.
15196 +                */
15197 +               try = reiser4_scanning_left(scan)
15198 +                   && !lock_stack_isclean(get_current_lock_stack());
15199 +               /* Need the node locked to get the parent lock, We have to
15200 +                  take write lock since there is at least one call path
15201 +                  where this znode is already write-locked by us. */
15202 +               ret =
15203 +                   longterm_lock_znode(&lock, JZNODE(scan->node),
15204 +                                       ZNODE_WRITE_LOCK,
15205 +                                       reiser4_scanning_left(scan) ?
15206 +                                       ZNODE_LOCK_LOPRI :
15207 +                                       ZNODE_LOCK_HIPRI);
15208 +               if (ret != 0)
15209 +                       /* EINVAL or E_DEADLOCK here mean... try again!  At this
15210 +                          point we've scanned too far and can't back out, just
15211 +                          start over. */
15212 +                       return ret;
15213 +
15214 +               ret = jnode_lock_parent_coord(scan->node,
15215 +                                             &scan->parent_coord,
15216 +                                             &scan->parent_lock,
15217 +                                             &scan->parent_load,
15218 +                                             ZNODE_WRITE_LOCK, try);
15219 +
15220 +               /* FIXME(C): check EINVAL, E_DEADLOCK */
15221 +               done_lh(&lock);
15222 +               if (ret == -E_REPEAT) {
15223 +                       scan->stop = 1;
15224 +                       return 0;
15225 +               }
15226 +               if (ret)
15227 +                       return ret;
15228 +
15229 +       } else {
15230 +               /* unformatted position */
15231 +
15232 +               ret =
15233 +                   jnode_lock_parent_coord(scan->node, &scan->parent_coord,
15234 +                                           &scan->parent_lock,
15235 +                                           &scan->parent_load,
15236 +                                           ZNODE_WRITE_LOCK, try);
15237 +
15238 +               if (IS_CBKERR(ret))
15239 +                       return ret;
15240 +
15241 +               if (ret == CBK_COORD_NOTFOUND)
15242 +                       /* FIXME(C): check EINVAL, E_DEADLOCK */
15243 +                       return ret;
15244 +
15245 +               /* parent was found */
15246 +               assert("jmacd-8661", other != NULL);
15247 +               /* Duplicate the reference into the other flush_scan. */
15248 +               coord_dup(&other->parent_coord, &scan->parent_coord);
15249 +               copy_lh(&other->parent_lock, &scan->parent_lock);
15250 +               copy_load_count(&other->parent_load, &scan->parent_load);
15251 +       }
15252 +scan:
15253 +       return scan_by_coord(scan);
15254 +}
15255 +
15256 +/* Performs left- or rightward scanning starting from a formatted node. Follow
15257 +   left pointers under tree lock as long as:
15258 +
15259 +   - node->left/right is non-NULL
15260 +   - node->left/right is connected, dirty
15261 +   - node->left/right belongs to the same atom
15262 +   - scan has not reached maximum count
15263 +*/
15264 +static int scan_formatted(flush_scan * scan)
15265 +{
15266 +       int ret;
15267 +       znode *neighbor = NULL;
15268 +
15269 +       assert("jmacd-1401", !reiser4_scan_finished(scan));
15270 +
15271 +       do {
15272 +               znode *node = JZNODE(scan->node);
15273 +
15274 +               /* Node should be connected, but if not stop the scan. */
15275 +               if (!znode_is_connected(node)) {
15276 +                       scan->stop = 1;
15277 +                       break;
15278 +               }
15279 +
15280 +               /* Lock the tree, check-for and reference the next sibling. */
15281 +               read_lock_tree(znode_get_tree(node));
15282 +
15283 +               /* It may be that a node is inserted or removed between a node
15284 +                  and its left sibling while the tree lock is released, but the
15285 +                  flush-scan count does not need to be precise. Thus, we
15286 +                  release the tree lock as soon as we get the neighboring node.
15287 +               */
15288 +               neighbor =
15289 +                       reiser4_scanning_left(scan) ? node->left : node->right;
15290 +               if (neighbor != NULL)
15291 +                       zref(neighbor);
15292 +
15293 +               read_unlock_tree(znode_get_tree(node));
15294 +
15295 +               /* If neighbor is NULL at the leaf level, need to check for an
15296 +                  unformatted sibling using the parent--break in any case. */
15297 +               if (neighbor == NULL)
15298 +                       break;
15299 +
15300 +               /* Check the condition for going left, break if it is not met.
15301 +                  This also releases (jputs) the neighbor if false. */
15302 +               if (!reiser4_scan_goto(scan, ZJNODE(neighbor)))
15303 +                       break;
15304 +
15305 +               /* Advance the flush_scan state to the left, repeat. */
15306 +               ret = scan_set_current(scan, ZJNODE(neighbor), 1, NULL);
15307 +               if (ret != 0)
15308 +                       return ret;
15309 +
15310 +       } while (!reiser4_scan_finished(scan));
15311 +
15312 +       /* If neighbor is NULL then we reached the end of a formatted region, or
15313 +          else the sibling is out of memory, now check for an extent to the
15314 +          left (as long as LEAF_LEVEL). */
15315 +       if (neighbor != NULL || jnode_get_level(scan->node) != LEAF_LEVEL
15316 +           || reiser4_scan_finished(scan)) {
15317 +               scan->stop = 1;
15318 +               return 0;
15319 +       }
15320 +       /* Otherwise, calls scan_by_coord for the right(left)most item of the
15321 +          left(right) neighbor on the parent level, then possibly continue. */
15322 +
15323 +       coord_init_invalid(&scan->parent_coord, NULL);
15324 +       return scan_unformatted(scan, NULL);
15325 +}
15326 +
15327 +/* NOTE-EDWARD:
15328 +   This scans adjacent items of the same type and calls scan flush plugin for
15329 +   each one. Performs left(right)ward scanning starting from a (possibly)
15330 +   unformatted node. If we start from unformatted node, then we continue only if
15331 +   the next neighbor is also unformatted. When called from scan_formatted, we
15332 +   skip first iteration (to make sure that right(left)most item of the
15333 +   left(right) neighbor on the parent level is of the same type and set
15334 +   appropriate coord). */
15335 +static int scan_by_coord(flush_scan * scan)
15336 +{
15337 +       int ret = 0;
15338 +       int scan_this_coord;
15339 +       lock_handle next_lock;
15340 +       load_count next_load;
15341 +       coord_t next_coord;
15342 +       jnode *child;
15343 +       item_plugin *iplug;
15344 +
15345 +       init_lh(&next_lock);
15346 +       init_load_count(&next_load);
15347 +       scan_this_coord = (jnode_is_unformatted(scan->node) ? 1 : 0);
15348 +
15349 +       /* set initial item id */
15350 +       iplug = item_plugin_by_coord(&scan->parent_coord);
15351 +
15352 +       for (; !reiser4_scan_finished(scan); scan_this_coord = 1) {
15353 +               if (scan_this_coord) {
15354 +                       /* Here we expect that unit is scannable. it would not
15355 +                        * be so due to race with extent->tail conversion.  */
15356 +                       if (iplug->f.scan == NULL) {
15357 +                               scan->stop = 1;
15358 +                               ret = -E_REPEAT;
15359 +                               /* skip the check at the end. */
15360 +                               goto race;
15361 +                       }
15362 +
15363 +                       ret = iplug->f.scan(scan);
15364 +                       if (ret != 0)
15365 +                               goto exit;
15366 +
15367 +                       if (reiser4_scan_finished(scan)) {
15368 +                               checkchild(scan);
15369 +                               break;
15370 +                       }
15371 +               } else {
15372 +                       /* the same race against truncate as above is possible
15373 +                        * here, it seems */
15374 +
15375 +                       /* NOTE-JMACD: In this case, apply the same end-of-node
15376 +                          logic but don't scan the first coordinate. */
15377 +                       assert("jmacd-1231",
15378 +                              item_is_internal(&scan->parent_coord));
15379 +               }
15380 +
15381 +               if (iplug->f.utmost_child == NULL
15382 +                   || znode_get_level(scan->parent_coord.node) != TWIG_LEVEL) {
15383 +                       /* stop this coord and continue on parrent level */
15384 +                       ret =
15385 +                           scan_set_current(scan,
15386 +                                            ZJNODE(zref
15387 +                                                   (scan->parent_coord.node)),
15388 +                                            1, NULL);
15389 +                       if (ret != 0)
15390 +                               goto exit;
15391 +                       break;
15392 +               }
15393 +
15394 +               /* Either way, the invariant is that scan->parent_coord is set
15395 +                  to the parent of scan->node. Now get the next unit. */
15396 +               coord_dup(&next_coord, &scan->parent_coord);
15397 +               coord_sideof_unit(&next_coord, scan->direction);
15398 +
15399 +               /* If off-the-end of the twig, try the next twig. */
15400 +               if (coord_is_after_sideof_unit(&next_coord, scan->direction)) {
15401 +                       /* We take the write lock because we may start flushing
15402 +                        * from this coordinate. */
15403 +                       ret = neighbor_in_slum(next_coord.node,
15404 +                                              &next_lock,
15405 +                                              scan->direction,
15406 +                                              ZNODE_WRITE_LOCK,
15407 +                                              1 /* check dirty */,
15408 +                                              0 /* don't go though upper
15409 +                                                   levels */);
15410 +                       if (ret == -E_NO_NEIGHBOR) {
15411 +                               scan->stop = 1;
15412 +                               ret = 0;
15413 +                               break;
15414 +                       }
15415 +
15416 +                       if (ret != 0)
15417 +                               goto exit;
15418 +
15419 +                       ret = incr_load_count_znode(&next_load, next_lock.node);
15420 +                       if (ret != 0)
15421 +                               goto exit;
15422 +
15423 +                       coord_init_sideof_unit(&next_coord, next_lock.node,
15424 +                                              sideof_reverse(scan->direction));
15425 +               }
15426 +
15427 +               iplug = item_plugin_by_coord(&next_coord);
15428 +
15429 +               /* Get the next child. */
15430 +               ret =
15431 +                   iplug->f.utmost_child(&next_coord,
15432 +                                         sideof_reverse(scan->direction),
15433 +                                         &child);
15434 +               if (ret != 0)
15435 +                       goto exit;
15436 +               /* If the next child is not in memory, or, item_utmost_child
15437 +                  failed (due to race with unlink, most probably), stop
15438 +                  here. */
15439 +               if (child == NULL || IS_ERR(child)) {
15440 +                       scan->stop = 1;
15441 +                       checkchild(scan);
15442 +                       break;
15443 +               }
15444 +
15445 +               assert("nikita-2374", jnode_is_unformatted(child)
15446 +                      || jnode_is_znode(child));
15447 +
15448 +               /* See if it is dirty, part of the same atom. */
15449 +               if (!reiser4_scan_goto(scan, child)) {
15450 +                       checkchild(scan);
15451 +                       break;
15452 +               }
15453 +
15454 +               /* If so, make this child current. */
15455 +               ret = scan_set_current(scan, child, 1, &next_coord);
15456 +               if (ret != 0)
15457 +                       goto exit;
15458 +
15459 +               /* Now continue.  If formatted we release the parent lock and
15460 +                  return, then proceed. */
15461 +               if (jnode_is_znode(child))
15462 +                       break;
15463 +
15464 +               /* Otherwise, repeat the above loop with next_coord. */
15465 +               if (next_load.node != NULL) {
15466 +                       done_lh(&scan->parent_lock);
15467 +                       move_lh(&scan->parent_lock, &next_lock);
15468 +                       move_load_count(&scan->parent_load, &next_load);
15469 +               }
15470 +       }
15471 +
15472 +       assert("jmacd-6233",
15473 +              reiser4_scan_finished(scan) || jnode_is_znode(scan->node));
15474 +exit:
15475 +       checkchild(scan);
15476 +race:                  /* skip the above check  */
15477 +       if (jnode_is_znode(scan->node)) {
15478 +               done_lh(&scan->parent_lock);
15479 +               done_load_count(&scan->parent_load);
15480 +       }
15481 +
15482 +       done_load_count(&next_load);
15483 +       done_lh(&next_lock);
15484 +       return ret;
15485 +}
15486 +
15487 +/* FLUSH POS HELPERS */
15488 +
15489 +/* Initialize the fields of a flush_position. */
15490 +static void pos_init(flush_pos_t *pos)
15491 +{
15492 +       memset(pos, 0, sizeof *pos);
15493 +
15494 +       pos->state = POS_INVALID;
15495 +       coord_init_invalid(&pos->coord, NULL);
15496 +       init_lh(&pos->lock);
15497 +       init_load_count(&pos->load);
15498 +
15499 +       reiser4_blocknr_hint_init(&pos->preceder);
15500 +}
15501 +
15502 +/* The flush loop inside squalloc periodically checks pos_valid to determine
15503 +   when "enough flushing" has been performed. This will return true until one
15504 +   of the following conditions is met:
15505 +
15506 +   1. the number of flush-queued nodes has reached the kernel-supplied
15507 +   "int *nr_to_flush" parameter, meaning we have flushed as many blocks as the
15508 +   kernel requested. When flushing to commit, this parameter is NULL.
15509 +
15510 +   2. pos_stop() is called because squalloc discovers that the "next" node in
15511 +   the flush order is either non-existant, not dirty, or not in the same atom.
15512 +*/
15513 +
15514 +static int pos_valid(flush_pos_t *pos)
15515 +{
15516 +       return pos->state != POS_INVALID;
15517 +}
15518 +
15519 +/* Release any resources of a flush_position. Called when jnode_flush
15520 +   finishes. */
15521 +static void pos_done(flush_pos_t *pos)
15522 +{
15523 +       pos_stop(pos);
15524 +       reiser4_blocknr_hint_done(&pos->preceder);
15525 +       if (convert_data(pos))
15526 +               free_convert_data(pos);
15527 +}
15528 +
15529 +/* Reset the point and parent.  Called during flush subroutines to terminate the
15530 +   squalloc loop. */
15531 +static int pos_stop(flush_pos_t *pos)
15532 +{
15533 +       pos->state = POS_INVALID;
15534 +       done_lh(&pos->lock);
15535 +       done_load_count(&pos->load);
15536 +       coord_init_invalid(&pos->coord, NULL);
15537 +
15538 +       if (pos->child) {
15539 +               jput(pos->child);
15540 +               pos->child = NULL;
15541 +       }
15542 +
15543 +       return 0;
15544 +}
15545 +
15546 +/* Return the flush_position's block allocator hint. */
15547 +reiser4_blocknr_hint *reiser4_pos_hint(flush_pos_t *pos)
15548 +{
15549 +       return &pos->preceder;
15550 +}
15551 +
15552 +flush_queue_t *reiser4_pos_fq(flush_pos_t *pos)
15553 +{
15554 +       return pos->fq;
15555 +}
15556 +
15557 +/* Make Linus happy.
15558 +   Local variables:
15559 +   c-indentation-style: "K&R"
15560 +   mode-name: "LC"
15561 +   c-basic-offset: 8
15562 +   tab-width: 8
15563 +   fill-column: 90
15564 +   LocalWords:  preceder
15565 +   End:
15566 +*/
15567 diff -puN /dev/null fs/reiser4/flush.h
15568 --- /dev/null
15569 +++ a/fs/reiser4/flush.h
15570 @@ -0,0 +1,300 @@
15571 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
15572 +
15573 +/* DECLARATIONS: */
15574 +
15575 +#if !defined(__REISER4_FLUSH_H__)
15576 +#define __REISER4_FLUSH_H__
15577 +
15578 +#include "plugin/cluster.h"
15579 +
15580 +/* The flush_scan data structure maintains the state of an in-progress
15581 +   flush-scan on a single level of the tree. A flush-scan is used for counting
15582 +   the number of adjacent nodes to flush, which is used to determine whether we
15583 +   should relocate, and it is also used to find a starting point for flush. A
15584 +   flush-scan object can scan in both right and left directions via the
15585 +   scan_left() and scan_right() interfaces. The right- and left-variations are
15586 +   similar but perform different functions. When scanning left we (optionally
15587 +   perform rapid scanning and then) longterm-lock the endpoint node. When
15588 +   scanning right we are simply counting the number of adjacent, dirty nodes. */
15589 +struct flush_scan {
15590 +
15591 +       /* The current number of nodes scanned on this level. */
15592 +       unsigned count;
15593 +
15594 +       /* There may be a maximum number of nodes for a scan on any single
15595 +          level. When going leftward, max_count is determined by
15596 +          FLUSH_SCAN_MAXNODES (see reiser4.h) */
15597 +       unsigned max_count;
15598 +
15599 +       /* Direction: Set to one of the sideof enumeration:
15600 +          { LEFT_SIDE, RIGHT_SIDE }. */
15601 +       sideof direction;
15602 +
15603 +       /* Initially @stop is set to false then set true once some condition
15604 +          stops the search (e.g., we found a clean node before reaching
15605 +          max_count or we found a node belonging to another atom). */
15606 +       int stop;
15607 +
15608 +       /* The current scan position.  If @node is non-NULL then its reference
15609 +          count has been incremented to reflect this reference. */
15610 +       jnode *node;
15611 +
15612 +       /* A handle for zload/zrelse of current scan position node. */
15613 +       load_count node_load;
15614 +
15615 +       /* During left-scan, if the final position (a.k.a. endpoint node) is
15616 +          formatted the node is locked using this lock handle. The endpoint
15617 +          needs to be locked for transfer to the flush_position object after
15618 +          scanning finishes. */
15619 +       lock_handle node_lock;
15620 +
15621 +       /* When the position is unformatted, its parent, coordinate, and parent
15622 +          zload/zrelse handle. */
15623 +       lock_handle parent_lock;
15624 +       coord_t parent_coord;
15625 +       load_count parent_load;
15626 +
15627 +       /* The block allocator preceder hint.  Sometimes flush_scan determines
15628 +          what the preceder is and if so it sets it here, after which it is
15629 +          copied into the flush_position. Otherwise, the preceder is computed
15630 +          later. */
15631 +       reiser4_block_nr preceder_blk;
15632 +};
15633 +
15634 +struct convert_item_info {
15635 +       dc_item_stat d_cur;     /* disk cluster state of the current item */
15636 +       dc_item_stat d_next;    /* disk cluster state of the next slum item */
15637 +       int cluster_shift;      /* disk cluster shift */
15638 +       flow_t flow;            /* disk cluster data */
15639 +};
15640 +
15641 +struct convert_info {
15642 +       int count;              /* for squalloc terminating */
15643 +       item_plugin *iplug;     /* current item plugin */
15644 +       struct convert_item_info *itm;  /* current item info */
15645 +       struct cluster_handle clust;    /* transform cluster */
15646 +};
15647 +
15648 +typedef enum flush_position_state {
15649 +       POS_INVALID,            /* Invalid or stopped pos, do not continue slum
15650 +                                * processing */
15651 +       POS_ON_LEAF,            /* pos points to already prepped, locked
15652 +                                * formatted node at leaf level */
15653 +       POS_ON_EPOINT,          /* pos keeps a lock on twig level, "coord" field
15654 +                                * is used to traverse unformatted nodes */
15655 +       POS_TO_LEAF,            /* pos is being moved to leaf level */
15656 +       POS_TO_TWIG,            /* pos is being moved to twig level */
15657 +       POS_END_OF_TWIG,        /* special case of POS_ON_TWIG, when coord is
15658 +                                * after rightmost unit of the current twig */
15659 +       POS_ON_INTERNAL         /* same as POS_ON_LEAF, but points to internal
15660 +                                * node */
15661 +} flushpos_state_t;
15662 +
15663 +/* An encapsulation of the current flush point and all the parameters that are
15664 +   passed through the entire squeeze-and-allocate stage of the flush routine.
15665 +   A single flush_position object is constructed after left- and right-scanning
15666 +   finishes. */
15667 +struct flush_position {
15668 +       flushpos_state_t state;
15669 +
15670 +       coord_t coord;          /* coord to traverse unformatted nodes */
15671 +       lock_handle lock;       /* current lock we hold */
15672 +       load_count load;        /* load status for current locked formatted node
15673 +                               */
15674 +       jnode *child;           /* for passing a reference to unformatted child
15675 +                                * across pos state changes */
15676 +
15677 +       reiser4_blocknr_hint preceder;  /* The flush 'hint' state. */
15678 +       int leaf_relocate;      /* True if enough leaf-level nodes were
15679 +                                * found to suggest a relocate policy. */
15680 +       int alloc_cnt;          /* The number of nodes allocated during squeeze
15681 +                                  and allococate. */
15682 +       int prep_or_free_cnt;   /* The number of nodes prepared for write
15683 +                                  (allocate) or squeezed and freed. */
15684 +       flush_queue_t *fq;
15685 +       long *nr_written;       /* number of nodes submitted to disk */
15686 +       int flags;              /* a copy of jnode_flush flags argument */
15687 +
15688 +       znode *prev_twig;       /* previous parent pointer value, used to catch
15689 +                                * processing of new twig node */
15690 +       struct convert_info *sq;        /* convert info */
15691 +
15692 +       unsigned long pos_in_unit;      /* for extents only. Position
15693 +                                          within an extent unit of first
15694 +                                          jnode of slum */
15695 +       long nr_to_write;       /* number of unformatted nodes to handle on
15696 +                                  flush */
15697 +};
15698 +
15699 +static inline int item_convert_count(flush_pos_t *pos)
15700 +{
15701 +       return pos->sq->count;
15702 +}
15703 +static inline void inc_item_convert_count(flush_pos_t *pos)
15704 +{
15705 +       pos->sq->count++;
15706 +}
15707 +static inline void set_item_convert_count(flush_pos_t *pos, int count)
15708 +{
15709 +       pos->sq->count = count;
15710 +}
15711 +static inline item_plugin *item_convert_plug(flush_pos_t *pos)
15712 +{
15713 +       return pos->sq->iplug;
15714 +}
15715 +
15716 +static inline struct convert_info *convert_data(flush_pos_t *pos)
15717 +{
15718 +       return pos->sq;
15719 +}
15720 +
15721 +static inline struct convert_item_info *item_convert_data(flush_pos_t *pos)
15722 +{
15723 +       assert("edward-955", convert_data(pos));
15724 +       return pos->sq->itm;
15725 +}
15726 +
15727 +static inline struct tfm_cluster *tfm_cluster_sq(flush_pos_t *pos)
15728 +{
15729 +       return &pos->sq->clust.tc;
15730 +}
15731 +
15732 +static inline struct tfm_stream *tfm_stream_sq(flush_pos_t *pos,
15733 +                                               tfm_stream_id id)
15734 +{
15735 +       assert("edward-854", pos->sq != NULL);
15736 +       return get_tfm_stream(tfm_cluster_sq(pos), id);
15737 +}
15738 +
15739 +static inline int chaining_data_present(flush_pos_t *pos)
15740 +{
15741 +       return convert_data(pos) && item_convert_data(pos);
15742 +}
15743 +
15744 +/* Returns true if next node contains next item of the disk cluster
15745 +   so item convert data should be moved to the right slum neighbor.
15746 +*/
15747 +static inline int should_chain_next_node(flush_pos_t *pos)
15748 +{
15749 +       int result = 0;
15750 +
15751 +       assert("edward-1007", chaining_data_present(pos));
15752 +
15753 +       switch (item_convert_data(pos)->d_next) {
15754 +       case DC_CHAINED_ITEM:
15755 +               result = 1;
15756 +               break;
15757 +       case DC_AFTER_CLUSTER:
15758 +               break;
15759 +       default:
15760 +               impossible("edward-1009", "bad state of next slum item");
15761 +       }
15762 +       return result;
15763 +}
15764 +
15765 +/* update item state in a disk cluster to assign conversion mode */
15766 +static inline void
15767 +move_chaining_data(flush_pos_t *pos, int this_node/* where is next item */)
15768 +{
15769 +
15770 +       assert("edward-1010", chaining_data_present(pos));
15771 +
15772 +       if (this_node == 0) {
15773 +               /* next item is on the right neighbor */
15774 +               assert("edward-1011",
15775 +                      item_convert_data(pos)->d_cur == DC_FIRST_ITEM ||
15776 +                      item_convert_data(pos)->d_cur == DC_CHAINED_ITEM);
15777 +               assert("edward-1012",
15778 +                      item_convert_data(pos)->d_next == DC_CHAINED_ITEM);
15779 +
15780 +               item_convert_data(pos)->d_cur = DC_CHAINED_ITEM;
15781 +               item_convert_data(pos)->d_next = DC_INVALID_STATE;
15782 +       } else {
15783 +               /* next item is on the same node */
15784 +               assert("edward-1013",
15785 +                      item_convert_data(pos)->d_cur == DC_FIRST_ITEM ||
15786 +                      item_convert_data(pos)->d_cur == DC_CHAINED_ITEM);
15787 +               assert("edward-1227",
15788 +                      item_convert_data(pos)->d_next == DC_AFTER_CLUSTER ||
15789 +                      item_convert_data(pos)->d_next == DC_INVALID_STATE);
15790 +
15791 +               item_convert_data(pos)->d_cur = DC_AFTER_CLUSTER;
15792 +               item_convert_data(pos)->d_next = DC_INVALID_STATE;
15793 +       }
15794 +}
15795 +
15796 +static inline int should_convert_node(flush_pos_t *pos, znode * node)
15797 +{
15798 +       return znode_convertible(node);
15799 +}
15800 +
15801 +/* true if there is attached convert item info */
15802 +static inline int should_convert_next_node(flush_pos_t *pos)
15803 +{
15804 +       return convert_data(pos) && item_convert_data(pos);
15805 +}
15806 +
15807 +#define SQUALLOC_THRESHOLD 256
15808 +
15809 +static inline int should_terminate_squalloc(flush_pos_t *pos)
15810 +{
15811 +       return convert_data(pos) &&
15812 +           !item_convert_data(pos) &&
15813 +           item_convert_count(pos) >= SQUALLOC_THRESHOLD;
15814 +}
15815 +
15816 +#if 1
15817 +#define check_convert_info(pos)                                                \
15818 +do {                                                                   \
15819 +       if (unlikely(should_convert_next_node(pos))) {                  \
15820 +               warning("edward-1006", "unprocessed chained data");     \
15821 +               printk("d_cur = %d, d_next = %d, flow.len = %llu\n",    \
15822 +                      item_convert_data(pos)->d_cur,                   \
15823 +                      item_convert_data(pos)->d_next,                  \
15824 +                      item_convert_data(pos)->flow.length);            \
15825 +       }                                                               \
15826 +} while (0)
15827 +#else
15828 +#define check_convert_info(pos)
15829 +#endif /* REISER4_DEBUG */
15830 +
15831 +void free_convert_data(flush_pos_t *pos);
15832 +/* used in extent.c */
15833 +int scan_set_current(flush_scan * scan, jnode * node, unsigned add_size,
15834 +                    const coord_t *parent);
15835 +int reiser4_scan_finished(flush_scan * scan);
15836 +int reiser4_scanning_left(flush_scan * scan);
15837 +int reiser4_scan_goto(flush_scan * scan, jnode * tonode);
15838 +txn_atom *atom_locked_by_fq(flush_queue_t *fq);
15839 +int reiser4_alloc_extent(flush_pos_t *flush_pos);
15840 +squeeze_result squalloc_extent(znode *left, const coord_t *, flush_pos_t *,
15841 +                              reiser4_key *stop_key);
15842 +extern int reiser4_init_fqs(void);
15843 +extern void reiser4_done_fqs(void);
15844 +
15845 +#if REISER4_DEBUG
15846 +
15847 +extern void reiser4_check_fq(const txn_atom *atom);
15848 +extern atomic_t flush_cnt;
15849 +
15850 +#define check_preceder(blk) \
15851 +assert("nikita-2588", blk < reiser4_block_count(reiser4_get_current_sb()));
15852 +extern void check_pos(flush_pos_t *pos);
15853 +#else
15854 +#define check_preceder(b) noop
15855 +#define check_pos(pos) noop
15856 +#endif
15857 +
15858 +/* __REISER4_FLUSH_H__ */
15859 +#endif
15860 +
15861 +/* Make Linus happy.
15862 +   Local variables:
15863 +   c-indentation-style: "K&R"
15864 +   mode-name: "LC"
15865 +   c-basic-offset: 8
15866 +   tab-width: 8
15867 +   fill-column: 90
15868 +   LocalWords:  preceder
15869 +   End:
15870 +*/
15871 diff -puN /dev/null fs/reiser4/flush_queue.c
15872 --- /dev/null
15873 +++ a/fs/reiser4/flush_queue.c
15874 @@ -0,0 +1,678 @@
15875 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
15876 +   reiser4/README */
15877 +
15878 +#include "debug.h"
15879 +#include "super.h"
15880 +#include "txnmgr.h"
15881 +#include "jnode.h"
15882 +#include "znode.h"
15883 +#include "page_cache.h"
15884 +#include "wander.h"
15885 +#include "vfs_ops.h"
15886 +#include "writeout.h"
15887 +#include "flush.h"
15888 +
15889 +#include <linux/bio.h>
15890 +#include <linux/mm.h>
15891 +#include <linux/pagemap.h>
15892 +#include <linux/blkdev.h>
15893 +#include <linux/writeback.h>
15894 +
15895 +/* A flush queue object is an accumulator for keeping jnodes prepared
15896 +   by the jnode_flush() function for writing to disk. Those "queued" jnodes are
15897 +   kept on the flush queue until memory pressure or atom commit asks
15898 +   flush queues to write some or all from their jnodes. */
15899 +
15900 +/*
15901 +   LOCKING:
15902 +
15903 +   fq->guard spin lock protects fq->atom pointer and nothing else.  fq->prepped
15904 +   list protected by atom spin lock.  fq->prepped list uses the following
15905 +   locking:
15906 +
15907 +   two ways to protect fq->prepped list for read-only list traversal:
15908 +
15909 +   1. atom spin-lock atom.
15910 +   2. fq is IN_USE, atom->nr_running_queues increased.
15911 +
15912 +   and one for list modification:
15913 +
15914 +   1. atom is spin-locked and one condition is true: fq is IN_USE or
15915 +      atom->nr_running_queues == 0.
15916 +
15917 +   The deadlock-safe order for flush queues and atoms is: first lock atom, then
15918 +   lock flush queue, then lock jnode.
15919 +*/
15920 +
15921 +#define fq_in_use(fq)          ((fq)->state & FQ_IN_USE)
15922 +#define fq_ready(fq)           (!fq_in_use(fq))
15923 +
15924 +#define mark_fq_in_use(fq)     do { (fq)->state |= FQ_IN_USE;    } while (0)
15925 +#define mark_fq_ready(fq)      do { (fq)->state &= ~FQ_IN_USE;   } while (0)
15926 +
15927 +/* get lock on atom from locked flush queue object */
15928 +static txn_atom *atom_locked_by_fq_nolock(flush_queue_t *fq)
15929 +{
15930 +       /* This code is similar to jnode_get_atom(), look at it for the
15931 +        * explanation. */
15932 +       txn_atom *atom;
15933 +
15934 +       assert_spin_locked(&(fq->guard));
15935 +
15936 +       while (1) {
15937 +               atom = fq->atom;
15938 +               if (atom == NULL)
15939 +                       break;
15940 +
15941 +               if (spin_trylock_atom(atom))
15942 +                       break;
15943 +
15944 +               atomic_inc(&atom->refcount);
15945 +               spin_unlock(&(fq->guard));
15946 +               spin_lock_atom(atom);
15947 +               spin_lock(&(fq->guard));
15948 +
15949 +               if (fq->atom == atom) {
15950 +                       atomic_dec(&atom->refcount);
15951 +                       break;
15952 +               }
15953 +
15954 +               spin_unlock(&(fq->guard));
15955 +               atom_dec_and_unlock(atom);
15956 +               spin_lock(&(fq->guard));
15957 +       }
15958 +
15959 +       return atom;
15960 +}
15961 +
15962 +txn_atom *atom_locked_by_fq(flush_queue_t *fq)
15963 +{
15964 +       txn_atom *atom;
15965 +
15966 +       spin_lock(&(fq->guard));
15967 +       atom = atom_locked_by_fq_nolock(fq);
15968 +       spin_unlock(&(fq->guard));
15969 +       return atom;
15970 +}
15971 +
15972 +static void init_fq(flush_queue_t *fq)
15973 +{
15974 +       memset(fq, 0, sizeof *fq);
15975 +
15976 +       atomic_set(&fq->nr_submitted, 0);
15977 +
15978 +       INIT_LIST_HEAD(ATOM_FQ_LIST(fq));
15979 +
15980 +       init_waitqueue_head(&fq->wait);
15981 +       spin_lock_init(&fq->guard);
15982 +}
15983 +
15984 +/* slab for flush queues */
15985 +static struct kmem_cache *fq_slab;
15986 +
15987 +/**
15988 + * reiser4_init_fqs - create flush queue cache
15989 + *
15990 + * Initializes slab cache of flush queues. It is part of reiser4 module
15991 + * initialization.
15992 + */
15993 +int reiser4_init_fqs(void)
15994 +{
15995 +       fq_slab = kmem_cache_create("fq",
15996 +                                   sizeof(flush_queue_t),
15997 +                                   0, SLAB_HWCACHE_ALIGN, NULL);
15998 +       if (fq_slab == NULL)
15999 +               return RETERR(-ENOMEM);
16000 +       return 0;
16001 +}
16002 +
16003 +/**
16004 + * reiser4_done_fqs - delete flush queue cache
16005 + *
16006 + * This is called on reiser4 module unloading or system shutdown.
16007 + */
16008 +void reiser4_done_fqs(void)
16009 +{
16010 +       destroy_reiser4_cache(&fq_slab);
16011 +}
16012 +
16013 +/* create new flush queue object */
16014 +static flush_queue_t *create_fq(gfp_t gfp)
16015 +{
16016 +       flush_queue_t *fq;
16017 +
16018 +       fq = kmem_cache_alloc(fq_slab, gfp);
16019 +       if (fq)
16020 +               init_fq(fq);
16021 +
16022 +       return fq;
16023 +}
16024 +
16025 +/* adjust atom's and flush queue's counters of queued nodes */
16026 +static void count_enqueued_node(flush_queue_t *fq)
16027 +{
16028 +       ON_DEBUG(fq->atom->num_queued++);
16029 +}
16030 +
16031 +static void count_dequeued_node(flush_queue_t *fq)
16032 +{
16033 +       assert("zam-993", fq->atom->num_queued > 0);
16034 +       ON_DEBUG(fq->atom->num_queued--);
16035 +}
16036 +
16037 +/* attach flush queue object to the atom */
16038 +static void attach_fq(txn_atom *atom, flush_queue_t *fq)
16039 +{
16040 +       assert_spin_locked(&(atom->alock));
16041 +       list_add(&fq->alink, &atom->flush_queues);
16042 +       fq->atom = atom;
16043 +       ON_DEBUG(atom->nr_flush_queues++);
16044 +}
16045 +
16046 +static void detach_fq(flush_queue_t *fq)
16047 +{
16048 +       assert_spin_locked(&(fq->atom->alock));
16049 +
16050 +       spin_lock(&(fq->guard));
16051 +       list_del_init(&fq->alink);
16052 +       assert("vs-1456", fq->atom->nr_flush_queues > 0);
16053 +       ON_DEBUG(fq->atom->nr_flush_queues--);
16054 +       fq->atom = NULL;
16055 +       spin_unlock(&(fq->guard));
16056 +}
16057 +
16058 +/* destroy flush queue object */
16059 +static void done_fq(flush_queue_t *fq)
16060 +{
16061 +       assert("zam-763", list_empty_careful(ATOM_FQ_LIST(fq)));
16062 +       assert("zam-766", atomic_read(&fq->nr_submitted) == 0);
16063 +
16064 +       kmem_cache_free(fq_slab, fq);
16065 +}
16066 +
16067 +/* */
16068 +static void mark_jnode_queued(flush_queue_t *fq, jnode * node)
16069 +{
16070 +       JF_SET(node, JNODE_FLUSH_QUEUED);
16071 +       count_enqueued_node(fq);
16072 +}
16073 +
16074 +/* Putting jnode into the flush queue. Both atom and jnode should be
16075 +   spin-locked. */
16076 +void queue_jnode(flush_queue_t *fq, jnode * node)
16077 +{
16078 +       assert_spin_locked(&(node->guard));
16079 +       assert("zam-713", node->atom != NULL);
16080 +       assert_spin_locked(&(node->atom->alock));
16081 +       assert("zam-716", fq->atom != NULL);
16082 +       assert("zam-717", fq->atom == node->atom);
16083 +       assert("zam-907", fq_in_use(fq));
16084 +
16085 +       assert("zam-714", JF_ISSET(node, JNODE_DIRTY));
16086 +       assert("zam-826", JF_ISSET(node, JNODE_RELOC));
16087 +       assert("vs-1481", !JF_ISSET(node, JNODE_FLUSH_QUEUED));
16088 +       assert("vs-1481", NODE_LIST(node) != FQ_LIST);
16089 +
16090 +       mark_jnode_queued(fq, node);
16091 +       list_move_tail(&node->capture_link, ATOM_FQ_LIST(fq));
16092 +
16093 +       ON_DEBUG(count_jnode(node->atom, node, NODE_LIST(node),
16094 +                            FQ_LIST, 1));
16095 +}
16096 +
16097 +/* repeatable process for waiting io completion on a flush queue object */
16098 +static int wait_io(flush_queue_t *fq, int *nr_io_errors)
16099 +{
16100 +       assert("zam-738", fq->atom != NULL);
16101 +       assert_spin_locked(&(fq->atom->alock));
16102 +       assert("zam-736", fq_in_use(fq));
16103 +       assert("zam-911", list_empty_careful(ATOM_FQ_LIST(fq)));
16104 +
16105 +       if (atomic_read(&fq->nr_submitted) != 0) {
16106 +               struct super_block *super;
16107 +
16108 +               spin_unlock_atom(fq->atom);
16109 +
16110 +               assert("nikita-3013", reiser4_schedulable());
16111 +
16112 +               super = reiser4_get_current_sb();
16113 +
16114 +               /* FIXME: this is instead of blk_run_queues() */
16115 +               blk_run_address_space(reiser4_get_super_fake(super)->i_mapping);
16116 +
16117 +               if (!(super->s_flags & MS_RDONLY))
16118 +                       wait_event(fq->wait,
16119 +                                  atomic_read(&fq->nr_submitted) == 0);
16120 +
16121 +               /* Ask the caller to re-acquire the locks and call this
16122 +                  function again. Note: this technique is commonly used in
16123 +                  the txnmgr code. */
16124 +               return -E_REPEAT;
16125 +       }
16126 +
16127 +       *nr_io_errors += atomic_read(&fq->nr_errors);
16128 +       return 0;
16129 +}
16130 +
16131 +/* wait on I/O completion, re-submit dirty nodes to write */
16132 +static int finish_fq(flush_queue_t *fq, int *nr_io_errors)
16133 +{
16134 +       int ret;
16135 +       txn_atom *atom = fq->atom;
16136 +
16137 +       assert("zam-801", atom != NULL);
16138 +       assert_spin_locked(&(atom->alock));
16139 +       assert("zam-762", fq_in_use(fq));
16140 +
16141 +       ret = wait_io(fq, nr_io_errors);
16142 +       if (ret)
16143 +               return ret;
16144 +
16145 +       detach_fq(fq);
16146 +       done_fq(fq);
16147 +
16148 +       reiser4_atom_send_event(atom);
16149 +
16150 +       return 0;
16151 +}
16152 +
16153 +/* wait for all i/o for given atom to be completed, actually do one iteration
16154 +   on that and return -E_REPEAT if there more iterations needed */
16155 +static int finish_all_fq(txn_atom * atom, int *nr_io_errors)
16156 +{
16157 +       flush_queue_t *fq;
16158 +
16159 +       assert_spin_locked(&(atom->alock));
16160 +
16161 +       if (list_empty_careful(&atom->flush_queues))
16162 +               return 0;
16163 +
16164 +       list_for_each_entry(fq, &atom->flush_queues, alink) {
16165 +               if (fq_ready(fq)) {
16166 +                       int ret;
16167 +
16168 +                       mark_fq_in_use(fq);
16169 +                       assert("vs-1247", fq->owner == NULL);
16170 +                       ON_DEBUG(fq->owner = current);
16171 +                       ret = finish_fq(fq, nr_io_errors);
16172 +
16173 +                       if (*nr_io_errors)
16174 +                               reiser4_handle_error();
16175 +
16176 +                       if (ret) {
16177 +                               reiser4_fq_put(fq);
16178 +                               return ret;
16179 +                       }
16180 +
16181 +                       spin_unlock_atom(atom);
16182 +
16183 +                       return -E_REPEAT;
16184 +               }
16185 +       }
16186 +
16187 +       /* All flush queues are in use; atom remains locked */
16188 +       return -EBUSY;
16189 +}
16190 +
16191 +/* wait all i/o for current atom */
16192 +int current_atom_finish_all_fq(void)
16193 +{
16194 +       txn_atom *atom;
16195 +       int nr_io_errors = 0;
16196 +       int ret = 0;
16197 +
16198 +       do {
16199 +               while (1) {
16200 +                       atom = get_current_atom_locked();
16201 +                       ret = finish_all_fq(atom, &nr_io_errors);
16202 +                       if (ret != -EBUSY)
16203 +                               break;
16204 +                       reiser4_atom_wait_event(atom);
16205 +               }
16206 +       } while (ret == -E_REPEAT);
16207 +
16208 +       /* we do not need locked atom after this function finishes, SUCCESS or
16209 +          -EBUSY are two return codes when atom remains locked after
16210 +          finish_all_fq */
16211 +       if (!ret)
16212 +               spin_unlock_atom(atom);
16213 +
16214 +       assert_spin_not_locked(&(atom->alock));
16215 +
16216 +       if (ret)
16217 +               return ret;
16218 +
16219 +       if (nr_io_errors)
16220 +               return RETERR(-EIO);
16221 +
16222 +       return 0;
16223 +}
16224 +
16225 +/* change node->atom field for all jnode from given list */
16226 +static void
16227 +scan_fq_and_update_atom_ref(struct list_head *list, txn_atom *atom)
16228 +{
16229 +       jnode *cur;
16230 +
16231 +       list_for_each_entry(cur, list, capture_link) {
16232 +               spin_lock_jnode(cur);
16233 +               cur->atom = atom;
16234 +               spin_unlock_jnode(cur);
16235 +       }
16236 +}
16237 +
16238 +/* support for atom fusion operation */
16239 +void reiser4_fuse_fq(txn_atom *to, txn_atom *from)
16240 +{
16241 +       flush_queue_t *fq;
16242 +
16243 +       assert_spin_locked(&(to->alock));
16244 +       assert_spin_locked(&(from->alock));
16245 +
16246 +       list_for_each_entry(fq, &from->flush_queues, alink) {
16247 +               scan_fq_and_update_atom_ref(ATOM_FQ_LIST(fq), to);
16248 +               spin_lock(&(fq->guard));
16249 +               fq->atom = to;
16250 +               spin_unlock(&(fq->guard));
16251 +       }
16252 +
16253 +       list_splice_init(&from->flush_queues, to->flush_queues.prev);
16254 +
16255 +#if REISER4_DEBUG
16256 +       to->num_queued += from->num_queued;
16257 +       to->nr_flush_queues += from->nr_flush_queues;
16258 +       from->nr_flush_queues = 0;
16259 +#endif
16260 +}
16261 +
16262 +#if REISER4_DEBUG
16263 +int atom_fq_parts_are_clean(txn_atom * atom)
16264 +{
16265 +       assert("zam-915", atom != NULL);
16266 +       return list_empty_careful(&atom->flush_queues);
16267 +}
16268 +#endif
16269 +/* Bio i/o completion routine for reiser4 write operations. */
16270 +static void
16271 +end_io_handler(struct bio *bio, int err)
16272 +{
16273 +       int i;
16274 +       int nr_errors = 0;
16275 +       flush_queue_t *fq;
16276 +
16277 +       assert("zam-958", bio->bi_rw & WRITE);
16278 +
16279 +       if (err == -EOPNOTSUPP)
16280 +               set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
16281 +
16282 +       /* we expect that bio->private is set to NULL or fq object which is used
16283 +        * for synchronization and error counting. */
16284 +       fq = bio->bi_private;
16285 +       /* Check all elements of io_vec for correct write completion. */
16286 +       for (i = 0; i < bio->bi_vcnt; i += 1) {
16287 +               struct page *pg = bio->bi_io_vec[i].bv_page;
16288 +
16289 +               if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
16290 +                       SetPageError(pg);
16291 +                       nr_errors++;
16292 +               }
16293 +
16294 +               {
16295 +                       /* jnode WRITEBACK ("write is in progress bit") is
16296 +                        * atomically cleared here. */
16297 +                       jnode *node;
16298 +
16299 +                       assert("zam-736", pg != NULL);
16300 +                       assert("zam-736", PagePrivate(pg));
16301 +                       node = jprivate(pg);
16302 +
16303 +                       JF_CLR(node, JNODE_WRITEBACK);
16304 +               }
16305 +
16306 +               end_page_writeback(pg);
16307 +               page_cache_release(pg);
16308 +       }
16309 +
16310 +       if (fq) {
16311 +               /* count i/o error in fq object */
16312 +               atomic_add(nr_errors, &fq->nr_errors);
16313 +
16314 +               /* If all write requests registered in this "fq" are done we up
16315 +                * the waiter. */
16316 +               if (atomic_sub_and_test(bio->bi_vcnt, &fq->nr_submitted))
16317 +                       wake_up(&fq->wait);
16318 +       }
16319 +
16320 +       bio_put(bio);
16321 +}
16322 +
16323 +/* Count I/O requests which will be submitted by @bio in given flush queues
16324 +   @fq */
16325 +void add_fq_to_bio(flush_queue_t *fq, struct bio *bio)
16326 +{
16327 +       bio->bi_private = fq;
16328 +       bio->bi_end_io = end_io_handler;
16329 +
16330 +       if (fq)
16331 +               atomic_add(bio->bi_vcnt, &fq->nr_submitted);
16332 +}
16333 +
16334 +/* Move all queued nodes out from @fq->prepped list. */
16335 +static void release_prepped_list(flush_queue_t *fq)
16336 +{
16337 +       txn_atom *atom;
16338 +
16339 +       assert("zam-904", fq_in_use(fq));
16340 +       atom = atom_locked_by_fq(fq);
16341 +
16342 +       while (!list_empty(ATOM_FQ_LIST(fq))) {
16343 +               jnode *cur;
16344 +
16345 +               cur = list_entry(ATOM_FQ_LIST(fq)->next, jnode, capture_link);
16346 +               list_del_init(&cur->capture_link);
16347 +
16348 +               count_dequeued_node(fq);
16349 +               spin_lock_jnode(cur);
16350 +               assert("nikita-3154", !JF_ISSET(cur, JNODE_OVRWR));
16351 +               assert("nikita-3154", JF_ISSET(cur, JNODE_RELOC));
16352 +               assert("nikita-3154", JF_ISSET(cur, JNODE_FLUSH_QUEUED));
16353 +               JF_CLR(cur, JNODE_FLUSH_QUEUED);
16354 +
16355 +               if (JF_ISSET(cur, JNODE_DIRTY)) {
16356 +                       list_add_tail(&cur->capture_link,
16357 +                                     ATOM_DIRTY_LIST(atom,
16358 +                                                     jnode_get_level(cur)));
16359 +                       ON_DEBUG(count_jnode(atom, cur, FQ_LIST,
16360 +                                            DIRTY_LIST, 1));
16361 +               } else {
16362 +                       list_add_tail(&cur->capture_link,
16363 +                                     ATOM_CLEAN_LIST(atom));
16364 +                       ON_DEBUG(count_jnode(atom, cur, FQ_LIST,
16365 +                                            CLEAN_LIST, 1));
16366 +               }
16367 +
16368 +               spin_unlock_jnode(cur);
16369 +       }
16370 +
16371 +       if (--atom->nr_running_queues == 0)
16372 +               reiser4_atom_send_event(atom);
16373 +
16374 +       spin_unlock_atom(atom);
16375 +}
16376 +
16377 +/* Submit write requests for nodes on the already filled flush queue @fq.
16378 +
16379 +   @fq: flush queue object which contains jnodes we can (and will) write.
16380 +   @return: number of submitted blocks (>=0) if success, otherwise -- an error
16381 +           code (<0). */
16382 +int reiser4_write_fq(flush_queue_t *fq, long *nr_submitted, int flags)
16383 +{
16384 +       int ret;
16385 +       txn_atom *atom;
16386 +
16387 +       while (1) {
16388 +               atom = atom_locked_by_fq(fq);
16389 +               assert("zam-924", atom);
16390 +               /* do not write fq in parallel. */
16391 +               if (atom->nr_running_queues == 0
16392 +                   || !(flags & WRITEOUT_SINGLE_STREAM))
16393 +                       break;
16394 +               reiser4_atom_wait_event(atom);
16395 +       }
16396 +
16397 +       atom->nr_running_queues++;
16398 +       spin_unlock_atom(atom);
16399 +
16400 +       ret = write_jnode_list(ATOM_FQ_LIST(fq), fq, nr_submitted, flags);
16401 +       release_prepped_list(fq);
16402 +
16403 +       return ret;
16404 +}
16405 +
16406 +/* Getting flush queue object for exclusive use by one thread. May require
16407 +   several iterations which is indicated by -E_REPEAT return code.
16408 +
16409 +   This function does not contain code for obtaining an atom lock because an
16410 +   atom lock is obtained by different ways in different parts of reiser4,
16411 +   usually it is current atom, but we need a possibility for getting fq for the
16412 +   atom of given jnode. */
16413 +static int fq_by_atom_gfp(txn_atom *atom, flush_queue_t **new_fq, gfp_t gfp)
16414 +{
16415 +       flush_queue_t *fq;
16416 +
16417 +       assert_spin_locked(&(atom->alock));
16418 +
16419 +       fq = list_entry(atom->flush_queues.next, flush_queue_t, alink);
16420 +       while (&atom->flush_queues != &fq->alink) {
16421 +               spin_lock(&(fq->guard));
16422 +
16423 +               if (fq_ready(fq)) {
16424 +                       mark_fq_in_use(fq);
16425 +                       assert("vs-1246", fq->owner == NULL);
16426 +                       ON_DEBUG(fq->owner = current);
16427 +                       spin_unlock(&(fq->guard));
16428 +
16429 +                       if (*new_fq)
16430 +                               done_fq(*new_fq);
16431 +
16432 +                       *new_fq = fq;
16433 +
16434 +                       return 0;
16435 +               }
16436 +
16437 +               spin_unlock(&(fq->guard));
16438 +
16439 +               fq = list_entry(fq->alink.next, flush_queue_t, alink);
16440 +       }
16441 +
16442 +       /* Use previously allocated fq object */
16443 +       if (*new_fq) {
16444 +               mark_fq_in_use(*new_fq);
16445 +               assert("vs-1248", (*new_fq)->owner == 0);
16446 +               ON_DEBUG((*new_fq)->owner = current);
16447 +               attach_fq(atom, *new_fq);
16448 +
16449 +               return 0;
16450 +       }
16451 +
16452 +       spin_unlock_atom(atom);
16453 +
16454 +       *new_fq = create_fq(gfp);
16455 +
16456 +       if (*new_fq == NULL)
16457 +               return RETERR(-ENOMEM);
16458 +
16459 +       return RETERR(-E_REPEAT);
16460 +}
16461 +
16462 +int reiser4_fq_by_atom(txn_atom * atom, flush_queue_t **new_fq)
16463 +{
16464 +       return fq_by_atom_gfp(atom, new_fq, reiser4_ctx_gfp_mask_get());
16465 +}
16466 +
16467 +/* A wrapper around reiser4_fq_by_atom for getting a flush queue
16468 +   object for current atom, if success fq->atom remains locked. */
16469 +flush_queue_t *get_fq_for_current_atom(void)
16470 +{
16471 +       flush_queue_t *fq = NULL;
16472 +       txn_atom *atom;
16473 +       int ret;
16474 +
16475 +       do {
16476 +               atom = get_current_atom_locked();
16477 +               ret = reiser4_fq_by_atom(atom, &fq);
16478 +       } while (ret == -E_REPEAT);
16479 +
16480 +       if (ret)
16481 +               return ERR_PTR(ret);
16482 +       return fq;
16483 +}
16484 +
16485 +/* Releasing flush queue object after exclusive use */
16486 +void reiser4_fq_put_nolock(flush_queue_t *fq)
16487 +{
16488 +       assert("zam-747", fq->atom != NULL);
16489 +       assert("zam-902", list_empty_careful(ATOM_FQ_LIST(fq)));
16490 +       mark_fq_ready(fq);
16491 +       assert("vs-1245", fq->owner == current);
16492 +       ON_DEBUG(fq->owner = NULL);
16493 +}
16494 +
16495 +void reiser4_fq_put(flush_queue_t *fq)
16496 +{
16497 +       txn_atom *atom;
16498 +
16499 +       spin_lock(&(fq->guard));
16500 +       atom = atom_locked_by_fq_nolock(fq);
16501 +
16502 +       assert("zam-746", atom != NULL);
16503 +
16504 +       reiser4_fq_put_nolock(fq);
16505 +       reiser4_atom_send_event(atom);
16506 +
16507 +       spin_unlock(&(fq->guard));
16508 +       spin_unlock_atom(atom);
16509 +}
16510 +
16511 +/* A part of atom object initialization related to the embedded flush queue
16512 +   list head */
16513 +
16514 +void init_atom_fq_parts(txn_atom *atom)
16515 +{
16516 +       INIT_LIST_HEAD(&atom->flush_queues);
16517 +}
16518 +
16519 +#if REISER4_DEBUG
16520 +
16521 +void reiser4_check_fq(const txn_atom *atom)
16522 +{
16523 +       /* check number of nodes on all atom's flush queues */
16524 +       flush_queue_t *fq;
16525 +       int count;
16526 +       struct list_head *pos;
16527 +
16528 +       count = 0;
16529 +       list_for_each_entry(fq, &atom->flush_queues, alink) {
16530 +               spin_lock(&(fq->guard));
16531 +               /* calculate number of jnodes on fq' list of prepped jnodes */
16532 +               list_for_each(pos, ATOM_FQ_LIST(fq))
16533 +                       count++;
16534 +               spin_unlock(&(fq->guard));
16535 +       }
16536 +       if (count != atom->fq)
16537 +               warning("", "fq counter %d, real %d\n", atom->fq, count);
16538 +
16539 +}
16540 +
16541 +#endif
16542 +
16543 +/*
16544 + * Local variables:
16545 + * c-indentation-style: "K&R"
16546 + * mode-name: "LC"
16547 + * c-basic-offset: 8
16548 + * tab-width: 8
16549 + * fill-column: 79
16550 + * scroll-step: 1
16551 + * End:
16552 + */
16553 diff -puN /dev/null fs/reiser4/forward.h
16554 --- /dev/null
16555 +++ a/fs/reiser4/forward.h
16556 @@ -0,0 +1,256 @@
16557 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
16558 +   reiser4/README */
16559 +
16560 +/* Forward declarations. Thank you Kernighan. */
16561 +
16562 +#if !defined(__REISER4_FORWARD_H__)
16563 +#define __REISER4_FORWARD_H__
16564 +
16565 +#include <asm/errno.h>
16566 +#include <linux/types.h>
16567 +
16568 +typedef struct zlock zlock;
16569 +typedef struct lock_stack lock_stack;
16570 +typedef struct lock_handle lock_handle;
16571 +typedef struct znode znode;
16572 +typedef struct flow flow_t;
16573 +typedef struct coord coord_t;
16574 +typedef struct tree_access_pointer tap_t;
16575 +typedef struct reiser4_object_create_data reiser4_object_create_data;
16576 +typedef union reiser4_plugin reiser4_plugin;
16577 +typedef __u16 reiser4_plugin_id;
16578 +typedef __u64 reiser4_plugin_groups;
16579 +typedef struct item_plugin item_plugin;
16580 +typedef struct jnode_plugin jnode_plugin;
16581 +typedef struct reiser4_item_data reiser4_item_data;
16582 +typedef union reiser4_key reiser4_key;
16583 +typedef struct reiser4_tree reiser4_tree;
16584 +typedef struct carry_cut_data carry_cut_data;
16585 +typedef struct carry_kill_data carry_kill_data;
16586 +typedef struct carry_tree_op carry_tree_op;
16587 +typedef struct carry_tree_node carry_tree_node;
16588 +typedef struct carry_plugin_info carry_plugin_info;
16589 +typedef struct reiser4_journal reiser4_journal;
16590 +typedef struct txn_atom txn_atom;
16591 +typedef struct txn_handle txn_handle;
16592 +typedef struct txn_mgr txn_mgr;
16593 +typedef struct reiser4_dir_entry_desc reiser4_dir_entry_desc;
16594 +typedef struct reiser4_context reiser4_context;
16595 +typedef struct carry_level carry_level;
16596 +typedef struct blocknr_set_entry blocknr_set_entry;
16597 +/* super_block->s_fs_info points to this */
16598 +typedef struct reiser4_super_info_data reiser4_super_info_data;
16599 +/* next two objects are fields of reiser4_super_info_data */
16600 +typedef struct reiser4_oid_allocator reiser4_oid_allocator;
16601 +typedef struct reiser4_space_allocator reiser4_space_allocator;
16602 +
16603 +typedef struct flush_scan flush_scan;
16604 +typedef struct flush_position flush_pos_t;
16605 +
16606 +typedef unsigned short pos_in_node_t;
16607 +#define MAX_POS_IN_NODE 65535
16608 +
16609 +typedef struct jnode jnode;
16610 +typedef struct reiser4_blocknr_hint reiser4_blocknr_hint;
16611 +
16612 +typedef struct uf_coord uf_coord_t;
16613 +typedef struct hint hint_t;
16614 +
16615 +typedef struct ktxnmgrd_context ktxnmgrd_context;
16616 +
16617 +struct inode;
16618 +struct page;
16619 +struct file;
16620 +struct dentry;
16621 +struct super_block;
16622 +
16623 +/* return values of coord_by_key(). cbk == coord_by_key */
16624 +typedef enum {
16625 +       CBK_COORD_FOUND = 0,
16626 +       CBK_COORD_NOTFOUND = -ENOENT,
16627 +} lookup_result;
16628 +
16629 +/* results of lookup with directory file */
16630 +typedef enum {
16631 +       FILE_NAME_FOUND = 0,
16632 +       FILE_NAME_NOTFOUND = -ENOENT,
16633 +       FILE_IO_ERROR = -EIO,   /* FIXME: it seems silly to have special OOM,
16634 +                                  IO_ERROR return codes for each search. */
16635 +       FILE_OOM = -ENOMEM      /* FIXME: it seems silly to have special OOM,
16636 +                                  IO_ERROR return codes for each search. */
16637 +} file_lookup_result;
16638 +
16639 +/* behaviors of lookup. If coord we are looking for is actually in a tree,
16640 +    both coincide. */
16641 +typedef enum {
16642 +       /* search exactly for the coord with key given */
16643 +       FIND_EXACT,
16644 +       /* search for coord with the maximal key not greater than one
16645 +          given */
16646 +       FIND_MAX_NOT_MORE_THAN  /*LEFT_SLANT_BIAS */
16647 +} lookup_bias;
16648 +
16649 +typedef enum {
16650 +       /* number of leaf level of the tree
16651 +          The fake root has (tree_level=0). */
16652 +       LEAF_LEVEL = 1,
16653 +
16654 +       /* number of level one above leaf level of the tree.
16655 +
16656 +          It is supposed that internal tree used by reiser4 to store file
16657 +          system data and meta data will have height 2 initially (when
16658 +          created by mkfs).
16659 +        */
16660 +       TWIG_LEVEL = 2,
16661 +} tree_level;
16662 +
16663 +/* The "real" maximum ztree height is the 0-origin size of any per-level
16664 +   array, since the zero'th level is not used. */
16665 +#define REAL_MAX_ZTREE_HEIGHT     (REISER4_MAX_ZTREE_HEIGHT-LEAF_LEVEL)
16666 +
16667 +/* enumeration of possible mutual position of item and coord.  This enum is
16668 +    return type of ->is_in_item() item plugin method which see. */
16669 +typedef enum {
16670 +       /* coord is on the left of an item */
16671 +       IP_ON_THE_LEFT,
16672 +       /* coord is inside item */
16673 +       IP_INSIDE,
16674 +       /* coord is inside item, but to the right of the rightmost unit of
16675 +          this item */
16676 +       IP_RIGHT_EDGE,
16677 +       /* coord is on the right of an item */
16678 +       IP_ON_THE_RIGHT
16679 +} interposition;
16680 +
16681 +/* type of lock to acquire on znode before returning it to caller */
16682 +typedef enum {
16683 +       ZNODE_NO_LOCK = 0,
16684 +       ZNODE_READ_LOCK = 1,
16685 +       ZNODE_WRITE_LOCK = 2,
16686 +} znode_lock_mode;
16687 +
16688 +/* type of lock request */
16689 +typedef enum {
16690 +       ZNODE_LOCK_LOPRI = 0,
16691 +       ZNODE_LOCK_HIPRI = (1 << 0),
16692 +
16693 +       /* By setting the ZNODE_LOCK_NONBLOCK flag in a lock request the call to
16694 +          longterm_lock_znode will not sleep waiting for the lock to become
16695 +          available.  If the lock is unavailable, reiser4_znode_lock will
16696 +          immediately return the value -E_REPEAT. */
16697 +       ZNODE_LOCK_NONBLOCK = (1 << 1),
16698 +       /* An option for longterm_lock_znode which prevents atom fusion */
16699 +       ZNODE_LOCK_DONT_FUSE = (1 << 2)
16700 +} znode_lock_request;
16701 +
16702 +typedef enum { READ_OP = 0, WRITE_OP = 1 } rw_op;
16703 +
16704 +/* used to specify direction of shift. These must be -1 and 1 */
16705 +typedef enum {
16706 +       SHIFT_LEFT = 1,
16707 +       SHIFT_RIGHT = -1
16708 +} shift_direction;
16709 +
16710 +typedef enum {
16711 +       LEFT_SIDE,
16712 +       RIGHT_SIDE
16713 +} sideof;
16714 +
16715 +#define round_up(value, order)                                         \
16716 +       ((typeof(value))(((long) (value) + (order) - 1U) &      \
16717 +                            ~((order) - 1)))
16718 +
16719 +/* values returned by squalloc_right_neighbor and its auxiliary functions */
16720 +typedef enum {
16721 +       /* unit of internal item is moved */
16722 +       SUBTREE_MOVED = 0,
16723 +       /* nothing else can be squeezed into left neighbor */
16724 +       SQUEEZE_TARGET_FULL = 1,
16725 +       /* all content of node is squeezed into its left neighbor */
16726 +       SQUEEZE_SOURCE_EMPTY = 2,
16727 +       /* one more item is copied (this is only returned by
16728 +          allocate_and_copy_extent to squalloc_twig)) */
16729 +       SQUEEZE_CONTINUE = 3
16730 +} squeeze_result;
16731 +
16732 +/* Do not change items ids. If you do - there will be format change */
16733 +typedef enum {
16734 +       STATIC_STAT_DATA_ID = 0x0,
16735 +       SIMPLE_DIR_ENTRY_ID = 0x1,
16736 +       COMPOUND_DIR_ID = 0x2,
16737 +       NODE_POINTER_ID = 0x3,
16738 +       EXTENT_POINTER_ID = 0x5,
16739 +       FORMATTING_ID = 0x6,
16740 +       CTAIL_ID = 0x7,
16741 +       BLACK_BOX_ID = 0x8,
16742 +       LAST_ITEM_ID = 0x9
16743 +} item_id;
16744 +
16745 +/* Flags passed to jnode_flush() to allow it to distinguish default settings
16746 +   based on whether commit() was called or VM memory pressure was applied. */
16747 +typedef enum {
16748 +       /* submit flush queue to disk at jnode_flush completion */
16749 +       JNODE_FLUSH_WRITE_BLOCKS = 1,
16750 +
16751 +       /* flush is called for commit */
16752 +       JNODE_FLUSH_COMMIT = 2,
16753 +       /* not implemented */
16754 +       JNODE_FLUSH_MEMORY_FORMATTED = 4,
16755 +
16756 +       /* not implemented */
16757 +       JNODE_FLUSH_MEMORY_UNFORMATTED = 8,
16758 +} jnode_flush_flags;
16759 +
16760 +/* Flags to insert/paste carry operations. Currently they only used in
16761 +   flushing code, but in future, they can be used to optimize for repetitive
16762 +   accesses.  */
16763 +typedef enum {
16764 +       /* carry is not allowed to shift data to the left when trying to find
16765 +          free space  */
16766 +       COPI_DONT_SHIFT_LEFT = (1 << 0),
16767 +       /* carry is not allowed to shift data to the right when trying to find
16768 +          free space  */
16769 +       COPI_DONT_SHIFT_RIGHT = (1 << 1),
16770 +       /* carry is not allowed to allocate new node(s) when trying to find
16771 +          free space */
16772 +       COPI_DONT_ALLOCATE = (1 << 2),
16773 +       /* try to load left neighbor if its not in a cache */
16774 +       COPI_LOAD_LEFT = (1 << 3),
16775 +       /* try to load right neighbor if its not in a cache */
16776 +       COPI_LOAD_RIGHT = (1 << 4),
16777 +       /* shift insertion point to the left neighbor */
16778 +       COPI_GO_LEFT = (1 << 5),
16779 +       /* shift insertion point to the right neighbor */
16780 +       COPI_GO_RIGHT = (1 << 6),
16781 +       /* try to step back into original node if insertion into new node
16782 +          fails after shifting data there. */
16783 +       COPI_STEP_BACK = (1 << 7)
16784 +} cop_insert_flag;
16785 +
16786 +typedef enum {
16787 +       SAFE_UNLINK,            /* safe-link for unlink */
16788 +       SAFE_TRUNCATE           /* safe-link for truncate */
16789 +} reiser4_safe_link_t;
16790 +
16791 +/* this is to show on which list of atom jnode is */
16792 +typedef enum {
16793 +       NOT_CAPTURED,
16794 +       DIRTY_LIST,
16795 +       CLEAN_LIST,
16796 +       FQ_LIST,
16797 +       WB_LIST,
16798 +       OVRWR_LIST
16799 +} atom_list;
16800 +
16801 +/* __REISER4_FORWARD_H__ */
16802 +#endif
16803 +
16804 +/* Make Linus happy.
16805 +   Local variables:
16806 +   c-indentation-style: "K&R"
16807 +   mode-name: "LC"
16808 +   c-basic-offset: 8
16809 +   tab-width: 8
16810 +   fill-column: 120
16811 +   End:
16812 +*/
16813 diff -puN /dev/null fs/reiser4/fsdata.c
16814 --- /dev/null
16815 +++ a/fs/reiser4/fsdata.c
16816 @@ -0,0 +1,804 @@
16817 +/* Copyright 2001, 2002, 2003, 2004, 2005 by Hans Reiser, licensing governed by
16818 + * reiser4/README */
16819 +
16820 +#include "fsdata.h"
16821 +#include "inode.h"
16822 +
16823 +
16824 +/* cache or dir_cursors */
16825 +static struct kmem_cache *d_cursor_cache;
16826 +
16827 +/* list of unused cursors */
16828 +static LIST_HEAD(cursor_cache);
16829 +
16830 +/* number of cursors in list of ununsed cursors */
16831 +static unsigned long d_cursor_unused = 0;
16832 +
16833 +/* spinlock protecting manipulations with dir_cursor's hash table and lists */
16834 +DEFINE_SPINLOCK(d_lock);
16835 +
16836 +static reiser4_file_fsdata *create_fsdata(struct file *file);
16837 +static int file_is_stateless(struct file *file);
16838 +static void free_fsdata(reiser4_file_fsdata *fsdata);
16839 +static void kill_cursor(dir_cursor *);
16840 +
16841 +/**
16842 + * d_cursor_shrink - shrink callback for cache of dir_cursor-s
16843 + * @nr: number of objects to free
16844 + * @mask: GFP mask
16845 + *
16846 + * Shrinks d_cursor_cache. Scan LRU list of unused cursors, freeing requested
16847 + * number. Return number of still freeable cursors.
16848 + */
16849 +static int d_cursor_shrink(int nr, gfp_t mask)
16850 +{
16851 +       if (nr != 0) {
16852 +               dir_cursor *scan;
16853 +               int killed;
16854 +
16855 +               killed = 0;
16856 +               spin_lock(&d_lock);
16857 +               while (!list_empty(&cursor_cache)) {
16858 +                       scan = list_entry(cursor_cache.next, dir_cursor, alist);
16859 +                       assert("nikita-3567", scan->ref == 0);
16860 +                       kill_cursor(scan);
16861 +                       ++killed;
16862 +                       --nr;
16863 +                       if (nr == 0)
16864 +                               break;
16865 +               }
16866 +               spin_unlock(&d_lock);
16867 +       }
16868 +       return d_cursor_unused;
16869 +}
16870 +
16871 +/*
16872 + * actually, d_cursors are "priceless", because there is no way to
16873 + * recover information stored in them. On the other hand, we don't
16874 + * want to consume all kernel memory by them. As a compromise, just
16875 + * assign higher "seeks" value to d_cursor cache, so that it will be
16876 + * shrunk only if system is really tight on memory.
16877 + */
16878 +static struct shrinker d_cursor_shrinker = {
16879 +       .shrink = d_cursor_shrink,
16880 +       .seeks = DEFAULT_SEEKS << 3,
16881 +};
16882 +
16883 +/**
16884 + * reiser4_init_d_cursor - create d_cursor cache
16885 + *
16886 + * Initializes slab cache of d_cursors. It is part of reiser4 module
16887 + * initialization.
16888 + */
16889 +int reiser4_init_d_cursor(void)
16890 +{
16891 +       d_cursor_cache = kmem_cache_create("d_cursor", sizeof(dir_cursor), 0,
16892 +                                          SLAB_HWCACHE_ALIGN, NULL);
16893 +       if (d_cursor_cache == NULL)
16894 +               return RETERR(-ENOMEM);
16895 +
16896 +       register_shrinker(&d_cursor_shrinker);
16897 +       return 0;
16898 +}
16899 +
16900 +/**
16901 + * reiser4_done_d_cursor - delete d_cursor cache and d_cursor shrinker
16902 + *
16903 + * This is called on reiser4 module unloading or system shutdown.
16904 + */
16905 +void reiser4_done_d_cursor(void)
16906 +{
16907 +       unregister_shrinker(&d_cursor_shrinker);
16908 +
16909 +       destroy_reiser4_cache(&d_cursor_cache);
16910 +}
16911 +
16912 +#define D_CURSOR_TABLE_SIZE (256)
16913 +
16914 +static inline unsigned long
16915 +d_cursor_hash(d_cursor_hash_table * table, const struct d_cursor_key *key)
16916 +{
16917 +       assert("nikita-3555", IS_POW(D_CURSOR_TABLE_SIZE));
16918 +       return (key->oid + key->cid) & (D_CURSOR_TABLE_SIZE - 1);
16919 +}
16920 +
16921 +static inline int d_cursor_eq(const struct d_cursor_key *k1,
16922 +                             const struct d_cursor_key *k2)
16923 +{
16924 +       return k1->cid == k2->cid && k1->oid == k2->oid;
16925 +}
16926 +
16927 +/*
16928 + * define functions to manipulate reiser4 super block's hash table of
16929 + * dir_cursors
16930 + */
16931 +#define KMALLOC(size) kmalloc((size), reiser4_ctx_gfp_mask_get())
16932 +#define KFREE(ptr, size) kfree(ptr)
16933 +TYPE_SAFE_HASH_DEFINE(d_cursor,
16934 +                     dir_cursor,
16935 +                     struct d_cursor_key,
16936 +                     key, hash, d_cursor_hash, d_cursor_eq);
16937 +#undef KFREE
16938 +#undef KMALLOC
16939 +
16940 +/**
16941 + * reiser4_init_super_d_info - initialize per-super-block d_cursor resources
16942 + * @super: super block to initialize
16943 + *
16944 + * Initializes per-super-block d_cursor's hash table and radix tree. It is part
16945 + * of mount.
16946 + */
16947 +int reiser4_init_super_d_info(struct super_block *super)
16948 +{
16949 +       struct d_cursor_info *p;
16950 +
16951 +       p = &get_super_private(super)->d_info;
16952 +
16953 +       INIT_RADIX_TREE(&p->tree, reiser4_ctx_gfp_mask_get());
16954 +       return d_cursor_hash_init(&p->table, D_CURSOR_TABLE_SIZE);
16955 +}
16956 +
16957 +/**
16958 + * reiser4_done_super_d_info - release per-super-block d_cursor resources
16959 + * @super: super block being umounted
16960 + *
16961 + * It is called on umount. Kills all directory cursors attached to suoer block.
16962 + */
16963 +void reiser4_done_super_d_info(struct super_block *super)
16964 +{
16965 +       struct d_cursor_info *d_info;
16966 +       dir_cursor *cursor, *next;
16967 +
16968 +       d_info = &get_super_private(super)->d_info;
16969 +       for_all_in_htable(&d_info->table, d_cursor, cursor, next)
16970 +               kill_cursor(cursor);
16971 +
16972 +       BUG_ON(d_info->tree.rnode != NULL);
16973 +       d_cursor_hash_done(&d_info->table);
16974 +}
16975 +
16976 +/**
16977 + * kill_cursor - free dir_cursor and reiser4_file_fsdata attached to it
16978 + * @cursor: cursor to free
16979 + *
16980 + * Removes reiser4_file_fsdata attached to @cursor from readdir list of
16981 + * reiser4_inode, frees that reiser4_file_fsdata. Removes @cursor from from
16982 + * indices, hash table, list of unused cursors and frees it.
16983 + */
16984 +static void kill_cursor(dir_cursor *cursor)
16985 +{
16986 +       unsigned long index;
16987 +
16988 +       assert("nikita-3566", cursor->ref == 0);
16989 +       assert("nikita-3572", cursor->fsdata != NULL);
16990 +
16991 +       index = (unsigned long)cursor->key.oid;
16992 +       list_del_init(&cursor->fsdata->dir.linkage);
16993 +       free_fsdata(cursor->fsdata);
16994 +       cursor->fsdata = NULL;
16995 +
16996 +       if (list_empty_careful(&cursor->list))
16997 +               /* this is last cursor for a file. Kill radix-tree entry */
16998 +               radix_tree_delete(&cursor->info->tree, index);
16999 +       else {
17000 +               void **slot;
17001 +
17002 +               /*
17003 +                * there are other cursors for the same oid.
17004 +                */
17005 +
17006 +               /*
17007 +                * if radix tree point to the cursor being removed, re-target
17008 +                * radix tree slot to the next cursor in the (non-empty as was
17009 +                * checked above) element of the circular list of all cursors
17010 +                * for this oid.
17011 +                */
17012 +               slot = radix_tree_lookup_slot(&cursor->info->tree, index);
17013 +               assert("nikita-3571", *slot != NULL);
17014 +               if (*slot == cursor)
17015 +                       *slot = list_entry(cursor->list.next, dir_cursor, list);
17016 +               /* remove cursor from circular list */
17017 +               list_del_init(&cursor->list);
17018 +       }
17019 +       /* remove cursor from the list of unused cursors */
17020 +       list_del_init(&cursor->alist);
17021 +       /* remove cursor from the hash table */
17022 +       d_cursor_hash_remove(&cursor->info->table, cursor);
17023 +       /* and free it */
17024 +       kmem_cache_free(d_cursor_cache, cursor);
17025 +       --d_cursor_unused;
17026 +}
17027 +
17028 +/* possible actions that can be performed on all cursors for the given file */
17029 +enum cursor_action {
17030 +       /*
17031 +        * load all detached state: this is called when stat-data is loaded
17032 +        * from the disk to recover information about all pending readdirs
17033 +        */
17034 +       CURSOR_LOAD,
17035 +       /*
17036 +        * detach all state from inode, leaving it in the cache. This is called
17037 +        * when inode is removed form the memory by memory pressure
17038 +        */
17039 +       CURSOR_DISPOSE,
17040 +       /*
17041 +        * detach cursors from the inode, and free them. This is called when
17042 +        * inode is destroyed
17043 +        */
17044 +       CURSOR_KILL
17045 +};
17046 +
17047 +/*
17048 + * return d_cursor data for the file system @inode is in.
17049 + */
17050 +static inline struct d_cursor_info *d_info(struct inode *inode)
17051 +{
17052 +       return &get_super_private(inode->i_sb)->d_info;
17053 +}
17054 +
17055 +/*
17056 + * lookup d_cursor in the per-super-block radix tree.
17057 + */
17058 +static inline dir_cursor *lookup(struct d_cursor_info *info,
17059 +                                unsigned long index)
17060 +{
17061 +       return (dir_cursor *) radix_tree_lookup(&info->tree, index);
17062 +}
17063 +
17064 +/*
17065 + * attach @cursor to the radix tree. There may be multiple cursors for the
17066 + * same oid, they are chained into circular list.
17067 + */
17068 +static void bind_cursor(dir_cursor * cursor, unsigned long index)
17069 +{
17070 +       dir_cursor *head;
17071 +
17072 +       head = lookup(cursor->info, index);
17073 +       if (head == NULL) {
17074 +               /* this is the first cursor for this index */
17075 +               INIT_LIST_HEAD(&cursor->list);
17076 +               radix_tree_insert(&cursor->info->tree, index, cursor);
17077 +       } else {
17078 +               /* some cursor already exists. Chain ours */
17079 +               list_add(&cursor->list, &head->list);
17080 +       }
17081 +}
17082 +
17083 +/*
17084 + * detach fsdata (if detachable) from file descriptor, and put cursor on the
17085 + * "unused" list. Called when file descriptor is not longer in active use.
17086 + */
17087 +static void clean_fsdata(struct file *file)
17088 +{
17089 +       dir_cursor *cursor;
17090 +       reiser4_file_fsdata *fsdata;
17091 +
17092 +       assert("nikita-3570", file_is_stateless(file));
17093 +
17094 +       fsdata = (reiser4_file_fsdata *) file->private_data;
17095 +       if (fsdata != NULL) {
17096 +               cursor = fsdata->cursor;
17097 +               if (cursor != NULL) {
17098 +                       spin_lock(&d_lock);
17099 +                       --cursor->ref;
17100 +                       if (cursor->ref == 0) {
17101 +                               list_add_tail(&cursor->alist, &cursor_cache);
17102 +                               ++d_cursor_unused;
17103 +                       }
17104 +                       spin_unlock(&d_lock);
17105 +                       file->private_data = NULL;
17106 +               }
17107 +       }
17108 +}
17109 +
17110 +/*
17111 + * global counter used to generate "client ids". These ids are encoded into
17112 + * high bits of fpos.
17113 + */
17114 +static __u32 cid_counter = 0;
17115 +#define CID_SHIFT (20)
17116 +#define CID_MASK  (0xfffffull)
17117 +
17118 +static void free_file_fsdata_nolock(struct file *);
17119 +
17120 +/**
17121 + * insert_cursor - allocate file_fsdata, insert cursor to tree and hash table
17122 + * @cursor:
17123 + * @file:
17124 + * @inode:
17125 + *
17126 + * Allocates reiser4_file_fsdata, attaches it to @cursor, inserts cursor to
17127 + * reiser4 super block's hash table and radix tree.
17128 + add detachable readdir
17129 + * state to the @f
17130 + */
17131 +static int insert_cursor(dir_cursor *cursor, struct file *file,
17132 +                        struct inode *inode)
17133 +{
17134 +       int result;
17135 +       reiser4_file_fsdata *fsdata;
17136 +
17137 +       memset(cursor, 0, sizeof *cursor);
17138 +
17139 +       /* this is either first call to readdir, or rewind. Anyway, create new
17140 +        * cursor. */
17141 +       fsdata = create_fsdata(NULL);
17142 +       if (fsdata != NULL) {
17143 +               result = radix_tree_preload(reiser4_ctx_gfp_mask_get());
17144 +               if (result == 0) {
17145 +                       struct d_cursor_info *info;
17146 +                       oid_t oid;
17147 +
17148 +                       info = d_info(inode);
17149 +                       oid = get_inode_oid(inode);
17150 +                       /* cid occupies higher 12 bits of f->f_pos. Don't
17151 +                        * allow it to become negative: this confuses
17152 +                        * nfsd_readdir() */
17153 +                       cursor->key.cid = (++cid_counter) & 0x7ff;
17154 +                       cursor->key.oid = oid;
17155 +                       cursor->fsdata = fsdata;
17156 +                       cursor->info = info;
17157 +                       cursor->ref = 1;
17158 +
17159 +                       spin_lock_inode(inode);
17160 +                       /* install cursor as @f's private_data, discarding old
17161 +                        * one if necessary */
17162 +#if REISER4_DEBUG
17163 +                       if (file->private_data)
17164 +                               warning("", "file has fsdata already");
17165 +#endif
17166 +                       clean_fsdata(file);
17167 +                       free_file_fsdata_nolock(file);
17168 +                       file->private_data = fsdata;
17169 +                       fsdata->cursor = cursor;
17170 +                       spin_unlock_inode(inode);
17171 +                       spin_lock(&d_lock);
17172 +                       /* insert cursor into hash table */
17173 +                       d_cursor_hash_insert(&info->table, cursor);
17174 +                       /* and chain it into radix-tree */
17175 +                       bind_cursor(cursor, (unsigned long)oid);
17176 +                       spin_unlock(&d_lock);
17177 +                       radix_tree_preload_end();
17178 +                       file->f_pos = ((__u64) cursor->key.cid) << CID_SHIFT;
17179 +               }
17180 +       } else
17181 +               result = RETERR(-ENOMEM);
17182 +       return result;
17183 +}
17184 +
17185 +/**
17186 + * process_cursors - do action on each cursor attached to inode
17187 + * @inode:
17188 + * @act: action to do
17189 + *
17190 + * Finds all cursors of @inode in reiser4's super block radix tree of cursors
17191 + * and performs action specified by @act on each of cursors.
17192 + */
17193 +static void process_cursors(struct inode *inode, enum cursor_action act)
17194 +{
17195 +       oid_t oid;
17196 +       dir_cursor *start;
17197 +       struct list_head *head;
17198 +       reiser4_context *ctx;
17199 +       struct d_cursor_info *info;
17200 +
17201 +       /* this can be called by
17202 +        *
17203 +        * kswapd->...->prune_icache->..reiser4_destroy_inode
17204 +        *
17205 +        * without reiser4_context
17206 +        */
17207 +       ctx = reiser4_init_context(inode->i_sb);
17208 +       if (IS_ERR(ctx)) {
17209 +               warning("vs-23", "failed to init context");
17210 +               return;
17211 +       }
17212 +
17213 +       assert("nikita-3558", inode != NULL);
17214 +
17215 +       info = d_info(inode);
17216 +       oid = get_inode_oid(inode);
17217 +       spin_lock_inode(inode);
17218 +       head = get_readdir_list(inode);
17219 +       spin_lock(&d_lock);
17220 +       /* find any cursor for this oid: reference to it is hanging of radix
17221 +        * tree */
17222 +       start = lookup(info, (unsigned long)oid);
17223 +       if (start != NULL) {
17224 +               dir_cursor *scan;
17225 +               reiser4_file_fsdata *fsdata;
17226 +
17227 +               /* process circular list of cursors for this oid */
17228 +               scan = start;
17229 +               do {
17230 +                       dir_cursor *next;
17231 +
17232 +                       next = list_entry(scan->list.next, dir_cursor, list);
17233 +                       fsdata = scan->fsdata;
17234 +                       assert("nikita-3557", fsdata != NULL);
17235 +                       if (scan->key.oid == oid) {
17236 +                               switch (act) {
17237 +                               case CURSOR_DISPOSE:
17238 +                                       list_del_init(&fsdata->dir.linkage);
17239 +                                       break;
17240 +                               case CURSOR_LOAD:
17241 +                                       list_add(&fsdata->dir.linkage, head);
17242 +                                       break;
17243 +                               case CURSOR_KILL:
17244 +                                       kill_cursor(scan);
17245 +                                       break;
17246 +                               }
17247 +                       }
17248 +                       if (scan == next)
17249 +                               /* last cursor was just killed */
17250 +                               break;
17251 +                       scan = next;
17252 +               } while (scan != start);
17253 +       }
17254 +       spin_unlock(&d_lock);
17255 +       /* check that we killed 'em all */
17256 +       assert("nikita-3568",
17257 +              ergo(act == CURSOR_KILL,
17258 +                   list_empty_careful(get_readdir_list(inode))));
17259 +       assert("nikita-3569",
17260 +              ergo(act == CURSOR_KILL, lookup(info, oid) == NULL));
17261 +       spin_unlock_inode(inode);
17262 +       reiser4_exit_context(ctx);
17263 +}
17264 +
17265 +/**
17266 + * reiser4_dispose_cursors - removes cursors from inode's list
17267 + * @inode: inode to dispose cursors of
17268 + *
17269 + * For each of cursors corresponding to @inode - removes reiser4_file_fsdata
17270 + * attached to cursor from inode's readdir list. This is called when inode is
17271 + * removed from the memory by memory pressure.
17272 + */
17273 +void reiser4_dispose_cursors(struct inode *inode)
17274 +{
17275 +       process_cursors(inode, CURSOR_DISPOSE);
17276 +}
17277 +
17278 +/**
17279 + * reiser4_load_cursors - attach cursors to inode
17280 + * @inode: inode to load cursors to
17281 + *
17282 + * For each of cursors corresponding to @inode - attaches reiser4_file_fsdata
17283 + * attached to cursor to inode's readdir list. This is done when inode is
17284 + * loaded into memory.
17285 + */
17286 +void reiser4_load_cursors(struct inode *inode)
17287 +{
17288 +       process_cursors(inode, CURSOR_LOAD);
17289 +}
17290 +
17291 +/**
17292 + * reiser4_kill_cursors - kill all inode cursors
17293 + * @inode: inode to kill cursors of
17294 + *
17295 + * Frees all cursors for this inode. This is called when inode is destroyed.
17296 + */
17297 +void reiser4_kill_cursors(struct inode *inode)
17298 +{
17299 +       process_cursors(inode, CURSOR_KILL);
17300 +}
17301 +
17302 +/**
17303 + * file_is_stateless -
17304 + * @file:
17305 + *
17306 + * true, if file descriptor @f is created by NFS server by "demand" to serve
17307 + * one file system operation. This means that there may be "detached state"
17308 + * for underlying inode.
17309 + */
17310 +static int file_is_stateless(struct file *file)
17311 +{
17312 +       return reiser4_get_dentry_fsdata(file->f_dentry)->stateless;
17313 +}
17314 +
17315 +/**
17316 + * reiser4_get_dir_fpos -
17317 + * @dir:
17318 + *
17319 + * Calculates ->fpos from user-supplied cookie. Normally it is dir->f_pos, but
17320 + * in the case of stateless directory operation (readdir-over-nfs), client id
17321 + * was encoded in the high bits of cookie and should me masked off.
17322 + */
17323 +loff_t reiser4_get_dir_fpos(struct file *dir)
17324 +{
17325 +       if (file_is_stateless(dir))
17326 +               return dir->f_pos & CID_MASK;
17327 +       else
17328 +               return dir->f_pos;
17329 +}
17330 +
17331 +/**
17332 + * reiser4_attach_fsdata - try to attach fsdata
17333 + * @file:
17334 + * @inode:
17335 + *
17336 + * Finds or creates cursor for readdir-over-nfs.
17337 + */
17338 +int reiser4_attach_fsdata(struct file *file, struct inode *inode)
17339 +{
17340 +       loff_t pos;
17341 +       int result;
17342 +       dir_cursor *cursor;
17343 +
17344 +       /*
17345 +        * we are serialized by inode->i_mutex
17346 +        */
17347 +       if (!file_is_stateless(file))
17348 +               return 0;
17349 +
17350 +       pos = file->f_pos;
17351 +       result = 0;
17352 +       if (pos == 0) {
17353 +               /*
17354 +                * first call to readdir (or rewind to the beginning of
17355 +                * directory)
17356 +                */
17357 +               cursor = kmem_cache_alloc(d_cursor_cache,
17358 +                                         reiser4_ctx_gfp_mask_get());
17359 +               if (cursor != NULL)
17360 +                       result = insert_cursor(cursor, file, inode);
17361 +               else
17362 +                       result = RETERR(-ENOMEM);
17363 +       } else {
17364 +               /* try to find existing cursor */
17365 +               struct d_cursor_key key;
17366 +
17367 +               key.cid = pos >> CID_SHIFT;
17368 +               key.oid = get_inode_oid(inode);
17369 +               spin_lock(&d_lock);
17370 +               cursor = d_cursor_hash_find(&d_info(inode)->table, &key);
17371 +               if (cursor != NULL) {
17372 +                       /* cursor was found */
17373 +                       if (cursor->ref == 0) {
17374 +                               /* move it from unused list */
17375 +                               list_del_init(&cursor->alist);
17376 +                               --d_cursor_unused;
17377 +                       }
17378 +                       ++cursor->ref;
17379 +               }
17380 +               spin_unlock(&d_lock);
17381 +               if (cursor != NULL) {
17382 +                       spin_lock_inode(inode);
17383 +                       assert("nikita-3556", cursor->fsdata->back == NULL);
17384 +                       clean_fsdata(file);
17385 +                       free_file_fsdata_nolock(file);
17386 +                       file->private_data = cursor->fsdata;
17387 +                       spin_unlock_inode(inode);
17388 +               }
17389 +       }
17390 +       return result;
17391 +}
17392 +
17393 +/**
17394 + * reiser4_detach_fsdata - ???
17395 + * @file:
17396 + *
17397 + * detach fsdata, if necessary
17398 + */
17399 +void reiser4_detach_fsdata(struct file *file)
17400 +{
17401 +       struct inode *inode;
17402 +
17403 +       if (!file_is_stateless(file))
17404 +               return;
17405 +
17406 +       inode = file->f_dentry->d_inode;
17407 +       spin_lock_inode(inode);
17408 +       clean_fsdata(file);
17409 +       spin_unlock_inode(inode);
17410 +}
17411 +
17412 +/* slab for reiser4_dentry_fsdata */
17413 +static struct kmem_cache *dentry_fsdata_cache;
17414 +
17415 +/**
17416 + * reiser4_init_dentry_fsdata - create cache of dentry_fsdata
17417 + *
17418 + * Initializes slab cache of structures attached to denty->d_fsdata. It is
17419 + * part of reiser4 module initialization.
17420 + */
17421 +int reiser4_init_dentry_fsdata(void)
17422 +{
17423 +       dentry_fsdata_cache = kmem_cache_create("dentry_fsdata",
17424 +                                          sizeof(struct reiser4_dentry_fsdata),
17425 +                                          0,
17426 +                                          SLAB_HWCACHE_ALIGN |
17427 +                                          SLAB_RECLAIM_ACCOUNT,
17428 +                                          NULL);
17429 +       if (dentry_fsdata_cache == NULL)
17430 +               return RETERR(-ENOMEM);
17431 +       return 0;
17432 +}
17433 +
17434 +/**
17435 + * reiser4_done_dentry_fsdata - delete cache of dentry_fsdata
17436 + *
17437 + * This is called on reiser4 module unloading or system shutdown.
17438 + */
17439 +void reiser4_done_dentry_fsdata(void)
17440 +{
17441 +       destroy_reiser4_cache(&dentry_fsdata_cache);
17442 +}
17443 +
17444 +/**
17445 + * reiser4_get_dentry_fsdata - get fs-specific dentry data
17446 + * @dentry: queried dentry
17447 + *
17448 + * Allocates if necessary and returns per-dentry data that we attach to each
17449 + * dentry.
17450 + */
17451 +struct reiser4_dentry_fsdata *reiser4_get_dentry_fsdata(struct dentry *dentry)
17452 +{
17453 +       assert("nikita-1365", dentry != NULL);
17454 +
17455 +       if (dentry->d_fsdata == NULL) {
17456 +               dentry->d_fsdata = kmem_cache_alloc(dentry_fsdata_cache,
17457 +                                                   reiser4_ctx_gfp_mask_get());
17458 +               if (dentry->d_fsdata == NULL)
17459 +                       return ERR_PTR(RETERR(-ENOMEM));
17460 +               memset(dentry->d_fsdata, 0,
17461 +                      sizeof(struct reiser4_dentry_fsdata));
17462 +       }
17463 +       return dentry->d_fsdata;
17464 +}
17465 +
17466 +/**
17467 + * reiser4_free_dentry_fsdata - detach and free dentry_fsdata
17468 + * @dentry: dentry to free fsdata of
17469 + *
17470 + * Detaches and frees fs-specific dentry data
17471 + */
17472 +void reiser4_free_dentry_fsdata(struct dentry *dentry)
17473 +{
17474 +       if (dentry->d_fsdata != NULL) {
17475 +               kmem_cache_free(dentry_fsdata_cache, dentry->d_fsdata);
17476 +               dentry->d_fsdata = NULL;
17477 +       }
17478 +}
17479 +
17480 +/* slab for reiser4_file_fsdata */
17481 +static struct kmem_cache *file_fsdata_cache;
17482 +
17483 +/**
17484 + * reiser4_init_file_fsdata - create cache of reiser4_file_fsdata
17485 + *
17486 + * Initializes slab cache of structures attached to file->private_data. It is
17487 + * part of reiser4 module initialization.
17488 + */
17489 +int reiser4_init_file_fsdata(void)
17490 +{
17491 +       file_fsdata_cache = kmem_cache_create("file_fsdata",
17492 +                                             sizeof(reiser4_file_fsdata),
17493 +                                             0,
17494 +                                             SLAB_HWCACHE_ALIGN |
17495 +                                             SLAB_RECLAIM_ACCOUNT, NULL);
17496 +       if (file_fsdata_cache == NULL)
17497 +               return RETERR(-ENOMEM);
17498 +       return 0;
17499 +}
17500 +
17501 +/**
17502 + * reiser4_done_file_fsdata - delete cache of reiser4_file_fsdata
17503 + *
17504 + * This is called on reiser4 module unloading or system shutdown.
17505 + */
17506 +void reiser4_done_file_fsdata(void)
17507 +{
17508 +       destroy_reiser4_cache(&file_fsdata_cache);
17509 +}
17510 +
17511 +/**
17512 + * create_fsdata - allocate and initialize reiser4_file_fsdata
17513 + * @file: what to create file_fsdata for, may be NULL
17514 + *
17515 + * Allocates and initializes reiser4_file_fsdata structure.
17516 + */
17517 +static reiser4_file_fsdata *create_fsdata(struct file *file)
17518 +{
17519 +       reiser4_file_fsdata *fsdata;
17520 +
17521 +       fsdata = kmem_cache_alloc(file_fsdata_cache,
17522 +                                 reiser4_ctx_gfp_mask_get());
17523 +       if (fsdata != NULL) {
17524 +               memset(fsdata, 0, sizeof *fsdata);
17525 +               fsdata->ra1.max_window_size = VM_MAX_READAHEAD * 1024;
17526 +               fsdata->back = file;
17527 +               INIT_LIST_HEAD(&fsdata->dir.linkage);
17528 +       }
17529 +       return fsdata;
17530 +}
17531 +
17532 +/**
17533 + * free_fsdata - free reiser4_file_fsdata
17534 + * @fsdata: object to free
17535 + *
17536 + * Dual to create_fsdata(). Free reiser4_file_fsdata.
17537 + */
17538 +static void free_fsdata(reiser4_file_fsdata *fsdata)
17539 +{
17540 +       BUG_ON(fsdata == NULL);
17541 +       kmem_cache_free(file_fsdata_cache, fsdata);
17542 +}
17543 +
17544 +/**
17545 + * reiser4_get_file_fsdata - get fs-specific file data
17546 + * @file: queried file
17547 + *
17548 + * Returns fs-specific data of @file. If it is NULL, allocates it and attaches
17549 + * to @file.
17550 + */
17551 +reiser4_file_fsdata *reiser4_get_file_fsdata(struct file *file)
17552 +{
17553 +       assert("nikita-1603", file != NULL);
17554 +
17555 +       if (file->private_data == NULL) {
17556 +               reiser4_file_fsdata *fsdata;
17557 +               struct inode *inode;
17558 +
17559 +               fsdata = create_fsdata(file);
17560 +               if (fsdata == NULL)
17561 +                       return ERR_PTR(RETERR(-ENOMEM));
17562 +
17563 +               inode = file->f_dentry->d_inode;
17564 +               spin_lock_inode(inode);
17565 +               if (file->private_data == NULL) {
17566 +                       file->private_data = fsdata;
17567 +                       fsdata = NULL;
17568 +               }
17569 +               spin_unlock_inode(inode);
17570 +               if (fsdata != NULL)
17571 +                       /* other thread initialized ->fsdata */
17572 +                       kmem_cache_free(file_fsdata_cache, fsdata);
17573 +       }
17574 +       assert("nikita-2665", file->private_data != NULL);
17575 +       return file->private_data;
17576 +}
17577 +
17578 +/**
17579 + * free_file_fsdata_nolock - detach and free reiser4_file_fsdata
17580 + * @file:
17581 + *
17582 + * Detaches reiser4_file_fsdata from @file, removes reiser4_file_fsdata from
17583 + * readdir list, frees if it is not linked to d_cursor object.
17584 + */
17585 +static void free_file_fsdata_nolock(struct file *file)
17586 +{
17587 +       reiser4_file_fsdata *fsdata;
17588 +
17589 +       assert("", spin_inode_is_locked(file->f_dentry->d_inode));
17590 +       fsdata = file->private_data;
17591 +       if (fsdata != NULL) {
17592 +               list_del_init(&fsdata->dir.linkage);
17593 +               if (fsdata->cursor == NULL)
17594 +                       free_fsdata(fsdata);
17595 +       }
17596 +       file->private_data = NULL;
17597 +}
17598 +
17599 +/**
17600 + * reiser4_free_file_fsdata - detach from struct file and free reiser4_file_fsdata
17601 + * @file:
17602 + *
17603 + * Spinlocks inode and calls free_file_fsdata_nolock to do the work.
17604 + */
17605 +void reiser4_free_file_fsdata(struct file *file)
17606 +{
17607 +       spin_lock_inode(file->f_dentry->d_inode);
17608 +       free_file_fsdata_nolock(file);
17609 +       spin_unlock_inode(file->f_dentry->d_inode);
17610 +}
17611 +
17612 +/*
17613 + * Local variables:
17614 + * c-indentation-style: "K&R"
17615 + * mode-name: "LC"
17616 + * c-basic-offset: 8
17617 + * tab-width: 8
17618 + * fill-column: 79
17619 + * End:
17620 + */
17621 diff -puN /dev/null fs/reiser4/fsdata.h
17622 --- /dev/null
17623 +++ a/fs/reiser4/fsdata.h
17624 @@ -0,0 +1,205 @@
17625 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
17626 + * reiser4/README */
17627 +
17628 +#if !defined(__REISER4_FSDATA_H__)
17629 +#define __REISER4_FSDATA_H__
17630 +
17631 +#include "debug.h"
17632 +#include "kassign.h"
17633 +#include "seal.h"
17634 +#include "type_safe_hash.h"
17635 +#include "plugin/file/file.h"
17636 +#include "readahead.h"
17637 +
17638 +/*
17639 + * comment about reiser4_dentry_fsdata
17640 + *
17641 + *
17642 + */
17643 +
17644 +/*
17645 + * locking: fields of per file descriptor readdir_pos and ->f_pos are
17646 + * protected by ->i_mutex on inode. Under this lock following invariant
17647 + * holds:
17648 + *
17649 + *     file descriptor is "looking" at the entry_no-th directory entry from
17650 + *     the beginning of directory. This entry has key dir_entry_key and is
17651 + *     pos-th entry with duplicate-key sequence.
17652 + *
17653 + */
17654 +
17655 +/* logical position within directory */
17656 +struct dir_pos {
17657 +       /* key of directory entry (actually, part of a key sufficient to
17658 +          identify directory entry)  */
17659 +       de_id dir_entry_key;
17660 +       /* ordinal number of directory entry among all entries with the same
17661 +          key. (Starting from 0.) */
17662 +       unsigned pos;
17663 +};
17664 +
17665 +struct readdir_pos {
17666 +       /* f_pos corresponding to this readdir position */
17667 +       __u64 fpos;
17668 +       /* logical position within directory */
17669 +       struct dir_pos position;
17670 +       /* logical number of directory entry within
17671 +          directory  */
17672 +       __u64 entry_no;
17673 +};
17674 +
17675 +/*
17676 + * this is used to speed up lookups for directory entry: on initial call to
17677 + * ->lookup() seal and coord of directory entry (if found, that is) are stored
17678 + * in struct dentry and reused later to avoid tree traversals.
17679 + */
17680 +struct de_location {
17681 +       /* seal covering directory entry */
17682 +       seal_t entry_seal;
17683 +       /* coord of directory entry */
17684 +       coord_t entry_coord;
17685 +       /* ordinal number of directory entry among all entries with the same
17686 +          key. (Starting from 0.) */
17687 +       int pos;
17688 +};
17689 +
17690 +/**
17691 + * reiser4_dentry_fsdata - reiser4-specific data attached to dentries
17692 + *
17693 + * This is allocated dynamically and released in d_op->d_release()
17694 + *
17695 + * Currently it only contains cached location (hint) of directory entry, but
17696 + * it is expected that other information will be accumulated here.
17697 + */
17698 +struct reiser4_dentry_fsdata {
17699 +       /*
17700 +        * here will go fields filled by ->lookup() to speedup next
17701 +        * create/unlink, like blocknr of znode with stat-data, or key of
17702 +        * stat-data.
17703 +        */
17704 +       struct de_location dec;
17705 +       int stateless;          /* created through reiser4_decode_fh, needs
17706 +                                * special treatment in readdir. */
17707 +};
17708 +
17709 +extern int reiser4_init_dentry_fsdata(void);
17710 +extern void reiser4_done_dentry_fsdata(void);
17711 +extern struct reiser4_dentry_fsdata *reiser4_get_dentry_fsdata(struct dentry *);
17712 +extern void reiser4_free_dentry_fsdata(struct dentry *dentry);
17713 +
17714 +/**
17715 + * reiser4_file_fsdata - reiser4-specific data attached to file->private_data
17716 + *
17717 + * This is allocated dynamically and released in inode->i_fop->release
17718 + */
17719 +typedef struct reiser4_file_fsdata {
17720 +       /*
17721 +        * pointer back to the struct file which this reiser4_file_fsdata is
17722 +        * part of
17723 +        */
17724 +       struct file *back;
17725 +       /* detached cursor for stateless readdir. */
17726 +       struct dir_cursor *cursor;
17727 +       /*
17728 +        * We need both directory and regular file parts here, because there
17729 +        * are file system objects that are files and directories.
17730 +        */
17731 +       struct {
17732 +               /*
17733 +                * position in directory. It is updated each time directory is
17734 +                * modified
17735 +                */
17736 +               struct readdir_pos readdir;
17737 +               /* head of this list is reiser4_inode->lists.readdir_list */
17738 +               struct list_head linkage;
17739 +       } dir;
17740 +       /* hints to speed up operations with regular files: read and write. */
17741 +       struct {
17742 +               hint_t hint;
17743 +       } reg;
17744 +       struct reiser4_file_ra_state ra1;
17745 +
17746 +} reiser4_file_fsdata;
17747 +
17748 +extern int reiser4_init_file_fsdata(void);
17749 +extern void reiser4_done_file_fsdata(void);
17750 +extern reiser4_file_fsdata *reiser4_get_file_fsdata(struct file *);
17751 +extern void reiser4_free_file_fsdata(struct file *);
17752 +
17753 +/*
17754 + * d_cursor is reiser4_file_fsdata not attached to struct file. d_cursors are
17755 + * used to address problem reiser4 has with readdir accesses via NFS. See
17756 + * plugin/file_ops_readdir.c for more details.
17757 + */
17758 +struct d_cursor_key{
17759 +       __u16 cid;
17760 +       __u64 oid;
17761 +};
17762 +
17763 +/*
17764 + * define structures d_cursor_hash_table d_cursor_hash_link which are used to
17765 + * maintain hash table of dir_cursor-s in reiser4's super block
17766 + */
17767 +typedef struct dir_cursor dir_cursor;
17768 +TYPE_SAFE_HASH_DECLARE(d_cursor, dir_cursor);
17769 +
17770 +struct dir_cursor {
17771 +       int ref;
17772 +       reiser4_file_fsdata *fsdata;
17773 +
17774 +       /* link to reiser4 super block hash table of cursors */
17775 +       d_cursor_hash_link hash;
17776 +
17777 +       /*
17778 +        * this is to link cursors to reiser4 super block's radix tree of
17779 +        * cursors if there are more than one cursor of the same objectid
17780 +        */
17781 +       struct list_head list;
17782 +       struct d_cursor_key key;
17783 +       struct d_cursor_info *info;
17784 +       /* list of unused cursors */
17785 +       struct list_head alist;
17786 +};
17787 +
17788 +extern int reiser4_init_d_cursor(void);
17789 +extern void reiser4_done_d_cursor(void);
17790 +
17791 +extern int reiser4_init_super_d_info(struct super_block *);
17792 +extern void reiser4_done_super_d_info(struct super_block *);
17793 +
17794 +extern loff_t reiser4_get_dir_fpos(struct file *);
17795 +extern int reiser4_attach_fsdata(struct file *, struct inode *);
17796 +extern void reiser4_detach_fsdata(struct file *);
17797 +
17798 +/* these are needed for "stateless" readdir. See plugin/file_ops_readdir.c for
17799 +   more details */
17800 +void reiser4_dispose_cursors(struct inode *inode);
17801 +void reiser4_load_cursors(struct inode *inode);
17802 +void reiser4_kill_cursors(struct inode *inode);
17803 +void reiser4_adjust_dir_file(struct inode *dir, const struct dentry *de,
17804 +                            int offset, int adj);
17805 +
17806 +/*
17807 + * this structure is embedded to reise4_super_info_data. It maintains d_cursors
17808 + * (detached readdir state). See plugin/file_ops_readdir.c for more details.
17809 + */
17810 +struct d_cursor_info {
17811 +       d_cursor_hash_table table;
17812 +       struct radix_tree_root tree;
17813 +};
17814 +
17815 +/* spinlock protecting readdir cursors */
17816 +extern spinlock_t d_lock;
17817 +
17818 +/* __REISER4_FSDATA_H__ */
17819 +#endif
17820 +
17821 +/*
17822 + * Local variables:
17823 + * c-indentation-style: "K&R"
17824 + * mode-name: "LC"
17825 + * c-basic-offset: 8
17826 + * tab-width: 8
17827 + * fill-column: 120
17828 + * End:
17829 + */
17830 diff -puN /dev/null fs/reiser4/init_super.c
17831 --- /dev/null
17832 +++ a/fs/reiser4/init_super.c
17833 @@ -0,0 +1,751 @@
17834 +/* Copyright by Hans Reiser, 2003 */
17835 +
17836 +#include "super.h"
17837 +#include "inode.h"
17838 +#include "plugin/plugin_set.h"
17839 +
17840 +#include <linux/swap.h>
17841 +
17842 +/**
17843 + * init_fs_info - allocate reiser4 specific super block
17844 + * @super: super block of filesystem
17845 + *
17846 + * Allocates and initialize reiser4_super_info_data, attaches it to
17847 + * super->s_fs_info, initializes structures maintaining d_cursor-s.
17848 + */
17849 +int reiser4_init_fs_info(struct super_block *super)
17850 +{
17851 +       reiser4_super_info_data *sbinfo;
17852 +
17853 +       sbinfo = kzalloc(sizeof(reiser4_super_info_data),
17854 +                        reiser4_ctx_gfp_mask_get());
17855 +       if (!sbinfo)
17856 +               return RETERR(-ENOMEM);
17857 +
17858 +       super->s_fs_info = sbinfo;
17859 +       super->s_op = NULL;
17860 +
17861 +       ON_DEBUG(INIT_LIST_HEAD(&sbinfo->all_jnodes));
17862 +       ON_DEBUG(spin_lock_init(&sbinfo->all_guard));
17863 +
17864 +       mutex_init(&sbinfo->delete_mutex);
17865 +       spin_lock_init(&(sbinfo->guard));
17866 +
17867 +       /*  initialize per-super-block d_cursor resources */
17868 +       reiser4_init_super_d_info(super);
17869 +
17870 +       return 0;
17871 +}
17872 +
17873 +/**
17874 + * reiser4_done_fs_info - free reiser4 specific super block
17875 + * @super: super block of filesystem
17876 + *
17877 + * Performs some sanity checks, releases structures maintaining d_cursor-s,
17878 + * frees reiser4_super_info_data.
17879 + */
17880 +void reiser4_done_fs_info(struct super_block *super)
17881 +{
17882 +       assert("zam-990", super->s_fs_info != NULL);
17883 +
17884 +       /* release per-super-block d_cursor resources */
17885 +       reiser4_done_super_d_info(super);
17886 +
17887 +       /* make sure that there are not jnodes already */
17888 +       assert("", list_empty(&get_super_private(super)->all_jnodes));
17889 +       assert("", get_current_context()->trans->atom == NULL);
17890 +       reiser4_check_block_counters(super);
17891 +       kfree(super->s_fs_info);
17892 +       super->s_fs_info = NULL;
17893 +}
17894 +
17895 +/* type of option parseable by parse_option() */
17896 +typedef enum {
17897 +       /* value of option is arbitrary string */
17898 +       OPT_STRING,
17899 +
17900 +       /*
17901 +        * option specifies bit in a bitmask. When option is set - bit in
17902 +        * sbinfo->fs_flags is set. Examples are bsdgroups, 32bittimes, mtflush,
17903 +        * dont_load_bitmap, atomic_write.
17904 +        */
17905 +       OPT_BIT,
17906 +
17907 +       /*
17908 +        * value of option should conform to sprintf() format. Examples are
17909 +        * tmgr.atom_max_size=N, tmgr.atom_max_age=N
17910 +        */
17911 +       OPT_FORMAT,
17912 +
17913 +       /*
17914 +        * option can take one of predefined values. Example is onerror=panic or
17915 +        * onerror=remount-ro
17916 +        */
17917 +       OPT_ONEOF,
17918 +} opt_type_t;
17919 +
17920 +#if 0
17921 +struct opt_bitmask_bit {
17922 +       const char *bit_name;
17923 +       int bit_nr;
17924 +};
17925 +#endif
17926 +
17927 +/* description of option parseable by parse_option() */
17928 +struct opt_desc {
17929 +       /* option name.
17930 +
17931 +          parsed portion of string has a form "name=value".
17932 +        */
17933 +       const char *name;
17934 +       /* type of option */
17935 +       opt_type_t type;
17936 +       union {
17937 +               /* where to store value of string option (type == OPT_STRING) */
17938 +               char **string;
17939 +               /* description of bits for bit option (type == OPT_BIT) */
17940 +               struct {
17941 +                       int nr;
17942 +                       void *addr;
17943 +               } bit;
17944 +               /* description of format and targets for format option (type
17945 +                  == OPT_FORMAT) */
17946 +               struct {
17947 +                       const char *format;
17948 +                       int nr_args;
17949 +                       void *arg1;
17950 +                       void *arg2;
17951 +                       void *arg3;
17952 +                       void *arg4;
17953 +               } f;
17954 +               struct {
17955 +                       int *result;
17956 +                       const char *list[10];
17957 +               } oneof;
17958 +               struct {
17959 +                       void *addr;
17960 +                       int nr_bits;
17961 +                       /* struct opt_bitmask_bit *bits; */
17962 +               } bitmask;
17963 +       } u;
17964 +};
17965 +
17966 +/**
17967 + * parse_option - parse one option
17968 + * @opt_strin: starting point of parsing
17969 + * @opt: option description
17970 + *
17971 + * foo=bar,
17972 + * ^   ^  ^
17973 + * |   |  +-- replaced to '\0'
17974 + * |   +-- val_start
17975 + * +-- opt_string
17976 + * Figures out option type and handles option correspondingly.
17977 + */
17978 +static int parse_option(char *opt_string, struct opt_desc *opt)
17979 +{
17980 +       char *val_start;
17981 +       int result;
17982 +       const char *err_msg;
17983 +
17984 +       /* NOTE-NIKITA think about using lib/cmdline.c functions here. */
17985 +
17986 +       val_start = strchr(opt_string, '=');
17987 +       if (val_start != NULL) {
17988 +               *val_start = '\0';
17989 +               ++val_start;
17990 +       }
17991 +
17992 +       err_msg = NULL;
17993 +       result = 0;
17994 +       switch (opt->type) {
17995 +       case OPT_STRING:
17996 +               if (val_start == NULL) {
17997 +                       err_msg = "String arg missing";
17998 +                       result = RETERR(-EINVAL);
17999 +               } else
18000 +                       *opt->u.string = val_start;
18001 +               break;
18002 +       case OPT_BIT:
18003 +               if (val_start != NULL)
18004 +                       err_msg = "Value ignored";
18005 +               else
18006 +                       set_bit(opt->u.bit.nr, opt->u.bit.addr);
18007 +               break;
18008 +       case OPT_FORMAT:
18009 +               if (val_start == NULL) {
18010 +                       err_msg = "Formatted arg missing";
18011 +                       result = RETERR(-EINVAL);
18012 +                       break;
18013 +               }
18014 +               if (sscanf(val_start, opt->u.f.format,
18015 +                          opt->u.f.arg1, opt->u.f.arg2, opt->u.f.arg3,
18016 +                          opt->u.f.arg4) != opt->u.f.nr_args) {
18017 +                       err_msg = "Wrong conversion";
18018 +                       result = RETERR(-EINVAL);
18019 +               }
18020 +               break;
18021 +       case OPT_ONEOF:
18022 +               {
18023 +                       int i = 0;
18024 +
18025 +                       if (val_start == NULL) {
18026 +                               err_msg = "Value is missing";
18027 +                               result = RETERR(-EINVAL);
18028 +                               break;
18029 +                       }
18030 +                       err_msg = "Wrong option value";
18031 +                       result = RETERR(-EINVAL);
18032 +                       while (opt->u.oneof.list[i]) {
18033 +                               if (!strcmp(opt->u.oneof.list[i], val_start)) {
18034 +                                       result = 0;
18035 +                                       err_msg = NULL;
18036 +                                       *opt->u.oneof.result = i;
18037 +                                       break;
18038 +                               }
18039 +                               i++;
18040 +                       }
18041 +                       break;
18042 +               }
18043 +       default:
18044 +               wrong_return_value("nikita-2100", "opt -> type");
18045 +               break;
18046 +       }
18047 +       if (err_msg != NULL) {
18048 +               warning("nikita-2496", "%s when parsing option \"%s%s%s\"",
18049 +                       err_msg, opt->name, val_start ? "=" : "",
18050 +                       val_start ? : "");
18051 +       }
18052 +       return result;
18053 +}
18054 +
18055 +/**
18056 + * parse_options - parse reiser4 mount options
18057 + * @opt_string: starting point
18058 + * @opts: array of option description
18059 + * @nr_opts: number of elements in @opts
18060 + *
18061 + * Parses comma separated list of reiser4 mount options.
18062 + */
18063 +static int parse_options(char *opt_string, struct opt_desc *opts, int nr_opts)
18064 +{
18065 +       int result;
18066 +
18067 +       result = 0;
18068 +       while ((result == 0) && opt_string && *opt_string) {
18069 +               int j;
18070 +               char *next;
18071 +
18072 +               next = strchr(opt_string, ',');
18073 +               if (next != NULL) {
18074 +                       *next = '\0';
18075 +                       ++next;
18076 +               }
18077 +               for (j = 0; j < nr_opts; ++j) {
18078 +                       if (!strncmp(opt_string, opts[j].name,
18079 +                                    strlen(opts[j].name))) {
18080 +                               result = parse_option(opt_string, &opts[j]);
18081 +                               break;
18082 +                       }
18083 +               }
18084 +               if (j == nr_opts) {
18085 +                       warning("nikita-2307", "Unrecognized option: \"%s\"",
18086 +                               opt_string);
18087 +                       /* traditionally, -EINVAL is returned on wrong mount
18088 +                          option */
18089 +                       result = RETERR(-EINVAL);
18090 +               }
18091 +               opt_string = next;
18092 +       }
18093 +       return result;
18094 +}
18095 +
18096 +#define NUM_OPT(label, fmt, addr)                              \
18097 +               {                                               \
18098 +                       .name = (label),                        \
18099 +                       .type = OPT_FORMAT,                     \
18100 +                       .u = {                                  \
18101 +                               .f = {                          \
18102 +                                       .format  = (fmt),       \
18103 +                                       .nr_args = 1,           \
18104 +                                       .arg1 = (addr),         \
18105 +                                       .arg2 = NULL,           \
18106 +                                       .arg3 = NULL,           \
18107 +                                       .arg4 = NULL            \
18108 +                               }                               \
18109 +                       }                                       \
18110 +               }
18111 +
18112 +#define SB_FIELD_OPT(field, fmt) NUM_OPT(#field, fmt, &sbinfo->field)
18113 +
18114 +#define BIT_OPT(label, bitnr)                                  \
18115 +       {                                                       \
18116 +               .name = label,                                  \
18117 +               .type = OPT_BIT,                                \
18118 +               .u = {                                          \
18119 +                       .bit = {                                \
18120 +                               .nr = bitnr,                    \
18121 +                               .addr = &sbinfo->fs_flags       \
18122 +                       }                                       \
18123 +               }                                               \
18124 +       }
18125 +
18126 +#define MAX_NR_OPTIONS (30)
18127 +
18128 +/**
18129 + * reiser4_init_super_data - initialize reiser4 private super block
18130 + * @super: super block to initialize
18131 + * @opt_string: list of reiser4 mount options
18132 + *
18133 + * Sets various reiser4 parameters to default values. Parses mount options and
18134 + * overwrites default settings.
18135 + */
18136 +int reiser4_init_super_data(struct super_block *super, char *opt_string)
18137 +{
18138 +       int result;
18139 +       struct opt_desc *opts, *p;
18140 +       reiser4_super_info_data *sbinfo = get_super_private(super);
18141 +
18142 +       /* initialize super, export, dentry operations */
18143 +       sbinfo->ops.super = reiser4_super_operations;
18144 +       sbinfo->ops.export = reiser4_export_operations;
18145 +       sbinfo->ops.dentry = reiser4_dentry_operations;
18146 +       super->s_op = &sbinfo->ops.super;
18147 +       super->s_export_op = &sbinfo->ops.export;
18148 +
18149 +       /* initialize transaction manager parameters to default values */
18150 +       sbinfo->tmgr.atom_max_size = totalram_pages / 4;
18151 +       sbinfo->tmgr.atom_max_age = REISER4_ATOM_MAX_AGE / HZ;
18152 +       sbinfo->tmgr.atom_min_size = 256;
18153 +       sbinfo->tmgr.atom_max_flushers = ATOM_MAX_FLUSHERS;
18154 +
18155 +       /* initialize cbk cache parameter */
18156 +       sbinfo->tree.cbk_cache.nr_slots = CBK_CACHE_SLOTS;
18157 +
18158 +       /* initialize flush parameters */
18159 +       sbinfo->flush.relocate_threshold = FLUSH_RELOCATE_THRESHOLD;
18160 +       sbinfo->flush.relocate_distance = FLUSH_RELOCATE_DISTANCE;
18161 +       sbinfo->flush.written_threshold = FLUSH_WRITTEN_THRESHOLD;
18162 +       sbinfo->flush.scan_maxnodes = FLUSH_SCAN_MAXNODES;
18163 +
18164 +       sbinfo->optimal_io_size = REISER4_OPTIMAL_IO_SIZE;
18165 +
18166 +       /* preliminary tree initializations */
18167 +       sbinfo->tree.super = super;
18168 +       sbinfo->tree.carry.new_node_flags = REISER4_NEW_NODE_FLAGS;
18169 +       sbinfo->tree.carry.new_extent_flags = REISER4_NEW_EXTENT_FLAGS;
18170 +       sbinfo->tree.carry.paste_flags = REISER4_PASTE_FLAGS;
18171 +       sbinfo->tree.carry.insert_flags = REISER4_INSERT_FLAGS;
18172 +       rwlock_init(&(sbinfo->tree.tree_lock));
18173 +       spin_lock_init(&(sbinfo->tree.epoch_lock));
18174 +
18175 +       /* initialize default readahead params */
18176 +       sbinfo->ra_params.max = num_physpages / 4;
18177 +       sbinfo->ra_params.flags = 0;
18178 +
18179 +       /* allocate memory for structure describing reiser4 mount options */
18180 +       opts = kmalloc(sizeof(struct opt_desc) * MAX_NR_OPTIONS,
18181 +                      reiser4_ctx_gfp_mask_get());
18182 +       if (opts == NULL)
18183 +               return RETERR(-ENOMEM);
18184 +
18185 +       /* initialize structure describing reiser4 mount options */
18186 +       p = opts;
18187 +
18188 +#if REISER4_DEBUG
18189 +#  define OPT_ARRAY_CHECK if ((p) > (opts) + MAX_NR_OPTIONS) {         \
18190 +               warning("zam-1046", "opt array is overloaded"); break;  \
18191 +       }
18192 +#else
18193 +#   define OPT_ARRAY_CHECK noop
18194 +#endif
18195 +
18196 +#define PUSH_OPT(...)                          \
18197 +do {                                           \
18198 +       struct opt_desc o = __VA_ARGS__;        \
18199 +       OPT_ARRAY_CHECK;                        \
18200 +       *p ++ = o;                              \
18201 +} while (0)
18202 +
18203 +#define PUSH_SB_FIELD_OPT(field, format) PUSH_OPT(SB_FIELD_OPT(field, format))
18204 +#define PUSH_BIT_OPT(name, bit) PUSH_OPT(BIT_OPT(name, bit))
18205 +
18206 +       /*
18207 +        * tmgr.atom_max_size=N
18208 +        * Atoms containing more than N blocks will be forced to commit. N is
18209 +        * decimal.
18210 +        */
18211 +       PUSH_SB_FIELD_OPT(tmgr.atom_max_size, "%u");
18212 +       /*
18213 +        * tmgr.atom_max_age=N
18214 +        * Atoms older than N seconds will be forced to commit. N is decimal.
18215 +        */
18216 +       PUSH_SB_FIELD_OPT(tmgr.atom_max_age, "%u");
18217 +       /*
18218 +        * tmgr.atom_min_size=N
18219 +        * In committing an atom to free dirty pages, force the atom less than
18220 +        * N in size to fuse with another one.
18221 +        */
18222 +       PUSH_SB_FIELD_OPT(tmgr.atom_min_size, "%u");
18223 +       /*
18224 +        * tmgr.atom_max_flushers=N
18225 +        * limit of concurrent flushers for one atom. 0 means no limit.
18226 +        */
18227 +       PUSH_SB_FIELD_OPT(tmgr.atom_max_flushers, "%u");
18228 +       /*
18229 +        * tree.cbk_cache_slots=N
18230 +        * Number of slots in the cbk cache.
18231 +        */
18232 +       PUSH_SB_FIELD_OPT(tree.cbk_cache.nr_slots, "%u");
18233 +       /*
18234 +        * If flush finds more than FLUSH_RELOCATE_THRESHOLD adjacent dirty
18235 +        * leaf-level blocks it will force them to be relocated.
18236 +        */
18237 +       PUSH_SB_FIELD_OPT(flush.relocate_threshold, "%u");
18238 +       /*
18239 +        * If flush finds can find a block allocation closer than at most
18240 +        * FLUSH_RELOCATE_DISTANCE from the preceder it will relocate to that
18241 +        * position.
18242 +        */
18243 +       PUSH_SB_FIELD_OPT(flush.relocate_distance, "%u");
18244 +       /*
18245 +        * If we have written this much or more blocks before encountering busy
18246 +        * jnode in flush list - abort flushing hoping that next time we get
18247 +        * called this jnode will be clean already, and we will save some
18248 +        * seeks.
18249 +        */
18250 +       PUSH_SB_FIELD_OPT(flush.written_threshold, "%u");
18251 +       /* The maximum number of nodes to scan left on a level during flush. */
18252 +       PUSH_SB_FIELD_OPT(flush.scan_maxnodes, "%u");
18253 +       /* preferred IO size */
18254 +       PUSH_SB_FIELD_OPT(optimal_io_size, "%u");
18255 +       /* carry flags used for insertion of new nodes */
18256 +       PUSH_SB_FIELD_OPT(tree.carry.new_node_flags, "%u");
18257 +       /* carry flags used for insertion of new extents */
18258 +       PUSH_SB_FIELD_OPT(tree.carry.new_extent_flags, "%u");
18259 +       /* carry flags used for paste operations */
18260 +       PUSH_SB_FIELD_OPT(tree.carry.paste_flags, "%u");
18261 +       /* carry flags used for insert operations */
18262 +       PUSH_SB_FIELD_OPT(tree.carry.insert_flags, "%u");
18263 +
18264 +#ifdef CONFIG_REISER4_BADBLOCKS
18265 +       /*
18266 +        * Alternative master superblock location in case if it's original
18267 +        * location is not writeable/accessable. This is offset in BYTES.
18268 +        */
18269 +       PUSH_SB_FIELD_OPT(altsuper, "%lu");
18270 +#endif
18271 +
18272 +       /* turn on BSD-style gid assignment */
18273 +       PUSH_BIT_OPT("bsdgroups", REISER4_BSD_GID);
18274 +       /* turn on 32 bit times */
18275 +       PUSH_BIT_OPT("32bittimes", REISER4_32_BIT_TIMES);
18276 +       /*
18277 +        * Don't load all bitmap blocks at mount time, it is useful for
18278 +        * machines with tiny RAM and large disks.
18279 +        */
18280 +       PUSH_BIT_OPT("dont_load_bitmap", REISER4_DONT_LOAD_BITMAP);
18281 +       /* disable transaction commits during write() */
18282 +       PUSH_BIT_OPT("atomic_write", REISER4_ATOMIC_WRITE);
18283 +       /* disable use of write barriers in the reiser4 log writer. */
18284 +       PUSH_BIT_OPT("no_write_barrier", REISER4_NO_WRITE_BARRIER);
18285 +
18286 +       PUSH_OPT(
18287 +       {
18288 +               /*
18289 +                * tree traversal readahead parameters:
18290 +                * -o readahead:MAXNUM:FLAGS
18291 +                * MAXNUM - max number fo nodes to request readahead for: -1UL
18292 +                * will set it to max_sane_readahead()
18293 +                * FLAGS - combination of bits: RA_ADJCENT_ONLY, RA_ALL_LEVELS,
18294 +                * CONTINUE_ON_PRESENT
18295 +                */
18296 +               .name = "readahead",
18297 +               .type = OPT_FORMAT,
18298 +               .u = {
18299 +                       .f = {
18300 +                               .format = "%u:%u",
18301 +                               .nr_args = 2,
18302 +                               .arg1 = &sbinfo->ra_params.max,
18303 +                               .arg2 = &sbinfo->ra_params.flags,
18304 +                               .arg3 = NULL,
18305 +                               .arg4 = NULL
18306 +                       }
18307 +               }
18308 +       }
18309 +       );
18310 +
18311 +       /* What to do in case of fs error */
18312 +       PUSH_OPT(
18313 +       {
18314 +               .name = "onerror",
18315 +               .type = OPT_ONEOF,
18316 +               .u = {
18317 +                       .oneof = {
18318 +                               .result = &sbinfo->onerror,
18319 +                               .list = {
18320 +                                       "panic", "remount-ro", NULL
18321 +                               },
18322 +                       }
18323 +               }
18324 +       }
18325 +       );
18326 +
18327 +       /* modify default settings to values set by mount options */
18328 +       result = parse_options(opt_string, opts, p - opts);
18329 +       kfree(opts);
18330 +       if (result != 0)
18331 +               return result;
18332 +
18333 +       /* correct settings to sanity values */
18334 +       sbinfo->tmgr.atom_max_age *= HZ;
18335 +       if (sbinfo->tmgr.atom_max_age <= 0)
18336 +               /* overflow */
18337 +               sbinfo->tmgr.atom_max_age = REISER4_ATOM_MAX_AGE;
18338 +
18339 +       /* round optimal io size up to 512 bytes */
18340 +       sbinfo->optimal_io_size >>= VFS_BLKSIZE_BITS;
18341 +       sbinfo->optimal_io_size <<= VFS_BLKSIZE_BITS;
18342 +       if (sbinfo->optimal_io_size == 0) {
18343 +               warning("nikita-2497", "optimal_io_size is too small");
18344 +               return RETERR(-EINVAL);
18345 +       }
18346 +       return result;
18347 +}
18348 +
18349 +/**
18350 + * reiser4_init_read_super - read reiser4 master super block
18351 + * @super: super block to fill
18352 + * @silent: if 0 - print warnings
18353 + *
18354 + * Reads reiser4 master super block either from predefined location or from
18355 + * location specified by altsuper mount option, initializes disk format plugin.
18356 + */
18357 +int reiser4_init_read_super(struct super_block *super, int silent)
18358 +{
18359 +       struct buffer_head *super_bh;
18360 +       struct reiser4_master_sb *master_sb;
18361 +       reiser4_super_info_data *sbinfo = get_super_private(super);
18362 +       unsigned long blocksize;
18363 +
18364 + read_super_block:
18365 +#ifdef CONFIG_REISER4_BADBLOCKS
18366 +       if (sbinfo->altsuper)
18367 +               /*
18368 +                * read reiser4 master super block at position specified by
18369 +                * mount option
18370 +                */
18371 +               super_bh = sb_bread(super,
18372 +                                   (sector_t)(sbinfo->altsuper / super->s_blocksize));
18373 +       else
18374 +#endif
18375 +               /* read reiser4 master super block at 16-th 4096 block */
18376 +               super_bh = sb_bread(super,
18377 +                                   (sector_t)(REISER4_MAGIC_OFFSET / super->s_blocksize));
18378 +       if (!super_bh)
18379 +               return RETERR(-EIO);
18380 +
18381 +       master_sb = (struct reiser4_master_sb *)super_bh->b_data;
18382 +       /* check reiser4 magic string */
18383 +       if (!strncmp(master_sb->magic, REISER4_SUPER_MAGIC_STRING,
18384 +                    sizeof(REISER4_SUPER_MAGIC_STRING))) {
18385 +               /* reiser4 master super block contains filesystem blocksize */
18386 +               blocksize = le16_to_cpu(get_unaligned(&master_sb->blocksize));
18387 +
18388 +               if (blocksize != PAGE_CACHE_SIZE) {
18389 +                       /*
18390 +                        * currenly reiser4's blocksize must be equal to
18391 +                        * pagesize
18392 +                        */
18393 +                       if (!silent)
18394 +                               warning("nikita-2609",
18395 +                                       "%s: wrong block size %ld\n", super->s_id,
18396 +                                       blocksize);
18397 +                       brelse(super_bh);
18398 +                       return RETERR(-EINVAL);
18399 +               }
18400 +               if (blocksize != super->s_blocksize) {
18401 +                       /*
18402 +                        * filesystem uses different blocksize. Reread master
18403 +                        * super block with correct blocksize
18404 +                        */
18405 +                       brelse(super_bh);
18406 +                       if (!sb_set_blocksize(super, (int)blocksize))
18407 +                               return RETERR(-EINVAL);
18408 +                       goto read_super_block;
18409 +               }
18410 +
18411 +               sbinfo->df_plug =
18412 +                       disk_format_plugin_by_id(
18413 +                               le16_to_cpu(get_unaligned(&master_sb->disk_plugin_id)));
18414 +               if (sbinfo->df_plug == NULL) {
18415 +                       if (!silent)
18416 +                               warning("nikita-26091",
18417 +                                       "%s: unknown disk format plugin %d\n",
18418 +                                       super->s_id,
18419 +                                       le16_to_cpu(get_unaligned(&master_sb->disk_plugin_id)));
18420 +                       brelse(super_bh);
18421 +                       return RETERR(-EINVAL);
18422 +               }
18423 +               sbinfo->diskmap_block = le64_to_cpu(get_unaligned(&master_sb->diskmap));
18424 +               brelse(super_bh);
18425 +               return 0;
18426 +       }
18427 +
18428 +       /* there is no reiser4 on the device */
18429 +       if (!silent)
18430 +               warning("nikita-2608",
18431 +                       "%s: wrong master super block magic", super->s_id);
18432 +       brelse(super_bh);
18433 +       return RETERR(-EINVAL);
18434 +}
18435 +
18436 +static struct {
18437 +       reiser4_plugin_type type;
18438 +       reiser4_plugin_id id;
18439 +} default_plugins[PSET_LAST] = {
18440 +       [PSET_FILE] = {
18441 +               .type = REISER4_FILE_PLUGIN_TYPE,
18442 +               .id = UNIX_FILE_PLUGIN_ID
18443 +       },
18444 +       [PSET_DIR] = {
18445 +               .type = REISER4_DIR_PLUGIN_TYPE,
18446 +               .id = HASHED_DIR_PLUGIN_ID
18447 +       },
18448 +       [PSET_HASH] = {
18449 +               .type = REISER4_HASH_PLUGIN_TYPE,
18450 +               .id = R5_HASH_ID
18451 +       },
18452 +       [PSET_FIBRATION] = {
18453 +               .type = REISER4_FIBRATION_PLUGIN_TYPE,
18454 +               .id = FIBRATION_DOT_O
18455 +       },
18456 +       [PSET_PERM] = {
18457 +               .type = REISER4_PERM_PLUGIN_TYPE,
18458 +               .id = NULL_PERM_ID
18459 +       },
18460 +       [PSET_FORMATTING] = {
18461 +               .type = REISER4_FORMATTING_PLUGIN_TYPE,
18462 +               .id = SMALL_FILE_FORMATTING_ID
18463 +       },
18464 +       [PSET_SD] = {
18465 +               .type = REISER4_ITEM_PLUGIN_TYPE,
18466 +               .id = STATIC_STAT_DATA_ID
18467 +       },
18468 +       [PSET_DIR_ITEM] = {
18469 +               .type = REISER4_ITEM_PLUGIN_TYPE,
18470 +               .id = COMPOUND_DIR_ID
18471 +       },
18472 +       [PSET_CIPHER] = {
18473 +               .type = REISER4_CIPHER_PLUGIN_TYPE,
18474 +               .id = NONE_CIPHER_ID
18475 +       },
18476 +       [PSET_DIGEST] = {
18477 +               .type = REISER4_DIGEST_PLUGIN_TYPE,
18478 +               .id = SHA256_32_DIGEST_ID
18479 +       },
18480 +       [PSET_COMPRESSION] = {
18481 +               .type = REISER4_COMPRESSION_PLUGIN_TYPE,
18482 +               .id = LZO1_COMPRESSION_ID
18483 +       },
18484 +       [PSET_COMPRESSION_MODE] = {
18485 +               .type = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
18486 +               .id = CONVX_COMPRESSION_MODE_ID
18487 +       },
18488 +       [PSET_CLUSTER] = {
18489 +               .type = REISER4_CLUSTER_PLUGIN_TYPE,
18490 +               .id = CLUSTER_64K_ID
18491 +       },
18492 +       [PSET_CREATE] = {
18493 +               .type = REISER4_FILE_PLUGIN_TYPE,
18494 +               .id = UNIX_FILE_PLUGIN_ID
18495 +       }
18496 +};
18497 +
18498 +/* access to default plugin table */
18499 +reiser4_plugin *get_default_plugin(pset_member memb)
18500 +{
18501 +       return plugin_by_id(default_plugins[memb].type,
18502 +                           default_plugins[memb].id);
18503 +}
18504 +
18505 +/**
18506 + * reiser4_init_root_inode - obtain inode of root directory
18507 + * @super: super block of filesystem
18508 + *
18509 + * Obtains inode of root directory (reading it from disk), initializes plugin
18510 + * set it was not initialized.
18511 + */
18512 +int reiser4_init_root_inode(struct super_block *super)
18513 +{
18514 +       reiser4_super_info_data *sbinfo = get_super_private(super);
18515 +       struct inode *inode;
18516 +       int result = 0;
18517 +
18518 +       inode = reiser4_iget(super, sbinfo->df_plug->root_dir_key(super), 0);
18519 +       if (IS_ERR(inode))
18520 +               return RETERR(PTR_ERR(inode));
18521 +
18522 +       super->s_root = d_alloc_root(inode);
18523 +       if (!super->s_root) {
18524 +               iput(inode);
18525 +               return RETERR(-ENOMEM);
18526 +       }
18527 +
18528 +       super->s_root->d_op = &sbinfo->ops.dentry;
18529 +
18530 +       if (!is_inode_loaded(inode)) {
18531 +               pset_member memb;
18532 +               plugin_set *pset;
18533 +
18534 +               pset = reiser4_inode_data(inode)->pset;
18535 +               for (memb = 0; memb < PSET_LAST; ++memb) {
18536 +
18537 +                       if (aset_get(pset, memb) != NULL)
18538 +                               continue;
18539 +
18540 +                       result = grab_plugin_pset(inode, NULL, memb);
18541 +                       if (result != 0)
18542 +                               break;
18543 +
18544 +                       reiser4_inode_clr_flag(inode, REISER4_SDLEN_KNOWN);
18545 +               }
18546 +
18547 +               if (result == 0) {
18548 +                       if (REISER4_DEBUG) {
18549 +                               for (memb = 0; memb < PSET_LAST; ++memb)
18550 +                                       assert("nikita-3500",
18551 +                                              aset_get(pset, memb) != NULL);
18552 +                       }
18553 +               } else
18554 +                       warning("nikita-3448", "Cannot set plugins of root: %i",
18555 +                               result);
18556 +               reiser4_iget_complete(inode);
18557 +
18558 +               /* As the default pset kept in the root dir may has been changed
18559 +                  (length is unknown), call update_sd. */
18560 +               if (!reiser4_inode_get_flag(inode, REISER4_SDLEN_KNOWN)) {
18561 +                       result = reiser4_grab_space(
18562 +                               inode_file_plugin(inode)->estimate.update(inode),
18563 +                               BA_CAN_COMMIT);
18564 +
18565 +                       if (result == 0)
18566 +                               result = reiser4_update_sd(inode);
18567 +
18568 +                       all_grabbed2free();
18569 +               }
18570 +       }
18571 +
18572 +       super->s_maxbytes = MAX_LFS_FILESIZE;
18573 +       return result;
18574 +}
18575 +
18576 +/*
18577 + * Local variables:
18578 + * c-indentation-style: "K&R"
18579 + * mode-name: "LC"
18580 + * c-basic-offset: 8
18581 + * tab-width: 8
18582 + * fill-column: 79
18583 + * End:
18584 + */
18585 diff -puN /dev/null fs/reiser4/inode.c
18586 --- /dev/null
18587 +++ a/fs/reiser4/inode.c
18588 @@ -0,0 +1,711 @@
18589 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
18590 +   reiser4/README */
18591 +
18592 +/* Inode specific operations. */
18593 +
18594 +#include "forward.h"
18595 +#include "debug.h"
18596 +#include "key.h"
18597 +#include "kassign.h"
18598 +#include "coord.h"
18599 +#include "seal.h"
18600 +#include "dscale.h"
18601 +#include "plugin/item/item.h"
18602 +#include "plugin/security/perm.h"
18603 +#include "plugin/plugin.h"
18604 +#include "plugin/object.h"
18605 +#include "znode.h"
18606 +#include "vfs_ops.h"
18607 +#include "inode.h"
18608 +#include "super.h"
18609 +#include "reiser4.h"
18610 +
18611 +#include <linux/fs.h>          /* for struct super_block,  address_space */
18612 +
18613 +/* return reiser4 internal tree which inode belongs to */
18614 +/* Audited by: green(2002.06.17) */
18615 +reiser4_tree *reiser4_tree_by_inode(const struct inode *inode/* inode queried*/)
18616 +{
18617 +       assert("nikita-256", inode != NULL);
18618 +       assert("nikita-257", inode->i_sb != NULL);
18619 +       return reiser4_get_tree(inode->i_sb);
18620 +}
18621 +
18622 +/* return reiser4-specific inode flags */
18623 +static inline unsigned long *inode_flags(const struct inode *const inode)
18624 +{
18625 +       assert("nikita-2842", inode != NULL);
18626 +       return &reiser4_inode_data(inode)->flags;
18627 +}
18628 +
18629 +/* set reiser4-specific flag @f in @inode */
18630 +void reiser4_inode_set_flag(struct inode *inode, reiser4_file_plugin_flags f)
18631 +{
18632 +       assert("nikita-2248", inode != NULL);
18633 +       set_bit((int)f, inode_flags(inode));
18634 +}
18635 +
18636 +/* clear reiser4-specific flag @f in @inode */
18637 +void reiser4_inode_clr_flag(struct inode *inode, reiser4_file_plugin_flags f)
18638 +{
18639 +       assert("nikita-2250", inode != NULL);
18640 +       clear_bit((int)f, inode_flags(inode));
18641 +}
18642 +
18643 +/* true if reiser4-specific flag @f is set in @inode */
18644 +int reiser4_inode_get_flag(const struct inode *inode,
18645 +                          reiser4_file_plugin_flags f)
18646 +{
18647 +       assert("nikita-2251", inode != NULL);
18648 +       return test_bit((int)f, inode_flags(inode));
18649 +}
18650 +
18651 +/* convert oid to inode number */
18652 +ino_t oid_to_ino(oid_t oid)
18653 +{
18654 +       return (ino_t) oid;
18655 +}
18656 +
18657 +/* convert oid to user visible inode number */
18658 +ino_t oid_to_uino(oid_t oid)
18659 +{
18660 +       /* reiser4 object is uniquely identified by oid which is 64 bit
18661 +          quantity. Kernel in-memory inode is indexed (in the hash table) by
18662 +          32 bit i_ino field, but this is not a problem, because there is a
18663 +          way to further distinguish inodes with identical inode numbers
18664 +          (find_actor supplied to iget()).
18665 +
18666 +          But user space expects unique 32 bit inode number. Obviously this
18667 +          is impossible. Work-around is to somehow hash oid into user visible
18668 +          inode number.
18669 +        */
18670 +       oid_t max_ino = (ino_t) ~0;
18671 +
18672 +       if (REISER4_INO_IS_OID || (oid <= max_ino))
18673 +               return oid;
18674 +       else
18675 +               /* this is remotely similar to algorithm used to find next pid
18676 +                  to use for process: after wrap-around start from some
18677 +                  offset rather than from 0. Idea is that there are some long
18678 +                  living objects with which we don't want to collide.
18679 +                */
18680 +               return REISER4_UINO_SHIFT + ((oid - max_ino) & (max_ino >> 1));
18681 +}
18682 +
18683 +/* check that "inode" is on reiser4 file-system */
18684 +int is_reiser4_inode(const struct inode *inode/* inode queried */)
18685 +{
18686 +       return inode != NULL && is_reiser4_super(inode->i_sb);
18687 +}
18688 +
18689 +/* Maximal length of a name that can be stored in directory @inode.
18690 +
18691 +   This is used in check during file creation and lookup. */
18692 +int reiser4_max_filename_len(const struct inode *inode/* inode queried */)
18693 +{
18694 +       assert("nikita-287", is_reiser4_inode(inode));
18695 +       assert("nikita-1710", inode_dir_item_plugin(inode));
18696 +       if (inode_dir_item_plugin(inode)->s.dir.max_name_len)
18697 +               return inode_dir_item_plugin(inode)->s.dir.max_name_len(inode);
18698 +       else
18699 +               return 255;
18700 +}
18701 +
18702 +#if REISER4_USE_COLLISION_LIMIT
18703 +/* Maximal number of hash collisions for this directory. */
18704 +int max_hash_collisions(const struct inode *dir/* inode queried */)
18705 +{
18706 +       assert("nikita-1711", dir != NULL);
18707 +       return reiser4_inode_data(dir)->plugin.max_collisions;
18708 +}
18709 +#endif  /*  REISER4_USE_COLLISION_LIMIT  */
18710 +
18711 +/* Install file, inode, and address_space operation on @inode, depending on
18712 +   its mode. */
18713 +int setup_inode_ops(struct inode *inode /* inode to intialize */ ,
18714 +                   reiser4_object_create_data * data   /* parameters to create
18715 +                                                        * object */ )
18716 +{
18717 +       reiser4_super_info_data *sinfo;
18718 +       file_plugin *fplug;
18719 +       dir_plugin *dplug;
18720 +
18721 +       fplug = inode_file_plugin(inode);
18722 +       dplug = inode_dir_plugin(inode);
18723 +
18724 +       sinfo = get_super_private(inode->i_sb);
18725 +
18726 +       switch (inode->i_mode & S_IFMT) {
18727 +       case S_IFSOCK:
18728 +       case S_IFBLK:
18729 +       case S_IFCHR:
18730 +       case S_IFIFO:
18731 +               {
18732 +                       dev_t rdev;     /* to keep gcc happy */
18733 +
18734 +                       assert("vs-46", fplug != NULL);
18735 +                       /* ugly hack with rdev */
18736 +                       if (data == NULL) {
18737 +                               rdev = inode->i_rdev;
18738 +                               inode->i_rdev = 0;
18739 +                       } else
18740 +                               rdev = data->rdev;
18741 +                       inode->i_blocks = 0;
18742 +                       assert("vs-42", fplug->h.id == SPECIAL_FILE_PLUGIN_ID);
18743 +                       inode->i_op = file_plugins[fplug->h.id].inode_ops;
18744 +                       /* initialize inode->i_fop and inode->i_rdev for block
18745 +                          and char devices */
18746 +                       init_special_inode(inode, inode->i_mode, rdev);
18747 +                       /* all address space operations are null */
18748 +                       inode->i_mapping->a_ops =
18749 +                           file_plugins[fplug->h.id].as_ops;
18750 +                       break;
18751 +               }
18752 +       case S_IFLNK:
18753 +               assert("vs-46", fplug != NULL);
18754 +               assert("vs-42", fplug->h.id == SYMLINK_FILE_PLUGIN_ID);
18755 +               inode->i_op = file_plugins[fplug->h.id].inode_ops;
18756 +               inode->i_fop = NULL;
18757 +               /* all address space operations are null */
18758 +               inode->i_mapping->a_ops = file_plugins[fplug->h.id].as_ops;
18759 +               break;
18760 +       case S_IFDIR:
18761 +               assert("vs-46", dplug != NULL);
18762 +               assert("vs-43", (dplug->h.id == HASHED_DIR_PLUGIN_ID ||
18763 +                                dplug->h.id == SEEKABLE_HASHED_DIR_PLUGIN_ID));
18764 +               inode->i_op = dir_plugins[dplug->h.id].inode_ops;
18765 +               inode->i_fop = dir_plugins[dplug->h.id].file_ops;
18766 +               inode->i_mapping->a_ops = dir_plugins[dplug->h.id].as_ops;
18767 +               break;
18768 +       case S_IFREG:
18769 +               assert("vs-46", fplug != NULL);
18770 +               assert("vs-43", (fplug->h.id == UNIX_FILE_PLUGIN_ID ||
18771 +                                fplug->h.id == CRYPTCOMPRESS_FILE_PLUGIN_ID));
18772 +               inode->i_op = file_plugins[fplug->h.id].inode_ops;
18773 +               inode->i_fop = file_plugins[fplug->h.id].file_ops;
18774 +               inode->i_mapping->a_ops = file_plugins[fplug->h.id].as_ops;
18775 +               break;
18776 +       default:
18777 +               warning("nikita-291", "wrong file mode: %o for %llu",
18778 +                       inode->i_mode,
18779 +                       (unsigned long long)get_inode_oid(inode));
18780 +               reiser4_make_bad_inode(inode);
18781 +               return RETERR(-EINVAL);
18782 +       }
18783 +       return 0;
18784 +}
18785 +
18786 +/* Initialize inode from disk data. Called with inode locked.
18787 +   Return inode locked. */
18788 +static int init_inode(struct inode *inode /* inode to intialise */ ,
18789 +                     coord_t *coord/* coord of stat data */)
18790 +{
18791 +       int result;
18792 +       item_plugin *iplug;
18793 +       void *body;
18794 +       int length;
18795 +       reiser4_inode *state;
18796 +
18797 +       assert("nikita-292", coord != NULL);
18798 +       assert("nikita-293", inode != NULL);
18799 +
18800 +       coord_clear_iplug(coord);
18801 +       result = zload(coord->node);
18802 +       if (result)
18803 +               return result;
18804 +       iplug = item_plugin_by_coord(coord);
18805 +       body = item_body_by_coord(coord);
18806 +       length = item_length_by_coord(coord);
18807 +
18808 +       assert("nikita-295", iplug != NULL);
18809 +       assert("nikita-296", body != NULL);
18810 +       assert("nikita-297", length > 0);
18811 +
18812 +       /* inode is under I_LOCK now */
18813 +
18814 +       state = reiser4_inode_data(inode);
18815 +       /* call stat-data plugin method to load sd content into inode */
18816 +       result = iplug->s.sd.init_inode(inode, body, length);
18817 +       set_plugin(&state->pset, PSET_SD, item_plugin_to_plugin(iplug));
18818 +       if (result == 0) {
18819 +               result = setup_inode_ops(inode, NULL);
18820 +               if (result == 0 && inode->i_sb->s_root &&
18821 +                   inode->i_sb->s_root->d_inode)
18822 +                       result = finish_pset(inode);
18823 +       }
18824 +       zrelse(coord->node);
18825 +       return result;
18826 +}
18827 +
18828 +/* read `inode' from the disk. This is what was previously in
18829 +   reiserfs_read_inode2().
18830 +
18831 +   Must be called with inode locked. Return inode still locked.
18832 +*/
18833 +static int read_inode(struct inode *inode /* inode to read from disk */ ,
18834 +                     const reiser4_key * key /* key of stat data */ ,
18835 +                     int silent)
18836 +{
18837 +       int result;
18838 +       lock_handle lh;
18839 +       reiser4_inode *info;
18840 +       coord_t coord;
18841 +
18842 +       assert("nikita-298", inode != NULL);
18843 +       assert("nikita-1945", !is_inode_loaded(inode));
18844 +
18845 +       info = reiser4_inode_data(inode);
18846 +       assert("nikita-300", info->locality_id != 0);
18847 +
18848 +       coord_init_zero(&coord);
18849 +       init_lh(&lh);
18850 +       /* locate stat-data in a tree and return znode locked */
18851 +       result = lookup_sd(inode, ZNODE_READ_LOCK, &coord, &lh, key, silent);
18852 +       assert("nikita-301", !is_inode_loaded(inode));
18853 +       if (result == 0) {
18854 +               /* use stat-data plugin to load sd into inode. */
18855 +               result = init_inode(inode, &coord);
18856 +               if (result == 0) {
18857 +                       /* initialize stat-data seal */
18858 +                       spin_lock_inode(inode);
18859 +                       reiser4_seal_init(&info->sd_seal, &coord, key);
18860 +                       info->sd_coord = coord;
18861 +                       spin_unlock_inode(inode);
18862 +
18863 +                       /* call file plugin's method to initialize plugin
18864 +                        * specific part of inode */
18865 +                       if (inode_file_plugin(inode)->init_inode_data)
18866 +                               inode_file_plugin(inode)->init_inode_data(inode,
18867 +                                                                         NULL,
18868 +                                                                         0);
18869 +                       /* load detached directory cursors for stateless
18870 +                        * directory readers (NFS). */
18871 +                       reiser4_load_cursors(inode);
18872 +
18873 +                       /* Check the opened inode for consistency. */
18874 +                       result =
18875 +                           get_super_private(inode->i_sb)->df_plug->
18876 +                           check_open(inode);
18877 +               }
18878 +       }
18879 +       /* lookup_sd() doesn't release coord because we want znode
18880 +          stay read-locked while stat-data fields are accessed in
18881 +          init_inode() */
18882 +       done_lh(&lh);
18883 +
18884 +       if (result != 0)
18885 +               reiser4_make_bad_inode(inode);
18886 +       return result;
18887 +}
18888 +
18889 +/* initialise new reiser4 inode being inserted into hash table. */
18890 +static int init_locked_inode(struct inode *inode /* new inode */ ,
18891 +                            void *opaque       /* key of stat data passed to
18892 +                                               * the iget5_locked as cookie */)
18893 +{
18894 +       reiser4_key *key;
18895 +
18896 +       assert("nikita-1995", inode != NULL);
18897 +       assert("nikita-1996", opaque != NULL);
18898 +       key = opaque;
18899 +       set_inode_oid(inode, get_key_objectid(key));
18900 +       reiser4_inode_data(inode)->locality_id = get_key_locality(key);
18901 +       return 0;
18902 +}
18903 +
18904 +/* reiser4_inode_find_actor() - "find actor" supplied by reiser4 to
18905 +   iget5_locked().
18906 +
18907 +   This function is called by iget5_locked() to distinguish reiser4 inodes
18908 +   having the same inode numbers. Such inodes can only exist due to some error
18909 +   condition. One of them should be bad. Inodes with identical inode numbers
18910 +   (objectids) are distinguished by their packing locality.
18911 +
18912 +*/
18913 +static int reiser4_inode_find_actor(struct inode *inode        /* inode from hash table
18914 +                                                        * to check */ ,
18915 +                                   void *opaque        /* "cookie" passed to
18916 +                                                        * iget5_locked(). This
18917 +                                                        * is stat-data key */)
18918 +{
18919 +       reiser4_key *key;
18920 +
18921 +       key = opaque;
18922 +       return
18923 +           /* oid is unique, so first term is enough, actually. */
18924 +           get_inode_oid(inode) == get_key_objectid(key) &&
18925 +           /*
18926 +            * also, locality should be checked, but locality is stored in
18927 +            * the reiser4-specific part of the inode, and actor can be
18928 +            * called against arbitrary inode that happened to be in this
18929 +            * hash chain. Hence we first have to check that this is
18930 +            * reiser4 inode at least. is_reiser4_inode() is probably too
18931 +            * early to call, as inode may have ->i_op not yet
18932 +            * initialised.
18933 +            */
18934 +           is_reiser4_super(inode->i_sb) &&
18935 +           /*
18936 +            * usually objectid is unique, but pseudo files use counter to
18937 +            * generate objectid. All pseudo files are placed into special
18938 +            * (otherwise unused) locality.
18939 +            */
18940 +           reiser4_inode_data(inode)->locality_id == get_key_locality(key);
18941 +}
18942 +
18943 +/* hook for kmem_cache_create */
18944 +void loading_init_once(reiser4_inode * info)
18945 +{
18946 +       mutex_init(&info->loading);
18947 +}
18948 +
18949 +/* for reiser4_alloc_inode */
18950 +void loading_alloc(reiser4_inode * info)
18951 +{
18952 +       assert("vs-1717", !mutex_is_locked(&info->loading));
18953 +}
18954 +
18955 +/* for reiser4_destroy */
18956 +void loading_destroy(reiser4_inode * info)
18957 +{
18958 +       assert("vs-1717a", !mutex_is_locked(&info->loading));
18959 +}
18960 +
18961 +static void loading_begin(reiser4_inode * info)
18962 +{
18963 +       mutex_lock(&info->loading);
18964 +}
18965 +
18966 +static void loading_end(reiser4_inode * info)
18967 +{
18968 +       mutex_unlock(&info->loading);
18969 +}
18970 +
18971 +/**
18972 + * reiser4_iget - obtain inode via iget5_locked, read from disk if necessary
18973 + * @super: super block of filesystem
18974 + * @key: key of inode's stat-data
18975 + * @silent:
18976 + *
18977 + * This is our helper function a la iget(). This is be called by
18978 + * lookup_common() and reiser4_read_super(). Return inode locked or error
18979 + * encountered.
18980 + */
18981 +struct inode *reiser4_iget(struct super_block *super, const reiser4_key *key,
18982 +                          int silent)
18983 +{
18984 +       struct inode *inode;
18985 +       int result;
18986 +       reiser4_inode *info;
18987 +
18988 +       assert("nikita-302", super != NULL);
18989 +       assert("nikita-303", key != NULL);
18990 +
18991 +       result = 0;
18992 +
18993 +       /* call iget(). Our ->read_inode() is dummy, so this will either
18994 +          find inode in cache or return uninitialised inode */
18995 +       inode = iget5_locked(super,
18996 +                            (unsigned long)get_key_objectid(key),
18997 +                            reiser4_inode_find_actor,
18998 +                            init_locked_inode, (reiser4_key *) key);
18999 +       if (inode == NULL)
19000 +               return ERR_PTR(RETERR(-ENOMEM));
19001 +       if (is_bad_inode(inode)) {
19002 +               warning("nikita-304", "Bad inode found");
19003 +               reiser4_print_key("key", key);
19004 +               iput(inode);
19005 +               return ERR_PTR(RETERR(-EIO));
19006 +       }
19007 +
19008 +       info = reiser4_inode_data(inode);
19009 +
19010 +       /* Reiser4 inode state bit REISER4_LOADED is used to distinguish fully
19011 +          loaded and initialized inode from just allocated inode. If
19012 +          REISER4_LOADED bit is not set, reiser4_iget() completes loading under
19013 +          info->loading.  The place in reiser4 which uses not initialized inode
19014 +          is the reiser4 repacker, see repacker-related functions in
19015 +          plugin/item/extent.c */
19016 +       if (!is_inode_loaded(inode)) {
19017 +               loading_begin(info);
19018 +               if (!is_inode_loaded(inode)) {
19019 +                       /* locking: iget5_locked returns locked inode */
19020 +                       assert("nikita-1941", !is_inode_loaded(inode));
19021 +                       assert("nikita-1949",
19022 +                              reiser4_inode_find_actor(inode,
19023 +                                                       (reiser4_key *) key));
19024 +                       /* now, inode has objectid as ->i_ino and locality in
19025 +                          reiser4-specific part. This is enough for
19026 +                          read_inode() to read stat data from the disk */
19027 +                       result = read_inode(inode, key, silent);
19028 +               } else
19029 +                       loading_end(info);
19030 +       }
19031 +
19032 +       if (inode->i_state & I_NEW)
19033 +               unlock_new_inode(inode);
19034 +
19035 +       if (is_bad_inode(inode)) {
19036 +               assert("vs-1717", result != 0);
19037 +               loading_end(info);
19038 +               iput(inode);
19039 +               inode = ERR_PTR(result);
19040 +       } else if (REISER4_DEBUG) {
19041 +               reiser4_key found_key;
19042 +
19043 +               assert("vs-1717", result == 0);
19044 +               build_sd_key(inode, &found_key);
19045 +               if (!keyeq(&found_key, key)) {
19046 +                       warning("nikita-305", "Wrong key in sd");
19047 +                       reiser4_print_key("sought for", key);
19048 +                       reiser4_print_key("found", &found_key);
19049 +               }
19050 +               if (inode->i_nlink == 0) {
19051 +                       warning("nikita-3559", "Unlinked inode found: %llu\n",
19052 +                               (unsigned long long)get_inode_oid(inode));
19053 +               }
19054 +       }
19055 +       return inode;
19056 +}
19057 +
19058 +/* reiser4_iget() may return not fully initialized inode, this function should
19059 + * be called after one completes reiser4 inode initializing. */
19060 +void reiser4_iget_complete(struct inode *inode)
19061 +{
19062 +       assert("zam-988", is_reiser4_inode(inode));
19063 +
19064 +       if (!is_inode_loaded(inode)) {
19065 +               reiser4_inode_set_flag(inode, REISER4_LOADED);
19066 +               loading_end(reiser4_inode_data(inode));
19067 +       }
19068 +}
19069 +
19070 +void reiser4_make_bad_inode(struct inode *inode)
19071 +{
19072 +       assert("nikita-1934", inode != NULL);
19073 +
19074 +       /* clear LOADED bit */
19075 +       reiser4_inode_clr_flag(inode, REISER4_LOADED);
19076 +       make_bad_inode(inode);
19077 +       return;
19078 +}
19079 +
19080 +file_plugin *inode_file_plugin(const struct inode *inode)
19081 +{
19082 +       assert("nikita-1997", inode != NULL);
19083 +       return reiser4_inode_data(inode)->pset->file;
19084 +}
19085 +
19086 +dir_plugin *inode_dir_plugin(const struct inode *inode)
19087 +{
19088 +       assert("nikita-1998", inode != NULL);
19089 +       return reiser4_inode_data(inode)->pset->dir;
19090 +}
19091 +
19092 +formatting_plugin *inode_formatting_plugin(const struct inode *inode)
19093 +{
19094 +       assert("nikita-2000", inode != NULL);
19095 +       return reiser4_inode_data(inode)->pset->formatting;
19096 +}
19097 +
19098 +hash_plugin *inode_hash_plugin(const struct inode *inode)
19099 +{
19100 +       assert("nikita-2001", inode != NULL);
19101 +       return reiser4_inode_data(inode)->pset->hash;
19102 +}
19103 +
19104 +fibration_plugin *inode_fibration_plugin(const struct inode *inode)
19105 +{
19106 +       assert("nikita-2001", inode != NULL);
19107 +       return reiser4_inode_data(inode)->pset->fibration;
19108 +}
19109 +
19110 +cipher_plugin *inode_cipher_plugin(const struct inode *inode)
19111 +{
19112 +       assert("edward-36", inode != NULL);
19113 +       return reiser4_inode_data(inode)->pset->cipher;
19114 +}
19115 +
19116 +compression_plugin *inode_compression_plugin(const struct inode *inode)
19117 +{
19118 +       assert("edward-37", inode != NULL);
19119 +       return reiser4_inode_data(inode)->pset->compression;
19120 +}
19121 +
19122 +compression_mode_plugin *inode_compression_mode_plugin(const struct inode *
19123 +                                                      inode)
19124 +{
19125 +       assert("edward-1330", inode != NULL);
19126 +       return reiser4_inode_data(inode)->pset->compression_mode;
19127 +}
19128 +
19129 +cluster_plugin *inode_cluster_plugin(const struct inode *inode)
19130 +{
19131 +       assert("edward-1328", inode != NULL);
19132 +       return reiser4_inode_data(inode)->pset->cluster;
19133 +}
19134 +
19135 +file_plugin *inode_create_plugin(const struct inode *inode)
19136 +{
19137 +       assert("edward-1329", inode != NULL);
19138 +       return reiser4_inode_data(inode)->pset->create;
19139 +}
19140 +
19141 +digest_plugin *inode_digest_plugin(const struct inode *inode)
19142 +{
19143 +       assert("edward-86", inode != NULL);
19144 +       return reiser4_inode_data(inode)->pset->digest;
19145 +}
19146 +
19147 +item_plugin *inode_sd_plugin(const struct inode *inode)
19148 +{
19149 +       assert("vs-534", inode != NULL);
19150 +       return reiser4_inode_data(inode)->pset->sd;
19151 +}
19152 +
19153 +item_plugin *inode_dir_item_plugin(const struct inode *inode)
19154 +{
19155 +       assert("vs-534", inode != NULL);
19156 +       return reiser4_inode_data(inode)->pset->dir_item;
19157 +}
19158 +
19159 +file_plugin *child_create_plugin(const struct inode *inode)
19160 +{
19161 +       assert("edward-1329", inode != NULL);
19162 +       return reiser4_inode_data(inode)->hset->create;
19163 +}
19164 +
19165 +void inode_set_extension(struct inode *inode, sd_ext_bits ext)
19166 +{
19167 +       reiser4_inode *state;
19168 +
19169 +       assert("nikita-2716", inode != NULL);
19170 +       assert("nikita-2717", ext < LAST_SD_EXTENSION);
19171 +       assert("nikita-3491", spin_inode_is_locked(inode));
19172 +
19173 +       state = reiser4_inode_data(inode);
19174 +       state->extmask |= 1 << ext;
19175 +       /* force re-calculation of stat-data length on next call to
19176 +          update_sd(). */
19177 +       reiser4_inode_clr_flag(inode, REISER4_SDLEN_KNOWN);
19178 +}
19179 +
19180 +void inode_clr_extension(struct inode *inode, sd_ext_bits ext)
19181 +{
19182 +       reiser4_inode *state;
19183 +
19184 +       assert("vpf-1926", inode != NULL);
19185 +       assert("vpf-1927", ext < LAST_SD_EXTENSION);
19186 +       assert("vpf-1928", spin_inode_is_locked(inode));
19187 +
19188 +       state = reiser4_inode_data(inode);
19189 +       state->extmask &= ~(1 << ext);
19190 +       /* force re-calculation of stat-data length on next call to
19191 +          update_sd(). */
19192 +       reiser4_inode_clr_flag(inode, REISER4_SDLEN_KNOWN);
19193 +}
19194 +
19195 +void inode_check_scale_nolock(struct inode *inode, __u64 old, __u64 new)
19196 +{
19197 +       assert("edward-1287", inode != NULL);
19198 +       if (!dscale_fit(old, new))
19199 +               reiser4_inode_clr_flag(inode, REISER4_SDLEN_KNOWN);
19200 +       return;
19201 +}
19202 +
19203 +void inode_check_scale(struct inode *inode, __u64 old, __u64 new)
19204 +{
19205 +       assert("nikita-2875", inode != NULL);
19206 +       spin_lock_inode(inode);
19207 +       inode_check_scale_nolock(inode, old, new);
19208 +       spin_unlock_inode(inode);
19209 +}
19210 +
19211 +/*
19212 + * initialize ->ordering field of inode. This field defines how file stat-data
19213 + * and body is ordered within a tree with respect to other objects within the
19214 + * same parent directory.
19215 + */
19216 +void
19217 +init_inode_ordering(struct inode *inode,
19218 +                   reiser4_object_create_data * crd, int create)
19219 +{
19220 +       reiser4_key key;
19221 +
19222 +       if (create) {
19223 +               struct inode *parent;
19224 +
19225 +               parent = crd->parent;
19226 +               assert("nikita-3224", inode_dir_plugin(parent) != NULL);
19227 +               inode_dir_plugin(parent)->build_entry_key(parent,
19228 +                                                         &crd->dentry->d_name,
19229 +                                                         &key);
19230 +       } else {
19231 +               coord_t *coord;
19232 +
19233 +               coord = &reiser4_inode_data(inode)->sd_coord;
19234 +               coord_clear_iplug(coord);
19235 +               /* safe to use ->sd_coord, because node is under long term
19236 +                * lock */
19237 +               WITH_DATA(coord->node, item_key_by_coord(coord, &key));
19238 +       }
19239 +
19240 +       set_inode_ordering(inode, get_key_ordering(&key));
19241 +}
19242 +
19243 +znode *inode_get_vroot(struct inode *inode)
19244 +{
19245 +       reiser4_block_nr blk;
19246 +       znode *result;
19247 +
19248 +       spin_lock_inode(inode);
19249 +       blk = reiser4_inode_data(inode)->vroot;
19250 +       spin_unlock_inode(inode);
19251 +       if (!disk_addr_eq(&UBER_TREE_ADDR, &blk))
19252 +               result = zlook(reiser4_tree_by_inode(inode), &blk);
19253 +       else
19254 +               result = NULL;
19255 +       return result;
19256 +}
19257 +
19258 +void inode_set_vroot(struct inode *inode, znode *vroot)
19259 +{
19260 +       spin_lock_inode(inode);
19261 +       reiser4_inode_data(inode)->vroot = *znode_get_block(vroot);
19262 +       spin_unlock_inode(inode);
19263 +}
19264 +
19265 +#if REISER4_DEBUG
19266 +
19267 +void reiser4_inode_invariant(const struct inode *inode)
19268 +{
19269 +       assert("nikita-3077", spin_inode_is_locked(inode));
19270 +}
19271 +
19272 +int inode_has_no_jnodes(reiser4_inode * r4_inode)
19273 +{
19274 +       return jnode_tree_by_reiser4_inode(r4_inode)->rnode == NULL &&
19275 +               r4_inode->nr_jnodes == 0;
19276 +}
19277 +
19278 +#endif
19279 +
19280 +/* true if directory is empty (only contains dot and dotdot) */
19281 +/* FIXME: shouldn't it be dir plugin method? */
19282 +int is_dir_empty(const struct inode *dir)
19283 +{
19284 +       assert("nikita-1976", dir != NULL);
19285 +
19286 +       /* rely on our method to maintain directory i_size being equal to the
19287 +          number of entries. */
19288 +       return dir->i_size <= 2 ? 0 : RETERR(-ENOTEMPTY);
19289 +}
19290 +
19291 +/* Make Linus happy.
19292 +   Local variables:
19293 +   c-indentation-style: "K&R"
19294 +   mode-name: "LC"
19295 +   c-basic-offset: 8
19296 +   tab-width: 8
19297 +   fill-column: 120
19298 +   End:
19299 +*/
19300 diff -puN /dev/null fs/reiser4/inode.h
19301 --- /dev/null
19302 +++ a/fs/reiser4/inode.h
19303 @@ -0,0 +1,453 @@
19304 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
19305 +   reiser4/README */
19306 +
19307 +/* Inode functions. */
19308 +
19309 +#if !defined(__REISER4_INODE_H__)
19310 +#define __REISER4_INODE_H__
19311 +
19312 +#include "forward.h"
19313 +#include "debug.h"
19314 +#include "key.h"
19315 +#include "seal.h"
19316 +#include "plugin/plugin.h"
19317 +#include "plugin/file/cryptcompress.h"
19318 +#include "plugin/file/file.h"
19319 +#include "plugin/dir/dir.h"
19320 +#include "plugin/plugin_set.h"
19321 +#include "plugin/security/perm.h"
19322 +#include "vfs_ops.h"
19323 +#include "jnode.h"
19324 +#include "fsdata.h"
19325 +
19326 +#include <linux/types.h>       /* for __u?? , ino_t */
19327 +#include <linux/fs.h>          /* for struct super_block, struct
19328 +                                * rw_semaphore, etc  */
19329 +#include <linux/spinlock.h>
19330 +#include <asm/types.h>
19331 +
19332 +/* reiser4-specific inode flags. They are "transient" and are not
19333 +   supposed to be stored on disk. Used to trace "state" of
19334 +   inode
19335 +*/
19336 +typedef enum {
19337 +       /* this is light-weight inode, inheriting some state from its
19338 +          parent  */
19339 +       REISER4_LIGHT_WEIGHT = 0,
19340 +       /* stat data wasn't yet created */
19341 +       REISER4_NO_SD = 1,
19342 +       /* internal immutable flag. Currently is only used
19343 +          to avoid race condition during file creation.
19344 +          See comment in create_object(). */
19345 +       REISER4_IMMUTABLE = 2,
19346 +       /* inode was read from storage */
19347 +       REISER4_LOADED = 3,
19348 +       /* this bit is set for symlinks. inode->i_private points to target
19349 +          name of symlink. */
19350 +       REISER4_GENERIC_PTR_USED = 4,
19351 +       /* set if size of stat-data item for this inode is known. If this is
19352 +        * set we can avoid recalculating size of stat-data on each update. */
19353 +       REISER4_SDLEN_KNOWN = 5,
19354 +       /* reiser4_inode->crypt points to the crypto stat */
19355 +       REISER4_CRYPTO_STAT_LOADED = 6,
19356 +       /* cryptcompress_inode_data points to the secret key */
19357 +       REISER4_SECRET_KEY_INSTALLED = 7,
19358 +       /* File (possibly) has pages corresponding to the tail items, that
19359 +        * were created by ->readpage. It is set by mmap_unix_file() and
19360 +        * sendfile_unix_file(). This bit is inspected by write_unix_file and
19361 +        * kill-hook of tail items. It is never cleared once set. This bit is
19362 +        * modified and inspected under i_mutex. */
19363 +       REISER4_HAS_MMAP = 8,
19364 +       REISER4_PART_MIXED = 9,
19365 +       REISER4_PART_IN_CONV = 10,
19366 +       /* This flag indicates that file plugin conversion is in progress */
19367 +       REISER4_FILE_CONV_IN_PROGRESS = 11
19368 +} reiser4_file_plugin_flags;
19369 +
19370 +/* state associated with each inode.
19371 +   reiser4 inode.
19372 +
19373 +   NOTE-NIKITA In 2.5 kernels it is not necessary that all file-system inodes
19374 +   be of the same size. File-system allocates inodes by itself through
19375 +   s_op->allocate_inode() method. So, it is possible to adjust size of inode
19376 +   at the time of its creation.
19377 +
19378 +   Invariants involving parts of this data-type:
19379 +
19380 +      [inode->eflushed]
19381 +
19382 +*/
19383 +
19384 +typedef struct reiser4_inode reiser4_inode;
19385 +/* return pointer to reiser4-specific part of inode */
19386 +static inline reiser4_inode *reiser4_inode_data(const struct inode *inode
19387 +                                               /* inode queried */ );
19388 +
19389 +#if BITS_PER_LONG == 64
19390 +
19391 +#define REISER4_INO_IS_OID (1)
19392 +typedef struct {;
19393 +} oid_hi_t;
19394 +
19395 +/* BITS_PER_LONG == 64 */
19396 +#else
19397 +
19398 +#define REISER4_INO_IS_OID (0)
19399 +typedef __u32 oid_hi_t;
19400 +
19401 +/* BITS_PER_LONG == 64 */
19402 +#endif
19403 +
19404 +struct reiser4_inode {
19405 +       /* spin lock protecting fields of this structure. */
19406 +       spinlock_t guard;
19407 +       /* main plugin set that control the file
19408 +          (see comments in plugin/plugin_set.c) */
19409 +       plugin_set *pset;
19410 +       /* plugin set for inheritance
19411 +          (see comments in plugin/plugin_set.c) */
19412 +       plugin_set *hset;
19413 +       /* high 32 bits of object id */
19414 +       oid_hi_t oid_hi;
19415 +       /* seal for stat-data */
19416 +       seal_t sd_seal;
19417 +       /* locality id for this file */
19418 +       oid_t locality_id;
19419 +#if REISER4_LARGE_KEY
19420 +       __u64 ordering;
19421 +#endif
19422 +       /* coord of stat-data in sealed node */
19423 +       coord_t sd_coord;
19424 +       /* bit-mask of stat-data extentions used by this file */
19425 +       __u64 extmask;
19426 +       /* bitmask of non-default plugins for this inode */
19427 +       __u16 plugin_mask;
19428 +       /* bitmask of set heir plugins for this inode. */
19429 +       __u16 heir_mask;
19430 +       union {
19431 +               struct list_head readdir_list;
19432 +               struct list_head not_used;
19433 +       } lists;
19434 +       /* per-inode flags. Filled by values of reiser4_file_plugin_flags */
19435 +       unsigned long flags;
19436 +       union {
19437 +               /* fields specific to unix_file plugin */
19438 +               struct unix_file_info unix_file_info;
19439 +               /* fields specific to cryptcompress file plugin */
19440 +               struct cryptcompress_info cryptcompress_info;
19441 +       } file_plugin_data;
19442 +
19443 +       /* this semaphore is to serialize readers and writers of @pset->file
19444 +        * when file plugin conversion is enabled
19445 +        */
19446 +       struct rw_semaphore conv_sem;
19447 +
19448 +       /* tree of jnodes. Phantom jnodes (ones not attched to any atom) are
19449 +          tagged in that tree by EFLUSH_TAG_ANONYMOUS */
19450 +       struct radix_tree_root jnodes_tree;
19451 +#if REISER4_DEBUG
19452 +       /* number of unformatted node jnodes of this file in jnode hash table */
19453 +       unsigned long nr_jnodes;
19454 +#endif
19455 +
19456 +       /* block number of virtual root for this object. See comment above
19457 +        * fs/reiser4/search.c:handle_vroot() */
19458 +       reiser4_block_nr vroot;
19459 +       struct mutex loading;
19460 +};
19461 +
19462 +void loading_init_once(reiser4_inode *);
19463 +void loading_alloc(reiser4_inode *);
19464 +void loading_destroy(reiser4_inode *);
19465 +
19466 +struct reiser4_inode_object {
19467 +       /* private part */
19468 +       reiser4_inode p;
19469 +       /* generic fields not specific to reiser4, but used by VFS */
19470 +       struct inode vfs_inode;
19471 +};
19472 +
19473 +/* return pointer to the reiser4 specific portion of @inode */
19474 +static inline reiser4_inode *reiser4_inode_data(const struct inode *inode
19475 +                                               /* inode queried */ )
19476 +{
19477 +       assert("nikita-254", inode != NULL);
19478 +       return &container_of(inode, struct reiser4_inode_object, vfs_inode)->p;
19479 +}
19480 +
19481 +static inline struct inode *inode_by_reiser4_inode(const reiser4_inode *
19482 +                                                  r4_inode /* inode queried */
19483 +                                                  )
19484 +{
19485 +       return &container_of(r4_inode, struct reiser4_inode_object,
19486 +                            p)->vfs_inode;
19487 +}
19488 +
19489 +/*
19490 + * reiser4 inodes are identified by 64bit object-id (oid_t), but in struct
19491 + * inode ->i_ino field is of type ino_t (long) that can be either 32 or 64
19492 + * bits.
19493 + *
19494 + * If ->i_ino is 32 bits we store remaining 32 bits in reiser4 specific part
19495 + * of inode, otherwise whole oid is stored in i_ino.
19496 + *
19497 + * Wrappers below ([sg]et_inode_oid()) are used to hide this difference.
19498 + */
19499 +
19500 +#define OID_HI_SHIFT (sizeof(ino_t) * 8)
19501 +
19502 +#if REISER4_INO_IS_OID
19503 +
19504 +static inline oid_t get_inode_oid(const struct inode *inode)
19505 +{
19506 +       return inode->i_ino;
19507 +}
19508 +
19509 +static inline void set_inode_oid(struct inode *inode, oid_t oid)
19510 +{
19511 +       inode->i_ino = oid;
19512 +}
19513 +
19514 +/* REISER4_INO_IS_OID */
19515 +#else
19516 +
19517 +static inline oid_t get_inode_oid(const struct inode *inode)
19518 +{
19519 +       return
19520 +           ((__u64) reiser4_inode_data(inode)->oid_hi << OID_HI_SHIFT) |
19521 +           inode->i_ino;
19522 +}
19523 +
19524 +static inline void set_inode_oid(struct inode *inode, oid_t oid)
19525 +{
19526 +       assert("nikita-2519", inode != NULL);
19527 +       inode->i_ino = (ino_t) (oid);
19528 +       reiser4_inode_data(inode)->oid_hi = (oid) >> OID_HI_SHIFT;
19529 +       assert("nikita-2521", get_inode_oid(inode) == (oid));
19530 +}
19531 +
19532 +/* REISER4_INO_IS_OID */
19533 +#endif
19534 +
19535 +static inline oid_t get_inode_locality(const struct inode *inode)
19536 +{
19537 +       return reiser4_inode_data(inode)->locality_id;
19538 +}
19539 +
19540 +#if REISER4_LARGE_KEY
19541 +static inline __u64 get_inode_ordering(const struct inode *inode)
19542 +{
19543 +       return reiser4_inode_data(inode)->ordering;
19544 +}
19545 +
19546 +static inline void set_inode_ordering(const struct inode *inode, __u64 ordering)
19547 +{
19548 +       reiser4_inode_data(inode)->ordering = ordering;
19549 +}
19550 +
19551 +#else
19552 +
19553 +#define get_inode_ordering(inode) (0)
19554 +#define set_inode_ordering(inode, val) noop
19555 +
19556 +#endif
19557 +
19558 +/* return inode in which @uf_info is embedded */
19559 +static inline struct inode *
19560 +unix_file_info_to_inode(const struct unix_file_info *uf_info)
19561 +{
19562 +       return &container_of(uf_info, struct reiser4_inode_object,
19563 +                            p.file_plugin_data.unix_file_info)->vfs_inode;
19564 +}
19565 +
19566 +extern ino_t oid_to_ino(oid_t oid) __attribute__ ((const));
19567 +extern ino_t oid_to_uino(oid_t oid) __attribute__ ((const));
19568 +
19569 +extern reiser4_tree *reiser4_tree_by_inode(const struct inode *inode);
19570 +
19571 +#if REISER4_DEBUG
19572 +extern void reiser4_inode_invariant(const struct inode *inode);
19573 +extern int inode_has_no_jnodes(reiser4_inode *);
19574 +#else
19575 +#define reiser4_inode_invariant(inode) noop
19576 +#endif
19577 +
19578 +static inline int spin_inode_is_locked(const struct inode *inode)
19579 +{
19580 +       assert_spin_locked(&reiser4_inode_data(inode)->guard);
19581 +       return 1;
19582 +}
19583 +
19584 +/**
19585 + * spin_lock_inode - lock reiser4_inode' embedded spinlock
19586 + * @inode: inode to lock
19587 + *
19588 + * In debug mode it checks that lower priority locks are not held and
19589 + * increments reiser4_context's lock counters on which lock ordering checking
19590 + * is based.
19591 + */
19592 +static inline void spin_lock_inode(struct inode *inode)
19593 +{
19594 +       assert("", LOCK_CNT_NIL(spin_locked));
19595 +       /* check lock ordering */
19596 +       assert_spin_not_locked(&d_lock);
19597 +
19598 +       spin_lock(&reiser4_inode_data(inode)->guard);
19599 +
19600 +       LOCK_CNT_INC(spin_locked_inode);
19601 +       LOCK_CNT_INC(spin_locked);
19602 +
19603 +       reiser4_inode_invariant(inode);
19604 +}
19605 +
19606 +/**
19607 + * spin_unlock_inode - unlock reiser4_inode' embedded spinlock
19608 + * @inode: inode to unlock
19609 + *
19610 + * In debug mode it checks that spinlock is held and decrements
19611 + * reiser4_context's lock counters on which lock ordering checking is based.
19612 + */
19613 +static inline void spin_unlock_inode(struct inode *inode)
19614 +{
19615 +       assert_spin_locked(&reiser4_inode_data(inode)->guard);
19616 +       assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_inode));
19617 +       assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
19618 +
19619 +       reiser4_inode_invariant(inode);
19620 +
19621 +       LOCK_CNT_DEC(spin_locked_inode);
19622 +       LOCK_CNT_DEC(spin_locked);
19623 +
19624 +       spin_unlock(&reiser4_inode_data(inode)->guard);
19625 +}
19626 +
19627 +extern znode *inode_get_vroot(struct inode *inode);
19628 +extern void inode_set_vroot(struct inode *inode, znode * vroot);
19629 +
19630 +extern int reiser4_max_filename_len(const struct inode *inode);
19631 +extern int max_hash_collisions(const struct inode *dir);
19632 +extern void reiser4_unlock_inode(struct inode *inode);
19633 +extern int is_reiser4_inode(const struct inode *inode);
19634 +extern int setup_inode_ops(struct inode *inode, reiser4_object_create_data *);
19635 +extern struct inode *reiser4_iget(struct super_block *super,
19636 +                                 const reiser4_key * key, int silent);
19637 +extern void reiser4_iget_complete(struct inode *inode);
19638 +extern void reiser4_inode_set_flag(struct inode *inode,
19639 +                                  reiser4_file_plugin_flags f);
19640 +extern void reiser4_inode_clr_flag(struct inode *inode,
19641 +                                  reiser4_file_plugin_flags f);
19642 +extern int reiser4_inode_get_flag(const struct inode *inode,
19643 +                                 reiser4_file_plugin_flags f);
19644 +
19645 +/*  has inode been initialized? */
19646 +static inline int
19647 +is_inode_loaded(const struct inode *inode/* inode queried */)
19648 +{
19649 +       assert("nikita-1120", inode != NULL);
19650 +       return reiser4_inode_get_flag(inode, REISER4_LOADED);
19651 +}
19652 +
19653 +extern file_plugin *inode_file_plugin(const struct inode *inode);
19654 +extern dir_plugin *inode_dir_plugin(const struct inode *inode);
19655 +extern formatting_plugin *inode_formatting_plugin(const struct inode *inode);
19656 +extern hash_plugin *inode_hash_plugin(const struct inode *inode);
19657 +extern fibration_plugin *inode_fibration_plugin(const struct inode *inode);
19658 +extern cipher_plugin *inode_cipher_plugin(const struct inode *inode);
19659 +extern digest_plugin *inode_digest_plugin(const struct inode *inode);
19660 +extern compression_plugin *inode_compression_plugin(const struct inode *inode);
19661 +extern compression_mode_plugin *inode_compression_mode_plugin(const struct inode
19662 +                                                             *inode);
19663 +extern cluster_plugin *inode_cluster_plugin(const struct inode *inode);
19664 +extern file_plugin *inode_create_plugin(const struct inode *inode);
19665 +extern item_plugin *inode_sd_plugin(const struct inode *inode);
19666 +extern item_plugin *inode_dir_item_plugin(const struct inode *inode);
19667 +extern file_plugin *child_create_plugin(const struct inode *inode);
19668 +
19669 +extern void reiser4_make_bad_inode(struct inode *inode);
19670 +
19671 +extern void inode_set_extension(struct inode *inode, sd_ext_bits ext);
19672 +extern void inode_clr_extension(struct inode *inode, sd_ext_bits ext);
19673 +extern void inode_check_scale(struct inode *inode, __u64 old, __u64 new);
19674 +extern void inode_check_scale_nolock(struct inode *inode, __u64 old, __u64 new);
19675 +
19676 +#define INODE_SET_SIZE(i, value)                       \
19677 +({                                                     \
19678 +       struct inode *__i;                              \
19679 +       typeof(value) __v;                              \
19680 +                                                       \
19681 +       __i = (i);                                      \
19682 +       __v = (value);                                  \
19683 +       inode_check_scale(__i, __i->i_size, __v);       \
19684 +       i_size_write(__i, __v);                         \
19685 +})
19686 +
19687 +/*
19688 + * update field @field in inode @i to contain value @value.
19689 + */
19690 +#define INODE_SET_FIELD(i, field, value)               \
19691 +({                                                     \
19692 +       struct inode *__i;                              \
19693 +       typeof(value) __v;                              \
19694 +                                                       \
19695 +       __i = (i);                                      \
19696 +       __v = (value);                                  \
19697 +       inode_check_scale(__i, __i->field, __v);        \
19698 +       __i->field = __v;                               \
19699 +})
19700 +
19701 +#define INODE_INC_FIELD(i, field)                              \
19702 +({                                                             \
19703 +       struct inode *__i;                                      \
19704 +                                                               \
19705 +       __i = (i);                                              \
19706 +       inode_check_scale(__i, __i->field, __i->field + 1);     \
19707 +       ++ __i->field;                                          \
19708 +})
19709 +
19710 +#define INODE_DEC_FIELD(i, field)                              \
19711 +({                                                             \
19712 +       struct inode *__i;                                      \
19713 +                                                               \
19714 +       __i = (i);                                              \
19715 +       inode_check_scale(__i, __i->field, __i->field - 1);     \
19716 +       -- __i->field;                                          \
19717 +})
19718 +
19719 +/* See comment before reiser4_readdir_common() for description. */
19720 +static inline struct list_head *get_readdir_list(const struct inode *inode)
19721 +{
19722 +       return &reiser4_inode_data(inode)->lists.readdir_list;
19723 +}
19724 +
19725 +extern void init_inode_ordering(struct inode *inode,
19726 +                               reiser4_object_create_data * crd, int create);
19727 +
19728 +static inline struct radix_tree_root *jnode_tree_by_inode(struct inode *inode)
19729 +{
19730 +       return &reiser4_inode_data(inode)->jnodes_tree;
19731 +}
19732 +
19733 +static inline struct radix_tree_root *jnode_tree_by_reiser4_inode(reiser4_inode
19734 +                                                                 *r4_inode)
19735 +{
19736 +       return &r4_inode->jnodes_tree;
19737 +}
19738 +
19739 +#if REISER4_DEBUG
19740 +extern void print_inode(const char *prefix, const struct inode *i);
19741 +#endif
19742 +
19743 +int is_dir_empty(const struct inode *);
19744 +
19745 +/* __REISER4_INODE_H__ */
19746 +#endif
19747 +
19748 +/* Make Linus happy.
19749 +   Local variables:
19750 +   c-indentation-style: "K&R"
19751 +   mode-name: "LC"
19752 +   c-basic-offset: 8
19753 +   tab-width: 8
19754 +   fill-column: 120
19755 +   End:
19756 +*/
19757 diff -puN /dev/null fs/reiser4/ioctl.h
19758 --- /dev/null
19759 +++ a/fs/reiser4/ioctl.h
19760 @@ -0,0 +1,41 @@
19761 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
19762 + * reiser4/README */
19763 +
19764 +#if !defined(__REISER4_IOCTL_H__)
19765 +#define __REISER4_IOCTL_H__
19766 +
19767 +#include <linux/fs.h>
19768 +
19769 +/*
19770 + * ioctl(2) command used to "unpack" reiser4 file, that is, convert it into
19771 + * extents and fix in this state. This is used by applications that rely on
19772 + *
19773 + *     . files being block aligned, and
19774 + *
19775 + *     . files never migrating on disk
19776 + *
19777 + * for example, boot loaders (LILO) need this.
19778 + *
19779 + * This ioctl should be used as
19780 + *
19781 + *     result = ioctl(fd, REISER4_IOC_UNPACK);
19782 + *
19783 + * File behind fd descriptor will be converted to the extents (if necessary),
19784 + * and its stat-data will be updated so that it will never be converted back
19785 + * into tails again.
19786 + */
19787 +#define REISER4_IOC_UNPACK _IOW(0xCD, 1, long)
19788 +
19789 +/* __REISER4_IOCTL_H__ */
19790 +#endif
19791 +
19792 +/* Make Linus happy.
19793 +   Local variables:
19794 +   c-indentation-style: "K&R"
19795 +   mode-name: "LC"
19796 +   c-basic-offset: 8
19797 +   tab-width: 8
19798 +   fill-column: 120
19799 +   scroll-step: 1
19800 +   End:
19801 +*/
19802 diff -puN /dev/null fs/reiser4/jnode.c
19803 --- /dev/null
19804 +++ a/fs/reiser4/jnode.c
19805 @@ -0,0 +1,1923 @@
19806 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
19807 + * reiser4/README */
19808 +/* Jnode manipulation functions. */
19809 +/* Jnode is entity used to track blocks with data and meta-data in reiser4.
19810 +
19811 +   In particular, jnodes are used to track transactional information
19812 +   associated with each block. Each znode contains jnode as ->zjnode field.
19813 +
19814 +   Jnode stands for either Josh or Journal node.
19815 +*/
19816 +
19817 +/*
19818 + * Taxonomy.
19819 + *
19820 + *     Jnode represents block containing data or meta-data. There are jnodes
19821 + *     for:
19822 + *
19823 + *         unformatted blocks (jnodes proper). There are plans, however to
19824 + *         have a handle per extent unit rather than per each unformatted
19825 + *         block, because there are so many of them.
19826 + *
19827 + *         For bitmaps. Each bitmap is actually represented by two jnodes--one
19828 + *         for working and another for "commit" data, together forming bnode.
19829 + *
19830 + *         For io-heads. These are used by log writer.
19831 + *
19832 + *         For formatted nodes (znode). See comment at the top of znode.c for
19833 + *         details specific to the formatted nodes (znodes).
19834 + *
19835 + * Node data.
19836 + *
19837 + *     Jnode provides access to the data of node it represents. Data are
19838 + *     stored in a page. Page is kept in a page cache. This means, that jnodes
19839 + *     are highly interconnected with page cache and VM internals.
19840 + *
19841 + *     jnode has a pointer to page (->pg) containing its data. Pointer to data
19842 + *     themselves is cached in ->data field to avoid frequent calls to
19843 + *     page_address().
19844 + *
19845 + *     jnode and page are attached to each other by jnode_attach_page(). This
19846 + *     function places pointer to jnode in set_page_private(), sets PG_private
19847 + *     flag and increments page counter.
19848 + *
19849 + *     Opposite operation is performed by page_clear_jnode().
19850 + *
19851 + *     jnode->pg is protected by jnode spin lock, and page->private is
19852 + *     protected by page lock. See comment at the top of page_cache.c for
19853 + *     more.
19854 + *
19855 + *     page can be detached from jnode for two reasons:
19856 + *
19857 + *         . jnode is removed from a tree (file is truncated, of formatted
19858 + *         node is removed by balancing).
19859 + *
19860 + *         . during memory pressure, VM calls ->releasepage() method
19861 + *         (reiser4_releasepage()) to evict page from memory.
19862 + *
19863 + *    (there, of course, is also umount, but this is special case we are not
19864 + *    concerned with here).
19865 + *
19866 + *    To protect jnode page from eviction, one calls jload() function that
19867 + *    "pins" page in memory (loading it if necessary), increments
19868 + *    jnode->d_count, and kmap()s page. Page is unpinned through call to
19869 + *    jrelse().
19870 + *
19871 + * Jnode life cycle.
19872 + *
19873 + *    jnode is created, placed in hash table, and, optionally, in per-inode
19874 + *    radix tree. Page can be attached to jnode, pinned, released, etc.
19875 + *
19876 + *    When jnode is captured into atom its reference counter is
19877 + *    increased. While being part of an atom, jnode can be "early
19878 + *    flushed". This means that as part of flush procedure, jnode is placed
19879 + *    into "relocate set", and its page is submitted to the disk. After io
19880 + *    completes, page can be detached, then loaded again, re-dirtied, etc.
19881 + *
19882 + *    Thread acquired reference to jnode by calling jref() and releases it by
19883 + *    jput(). When last reference is removed, jnode is still retained in
19884 + *    memory (cached) if it has page attached, _unless_ it is scheduled for
19885 + *    destruction (has JNODE_HEARD_BANSHEE bit set).
19886 + *
19887 + *    Tree read-write lock was used as "existential" lock for jnodes. That is,
19888 + *    jnode->x_count could be changed from 0 to 1 only under tree write lock,
19889 + *    that is, tree lock protected unreferenced jnodes stored in the hash
19890 + *    table, from recycling.
19891 + *
19892 + *    This resulted in high contention on tree lock, because jref()/jput() is
19893 + *    frequent operation. To ameliorate this problem, RCU is used: when jput()
19894 + *    is just about to release last reference on jnode it sets JNODE_RIP bit
19895 + *    on it, and then proceed with jnode destruction (removing jnode from hash
19896 + *    table, cbk_cache, detaching page, etc.). All places that change jnode
19897 + *    reference counter from 0 to 1 (jlookup(), zlook(), zget(), and
19898 + *    cbk_cache_scan_slots()) check for JNODE_RIP bit (this is done by
19899 + *    jnode_rip_check() function), and pretend that nothing was found in hash
19900 + *    table if bit is set.
19901 + *
19902 + *    jput defers actual return of jnode into slab cache to some later time
19903 + *    (by call_rcu()), this guarantees that other threads can safely continue
19904 + *    working with JNODE_RIP-ped jnode.
19905 + *
19906 + */
19907 +
19908 +#include "reiser4.h"
19909 +#include "debug.h"
19910 +#include "dformat.h"
19911 +#include "jnode.h"
19912 +#include "plugin/plugin_header.h"
19913 +#include "plugin/plugin.h"
19914 +#include "txnmgr.h"
19915 +/*#include "jnode.h"*/
19916 +#include "znode.h"
19917 +#include "tree.h"
19918 +#include "tree_walk.h"
19919 +#include "super.h"
19920 +#include "inode.h"
19921 +#include "page_cache.h"
19922 +
19923 +#include <asm/uaccess.h>       /* UML needs this for PAGE_OFFSET */
19924 +#include <linux/types.h>
19925 +#include <linux/slab.h>
19926 +#include <linux/pagemap.h>
19927 +#include <linux/swap.h>
19928 +#include <linux/fs.h>          /* for struct address_space  */
19929 +#include <linux/writeback.h>   /* for inode_lock */
19930 +
19931 +static struct kmem_cache *_jnode_slab = NULL;
19932 +
19933 +static void jnode_set_type(jnode * node, jnode_type type);
19934 +static int jdelete(jnode * node);
19935 +static int jnode_try_drop(jnode * node);
19936 +
19937 +#if REISER4_DEBUG
19938 +static int jnode_invariant(const jnode * node, int tlocked, int jlocked);
19939 +#endif
19940 +
19941 +/* true if valid page is attached to jnode */
19942 +static inline int jnode_is_parsed(jnode * node)
19943 +{
19944 +       return JF_ISSET(node, JNODE_PARSED);
19945 +}
19946 +
19947 +/* hash table support */
19948 +
19949 +/* compare two jnode keys for equality. Used by hash-table macros */
19950 +static inline int jnode_key_eq(const struct jnode_key *k1,
19951 +                              const struct jnode_key *k2)
19952 +{
19953 +       assert("nikita-2350", k1 != NULL);
19954 +       assert("nikita-2351", k2 != NULL);
19955 +
19956 +       return (k1->index == k2->index && k1->objectid == k2->objectid);
19957 +}
19958 +
19959 +/* Hash jnode by its key (inode plus offset). Used by hash-table macros */
19960 +static inline __u32 jnode_key_hashfn(j_hash_table * table,
19961 +                                    const struct jnode_key *key)
19962 +{
19963 +       assert("nikita-2352", key != NULL);
19964 +       assert("nikita-3346", IS_POW(table->_buckets));
19965 +
19966 +       /* yes, this is remarkable simply (where not stupid) hash function. */
19967 +       return (key->objectid + key->index) & (table->_buckets - 1);
19968 +}
19969 +
19970 +/* The hash table definition */
19971 +#define KMALLOC(size) reiser4_vmalloc(size)
19972 +#define KFREE(ptr, size) vfree(ptr)
19973 +TYPE_SAFE_HASH_DEFINE(j, jnode, struct jnode_key, key.j, link.j,
19974 +                     jnode_key_hashfn, jnode_key_eq);
19975 +#undef KFREE
19976 +#undef KMALLOC
19977 +
19978 +/* call this to initialise jnode hash table */
19979 +int jnodes_tree_init(reiser4_tree * tree/* tree to initialise jnodes for */)
19980 +{
19981 +       assert("nikita-2359", tree != NULL);
19982 +       return j_hash_init(&tree->jhash_table, 16384);
19983 +}
19984 +
19985 +/* call this to destroy jnode hash table. This is called during umount. */
19986 +int jnodes_tree_done(reiser4_tree * tree/* tree to destroy jnodes for */)
19987 +{
19988 +       j_hash_table *jtable;
19989 +       jnode *node;
19990 +       jnode *next;
19991 +
19992 +       assert("nikita-2360", tree != NULL);
19993 +
19994 +       /*
19995 +        * Scan hash table and free all jnodes.
19996 +        */
19997 +       jtable = &tree->jhash_table;
19998 +       if (jtable->_table) {
19999 +               for_all_in_htable(jtable, j, node, next) {
20000 +                       assert("nikita-2361", !atomic_read(&node->x_count));
20001 +                       jdrop(node);
20002 +               }
20003 +
20004 +               j_hash_done(&tree->jhash_table);
20005 +       }
20006 +       return 0;
20007 +}
20008 +
20009 +/**
20010 + * init_jnodes - create jnode cache
20011 + *
20012 + * Initializes slab cache jnodes. It is part of reiser4 module initialization.
20013 + */
20014 +int init_jnodes(void)
20015 +{
20016 +       assert("umka-168", _jnode_slab == NULL);
20017 +
20018 +       _jnode_slab = kmem_cache_create("jnode", sizeof(jnode), 0,
20019 +                                       SLAB_HWCACHE_ALIGN |
20020 +                                       SLAB_RECLAIM_ACCOUNT, NULL);
20021 +       if (_jnode_slab == NULL)
20022 +               return RETERR(-ENOMEM);
20023 +
20024 +       return 0;
20025 +}
20026 +
20027 +/**
20028 + * done_znodes - delete znode cache
20029 + *
20030 + * This is called on reiser4 module unloading or system shutdown.
20031 + */
20032 +void done_jnodes(void)
20033 +{
20034 +       destroy_reiser4_cache(&_jnode_slab);
20035 +}
20036 +
20037 +/* Initialize a jnode. */
20038 +void jnode_init(jnode * node, reiser4_tree * tree, jnode_type type)
20039 +{
20040 +       assert("umka-175", node != NULL);
20041 +
20042 +       memset(node, 0, sizeof(jnode));
20043 +       ON_DEBUG(node->magic = JMAGIC);
20044 +       jnode_set_type(node, type);
20045 +       atomic_set(&node->d_count, 0);
20046 +       atomic_set(&node->x_count, 0);
20047 +       spin_lock_init(&node->guard);
20048 +       spin_lock_init(&node->load);
20049 +       node->atom = NULL;
20050 +       node->tree = tree;
20051 +       INIT_LIST_HEAD(&node->capture_link);
20052 +
20053 +       ASSIGN_NODE_LIST(node, NOT_CAPTURED);
20054 +
20055 +       INIT_RCU_HEAD(&node->rcu);
20056 +
20057 +#if REISER4_DEBUG
20058 +       {
20059 +               reiser4_super_info_data *sbinfo;
20060 +
20061 +               sbinfo = get_super_private(tree->super);
20062 +               spin_lock_irq(&sbinfo->all_guard);
20063 +               list_add(&node->jnodes, &sbinfo->all_jnodes);
20064 +               spin_unlock_irq(&sbinfo->all_guard);
20065 +       }
20066 +#endif
20067 +}
20068 +
20069 +#if REISER4_DEBUG
20070 +/*
20071 + * Remove jnode from ->all_jnodes list.
20072 + */
20073 +static void jnode_done(jnode * node, reiser4_tree * tree)
20074 +{
20075 +       reiser4_super_info_data *sbinfo;
20076 +
20077 +       sbinfo = get_super_private(tree->super);
20078 +
20079 +       spin_lock_irq(&sbinfo->all_guard);
20080 +       assert("nikita-2422", !list_empty(&node->jnodes));
20081 +       list_del_init(&node->jnodes);
20082 +       spin_unlock_irq(&sbinfo->all_guard);
20083 +}
20084 +#endif
20085 +
20086 +/* return already existing jnode of page */
20087 +jnode *jnode_by_page(struct page *pg)
20088 +{
20089 +       assert("nikita-2066", pg != NULL);
20090 +       assert("nikita-2400", PageLocked(pg));
20091 +       assert("nikita-2068", PagePrivate(pg));
20092 +       assert("nikita-2067", jprivate(pg) != NULL);
20093 +       return jprivate(pg);
20094 +}
20095 +
20096 +/* exported functions to allocate/free jnode objects outside this file */
20097 +jnode *jalloc(void)
20098 +{
20099 +       jnode *jal = kmem_cache_alloc(_jnode_slab, reiser4_ctx_gfp_mask_get());
20100 +       return jal;
20101 +}
20102 +
20103 +/* return jnode back to the slab allocator */
20104 +inline void jfree(jnode * node)
20105 +{
20106 +       assert("zam-449", node != NULL);
20107 +
20108 +       assert("nikita-2663", (list_empty_careful(&node->capture_link) &&
20109 +                              NODE_LIST(node) == NOT_CAPTURED));
20110 +       assert("nikita-3222", list_empty(&node->jnodes));
20111 +       assert("nikita-3221", jnode_page(node) == NULL);
20112 +
20113 +       /* not yet phash_jnode_destroy(node); */
20114 +
20115 +       kmem_cache_free(_jnode_slab, node);
20116 +}
20117 +
20118 +/*
20119 + * This function is supplied as RCU callback. It actually frees jnode when
20120 + * last reference to it is gone.
20121 + */
20122 +static void jnode_free_actor(struct rcu_head *head)
20123 +{
20124 +       jnode *node;
20125 +       jnode_type jtype;
20126 +
20127 +       node = container_of(head, jnode, rcu);
20128 +       jtype = jnode_get_type(node);
20129 +
20130 +       ON_DEBUG(jnode_done(node, jnode_get_tree(node)));
20131 +
20132 +       switch (jtype) {
20133 +       case JNODE_IO_HEAD:
20134 +       case JNODE_BITMAP:
20135 +       case JNODE_UNFORMATTED_BLOCK:
20136 +               jfree(node);
20137 +               break;
20138 +       case JNODE_FORMATTED_BLOCK:
20139 +               zfree(JZNODE(node));
20140 +               break;
20141 +       case JNODE_INODE:
20142 +       default:
20143 +               wrong_return_value("nikita-3197", "Wrong jnode type");
20144 +       }
20145 +}
20146 +
20147 +/*
20148 + * Free a jnode. Post a callback to be executed later through RCU when all
20149 + * references to @node are released.
20150 + */
20151 +static inline void jnode_free(jnode * node, jnode_type jtype)
20152 +{
20153 +       if (jtype != JNODE_INODE) {
20154 +               /*assert("nikita-3219", list_empty(&node->rcu.list)); */
20155 +               call_rcu(&node->rcu, jnode_free_actor);
20156 +       } else
20157 +               jnode_list_remove(node);
20158 +}
20159 +
20160 +/* allocate new unformatted jnode */
20161 +static jnode *jnew_unformatted(void)
20162 +{
20163 +       jnode *jal;
20164 +
20165 +       jal = jalloc();
20166 +       if (jal == NULL)
20167 +               return NULL;
20168 +
20169 +       jnode_init(jal, current_tree, JNODE_UNFORMATTED_BLOCK);
20170 +       jal->key.j.mapping = NULL;
20171 +       jal->key.j.index = (unsigned long)-1;
20172 +       jal->key.j.objectid = 0;
20173 +       return jal;
20174 +}
20175 +
20176 +/* look for jnode with given mapping and offset within hash table */
20177 +jnode *jlookup(reiser4_tree * tree, oid_t objectid, unsigned long index)
20178 +{
20179 +       struct jnode_key jkey;
20180 +       jnode *node;
20181 +
20182 +       assert("nikita-2353", tree != NULL);
20183 +
20184 +       jkey.objectid = objectid;
20185 +       jkey.index = index;
20186 +
20187 +       /*
20188 +        * hash table is _not_ protected by any lock during lookups. All we
20189 +        * have to do is to disable preemption to keep RCU happy.
20190 +        */
20191 +
20192 +       rcu_read_lock();
20193 +       node = j_hash_find(&tree->jhash_table, &jkey);
20194 +       if (node != NULL) {
20195 +               /* protect @node from recycling */
20196 +               jref(node);
20197 +               assert("nikita-2955", jnode_invariant(node, 0, 0));
20198 +               node = jnode_rip_check(tree, node);
20199 +       }
20200 +       rcu_read_unlock();
20201 +       return node;
20202 +}
20203 +
20204 +/* per inode radix tree of jnodes is protected by tree's read write spin lock */
20205 +static jnode *jfind_nolock(struct address_space *mapping, unsigned long index)
20206 +{
20207 +       assert("vs-1694", mapping->host != NULL);
20208 +
20209 +       return radix_tree_lookup(jnode_tree_by_inode(mapping->host), index);
20210 +}
20211 +
20212 +jnode *jfind(struct address_space *mapping, unsigned long index)
20213 +{
20214 +       reiser4_tree *tree;
20215 +       jnode *node;
20216 +
20217 +       assert("vs-1694", mapping->host != NULL);
20218 +       tree = reiser4_tree_by_inode(mapping->host);
20219 +
20220 +       read_lock_tree(tree);
20221 +       node = jfind_nolock(mapping, index);
20222 +       if (node != NULL)
20223 +               jref(node);
20224 +       read_unlock_tree(tree);
20225 +       return node;
20226 +}
20227 +
20228 +static void inode_attach_jnode(jnode * node)
20229 +{
20230 +       struct inode *inode;
20231 +       reiser4_inode *info;
20232 +       struct radix_tree_root *rtree;
20233 +
20234 +       assert_rw_write_locked(&(jnode_get_tree(node)->tree_lock));
20235 +       assert("zam-1043", node->key.j.mapping != NULL);
20236 +       inode = node->key.j.mapping->host;
20237 +       info = reiser4_inode_data(inode);
20238 +       rtree = jnode_tree_by_reiser4_inode(info);
20239 +       if (rtree->rnode == NULL) {
20240 +               /* prevent inode from being pruned when it has jnodes attached
20241 +                  to it */
20242 +               spin_lock_irq(&inode->i_data.tree_lock);
20243 +               inode->i_data.nrpages++;
20244 +               spin_unlock_irq(&inode->i_data.tree_lock);
20245 +       }
20246 +       assert("zam-1049", equi(rtree->rnode != NULL, info->nr_jnodes != 0));
20247 +       check_me("zam-1045",
20248 +                !radix_tree_insert(rtree, node->key.j.index, node));
20249 +       ON_DEBUG(info->nr_jnodes++);
20250 +}
20251 +
20252 +static void inode_detach_jnode(jnode * node)
20253 +{
20254 +       struct inode *inode;
20255 +       reiser4_inode *info;
20256 +       struct radix_tree_root *rtree;
20257 +
20258 +       assert_rw_write_locked(&(jnode_get_tree(node)->tree_lock));
20259 +       assert("zam-1044", node->key.j.mapping != NULL);
20260 +       inode = node->key.j.mapping->host;
20261 +       info = reiser4_inode_data(inode);
20262 +       rtree = jnode_tree_by_reiser4_inode(info);
20263 +
20264 +       assert("zam-1051", info->nr_jnodes != 0);
20265 +       assert("zam-1052", rtree->rnode != NULL);
20266 +       ON_DEBUG(info->nr_jnodes--);
20267 +
20268 +       /* delete jnode from inode's radix tree of jnodes */
20269 +       check_me("zam-1046", radix_tree_delete(rtree, node->key.j.index));
20270 +       if (rtree->rnode == NULL) {
20271 +               /* inode can be pruned now */
20272 +               spin_lock_irq(&inode->i_data.tree_lock);
20273 +               inode->i_data.nrpages--;
20274 +               spin_unlock_irq(&inode->i_data.tree_lock);
20275 +       }
20276 +}
20277 +
20278 +/* put jnode into hash table (where they can be found by flush who does not know
20279 +   mapping) and to inode's tree of jnodes (where they can be found (hopefully
20280 +   faster) in places where mapping is known). Currently it is used by
20281 +   fs/reiser4/plugin/item/extent_file_ops.c:index_extent_jnode when new jnode is
20282 +   created */
20283 +static void
20284 +hash_unformatted_jnode(jnode * node, struct address_space *mapping,
20285 +                      unsigned long index)
20286 +{
20287 +       j_hash_table *jtable;
20288 +
20289 +       assert("vs-1446", jnode_is_unformatted(node));
20290 +       assert("vs-1442", node->key.j.mapping == 0);
20291 +       assert("vs-1443", node->key.j.objectid == 0);
20292 +       assert("vs-1444", node->key.j.index == (unsigned long)-1);
20293 +       assert_rw_write_locked(&(jnode_get_tree(node)->tree_lock));
20294 +
20295 +       node->key.j.mapping = mapping;
20296 +       node->key.j.objectid = get_inode_oid(mapping->host);
20297 +       node->key.j.index = index;
20298 +
20299 +       jtable = &jnode_get_tree(node)->jhash_table;
20300 +
20301 +       /* race with some other thread inserting jnode into the hash table is
20302 +        * impossible, because we keep the page lock. */
20303 +       /*
20304 +        * following assertion no longer holds because of RCU: it is possible
20305 +        * jnode is in the hash table, but with JNODE_RIP bit set.
20306 +        */
20307 +       /* assert("nikita-3211", j_hash_find(jtable, &node->key.j) == NULL); */
20308 +       j_hash_insert_rcu(jtable, node);
20309 +       inode_attach_jnode(node);
20310 +}
20311 +
20312 +static void unhash_unformatted_node_nolock(jnode * node)
20313 +{
20314 +       assert("vs-1683", node->key.j.mapping != NULL);
20315 +       assert("vs-1684",
20316 +              node->key.j.objectid ==
20317 +              get_inode_oid(node->key.j.mapping->host));
20318 +
20319 +       /* remove jnode from hash-table */
20320 +       j_hash_remove_rcu(&node->tree->jhash_table, node);
20321 +       inode_detach_jnode(node);
20322 +       node->key.j.mapping = NULL;
20323 +       node->key.j.index = (unsigned long)-1;
20324 +       node->key.j.objectid = 0;
20325 +
20326 +}
20327 +
20328 +/* remove jnode from hash table and from inode's tree of jnodes. This is used in
20329 +   reiser4_invalidatepage and in kill_hook_extent -> truncate_inode_jnodes ->
20330 +   reiser4_uncapture_jnode */
20331 +void unhash_unformatted_jnode(jnode * node)
20332 +{
20333 +       assert("vs-1445", jnode_is_unformatted(node));
20334 +
20335 +       write_lock_tree(node->tree);
20336 +       unhash_unformatted_node_nolock(node);
20337 +       write_unlock_tree(node->tree);
20338 +}
20339 +
20340 +/*
20341 + * search hash table for a jnode with given oid and index. If not found,
20342 + * allocate new jnode, insert it, and also insert into radix tree for the
20343 + * given inode/mapping.
20344 + */
20345 +static jnode *find_get_jnode(reiser4_tree * tree,
20346 +                            struct address_space *mapping,
20347 +                            oid_t oid, unsigned long index)
20348 +{
20349 +       jnode *result;
20350 +       jnode *shadow;
20351 +       int preload;
20352 +
20353 +       result = jnew_unformatted();
20354 +
20355 +       if (unlikely(result == NULL))
20356 +               return ERR_PTR(RETERR(-ENOMEM));
20357 +
20358 +       preload = radix_tree_preload(reiser4_ctx_gfp_mask_get());
20359 +       if (preload != 0)
20360 +               return ERR_PTR(preload);
20361 +
20362 +       write_lock_tree(tree);
20363 +       shadow = jfind_nolock(mapping, index);
20364 +       if (likely(shadow == NULL)) {
20365 +               /* add new jnode to hash table and inode's radix tree of
20366 +                * jnodes */
20367 +               jref(result);
20368 +               hash_unformatted_jnode(result, mapping, index);
20369 +       } else {
20370 +               /* jnode is found in inode's radix tree of jnodes */
20371 +               jref(shadow);
20372 +               jnode_free(result, JNODE_UNFORMATTED_BLOCK);
20373 +               assert("vs-1498", shadow->key.j.mapping == mapping);
20374 +               result = shadow;
20375 +       }
20376 +       write_unlock_tree(tree);
20377 +
20378 +       assert("nikita-2955",
20379 +              ergo(result != NULL, jnode_invariant(result, 0, 0)));
20380 +       radix_tree_preload_end();
20381 +       return result;
20382 +}
20383 +
20384 +/* jget() (a la zget() but for unformatted nodes). Returns (and possibly
20385 +   creates) jnode corresponding to page @pg. jnode is attached to page and
20386 +   inserted into jnode hash-table. */
20387 +static jnode *do_jget(reiser4_tree * tree, struct page *pg)
20388 +{
20389 +       /*
20390 +        * There are two ways to create jnode: starting with pre-existing page
20391 +        * and without page.
20392 +        *
20393 +        * When page already exists, jnode is created
20394 +        * (jnode_of_page()->do_jget()) under page lock. This is done in
20395 +        * ->writepage(), or when capturing anonymous page dirtied through
20396 +        * mmap.
20397 +        *
20398 +        * Jnode without page is created by index_extent_jnode().
20399 +        *
20400 +        */
20401 +
20402 +       jnode *result;
20403 +       oid_t oid = get_inode_oid(pg->mapping->host);
20404 +
20405 +       assert("umka-176", pg != NULL);
20406 +       assert("nikita-2394", PageLocked(pg));
20407 +
20408 +       result = jprivate(pg);
20409 +       if (likely(result != NULL))
20410 +               return jref(result);
20411 +
20412 +       tree = reiser4_tree_by_page(pg);
20413 +
20414 +       /* check hash-table first */
20415 +       result = jfind(pg->mapping, pg->index);
20416 +       if (unlikely(result != NULL)) {
20417 +               spin_lock_jnode(result);
20418 +               jnode_attach_page(result, pg);
20419 +               spin_unlock_jnode(result);
20420 +               result->key.j.mapping = pg->mapping;
20421 +               return result;
20422 +       }
20423 +
20424 +       /* since page is locked, jnode should be allocated with GFP_NOFS flag */
20425 +       reiser4_ctx_gfp_mask_force(GFP_NOFS);
20426 +       result = find_get_jnode(tree, pg->mapping, oid, pg->index);
20427 +       if (unlikely(IS_ERR(result)))
20428 +               return result;
20429 +       /* attach jnode to page */
20430 +       spin_lock_jnode(result);
20431 +       jnode_attach_page(result, pg);
20432 +       spin_unlock_jnode(result);
20433 +       return result;
20434 +}
20435 +
20436 +/*
20437 + * return jnode for @pg, creating it if necessary.
20438 + */
20439 +jnode *jnode_of_page(struct page *pg)
20440 +{
20441 +       jnode *result;
20442 +
20443 +       assert("umka-176", pg != NULL);
20444 +       assert("nikita-2394", PageLocked(pg));
20445 +
20446 +       result = do_jget(reiser4_tree_by_page(pg), pg);
20447 +
20448 +       if (REISER4_DEBUG && !IS_ERR(result)) {
20449 +               assert("nikita-3210", result == jprivate(pg));
20450 +               assert("nikita-2046", jnode_page(jprivate(pg)) == pg);
20451 +               if (jnode_is_unformatted(jprivate(pg))) {
20452 +                       assert("nikita-2364",
20453 +                              jprivate(pg)->key.j.index == pg->index);
20454 +                       assert("nikita-2367",
20455 +                              jprivate(pg)->key.j.mapping == pg->mapping);
20456 +                       assert("nikita-2365",
20457 +                              jprivate(pg)->key.j.objectid ==
20458 +                              get_inode_oid(pg->mapping->host));
20459 +                       assert("vs-1200",
20460 +                              jprivate(pg)->key.j.objectid ==
20461 +                              pg->mapping->host->i_ino);
20462 +                       assert("nikita-2356",
20463 +                              jnode_is_unformatted(jnode_by_page(pg)));
20464 +               }
20465 +               assert("nikita-2956", jnode_invariant(jprivate(pg), 0, 0));
20466 +       }
20467 +       return result;
20468 +}
20469 +
20470 +/* attach page to jnode: set ->pg pointer in jnode, and ->private one in the
20471 + * page.*/
20472 +void jnode_attach_page(jnode * node, struct page *pg)
20473 +{
20474 +       assert("nikita-2060", node != NULL);
20475 +       assert("nikita-2061", pg != NULL);
20476 +
20477 +       assert("nikita-2050", jprivate(pg) == 0ul);
20478 +       assert("nikita-2393", !PagePrivate(pg));
20479 +       assert("vs-1741", node->pg == NULL);
20480 +
20481 +       assert("nikita-2396", PageLocked(pg));
20482 +       assert_spin_locked(&(node->guard));
20483 +
20484 +       page_cache_get(pg);
20485 +       set_page_private(pg, (unsigned long)node);
20486 +       node->pg = pg;
20487 +       SetPagePrivate(pg);
20488 +}
20489 +
20490 +/* Dual to jnode_attach_page: break a binding between page and jnode */
20491 +void page_clear_jnode(struct page *page, jnode * node)
20492 +{
20493 +       assert("nikita-2424", page != NULL);
20494 +       assert("nikita-2425", PageLocked(page));
20495 +       assert("nikita-2426", node != NULL);
20496 +       assert_spin_locked(&(node->guard));
20497 +       assert("nikita-2428", PagePrivate(page));
20498 +
20499 +       assert("nikita-3551", !PageWriteback(page));
20500 +
20501 +       JF_CLR(node, JNODE_PARSED);
20502 +       set_page_private(page, 0ul);
20503 +       ClearPagePrivate(page);
20504 +       node->pg = NULL;
20505 +       page_cache_release(page);
20506 +}
20507 +
20508 +#if 0
20509 +/* it is only used in one place to handle error */
20510 +void
20511 +page_detach_jnode(struct page *page, struct address_space *mapping,
20512 +                 unsigned long index)
20513 +{
20514 +       assert("nikita-2395", page != NULL);
20515 +
20516 +       lock_page(page);
20517 +       if ((page->mapping == mapping) && (page->index == index)
20518 +           && PagePrivate(page)) {
20519 +               jnode *node;
20520 +
20521 +               node = jprivate(page);
20522 +               spin_lock_jnode(node);
20523 +               page_clear_jnode(page, node);
20524 +               spin_unlock_jnode(node);
20525 +       }
20526 +       unlock_page(page);
20527 +}
20528 +#endif  /*  0  */
20529 +
20530 +/* return @node page locked.
20531 +
20532 +   Locking ordering requires that one first takes page lock and afterwards
20533 +   spin lock on node attached to this page. Sometimes it is necessary to go in
20534 +   the opposite direction. This is done through standard trylock-and-release
20535 +   loop.
20536 +*/
20537 +static struct page *jnode_lock_page(jnode * node)
20538 +{
20539 +       struct page *page;
20540 +
20541 +       assert("nikita-2052", node != NULL);
20542 +       assert("nikita-2401", LOCK_CNT_NIL(spin_locked_jnode));
20543 +
20544 +       while (1) {
20545 +
20546 +               spin_lock_jnode(node);
20547 +               page = jnode_page(node);
20548 +               if (page == NULL)
20549 +                       break;
20550 +
20551 +               /* no need to page_cache_get( page ) here, because page cannot
20552 +                  be evicted from memory without detaching it from jnode and
20553 +                  this requires spin lock on jnode that we already hold.
20554 +                */
20555 +               if (trylock_page(page)) {
20556 +                       /* We won a lock on jnode page, proceed. */
20557 +                       break;
20558 +               }
20559 +
20560 +               /* Page is locked by someone else. */
20561 +               page_cache_get(page);
20562 +               spin_unlock_jnode(node);
20563 +               wait_on_page_locked(page);
20564 +               /* it is possible that page was detached from jnode and
20565 +                  returned to the free pool, or re-assigned while we were
20566 +                  waiting on locked bit. This will be rechecked on the next
20567 +                  loop iteration.
20568 +                */
20569 +               page_cache_release(page);
20570 +
20571 +               /* try again */
20572 +       }
20573 +       return page;
20574 +}
20575 +
20576 +/*
20577 + * is JNODE_PARSED bit is not set, call ->parse() method of jnode, to verify
20578 + * validness of jnode content.
20579 + */
20580 +static inline int jparse(jnode * node)
20581 +{
20582 +       int result;
20583 +
20584 +       assert("nikita-2466", node != NULL);
20585 +
20586 +       spin_lock_jnode(node);
20587 +       if (likely(!jnode_is_parsed(node))) {
20588 +               result = jnode_ops(node)->parse(node);
20589 +               if (likely(result == 0))
20590 +                       JF_SET(node, JNODE_PARSED);
20591 +       } else
20592 +               result = 0;
20593 +       spin_unlock_jnode(node);
20594 +       return result;
20595 +}
20596 +
20597 +/* Lock a page attached to jnode, create and attach page to jnode if it had no
20598 + * one. */
20599 +static struct page *jnode_get_page_locked(jnode * node, gfp_t gfp_flags)
20600 +{
20601 +       struct page *page;
20602 +
20603 +       spin_lock_jnode(node);
20604 +       page = jnode_page(node);
20605 +
20606 +       if (page == NULL) {
20607 +               spin_unlock_jnode(node);
20608 +               page = find_or_create_page(jnode_get_mapping(node),
20609 +                                          jnode_get_index(node), gfp_flags);
20610 +               if (page == NULL)
20611 +                       return ERR_PTR(RETERR(-ENOMEM));
20612 +       } else {
20613 +               if (trylock_page(page)) {
20614 +                       spin_unlock_jnode(node);
20615 +                       return page;
20616 +               }
20617 +               page_cache_get(page);
20618 +               spin_unlock_jnode(node);
20619 +               lock_page(page);
20620 +               assert("nikita-3134", page->mapping == jnode_get_mapping(node));
20621 +       }
20622 +
20623 +       spin_lock_jnode(node);
20624 +       if (!jnode_page(node))
20625 +               jnode_attach_page(node, page);
20626 +       spin_unlock_jnode(node);
20627 +
20628 +       page_cache_release(page);
20629 +       assert("zam-894", jnode_page(node) == page);
20630 +       return page;
20631 +}
20632 +
20633 +/* Start read operation for jnode's page if page is not up-to-date. */
20634 +static int jnode_start_read(jnode * node, struct page *page)
20635 +{
20636 +       assert("zam-893", PageLocked(page));
20637 +
20638 +       if (PageUptodate(page)) {
20639 +               unlock_page(page);
20640 +               return 0;
20641 +       }
20642 +       return reiser4_page_io(page, node, READ, reiser4_ctx_gfp_mask_get());
20643 +}
20644 +
20645 +#if REISER4_DEBUG
20646 +static void check_jload(jnode * node, struct page *page)
20647 +{
20648 +       if (jnode_is_znode(node)) {
20649 +               node40_header *nh;
20650 +               znode *z;
20651 +
20652 +               z = JZNODE(node);
20653 +               if (znode_is_any_locked(z)) {
20654 +                       nh = (node40_header *) kmap(page);
20655 +                       /* this only works for node40-only file systems. For
20656 +                        * debugging. */
20657 +                       assert("nikita-3253",
20658 +                              z->nr_items == le16_to_cpu(get_unaligned(&nh->nr_items)));
20659 +                       kunmap(page);
20660 +               }
20661 +               assert("nikita-3565", znode_invariant(z));
20662 +       }
20663 +}
20664 +#else
20665 +#define check_jload(node, page) noop
20666 +#endif
20667 +
20668 +/* prefetch jnode to speed up next call to jload. Call this when you are going
20669 + * to call jload() shortly. This will bring appropriate portion of jnode into
20670 + * CPU cache. */
20671 +void jload_prefetch(jnode * node)
20672 +{
20673 +       prefetchw(&node->x_count);
20674 +}
20675 +
20676 +/* load jnode's data into memory */
20677 +int jload_gfp(jnode * node /* node to load */ ,
20678 +             gfp_t gfp_flags /* allocation flags */ ,
20679 +             int do_kmap/* true if page should be kmapped */)
20680 +{
20681 +       struct page *page;
20682 +       int result = 0;
20683 +       int parsed;
20684 +
20685 +       assert("nikita-3010", reiser4_schedulable());
20686 +
20687 +       prefetchw(&node->pg);
20688 +
20689 +       /* taking d-reference implies taking x-reference. */
20690 +       jref(node);
20691 +
20692 +       /*
20693 +        * acquiring d-reference to @jnode and check for JNODE_PARSED bit
20694 +        * should be atomic, otherwise there is a race against
20695 +        * reiser4_releasepage().
20696 +        */
20697 +       spin_lock(&(node->load));
20698 +       add_d_ref(node);
20699 +       parsed = jnode_is_parsed(node);
20700 +       spin_unlock(&(node->load));
20701 +
20702 +       if (unlikely(!parsed)) {
20703 +               page = jnode_get_page_locked(node, gfp_flags);
20704 +               if (unlikely(IS_ERR(page))) {
20705 +                       result = PTR_ERR(page);
20706 +                       goto failed;
20707 +               }
20708 +
20709 +               result = jnode_start_read(node, page);
20710 +               if (unlikely(result != 0))
20711 +                       goto failed;
20712 +
20713 +               wait_on_page_locked(page);
20714 +               if (unlikely(!PageUptodate(page))) {
20715 +                       result = RETERR(-EIO);
20716 +                       goto failed;
20717 +               }
20718 +
20719 +               if (do_kmap)
20720 +                       node->data = kmap(page);
20721 +
20722 +               result = jparse(node);
20723 +               if (unlikely(result != 0)) {
20724 +                       if (do_kmap)
20725 +                               kunmap(page);
20726 +                       goto failed;
20727 +               }
20728 +               check_jload(node, page);
20729 +       } else {
20730 +               page = jnode_page(node);
20731 +               check_jload(node, page);
20732 +               if (do_kmap)
20733 +                       node->data = kmap(page);
20734 +       }
20735 +
20736 +       if (!is_writeout_mode())
20737 +               /* We do not mark pages active if jload is called as a part of
20738 +                * jnode_flush() or reiser4_write_logs().  Both jnode_flush()
20739 +                * and write_logs() add no value to cached data, there is no
20740 +                * sense to mark pages as active when they go to disk, it just
20741 +                * confuses vm scanning routines because clean page could be
20742 +                * moved out from inactive list as a result of this
20743 +                * mark_page_accessed() call. */
20744 +               mark_page_accessed(page);
20745 +
20746 +       return 0;
20747 +
20748 +failed:
20749 +       jrelse_tail(node);
20750 +       return result;
20751 +
20752 +}
20753 +
20754 +/* start asynchronous reading for given jnode's page. */
20755 +int jstartio(jnode * node)
20756 +{
20757 +       struct page *page;
20758 +
20759 +       page = jnode_get_page_locked(node, reiser4_ctx_gfp_mask_get());
20760 +       if (IS_ERR(page))
20761 +               return PTR_ERR(page);
20762 +
20763 +       return jnode_start_read(node, page);
20764 +}
20765 +
20766 +/* Initialize a node by calling appropriate plugin instead of reading
20767 + * node from disk as in jload(). */
20768 +int jinit_new(jnode * node, gfp_t gfp_flags)
20769 +{
20770 +       struct page *page;
20771 +       int result;
20772 +
20773 +       jref(node);
20774 +       add_d_ref(node);
20775 +
20776 +       page = jnode_get_page_locked(node, gfp_flags);
20777 +       if (IS_ERR(page)) {
20778 +               result = PTR_ERR(page);
20779 +               goto failed;
20780 +       }
20781 +
20782 +       SetPageUptodate(page);
20783 +       unlock_page(page);
20784 +
20785 +       node->data = kmap(page);
20786 +
20787 +       if (!jnode_is_parsed(node)) {
20788 +               jnode_plugin *jplug = jnode_ops(node);
20789 +               spin_lock_jnode(node);
20790 +               result = jplug->init(node);
20791 +               spin_unlock_jnode(node);
20792 +               if (result) {
20793 +                       kunmap(page);
20794 +                       goto failed;
20795 +               }
20796 +               JF_SET(node, JNODE_PARSED);
20797 +       }
20798 +
20799 +       return 0;
20800 +
20801 +failed:
20802 +       jrelse(node);
20803 +       return result;
20804 +}
20805 +
20806 +/* release a reference to jnode acquired by jload(), decrement ->d_count */
20807 +void jrelse_tail(jnode * node/* jnode to release references to */)
20808 +{
20809 +       assert("nikita-489", atomic_read(&node->d_count) > 0);
20810 +       atomic_dec(&node->d_count);
20811 +       /* release reference acquired in jload_gfp() or jinit_new() */
20812 +       jput(node);
20813 +       if (jnode_is_unformatted(node) || jnode_is_znode(node))
20814 +               LOCK_CNT_DEC(d_refs);
20815 +}
20816 +
20817 +/* drop reference to node data. When last reference is dropped, data are
20818 +   unloaded. */
20819 +void jrelse(jnode * node/* jnode to release references to */)
20820 +{
20821 +       struct page *page;
20822 +
20823 +       assert("nikita-487", node != NULL);
20824 +       assert_spin_not_locked(&(node->guard));
20825 +
20826 +       page = jnode_page(node);
20827 +       if (likely(page != NULL)) {
20828 +               /*
20829 +                * it is safe not to lock jnode here, because at this point
20830 +                * @node->d_count is greater than zero (if jrelse() is used
20831 +                * correctly, that is). JNODE_PARSED may be not set yet, if,
20832 +                * for example, we got here as a result of error handling path
20833 +                * in jload(). Anyway, page cannot be detached by
20834 +                * reiser4_releasepage(). truncate will invalidate page
20835 +                * regardless, but this should not be a problem.
20836 +                */
20837 +               kunmap(page);
20838 +       }
20839 +       jrelse_tail(node);
20840 +}
20841 +
20842 +/* called from jput() to wait for io completion */
20843 +static void jnode_finish_io(jnode * node)
20844 +{
20845 +       struct page *page;
20846 +
20847 +       assert("nikita-2922", node != NULL);
20848 +
20849 +       spin_lock_jnode(node);
20850 +       page = jnode_page(node);
20851 +       if (page != NULL) {
20852 +               page_cache_get(page);
20853 +               spin_unlock_jnode(node);
20854 +               wait_on_page_writeback(page);
20855 +               page_cache_release(page);
20856 +       } else
20857 +               spin_unlock_jnode(node);
20858 +}
20859 +
20860 +/*
20861 + * This is called by jput() when last reference to jnode is released. This is
20862 + * separate function, because we want fast path of jput() to be inline and,
20863 + * therefore, small.
20864 + */
20865 +void jput_final(jnode * node)
20866 +{
20867 +       int r_i_p;
20868 +
20869 +       /* A fast check for keeping node in cache. We always keep node in cache
20870 +        * if its page is present and node was not marked for deletion */
20871 +       if (jnode_page(node) != NULL && !JF_ISSET(node, JNODE_HEARD_BANSHEE)) {
20872 +               rcu_read_unlock();
20873 +               return;
20874 +       }
20875 +       r_i_p = !JF_TEST_AND_SET(node, JNODE_RIP);
20876 +       /*
20877 +        * if r_i_p is true, we were first to set JNODE_RIP on this node. In
20878 +        * this case it is safe to access node after unlock.
20879 +        */
20880 +       rcu_read_unlock();
20881 +       if (r_i_p) {
20882 +               jnode_finish_io(node);
20883 +               if (JF_ISSET(node, JNODE_HEARD_BANSHEE))
20884 +                       /* node is removed from the tree. */
20885 +                       jdelete(node);
20886 +               else
20887 +                       jnode_try_drop(node);
20888 +       }
20889 +       /* if !r_i_p some other thread is already killing it */
20890 +}
20891 +
20892 +int jwait_io(jnode * node, int rw)
20893 +{
20894 +       struct page *page;
20895 +       int result;
20896 +
20897 +       assert("zam-447", node != NULL);
20898 +       assert("zam-448", jnode_page(node) != NULL);
20899 +
20900 +       page = jnode_page(node);
20901 +
20902 +       result = 0;
20903 +       if (rw == READ) {
20904 +               wait_on_page_locked(page);
20905 +       } else {
20906 +               assert("nikita-2227", rw == WRITE);
20907 +               wait_on_page_writeback(page);
20908 +       }
20909 +       if (PageError(page))
20910 +               result = RETERR(-EIO);
20911 +
20912 +       return result;
20913 +}
20914 +
20915 +/*
20916 + * jnode types and plugins.
20917 + *
20918 + * jnode by itself is a "base type". There are several different jnode
20919 + * flavors, called "jnode types" (see jnode_type for a list). Sometimes code
20920 + * has to do different things based on jnode type. In the standard reiser4 way
20921 + * this is done by having jnode plugin (see fs/reiser4/plugin.h:jnode_plugin).
20922 + *
20923 + * Functions below deal with jnode types and define methods of jnode plugin.
20924 + *
20925 + */
20926 +
20927 +/* set jnode type. This is done during jnode initialization. */
20928 +static void jnode_set_type(jnode * node, jnode_type type)
20929 +{
20930 +       static unsigned long type_to_mask[] = {
20931 +               [JNODE_UNFORMATTED_BLOCK] = 1,
20932 +               [JNODE_FORMATTED_BLOCK] = 0,
20933 +               [JNODE_BITMAP] = 2,
20934 +               [JNODE_IO_HEAD] = 6,
20935 +               [JNODE_INODE] = 4
20936 +       };
20937 +
20938 +       assert("zam-647", type < LAST_JNODE_TYPE);
20939 +       assert("nikita-2815", !jnode_is_loaded(node));
20940 +       assert("nikita-3386", node->state == 0);
20941 +
20942 +       node->state |= (type_to_mask[type] << JNODE_TYPE_1);
20943 +}
20944 +
20945 +/* ->init() method of jnode plugin for jnodes that don't require plugin
20946 + * specific initialization. */
20947 +static int init_noinit(jnode * node UNUSED_ARG)
20948 +{
20949 +       return 0;
20950 +}
20951 +
20952 +/* ->parse() method of jnode plugin for jnodes that don't require plugin
20953 + * specific pasring. */
20954 +static int parse_noparse(jnode * node UNUSED_ARG)
20955 +{
20956 +       return 0;
20957 +}
20958 +
20959 +/* ->mapping() method for unformatted jnode */
20960 +struct address_space *mapping_jnode(const jnode * node)
20961 +{
20962 +       struct address_space *map;
20963 +
20964 +       assert("nikita-2713", node != NULL);
20965 +
20966 +       /* mapping is stored in jnode */
20967 +
20968 +       map = node->key.j.mapping;
20969 +       assert("nikita-2714", map != NULL);
20970 +       assert("nikita-2897", is_reiser4_inode(map->host));
20971 +       assert("nikita-2715", get_inode_oid(map->host) == node->key.j.objectid);
20972 +       return map;
20973 +}
20974 +
20975 +/* ->index() method for unformatted jnodes */
20976 +unsigned long index_jnode(const jnode * node)
20977 +{
20978 +       /* index is stored in jnode */
20979 +       return node->key.j.index;
20980 +}
20981 +
20982 +/* ->remove() method for unformatted jnodes */
20983 +static inline void remove_jnode(jnode * node, reiser4_tree * tree)
20984 +{
20985 +       /* remove jnode from hash table and radix tree */
20986 +       if (node->key.j.mapping)
20987 +               unhash_unformatted_node_nolock(node);
20988 +}
20989 +
20990 +/* ->mapping() method for znodes */
20991 +static struct address_space *mapping_znode(const jnode * node)
20992 +{
20993 +       /* all znodes belong to fake inode */
20994 +       return reiser4_get_super_fake(jnode_get_tree(node)->super)->i_mapping;
20995 +}
20996 +
20997 +/* ->index() method for znodes */
20998 +static unsigned long index_znode(const jnode * node)
20999 +{
21000 +       unsigned long addr;
21001 +       assert("nikita-3317", (1 << znode_shift_order) < sizeof(znode));
21002 +
21003 +       /* index of znode is just its address (shifted) */
21004 +       addr = (unsigned long)node;
21005 +       return (addr - PAGE_OFFSET) >> znode_shift_order;
21006 +}
21007 +
21008 +/* ->mapping() method for bitmap jnode */
21009 +static struct address_space *mapping_bitmap(const jnode * node)
21010 +{
21011 +       /* all bitmap blocks belong to special bitmap inode */
21012 +       return get_super_private(jnode_get_tree(node)->super)->bitmap->
21013 +           i_mapping;
21014 +}
21015 +
21016 +/* ->index() method for jnodes that are indexed by address */
21017 +static unsigned long index_is_address(const jnode * node)
21018 +{
21019 +       unsigned long ind;
21020 +
21021 +       ind = (unsigned long)node;
21022 +       return ind - PAGE_OFFSET;
21023 +}
21024 +
21025 +/* resolve race with jput */
21026 +jnode *jnode_rip_sync(reiser4_tree *tree, jnode *node)
21027 +{
21028 +       /*
21029 +        * This is used as part of RCU-based jnode handling.
21030 +        *
21031 +        * jlookup(), zlook(), zget(), and cbk_cache_scan_slots() have to work
21032 +        * with unreferenced jnodes (ones with ->x_count == 0). Hash table is
21033 +        * not protected during this, so concurrent thread may execute
21034 +        * zget-set-HEARD_BANSHEE-zput, or somehow else cause jnode to be
21035 +        * freed in jput_final(). To avoid such races, jput_final() sets
21036 +        * JNODE_RIP on jnode (under tree lock). All places that work with
21037 +        * unreferenced jnodes call this function. It checks for JNODE_RIP bit
21038 +        * (first without taking tree lock), and if this bit is set, released
21039 +        * reference acquired by the current thread and returns NULL.
21040 +        *
21041 +        * As a result, if jnode is being concurrently freed, NULL is returned
21042 +        * and caller should pretend that jnode wasn't found in the first
21043 +        * place.
21044 +        *
21045 +        * Otherwise it's safe to release "rcu-read-lock" and continue with
21046 +        * jnode.
21047 +        */
21048 +       if (unlikely(JF_ISSET(node, JNODE_RIP))) {
21049 +               read_lock_tree(tree);
21050 +               if (JF_ISSET(node, JNODE_RIP)) {
21051 +                       dec_x_ref(node);
21052 +                       node = NULL;
21053 +               }
21054 +               read_unlock_tree(tree);
21055 +       }
21056 +       return node;
21057 +}
21058 +
21059 +reiser4_key *jnode_build_key(const jnode * node, reiser4_key * key)
21060 +{
21061 +       struct inode *inode;
21062 +       item_plugin *iplug;
21063 +       loff_t off;
21064 +
21065 +       assert("nikita-3092", node != NULL);
21066 +       assert("nikita-3093", key != NULL);
21067 +       assert("nikita-3094", jnode_is_unformatted(node));
21068 +
21069 +       off = ((loff_t) index_jnode(node)) << PAGE_CACHE_SHIFT;
21070 +       inode = mapping_jnode(node)->host;
21071 +
21072 +       if (node->parent_item_id != 0)
21073 +               iplug = item_plugin_by_id(node->parent_item_id);
21074 +       else
21075 +               iplug = NULL;
21076 +
21077 +       if (iplug != NULL && iplug->f.key_by_offset)
21078 +               iplug->f.key_by_offset(inode, off, key);
21079 +       else {
21080 +               file_plugin *fplug;
21081 +
21082 +               fplug = inode_file_plugin(inode);
21083 +               assert("zam-1007", fplug != NULL);
21084 +               assert("zam-1008", fplug->key_by_inode != NULL);
21085 +
21086 +               fplug->key_by_inode(inode, off, key);
21087 +       }
21088 +
21089 +       return key;
21090 +}
21091 +
21092 +/* ->parse() method for formatted nodes */
21093 +static int parse_znode(jnode * node)
21094 +{
21095 +       return zparse(JZNODE(node));
21096 +}
21097 +
21098 +/* ->delete() method for formatted nodes */
21099 +static void delete_znode(jnode * node, reiser4_tree * tree)
21100 +{
21101 +       znode *z;
21102 +
21103 +       assert_rw_write_locked(&(tree->tree_lock));
21104 +       assert("vs-898", JF_ISSET(node, JNODE_HEARD_BANSHEE));
21105 +
21106 +       z = JZNODE(node);
21107 +       assert("vs-899", z->c_count == 0);
21108 +
21109 +       /* delete znode from sibling list. */
21110 +       sibling_list_remove(z);
21111 +
21112 +       znode_remove(z, tree);
21113 +}
21114 +
21115 +/* ->remove() method for formatted nodes */
21116 +static int remove_znode(jnode * node, reiser4_tree * tree)
21117 +{
21118 +       znode *z;
21119 +
21120 +       assert_rw_write_locked(&(tree->tree_lock));
21121 +       z = JZNODE(node);
21122 +
21123 +       if (z->c_count == 0) {
21124 +               /* detach znode from sibling list. */
21125 +               sibling_list_drop(z);
21126 +               /* this is called with tree spin-lock held, so call
21127 +                  znode_remove() directly (rather than znode_lock_remove()). */
21128 +               znode_remove(z, tree);
21129 +               return 0;
21130 +       }
21131 +       return RETERR(-EBUSY);
21132 +}
21133 +
21134 +/* ->init() method for formatted nodes */
21135 +static int init_znode(jnode * node)
21136 +{
21137 +       znode *z;
21138 +
21139 +       z = JZNODE(node);
21140 +       /* call node plugin to do actual initialization */
21141 +       return z->nplug->init(z);
21142 +}
21143 +
21144 +/* ->clone() method for formatted nodes */
21145 +static jnode *clone_formatted(jnode * node)
21146 +{
21147 +       znode *clone;
21148 +
21149 +       assert("vs-1430", jnode_is_znode(node));
21150 +       clone = zalloc(reiser4_ctx_gfp_mask_get());
21151 +       if (clone == NULL)
21152 +               return ERR_PTR(RETERR(-ENOMEM));
21153 +       zinit(clone, NULL, current_tree);
21154 +       jnode_set_block(ZJNODE(clone), jnode_get_block(node));
21155 +       /* ZJNODE(clone)->key.z is not initialized */
21156 +       clone->level = JZNODE(node)->level;
21157 +
21158 +       return ZJNODE(clone);
21159 +}
21160 +
21161 +/* jplug->clone for unformatted nodes */
21162 +static jnode *clone_unformatted(jnode * node)
21163 +{
21164 +       jnode *clone;
21165 +
21166 +       assert("vs-1431", jnode_is_unformatted(node));
21167 +       clone = jalloc();
21168 +       if (clone == NULL)
21169 +               return ERR_PTR(RETERR(-ENOMEM));
21170 +
21171 +       jnode_init(clone, current_tree, JNODE_UNFORMATTED_BLOCK);
21172 +       jnode_set_block(clone, jnode_get_block(node));
21173 +
21174 +       return clone;
21175 +
21176 +}
21177 +
21178 +/*
21179 + * Setup jnode plugin methods for various jnode types.
21180 + */
21181 +jnode_plugin jnode_plugins[LAST_JNODE_TYPE] = {
21182 +       [JNODE_UNFORMATTED_BLOCK] = {
21183 +               .h = {
21184 +                       .type_id = REISER4_JNODE_PLUGIN_TYPE,
21185 +                       .id = JNODE_UNFORMATTED_BLOCK,
21186 +                       .pops = NULL,
21187 +                       .label = "unformatted",
21188 +                       .desc = "unformatted node",
21189 +                       .linkage = {NULL, NULL}
21190 +               },
21191 +               .init = init_noinit,
21192 +               .parse = parse_noparse,
21193 +               .mapping = mapping_jnode,
21194 +               .index = index_jnode,
21195 +               .clone = clone_unformatted
21196 +       },
21197 +       [JNODE_FORMATTED_BLOCK] = {
21198 +               .h = {
21199 +                       .type_id = REISER4_JNODE_PLUGIN_TYPE,
21200 +                       .id = JNODE_FORMATTED_BLOCK,
21201 +                       .pops = NULL,
21202 +                       .label = "formatted",
21203 +                       .desc = "formatted tree node",
21204 +                       .linkage = {NULL, NULL}
21205 +               },
21206 +               .init = init_znode,
21207 +               .parse = parse_znode,
21208 +               .mapping = mapping_znode,
21209 +               .index = index_znode,
21210 +               .clone = clone_formatted
21211 +       },
21212 +       [JNODE_BITMAP] = {
21213 +               .h = {
21214 +                       .type_id = REISER4_JNODE_PLUGIN_TYPE,
21215 +                       .id = JNODE_BITMAP,
21216 +                       .pops = NULL,
21217 +                       .label = "bitmap",
21218 +                       .desc = "bitmap node",
21219 +                       .linkage = {NULL, NULL}
21220 +               },
21221 +               .init = init_noinit,
21222 +               .parse = parse_noparse,
21223 +               .mapping = mapping_bitmap,
21224 +               .index = index_is_address,
21225 +               .clone = NULL
21226 +       },
21227 +       [JNODE_IO_HEAD] = {
21228 +               .h = {
21229 +                       .type_id = REISER4_JNODE_PLUGIN_TYPE,
21230 +                       .id = JNODE_IO_HEAD,
21231 +                       .pops = NULL,
21232 +                       .label = "io head",
21233 +                       .desc = "io head",
21234 +                       .linkage = {NULL, NULL}
21235 +               },
21236 +               .init = init_noinit,
21237 +               .parse = parse_noparse,
21238 +               .mapping = mapping_bitmap,
21239 +               .index = index_is_address,
21240 +               .clone = NULL
21241 +       },
21242 +       [JNODE_INODE] = {
21243 +               .h = {
21244 +                       .type_id = REISER4_JNODE_PLUGIN_TYPE,
21245 +                       .id = JNODE_INODE,
21246 +                       .pops = NULL,
21247 +                       .label = "inode",
21248 +                       .desc = "inode's builtin jnode",
21249 +                       .linkage = {NULL, NULL}
21250 +               },
21251 +               .init = NULL,
21252 +               .parse = NULL,
21253 +               .mapping = NULL,
21254 +               .index = NULL,
21255 +               .clone = NULL
21256 +       }
21257 +};
21258 +
21259 +/*
21260 + * jnode destruction.
21261 + *
21262 + * Thread may use a jnode after it acquired a reference to it. References are
21263 + * counted in ->x_count field. Reference protects jnode from being
21264 + * recycled. This is different from protecting jnode data (that are stored in
21265 + * jnode page) from being evicted from memory. Data are protected by jload()
21266 + * and released by jrelse().
21267 + *
21268 + * If thread already possesses a reference to the jnode it can acquire another
21269 + * one through jref(). Initial reference is obtained (usually) by locating
21270 + * jnode in some indexing structure that depends on jnode type: formatted
21271 + * nodes are kept in global hash table, where they are indexed by block
21272 + * number, and also in the cbk cache. Unformatted jnodes are also kept in hash
21273 + * table, which is indexed by oid and offset within file, and in per-inode
21274 + * radix tree.
21275 + *
21276 + * Reference to jnode is released by jput(). If last reference is released,
21277 + * jput_final() is called. This function determines whether jnode has to be
21278 + * deleted (this happens when corresponding node is removed from the file
21279 + * system, jnode is marked with JNODE_HEARD_BANSHEE bit in this case), or it
21280 + * should be just "removed" (deleted from memory).
21281 + *
21282 + * Jnode destruction is signally delicate dance because of locking and RCU.
21283 + */
21284 +
21285 +/*
21286 + * Returns true if jnode cannot be removed right now. This check is called
21287 + * under tree lock. If it returns true, jnode is irrevocably committed to be
21288 + * deleted/removed.
21289 + */
21290 +static inline int jnode_is_busy(const jnode * node, jnode_type jtype)
21291 +{
21292 +       /* if other thread managed to acquire a reference to this jnode, don't
21293 +        * free it. */
21294 +       if (atomic_read(&node->x_count) > 0)
21295 +               return 1;
21296 +       /* also, don't free znode that has children in memory */
21297 +       if (jtype == JNODE_FORMATTED_BLOCK && JZNODE(node)->c_count > 0)
21298 +               return 1;
21299 +       return 0;
21300 +}
21301 +
21302 +/*
21303 + * this is called as part of removing jnode. Based on jnode type, call
21304 + * corresponding function that removes jnode from indices and returns it back
21305 + * to the appropriate slab (through RCU).
21306 + */
21307 +static inline void
21308 +jnode_remove(jnode * node, jnode_type jtype, reiser4_tree * tree)
21309 +{
21310 +       switch (jtype) {
21311 +       case JNODE_UNFORMATTED_BLOCK:
21312 +               remove_jnode(node, tree);
21313 +               break;
21314 +       case JNODE_IO_HEAD:
21315 +       case JNODE_BITMAP:
21316 +               break;
21317 +       case JNODE_INODE:
21318 +               break;
21319 +       case JNODE_FORMATTED_BLOCK:
21320 +               remove_znode(node, tree);
21321 +               break;
21322 +       default:
21323 +               wrong_return_value("nikita-3196", "Wrong jnode type");
21324 +       }
21325 +}
21326 +
21327 +/*
21328 + * this is called as part of deleting jnode. Based on jnode type, call
21329 + * corresponding function that removes jnode from indices and returns it back
21330 + * to the appropriate slab (through RCU).
21331 + *
21332 + * This differs from jnode_remove() only for formatted nodes---for them
21333 + * sibling list handling is different for removal and deletion.
21334 + */
21335 +static inline void
21336 +jnode_delete(jnode * node, jnode_type jtype, reiser4_tree * tree UNUSED_ARG)
21337 +{
21338 +       switch (jtype) {
21339 +       case JNODE_UNFORMATTED_BLOCK:
21340 +               remove_jnode(node, tree);
21341 +               break;
21342 +       case JNODE_IO_HEAD:
21343 +       case JNODE_BITMAP:
21344 +               break;
21345 +       case JNODE_FORMATTED_BLOCK:
21346 +               delete_znode(node, tree);
21347 +               break;
21348 +       case JNODE_INODE:
21349 +       default:
21350 +               wrong_return_value("nikita-3195", "Wrong jnode type");
21351 +       }
21352 +}
21353 +
21354 +#if REISER4_DEBUG
21355 +/*
21356 + * remove jnode from the debugging list of all jnodes hanging off super-block.
21357 + */
21358 +void jnode_list_remove(jnode * node)
21359 +{
21360 +       reiser4_super_info_data *sbinfo;
21361 +
21362 +       sbinfo = get_super_private(jnode_get_tree(node)->super);
21363 +
21364 +       spin_lock_irq(&sbinfo->all_guard);
21365 +       assert("nikita-2422", !list_empty(&node->jnodes));
21366 +       list_del_init(&node->jnodes);
21367 +       spin_unlock_irq(&sbinfo->all_guard);
21368 +}
21369 +#endif
21370 +
21371 +/*
21372 + * this is called by jput_final() to remove jnode when last reference to it is
21373 + * released.
21374 + */
21375 +static int jnode_try_drop(jnode * node)
21376 +{
21377 +       int result;
21378 +       reiser4_tree *tree;
21379 +       jnode_type jtype;
21380 +
21381 +       assert("nikita-2491", node != NULL);
21382 +       assert("nikita-2583", JF_ISSET(node, JNODE_RIP));
21383 +
21384 +       tree = jnode_get_tree(node);
21385 +       jtype = jnode_get_type(node);
21386 +
21387 +       spin_lock_jnode(node);
21388 +       write_lock_tree(tree);
21389 +       /*
21390 +        * if jnode has a page---leave it alone. Memory pressure will
21391 +        * eventually kill page and jnode.
21392 +        */
21393 +       if (jnode_page(node) != NULL) {
21394 +               write_unlock_tree(tree);
21395 +               spin_unlock_jnode(node);
21396 +               JF_CLR(node, JNODE_RIP);
21397 +               return RETERR(-EBUSY);
21398 +       }
21399 +
21400 +       /* re-check ->x_count under tree lock. */
21401 +       result = jnode_is_busy(node, jtype);
21402 +       if (result == 0) {
21403 +               assert("nikita-2582", !JF_ISSET(node, JNODE_HEARD_BANSHEE));
21404 +               assert("jmacd-511/b", atomic_read(&node->d_count) == 0);
21405 +
21406 +               spin_unlock_jnode(node);
21407 +               /* no page and no references---despatch him. */
21408 +               jnode_remove(node, jtype, tree);
21409 +               write_unlock_tree(tree);
21410 +               jnode_free(node, jtype);
21411 +       } else {
21412 +               /* busy check failed: reference was acquired by concurrent
21413 +                * thread. */
21414 +               write_unlock_tree(tree);
21415 +               spin_unlock_jnode(node);
21416 +               JF_CLR(node, JNODE_RIP);
21417 +       }
21418 +       return result;
21419 +}
21420 +
21421 +/* jdelete() -- Delete jnode from the tree and file system */
21422 +static int jdelete(jnode * node/* jnode to finish with */)
21423 +{
21424 +       struct page *page;
21425 +       int result;
21426 +       reiser4_tree *tree;
21427 +       jnode_type jtype;
21428 +
21429 +       assert("nikita-467", node != NULL);
21430 +       assert("nikita-2531", JF_ISSET(node, JNODE_RIP));
21431 +
21432 +       jtype = jnode_get_type(node);
21433 +
21434 +       page = jnode_lock_page(node);
21435 +       assert_spin_locked(&(node->guard));
21436 +
21437 +       tree = jnode_get_tree(node);
21438 +
21439 +       write_lock_tree(tree);
21440 +       /* re-check ->x_count under tree lock. */
21441 +       result = jnode_is_busy(node, jtype);
21442 +       if (likely(!result)) {
21443 +               assert("nikita-2123", JF_ISSET(node, JNODE_HEARD_BANSHEE));
21444 +               assert("jmacd-511", atomic_read(&node->d_count) == 0);
21445 +
21446 +               /* detach page */
21447 +               if (page != NULL) {
21448 +                       /*
21449 +                        * FIXME this is racy against jnode_extent_write().
21450 +                        */
21451 +                       page_clear_jnode(page, node);
21452 +               }
21453 +               spin_unlock_jnode(node);
21454 +               /* goodbye */
21455 +               jnode_delete(node, jtype, tree);
21456 +               write_unlock_tree(tree);
21457 +               jnode_free(node, jtype);
21458 +               /* @node is no longer valid pointer */
21459 +               if (page != NULL)
21460 +                       reiser4_drop_page(page);
21461 +       } else {
21462 +               /* busy check failed: reference was acquired by concurrent
21463 +                * thread. */
21464 +               JF_CLR(node, JNODE_RIP);
21465 +               write_unlock_tree(tree);
21466 +               spin_unlock_jnode(node);
21467 +               if (page != NULL)
21468 +                       unlock_page(page);
21469 +       }
21470 +       return result;
21471 +}
21472 +
21473 +/* drop jnode on the floor.
21474 +
21475 +   Return value:
21476 +
21477 +    -EBUSY:  failed to drop jnode, because there are still references to it
21478 +
21479 +    0:       successfully dropped jnode
21480 +
21481 +*/
21482 +static int jdrop_in_tree(jnode * node, reiser4_tree * tree)
21483 +{
21484 +       struct page *page;
21485 +       jnode_type jtype;
21486 +       int result;
21487 +
21488 +       assert("zam-602", node != NULL);
21489 +       assert_rw_not_read_locked(&(tree->tree_lock));
21490 +       assert_rw_not_write_locked(&(tree->tree_lock));
21491 +       assert("nikita-2403", !JF_ISSET(node, JNODE_HEARD_BANSHEE));
21492 +
21493 +       jtype = jnode_get_type(node);
21494 +
21495 +       page = jnode_lock_page(node);
21496 +       assert_spin_locked(&(node->guard));
21497 +
21498 +       write_lock_tree(tree);
21499 +
21500 +       /* re-check ->x_count under tree lock. */
21501 +       result = jnode_is_busy(node, jtype);
21502 +       if (!result) {
21503 +               assert("nikita-2488", page == jnode_page(node));
21504 +               assert("nikita-2533", atomic_read(&node->d_count) == 0);
21505 +               if (page != NULL) {
21506 +                       assert("nikita-2126", !PageDirty(page));
21507 +                       assert("nikita-2127", PageUptodate(page));
21508 +                       assert("nikita-2181", PageLocked(page));
21509 +                       page_clear_jnode(page, node);
21510 +               }
21511 +               spin_unlock_jnode(node);
21512 +               jnode_remove(node, jtype, tree);
21513 +               write_unlock_tree(tree);
21514 +               jnode_free(node, jtype);
21515 +               if (page != NULL)
21516 +                       reiser4_drop_page(page);
21517 +       } else {
21518 +               /* busy check failed: reference was acquired by concurrent
21519 +                * thread. */
21520 +               JF_CLR(node, JNODE_RIP);
21521 +               write_unlock_tree(tree);
21522 +               spin_unlock_jnode(node);
21523 +               if (page != NULL)
21524 +                       unlock_page(page);
21525 +       }
21526 +       return result;
21527 +}
21528 +
21529 +/* This function frees jnode "if possible". In particular, [dcx]_count has to
21530 +   be 0 (where applicable).  */
21531 +void jdrop(jnode * node)
21532 +{
21533 +       jdrop_in_tree(node, jnode_get_tree(node));
21534 +}
21535 +
21536 +/* IO head jnode implementation; The io heads are simple j-nodes with limited
21537 +   functionality (these j-nodes are not in any hash table) just for reading
21538 +   from and writing to disk. */
21539 +
21540 +jnode *reiser4_alloc_io_head(const reiser4_block_nr * block)
21541 +{
21542 +       jnode *jal = jalloc();
21543 +
21544 +       if (jal != NULL) {
21545 +               jnode_init(jal, current_tree, JNODE_IO_HEAD);
21546 +               jnode_set_block(jal, block);
21547 +       }
21548 +
21549 +       jref(jal);
21550 +
21551 +       return jal;
21552 +}
21553 +
21554 +void reiser4_drop_io_head(jnode * node)
21555 +{
21556 +       assert("zam-648", jnode_get_type(node) == JNODE_IO_HEAD);
21557 +
21558 +       jput(node);
21559 +       jdrop(node);
21560 +}
21561 +
21562 +/* protect keep jnode data from reiser4_releasepage()  */
21563 +void pin_jnode_data(jnode * node)
21564 +{
21565 +       assert("zam-671", jnode_page(node) != NULL);
21566 +       page_cache_get(jnode_page(node));
21567 +}
21568 +
21569 +/* make jnode data free-able again */
21570 +void unpin_jnode_data(jnode * node)
21571 +{
21572 +       assert("zam-672", jnode_page(node) != NULL);
21573 +       page_cache_release(jnode_page(node));
21574 +}
21575 +
21576 +struct address_space *jnode_get_mapping(const jnode * node)
21577 +{
21578 +       assert("nikita-3162", node != NULL);
21579 +       return jnode_ops(node)->mapping(node);
21580 +}
21581 +
21582 +#if REISER4_DEBUG
21583 +/* debugging aid: jnode invariant */
21584 +int jnode_invariant_f(const jnode * node, char const **msg)
21585 +{
21586 +#define _ergo(ant, con)                                                \
21587 +       ((*msg) = "{" #ant "} ergo {" #con "}", ergo((ant), (con)))
21588 +#define _check(exp) ((*msg) = #exp, (exp))
21589 +
21590 +       return _check(node != NULL) &&
21591 +           /* [jnode-queued] */
21592 +           /* only relocated node can be queued, except that when znode
21593 +            * is being deleted, its JNODE_RELOC bit is cleared */
21594 +           _ergo(JF_ISSET(node, JNODE_FLUSH_QUEUED),
21595 +                 JF_ISSET(node, JNODE_RELOC) ||
21596 +                 JF_ISSET(node, JNODE_HEARD_BANSHEE)) &&
21597 +           _check(node->jnodes.prev != NULL) &&
21598 +           _check(node->jnodes.next != NULL) &&
21599 +           /* [jnode-dirty] invariant */
21600 +           /* dirty inode is part of atom */
21601 +           _ergo(JF_ISSET(node, JNODE_DIRTY), node->atom != NULL) &&
21602 +           /* [jnode-oid] invariant */
21603 +           /* for unformatted node ->objectid and ->mapping fields are
21604 +            * consistent */
21605 +           _ergo(jnode_is_unformatted(node) && node->key.j.mapping != NULL,
21606 +                 node->key.j.objectid ==
21607 +                 get_inode_oid(node->key.j.mapping->host)) &&
21608 +           /* [jnode-atom-valid] invariant */
21609 +           /* node atom has valid state */
21610 +           _ergo(node->atom != NULL, node->atom->stage != ASTAGE_INVALID) &&
21611 +           /* [jnode-page-binding] invariant */
21612 +           /* if node points to page, it points back to node */
21613 +           _ergo(node->pg != NULL, jprivate(node->pg) == node) &&
21614 +           /* [jnode-refs] invariant */
21615 +           /* only referenced jnode can be loaded */
21616 +           _check(atomic_read(&node->x_count) >= atomic_read(&node->d_count));
21617 +
21618 +}
21619 +
21620 +static const char *jnode_type_name(jnode_type type)
21621 +{
21622 +       switch (type) {
21623 +       case JNODE_UNFORMATTED_BLOCK:
21624 +               return "unformatted";
21625 +       case JNODE_FORMATTED_BLOCK:
21626 +               return "formatted";
21627 +       case JNODE_BITMAP:
21628 +               return "bitmap";
21629 +       case JNODE_IO_HEAD:
21630 +               return "io head";
21631 +       case JNODE_INODE:
21632 +               return "inode";
21633 +       case LAST_JNODE_TYPE:
21634 +               return "last";
21635 +       default:{
21636 +                       static char unknown[30];
21637 +
21638 +                       sprintf(unknown, "unknown %i", type);
21639 +                       return unknown;
21640 +               }
21641 +       }
21642 +}
21643 +
21644 +#define jnode_state_name(node, flag)                   \
21645 +       (JF_ISSET((node), (flag)) ? ((#flag "|")+6) : "")
21646 +
21647 +/* debugging aid: output human readable information about @node */
21648 +static void info_jnode(const char *prefix /* prefix to print */ ,
21649 +                      const jnode * node/* node to print */)
21650 +{
21651 +       assert("umka-068", prefix != NULL);
21652 +
21653 +       if (node == NULL) {
21654 +               printk("%s: null\n", prefix);
21655 +               return;
21656 +       }
21657 +
21658 +       printk
21659 +           ("%s: %p: state: %lx: [%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s], level: %i,"
21660 +            " block: %s, d_count: %d, x_count: %d, "
21661 +            "pg: %p, atom: %p, lock: %i:%i, type: %s, ", prefix, node,
21662 +            node->state,
21663 +            jnode_state_name(node, JNODE_PARSED),
21664 +            jnode_state_name(node, JNODE_HEARD_BANSHEE),
21665 +            jnode_state_name(node, JNODE_LEFT_CONNECTED),
21666 +            jnode_state_name(node, JNODE_RIGHT_CONNECTED),
21667 +            jnode_state_name(node, JNODE_ORPHAN),
21668 +            jnode_state_name(node, JNODE_CREATED),
21669 +            jnode_state_name(node, JNODE_RELOC),
21670 +            jnode_state_name(node, JNODE_OVRWR),
21671 +            jnode_state_name(node, JNODE_DIRTY),
21672 +            jnode_state_name(node, JNODE_IS_DYING),
21673 +            jnode_state_name(node, JNODE_RIP),
21674 +            jnode_state_name(node, JNODE_MISSED_IN_CAPTURE),
21675 +            jnode_state_name(node, JNODE_WRITEBACK),
21676 +            jnode_state_name(node, JNODE_NEW),
21677 +            jnode_state_name(node, JNODE_DKSET),
21678 +            jnode_state_name(node, JNODE_REPACK),
21679 +            jnode_state_name(node, JNODE_CLUSTER_PAGE),
21680 +            jnode_get_level(node), sprint_address(jnode_get_block(node)),
21681 +            atomic_read(&node->d_count), atomic_read(&node->x_count),
21682 +            jnode_page(node), node->atom, 0, 0,
21683 +            jnode_type_name(jnode_get_type(node)));
21684 +       if (jnode_is_unformatted(node)) {
21685 +               printk("inode: %llu, index: %lu, ",
21686 +                      node->key.j.objectid, node->key.j.index);
21687 +       }
21688 +}
21689 +
21690 +/* debugging aid: check znode invariant and panic if it doesn't hold */
21691 +static int jnode_invariant(const jnode * node, int tlocked, int jlocked)
21692 +{
21693 +       char const *failed_msg;
21694 +       int result;
21695 +       reiser4_tree *tree;
21696 +
21697 +       tree = jnode_get_tree(node);
21698 +
21699 +       assert("umka-063312", node != NULL);
21700 +       assert("umka-064321", tree != NULL);
21701 +
21702 +       if (!jlocked && !tlocked)
21703 +               spin_lock_jnode((jnode *) node);
21704 +       if (!tlocked)
21705 +               read_lock_tree(jnode_get_tree(node));
21706 +       result = jnode_invariant_f(node, &failed_msg);
21707 +       if (!result) {
21708 +               info_jnode("corrupted node", node);
21709 +               warning("jmacd-555", "Condition %s failed", failed_msg);
21710 +       }
21711 +       if (!tlocked)
21712 +               read_unlock_tree(jnode_get_tree(node));
21713 +       if (!jlocked && !tlocked)
21714 +               spin_unlock_jnode((jnode *) node);
21715 +       return result;
21716 +}
21717 +
21718 +#endif                         /* REISER4_DEBUG */
21719 +
21720 +/* Make Linus happy.
21721 +   Local variables:
21722 +   c-indentation-style: "K&R"
21723 +   mode-name: "LC"
21724 +   c-basic-offset: 8
21725 +   tab-width: 8
21726 +   fill-column: 80
21727 +   End:
21728 +*/
21729 diff -puN /dev/null fs/reiser4/jnode.h
21730 --- /dev/null
21731 +++ a/fs/reiser4/jnode.h
21732 @@ -0,0 +1,704 @@
21733 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
21734 + * reiser4/README */
21735 +
21736 +/* Declaration of jnode. See jnode.c for details. */
21737 +
21738 +#ifndef __JNODE_H__
21739 +#define __JNODE_H__
21740 +
21741 +#include "forward.h"
21742 +#include "type_safe_hash.h"
21743 +#include "txnmgr.h"
21744 +#include "key.h"
21745 +#include "debug.h"
21746 +#include "dformat.h"
21747 +#include "page_cache.h"
21748 +#include "context.h"
21749 +
21750 +#include "plugin/plugin.h"
21751 +
21752 +#include <linux/fs.h>
21753 +#include <linux/mm.h>
21754 +#include <linux/spinlock.h>
21755 +#include <asm/atomic.h>
21756 +#include <linux/bitops.h>
21757 +#include <linux/list.h>
21758 +#include <linux/rcupdate.h>
21759 +
21760 +/* declare hash table of jnodes (jnodes proper, that is, unformatted
21761 +   nodes)  */
21762 +TYPE_SAFE_HASH_DECLARE(j, jnode);
21763 +
21764 +/* declare hash table of znodes */
21765 +TYPE_SAFE_HASH_DECLARE(z, znode);
21766 +
21767 +struct jnode_key {
21768 +       __u64 objectid;
21769 +       unsigned long index;
21770 +       struct address_space *mapping;
21771 +};
21772 +
21773 +/*
21774 +   Jnode is the "base class" of other nodes in reiser4. It is also happens to
21775 +   be exactly the node we use for unformatted tree nodes.
21776 +
21777 +   Jnode provides following basic functionality:
21778 +
21779 +   . reference counting and indexing.
21780 +
21781 +   . integration with page cache. Jnode has ->pg reference to which page can
21782 +   be attached.
21783 +
21784 +   . interface to transaction manager. It is jnode that is kept in transaction
21785 +   manager lists, attached to atoms, etc. (NOTE-NIKITA one may argue that this
21786 +   means, there should be special type of jnode for inode.)
21787 +
21788 +   Locking:
21789 +
21790 +   Spin lock: the following fields are protected by the per-jnode spin lock:
21791 +
21792 +    ->state
21793 +    ->atom
21794 +    ->capture_link
21795 +
21796 +   Following fields are protected by the global tree lock:
21797 +
21798 +    ->link
21799 +    ->key.z (content of ->key.z is only changed in znode_rehash())
21800 +    ->key.j
21801 +
21802 +   Atomic counters
21803 +
21804 +    ->x_count
21805 +    ->d_count
21806 +
21807 +    ->pg, and ->data are protected by spin lock for unused jnode and are
21808 +    immutable for used jnode (one for which fs/reiser4/vfs_ops.c:releasable()
21809 +    is false).
21810 +
21811 +    ->tree is immutable after creation
21812 +
21813 +   Unclear
21814 +
21815 +    ->blocknr: should be under jnode spin-lock, but current interface is based
21816 +    on passing of block address.
21817 +
21818 +   If you ever need to spin lock two nodes at once, do this in "natural"
21819 +   memory order: lock znode with lower address first. (See lock_two_nodes().)
21820 +
21821 +   Invariants involving this data-type:
21822 +
21823 +      [jnode-dirty]
21824 +      [jnode-refs]
21825 +      [jnode-oid]
21826 +      [jnode-queued]
21827 +      [jnode-atom-valid]
21828 +      [jnode-page-binding]
21829 +*/
21830 +
21831 +struct jnode {
21832 +#if REISER4_DEBUG
21833 +#define JMAGIC 0x52654973      /* "ReIs" */
21834 +       int magic;
21835 +#endif
21836 +       /* FIRST CACHE LINE (16 bytes): data used by jload */
21837 +
21838 +       /* jnode's state: bitwise flags from the reiser4_jnode_state enum. */
21839 +       /*   0 */ unsigned long state;
21840 +
21841 +       /* lock, protecting jnode's fields. */
21842 +       /*   4 */ spinlock_t load;
21843 +
21844 +       /* counter of references to jnode itself. Increased on jref().
21845 +          Decreased on jput().
21846 +        */
21847 +       /*   8 */ atomic_t x_count;
21848 +
21849 +       /* counter of references to jnode's data. Pin data page(s) in
21850 +          memory while this is greater than 0. Increased on jload().
21851 +          Decreased on jrelse().
21852 +        */
21853 +       /*   12 */ atomic_t d_count;
21854 +
21855 +       /* SECOND CACHE LINE: data used by hash table lookups */
21856 +
21857 +       /*   16 */ union {
21858 +               /* znodes are hashed by block number */
21859 +               reiser4_block_nr z;
21860 +               /* unformatted nodes are hashed by mapping plus offset */
21861 +               struct jnode_key j;
21862 +       } key;
21863 +
21864 +       /* THIRD CACHE LINE */
21865 +
21866 +       /*   32 */ union {
21867 +               /* pointers to maintain hash-table */
21868 +               z_hash_link z;
21869 +               j_hash_link j;
21870 +       } link;
21871 +
21872 +       /* pointer to jnode page.  */
21873 +       /*   36 */ struct page *pg;
21874 +       /* pointer to node itself. This is page_address(node->pg) when page is
21875 +          attached to the jnode
21876 +        */
21877 +       /*   40 */ void *data;
21878 +
21879 +       /*   44 */ reiser4_tree *tree;
21880 +
21881 +       /* FOURTH CACHE LINE: atom related fields */
21882 +
21883 +       /*   48 */ spinlock_t guard;
21884 +
21885 +       /* atom the block is in, if any */
21886 +       /*   52 */ txn_atom *atom;
21887 +
21888 +       /* capture list */
21889 +       /*   56 */ struct list_head capture_link;
21890 +
21891 +       /* FIFTH CACHE LINE */
21892 +
21893 +       /*   64 */ struct rcu_head rcu;
21894 +       /* crosses cache line */
21895 +
21896 +       /* SIXTH CACHE LINE */
21897 +
21898 +       /* the real blocknr (where io is going to/from) */
21899 +       /*   80 */ reiser4_block_nr blocknr;
21900 +       /* Parent item type, unformatted and CRC need it for
21901 +        * offset => key conversion.  */
21902 +       /* NOTE: this parent_item_id looks like jnode type. */
21903 +       /*   88 */ reiser4_plugin_id parent_item_id;
21904 +       /*   92 */
21905 +#if REISER4_DEBUG
21906 +       /* list of all jnodes for debugging purposes. */
21907 +       struct list_head jnodes;
21908 +       /* how many times this jnode was written in one transaction */
21909 +       int written;
21910 +       /* this indicates which atom's list the jnode is on */
21911 +       atom_list list;
21912 +#endif
21913 +} __attribute__ ((aligned(16)));
21914 +
21915 +/*
21916 + * jnode types. Enumeration of existing jnode types.
21917 + */
21918 +typedef enum {
21919 +       JNODE_UNFORMATTED_BLOCK,        /* unformatted block */
21920 +       JNODE_FORMATTED_BLOCK,  /* formatted block, znode */
21921 +       JNODE_BITMAP,           /* bitmap */
21922 +       JNODE_IO_HEAD,          /* jnode representing a block in the
21923 +                                * wandering log */
21924 +       JNODE_INODE,            /* jnode embedded into inode */
21925 +       LAST_JNODE_TYPE
21926 +} jnode_type;
21927 +
21928 +/* jnode states */
21929 +typedef enum {
21930 +       /* jnode's page is loaded and data checked */
21931 +       JNODE_PARSED = 0,
21932 +       /* node was deleted, not all locks on it were released. This
21933 +          node is empty and is going to be removed from the tree
21934 +          shortly. */
21935 +       JNODE_HEARD_BANSHEE = 1,
21936 +       /* left sibling pointer is valid */
21937 +       JNODE_LEFT_CONNECTED = 2,
21938 +       /* right sibling pointer is valid */
21939 +       JNODE_RIGHT_CONNECTED = 3,
21940 +
21941 +       /* znode was just created and doesn't yet have a pointer from
21942 +          its parent */
21943 +       JNODE_ORPHAN = 4,
21944 +
21945 +       /* this node was created by its transaction and has not been assigned
21946 +          a block address. */
21947 +       JNODE_CREATED = 5,
21948 +
21949 +       /* this node is currently relocated */
21950 +       JNODE_RELOC = 6,
21951 +       /* this node is currently wandered */
21952 +       JNODE_OVRWR = 7,
21953 +
21954 +       /* this znode has been modified */
21955 +       JNODE_DIRTY = 8,
21956 +
21957 +       /* znode lock is being invalidated */
21958 +       JNODE_IS_DYING = 9,
21959 +
21960 +       /* THIS PLACE IS INTENTIONALLY LEFT BLANK */
21961 +
21962 +       /* jnode is queued for flushing. */
21963 +       JNODE_FLUSH_QUEUED = 12,
21964 +
21965 +       /* In the following bits jnode type is encoded. */
21966 +       JNODE_TYPE_1 = 13,
21967 +       JNODE_TYPE_2 = 14,
21968 +       JNODE_TYPE_3 = 15,
21969 +
21970 +       /* jnode is being destroyed */
21971 +       JNODE_RIP = 16,
21972 +
21973 +       /* znode was not captured during locking (it might so be because
21974 +          ->level != LEAF_LEVEL and lock_mode == READ_LOCK) */
21975 +       JNODE_MISSED_IN_CAPTURE = 17,
21976 +
21977 +       /* write is in progress */
21978 +       JNODE_WRITEBACK = 18,
21979 +
21980 +       /* FIXME: now it is used by crypto-compress plugin only */
21981 +       JNODE_NEW = 19,
21982 +
21983 +       /* delimiting keys are already set for this znode. */
21984 +       JNODE_DKSET = 20,
21985 +
21986 +       /* when this bit is set page and jnode can not be disconnected */
21987 +       JNODE_WRITE_PREPARED = 21,
21988 +
21989 +       JNODE_CLUSTER_PAGE = 22,
21990 +       /* Jnode is marked for repacking, that means the reiser4 flush and the
21991 +        * block allocator should process this node special way  */
21992 +       JNODE_REPACK = 23,
21993 +       /* node should be converted by flush in squalloc phase */
21994 +       JNODE_CONVERTIBLE = 24,
21995 +       /*
21996 +        * When jnode is dirtied for the first time in given transaction,
21997 +        * do_jnode_make_dirty() checks whether this jnode can possible became
21998 +        * member of overwrite set. If so, this bit is set, and one block is
21999 +        * reserved in the ->flush_reserved space of atom.
22000 +        *
22001 +        * This block is "used" (and JNODE_FLUSH_RESERVED bit is cleared) when
22002 +        *
22003 +        *     (1) flush decides that we want this block to go into relocate
22004 +        *     set after all.
22005 +        *
22006 +        *     (2) wandering log is allocated (by log writer)
22007 +        *
22008 +        *     (3) extent is allocated
22009 +        *
22010 +        */
22011 +       JNODE_FLUSH_RESERVED = 29
22012 +} reiser4_jnode_state;
22013 +
22014 +/* Macros for accessing the jnode state. */
22015 +
22016 +static inline void JF_CLR(jnode * j, int f)
22017 +{
22018 +       assert("unknown-1", j->magic == JMAGIC);
22019 +       clear_bit(f, &j->state);
22020 +}
22021 +static inline int JF_ISSET(const jnode * j, int f)
22022 +{
22023 +       assert("unknown-2", j->magic == JMAGIC);
22024 +       return test_bit(f, &((jnode *) j)->state);
22025 +}
22026 +static inline void JF_SET(jnode * j, int f)
22027 +{
22028 +       assert("unknown-3", j->magic == JMAGIC);
22029 +       set_bit(f, &j->state);
22030 +}
22031 +
22032 +static inline int JF_TEST_AND_SET(jnode * j, int f)
22033 +{
22034 +       assert("unknown-4", j->magic == JMAGIC);
22035 +       return test_and_set_bit(f, &j->state);
22036 +}
22037 +
22038 +static inline void spin_lock_jnode(jnode *node)
22039 +{
22040 +       /* check that spinlocks of lower priorities are not held */
22041 +       assert("", (LOCK_CNT_NIL(rw_locked_tree) &&
22042 +                   LOCK_CNT_NIL(spin_locked_txnh) &&
22043 +                   LOCK_CNT_NIL(spin_locked_zlock) &&
22044 +                   LOCK_CNT_NIL(rw_locked_dk) &&
22045 +                   LOCK_CNT_LT(spin_locked_jnode, 2)));
22046 +
22047 +       spin_lock(&(node->guard));
22048 +
22049 +       LOCK_CNT_INC(spin_locked_jnode);
22050 +       LOCK_CNT_INC(spin_locked);
22051 +}
22052 +
22053 +static inline void spin_unlock_jnode(jnode *node)
22054 +{
22055 +       assert_spin_locked(&(node->guard));
22056 +       assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_jnode));
22057 +       assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
22058 +
22059 +       LOCK_CNT_DEC(spin_locked_jnode);
22060 +       LOCK_CNT_DEC(spin_locked);
22061 +
22062 +       spin_unlock(&(node->guard));
22063 +}
22064 +
22065 +static inline int jnode_is_in_deleteset(const jnode * node)
22066 +{
22067 +       return JF_ISSET(node, JNODE_RELOC);
22068 +}
22069 +
22070 +extern int init_jnodes(void);
22071 +extern void done_jnodes(void);
22072 +
22073 +/* Jnode routines */
22074 +extern jnode *jalloc(void);
22075 +extern void jfree(jnode * node) NONNULL;
22076 +extern jnode *jclone(jnode *);
22077 +extern jnode *jlookup(reiser4_tree * tree,
22078 +                     oid_t objectid, unsigned long ind) NONNULL;
22079 +extern jnode *jfind(struct address_space *, unsigned long index) NONNULL;
22080 +extern jnode *jnode_by_page(struct page *pg) NONNULL;
22081 +extern jnode *jnode_of_page(struct page *pg) NONNULL;
22082 +void jnode_attach_page(jnode * node, struct page *pg);
22083 +
22084 +void unhash_unformatted_jnode(jnode *);
22085 +extern jnode *page_next_jnode(jnode * node) NONNULL;
22086 +extern void jnode_init(jnode * node, reiser4_tree * tree, jnode_type) NONNULL;
22087 +extern void jnode_make_dirty(jnode * node) NONNULL;
22088 +extern void jnode_make_clean(jnode * node) NONNULL;
22089 +extern void jnode_make_wander_nolock(jnode * node) NONNULL;
22090 +extern void jnode_make_wander(jnode *) NONNULL;
22091 +extern void znode_make_reloc(znode * , flush_queue_t *) NONNULL;
22092 +extern void unformatted_make_reloc(jnode *, flush_queue_t *) NONNULL;
22093 +extern struct address_space *jnode_get_mapping(const jnode * node) NONNULL;
22094 +
22095 +/**
22096 + * jnode_get_block
22097 + * @node: jnode to query
22098 + *
22099 + */
22100 +static inline const reiser4_block_nr *jnode_get_block(const jnode *node)
22101 +{
22102 +       assert("nikita-528", node != NULL);
22103 +
22104 +       return &node->blocknr;
22105 +}
22106 +
22107 +/**
22108 + * jnode_set_block
22109 + * @node: jnode to update
22110 + * @blocknr: new block nr
22111 + */
22112 +static inline void jnode_set_block(jnode *node, const reiser4_block_nr *blocknr)
22113 +{
22114 +       assert("nikita-2020", node != NULL);
22115 +       assert("umka-055", blocknr != NULL);
22116 +       node->blocknr = *blocknr;
22117 +}
22118 +
22119 +
22120 +/* block number for IO. Usually this is the same as jnode_get_block(), unless
22121 + * jnode was emergency flushed---then block number chosen by eflush is
22122 + * used. */
22123 +static inline const reiser4_block_nr *jnode_get_io_block(jnode * node)
22124 +{
22125 +       assert("nikita-2768", node != NULL);
22126 +       assert_spin_locked(&(node->guard));
22127 +
22128 +       return jnode_get_block(node);
22129 +}
22130 +
22131 +/* Jnode flush interface. */
22132 +extern reiser4_blocknr_hint *reiser4_pos_hint(flush_pos_t *pos);
22133 +extern flush_queue_t *reiser4_pos_fq(flush_pos_t *pos);
22134 +
22135 +/* FIXME-VS: these are used in plugin/item/extent.c */
22136 +
22137 +/* does extent_get_block have to be called */
22138 +#define jnode_mapped(node)     JF_ISSET (node, JNODE_MAPPED)
22139 +#define jnode_set_mapped(node) JF_SET (node, JNODE_MAPPED)
22140 +
22141 +/* the node should be converted during flush squalloc phase */
22142 +#define jnode_convertible(node)        JF_ISSET (node, JNODE_CONVERTIBLE)
22143 +#define jnode_set_convertible(node)    JF_SET (node, JNODE_CONVERTIBLE)
22144 +
22145 +/* Macros to convert from jnode to znode, znode to jnode.  These are macros
22146 +   because C doesn't allow overloading of const prototypes. */
22147 +#define ZJNODE(x) (&(x)->zjnode)
22148 +#define JZNODE(x)                                              \
22149 +({                                                             \
22150 +       typeof(x) __tmp_x;                                      \
22151 +                                                               \
22152 +       __tmp_x = (x);                                          \
22153 +       assert("jmacd-1300", jnode_is_znode(__tmp_x));          \
22154 +       (znode*) __tmp_x;                                       \
22155 +})
22156 +
22157 +extern int jnodes_tree_init(reiser4_tree * tree);
22158 +extern int jnodes_tree_done(reiser4_tree * tree);
22159 +
22160 +#if REISER4_DEBUG
22161 +
22162 +extern int znode_is_any_locked(const znode * node);
22163 +extern void jnode_list_remove(jnode * node);
22164 +
22165 +#else
22166 +
22167 +#define jnode_list_remove(node) noop
22168 +
22169 +#endif
22170 +
22171 +int znode_is_root(const znode * node) NONNULL;
22172 +
22173 +/* bump reference counter on @node */
22174 +static inline void add_x_ref(jnode * node/* node to increase x_count of */)
22175 +{
22176 +       assert("nikita-1911", node != NULL);
22177 +
22178 +       atomic_inc(&node->x_count);
22179 +       LOCK_CNT_INC(x_refs);
22180 +}
22181 +
22182 +static inline void dec_x_ref(jnode * node)
22183 +{
22184 +       assert("nikita-3215", node != NULL);
22185 +       assert("nikita-3216", atomic_read(&node->x_count) > 0);
22186 +
22187 +       atomic_dec(&node->x_count);
22188 +       assert("nikita-3217", LOCK_CNT_GTZ(x_refs));
22189 +       LOCK_CNT_DEC(x_refs);
22190 +}
22191 +
22192 +/* jref() - increase counter of references to jnode/znode (x_count) */
22193 +static inline jnode *jref(jnode * node)
22194 +{
22195 +       assert("jmacd-508", (node != NULL) && !IS_ERR(node));
22196 +       add_x_ref(node);
22197 +       return node;
22198 +}
22199 +
22200 +/* get the page of jnode */
22201 +static inline struct page *jnode_page(const jnode * node)
22202 +{
22203 +       return node->pg;
22204 +}
22205 +
22206 +/* return pointer to jnode data */
22207 +static inline char *jdata(const jnode * node)
22208 +{
22209 +       assert("nikita-1415", node != NULL);
22210 +       assert("nikita-3198", jnode_page(node) != NULL);
22211 +       return node->data;
22212 +}
22213 +
22214 +static inline int jnode_is_loaded(const jnode * node)
22215 +{
22216 +       assert("zam-506", node != NULL);
22217 +       return atomic_read(&node->d_count) > 0;
22218 +}
22219 +
22220 +extern void page_clear_jnode(struct page *page, jnode * node) NONNULL;
22221 +
22222 +static inline void jnode_set_reloc(jnode * node)
22223 +{
22224 +       assert("nikita-2431", node != NULL);
22225 +       assert("nikita-2432", !JF_ISSET(node, JNODE_OVRWR));
22226 +       JF_SET(node, JNODE_RELOC);
22227 +}
22228 +
22229 +/* jload/jwrite/junload give a bread/bwrite/brelse functionality for jnodes */
22230 +
22231 +extern int jload_gfp(jnode *, gfp_t, int do_kmap) NONNULL;
22232 +
22233 +static inline int jload(jnode *node)
22234 +{
22235 +       return jload_gfp(node, reiser4_ctx_gfp_mask_get(), 1);
22236 +}
22237 +
22238 +extern int jinit_new(jnode *, gfp_t) NONNULL;
22239 +extern int jstartio(jnode *) NONNULL;
22240 +
22241 +extern void jdrop(jnode *) NONNULL;
22242 +extern int jwait_io(jnode *, int rw) NONNULL;
22243 +
22244 +void jload_prefetch(jnode *);
22245 +
22246 +extern jnode *reiser4_alloc_io_head(const reiser4_block_nr * block) NONNULL;
22247 +extern void reiser4_drop_io_head(jnode * node) NONNULL;
22248 +
22249 +static inline reiser4_tree *jnode_get_tree(const jnode * node)
22250 +{
22251 +       assert("nikita-2691", node != NULL);
22252 +       return node->tree;
22253 +}
22254 +
22255 +extern void pin_jnode_data(jnode *);
22256 +extern void unpin_jnode_data(jnode *);
22257 +
22258 +static inline jnode_type jnode_get_type(const jnode * node)
22259 +{
22260 +       static const unsigned long state_mask =
22261 +           (1 << JNODE_TYPE_1) | (1 << JNODE_TYPE_2) | (1 << JNODE_TYPE_3);
22262 +
22263 +       static jnode_type mask_to_type[] = {
22264 +               /*  JNODE_TYPE_3 : JNODE_TYPE_2 : JNODE_TYPE_1 */
22265 +
22266 +               /* 000 */
22267 +               [0] = JNODE_FORMATTED_BLOCK,
22268 +               /* 001 */
22269 +               [1] = JNODE_UNFORMATTED_BLOCK,
22270 +               /* 010 */
22271 +               [2] = JNODE_BITMAP,
22272 +               /* 011 */
22273 +               [3] = LAST_JNODE_TYPE,  /*invalid */
22274 +               /* 100 */
22275 +               [4] = JNODE_INODE,
22276 +               /* 101 */
22277 +               [5] = LAST_JNODE_TYPE,
22278 +               /* 110 */
22279 +               [6] = JNODE_IO_HEAD,
22280 +               /* 111 */
22281 +               [7] = LAST_JNODE_TYPE,  /* invalid */
22282 +       };
22283 +
22284 +       return mask_to_type[(node->state & state_mask) >> JNODE_TYPE_1];
22285 +}
22286 +
22287 +/* returns true if node is a znode */
22288 +static inline int jnode_is_znode(const jnode * node)
22289 +{
22290 +       return jnode_get_type(node) == JNODE_FORMATTED_BLOCK;
22291 +}
22292 +
22293 +static inline int jnode_is_flushprepped(jnode * node)
22294 +{
22295 +       assert("jmacd-78212", node != NULL);
22296 +       assert_spin_locked(&(node->guard));
22297 +       return !JF_ISSET(node, JNODE_DIRTY) || JF_ISSET(node, JNODE_RELOC) ||
22298 +               JF_ISSET(node, JNODE_OVRWR);
22299 +}
22300 +
22301 +/* Return true if @node has already been processed by the squeeze and allocate
22302 +   process.  This implies the block address has been finalized for the
22303 +   duration of this atom (or it is clean and will remain in place).  If this
22304 +   returns true you may use the block number as a hint. */
22305 +static inline int jnode_check_flushprepped(jnode * node)
22306 +{
22307 +       int result;
22308 +
22309 +       /* It must be clean or relocated or wandered.  New allocations are set
22310 +        * to relocate. */
22311 +       spin_lock_jnode(node);
22312 +       result = jnode_is_flushprepped(node);
22313 +       spin_unlock_jnode(node);
22314 +       return result;
22315 +}
22316 +
22317 +/* returns true if node is unformatted */
22318 +static inline int jnode_is_unformatted(const jnode * node)
22319 +{
22320 +       assert("jmacd-0123", node != NULL);
22321 +       return jnode_get_type(node) == JNODE_UNFORMATTED_BLOCK;
22322 +}
22323 +
22324 +/* returns true if node represents a cluster cache page */
22325 +static inline int jnode_is_cluster_page(const jnode * node)
22326 +{
22327 +       assert("edward-50", node != NULL);
22328 +       return (JF_ISSET(node, JNODE_CLUSTER_PAGE));
22329 +}
22330 +
22331 +/* returns true is node is builtin inode's jnode */
22332 +static inline int jnode_is_inode(const jnode * node)
22333 +{
22334 +       assert("vs-1240", node != NULL);
22335 +       return jnode_get_type(node) == JNODE_INODE;
22336 +}
22337 +
22338 +static inline jnode_plugin *jnode_ops_of(const jnode_type type)
22339 +{
22340 +       assert("nikita-2367", type < LAST_JNODE_TYPE);
22341 +       return jnode_plugin_by_id((reiser4_plugin_id) type);
22342 +}
22343 +
22344 +static inline jnode_plugin *jnode_ops(const jnode * node)
22345 +{
22346 +       assert("nikita-2366", node != NULL);
22347 +
22348 +       return jnode_ops_of(jnode_get_type(node));
22349 +}
22350 +
22351 +/* Get the index of a block. */
22352 +static inline unsigned long jnode_get_index(jnode * node)
22353 +{
22354 +       return jnode_ops(node)->index(node);
22355 +}
22356 +
22357 +/* return true if "node" is the root */
22358 +static inline int jnode_is_root(const jnode * node)
22359 +{
22360 +       return jnode_is_znode(node) && znode_is_root(JZNODE(node));
22361 +}
22362 +
22363 +extern struct address_space *mapping_jnode(const jnode * node);
22364 +extern unsigned long index_jnode(const jnode * node);
22365 +
22366 +static inline void jput(jnode * node);
22367 +extern void jput_final(jnode * node);
22368 +
22369 +/* bump data counter on @node */
22370 +static inline void add_d_ref(jnode * node/* node to increase d_count of */)
22371 +{
22372 +       assert("nikita-1962", node != NULL);
22373 +
22374 +       atomic_inc(&node->d_count);
22375 +       if (jnode_is_unformatted(node) || jnode_is_znode(node))
22376 +               LOCK_CNT_INC(d_refs);
22377 +}
22378 +
22379 +/* jput() - decrement x_count reference counter on znode.
22380 +
22381 +   Count may drop to 0, jnode stays in cache until memory pressure causes the
22382 +   eviction of its page. The c_count variable also ensures that children are
22383 +   pressured out of memory before the parent. The jnode remains hashed as
22384 +   long as the VM allows its page to stay in memory.
22385 +*/
22386 +static inline void jput(jnode * node)
22387 +{
22388 +       assert("jmacd-509", node != NULL);
22389 +       assert("jmacd-510", atomic_read(&node->x_count) > 0);
22390 +       assert("zam-926", reiser4_schedulable());
22391 +       LOCK_CNT_DEC(x_refs);
22392 +
22393 +       rcu_read_lock();
22394 +       /*
22395 +        * we don't need any kind of lock here--jput_final() uses RCU.
22396 +        */
22397 +       if (unlikely(atomic_dec_and_test(&node->x_count)))
22398 +               jput_final(node);
22399 +       else
22400 +               rcu_read_unlock();
22401 +       assert("nikita-3473", reiser4_schedulable());
22402 +}
22403 +
22404 +extern void jrelse(jnode * node);
22405 +extern void jrelse_tail(jnode * node);
22406 +
22407 +extern jnode *jnode_rip_sync(reiser4_tree * t, jnode * node);
22408 +
22409 +/* resolve race with jput */
22410 +static inline jnode *jnode_rip_check(reiser4_tree * tree, jnode * node)
22411 +{
22412 +       if (unlikely(JF_ISSET(node, JNODE_RIP)))
22413 +               node = jnode_rip_sync(tree, node);
22414 +       return node;
22415 +}
22416 +
22417 +extern reiser4_key *jnode_build_key(const jnode *node, reiser4_key * key);
22418 +
22419 +#if REISER4_DEBUG
22420 +extern int jnode_invariant_f(const jnode *node, char const **msg);
22421 +#endif
22422 +
22423 +extern jnode_plugin jnode_plugins[LAST_JNODE_TYPE];
22424 +
22425 +/* __JNODE_H__ */
22426 +#endif
22427 +
22428 +/* Make Linus happy.
22429 +   Local variables:
22430 +   c-indentation-style: "K&R"
22431 +   mode-name: "LC"
22432 +   c-basic-offset: 8
22433 +   tab-width: 8
22434 +   fill-column: 120
22435 +   End:
22436 +*/
22437 diff -puN /dev/null fs/reiser4/kassign.c
22438 --- /dev/null
22439 +++ a/fs/reiser4/kassign.c
22440 @@ -0,0 +1,677 @@
22441 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
22442 + * reiser4/README */
22443 +
22444 +/* Key assignment policy implementation */
22445 +
22446 +/*
22447 + * In reiser4 every piece of file system data and meta-data has a key. Keys
22448 + * are used to store information in and retrieve it from reiser4 internal
22449 + * tree. In addition to this, keys define _ordering_ of all file system
22450 + * information: things having close keys are placed into the same or
22451 + * neighboring (in the tree order) nodes of the tree. As our block allocator
22452 + * tries to respect tree order (see flush.c), keys also define order in which
22453 + * things are laid out on the disk, and hence, affect performance directly.
22454 + *
22455 + * Obviously, assignment of keys to data and meta-data should be consistent
22456 + * across whole file system. Algorithm that calculates a key for a given piece
22457 + * of data or meta-data is referred to as "key assignment".
22458 + *
22459 + * Key assignment is too expensive to be implemented as a plugin (that is,
22460 + * with an ability to support different key assignment schemas in the same
22461 + * compiled kernel image). As a compromise, all key-assignment functions and
22462 + * data-structures are collected in this single file, so that modifications to
22463 + * key assignment algorithm can be localized. Additional changes may be
22464 + * required in key.[ch].
22465 + *
22466 + * Current default reiser4 key assignment algorithm is dubbed "Plan A". As one
22467 + * may guess, there is "Plan B" too.
22468 + *
22469 + */
22470 +
22471 +/*
22472 + * Additional complication with key assignment implementation is a requirement
22473 + * to support different key length.
22474 + */
22475 +
22476 +/*
22477 + *                   KEY ASSIGNMENT: PLAN A, LONG KEYS.
22478 + *
22479 + * DIRECTORY ITEMS
22480 + *
22481 + * |       60     | 4 | 7 |1|   56        |        64        |        64       |
22482 + * +--------------+---+---+-+-------------+------------------+-----------------+
22483 + * |    dirid     | 0 | F |H|  prefix-1   |    prefix-2      |  prefix-3/hash  |
22484 + * +--------------+---+---+-+-------------+------------------+-----------------+
22485 + * |                  |                   |                  |                 |
22486 + * |    8 bytes       |      8 bytes      |     8 bytes      |     8 bytes     |
22487 + *
22488 + * dirid         objectid of directory this item is for
22489 + *
22490 + * F             fibration, see fs/reiser4/plugin/fibration.[ch]
22491 + *
22492 + * H             1 if last 8 bytes of the key contain hash,
22493 + *               0 if last 8 bytes of the key contain prefix-3
22494 + *
22495 + * prefix-1      first 7 characters of file name.
22496 + *               Padded by zeroes if name is not long enough.
22497 + *
22498 + * prefix-2      next 8 characters of the file name.
22499 + *
22500 + * prefix-3      next 8 characters of the file name.
22501 + *
22502 + * hash          hash of the rest of file name (i.e., portion of file
22503 + *               name not included into prefix-1 and prefix-2).
22504 + *
22505 + * File names shorter than 23 (== 7 + 8 + 8) characters are completely encoded
22506 + * in the key. Such file names are called "short". They are distinguished by H
22507 + * bit set 0 in the key.
22508 + *
22509 + * Other file names are "long". For long name, H bit is 1, and first 15 (== 7
22510 + * + 8) characters are encoded in prefix-1 and prefix-2 portions of the
22511 + * key. Last 8 bytes of the key are occupied by hash of the remaining
22512 + * characters of the name.
22513 + *
22514 + * This key assignment reaches following important goals:
22515 + *
22516 + *     (1) directory entries are sorted in approximately lexicographical
22517 + *     order.
22518 + *
22519 + *     (2) collisions (when multiple directory items have the same key), while
22520 + *     principally unavoidable in a tree with fixed length keys, are rare.
22521 + *
22522 + * STAT DATA
22523 + *
22524 + *  |       60     | 4 |       64        | 4 |     60       |        64       |
22525 + *  +--------------+---+-----------------+---+--------------+-----------------+
22526 + *  |  locality id | 1 |    ordering     | 0 |  objectid    |        0        |
22527 + *  +--------------+---+-----------------+---+--------------+-----------------+
22528 + *  |                  |                 |                  |                 |
22529 + *  |    8 bytes       |    8 bytes      |     8 bytes      |     8 bytes     |
22530 + *
22531 + * locality id     object id of a directory where first name was created for
22532 + *                 the object
22533 + *
22534 + * ordering        copy of second 8-byte portion of the key of directory
22535 + *                 entry for the first name of this object. Ordering has a form
22536 + *                         {
22537 + *                                 fibration :7;
22538 + *                                 h         :1;
22539 + *                                 prefix1   :56;
22540 + *                         }
22541 + *                 see description of key for directory entry above.
22542 + *
22543 + * objectid        object id for this object
22544 + *
22545 + * This key assignment policy is designed to keep stat-data in the same order
22546 + * as corresponding directory items, thus speeding up readdir/stat types of
22547 + * workload.
22548 + *
22549 + * FILE BODY
22550 + *
22551 + *  |       60     | 4 |       64        | 4 |     60       |        64       |
22552 + *  +--------------+---+-----------------+---+--------------+-----------------+
22553 + *  |  locality id | 4 |    ordering     | 0 |  objectid    |      offset     |
22554 + *  +--------------+---+-----------------+---+--------------+-----------------+
22555 + *  |                  |                 |                  |                 |
22556 + *  |    8 bytes       |    8 bytes      |     8 bytes      |     8 bytes     |
22557 + *
22558 + * locality id     object id of a directory where first name was created for
22559 + *                 the object
22560 + *
22561 + * ordering        the same as in the key of stat-data for this object
22562 + *
22563 + * objectid        object id for this object
22564 + *
22565 + * offset          logical offset from the beginning of this file.
22566 + *                 Measured in bytes.
22567 + *
22568 + *
22569 + *                   KEY ASSIGNMENT: PLAN A, SHORT KEYS.
22570 + *
22571 + * DIRECTORY ITEMS
22572 + *
22573 + *  |       60     | 4 | 7 |1|   56        |        64       |
22574 + *  +--------------+---+---+-+-------------+-----------------+
22575 + *  |    dirid     | 0 | F |H|  prefix-1   |  prefix-2/hash  |
22576 + *  +--------------+---+---+-+-------------+-----------------+
22577 + *  |                  |                   |                 |
22578 + *  |    8 bytes       |      8 bytes      |     8 bytes     |
22579 + *
22580 + * dirid         objectid of directory this item is for
22581 + *
22582 + * F             fibration, see fs/reiser4/plugin/fibration.[ch]
22583 + *
22584 + * H             1 if last 8 bytes of the key contain hash,
22585 + *               0 if last 8 bytes of the key contain prefix-2
22586 + *
22587 + * prefix-1      first 7 characters of file name.
22588 + *               Padded by zeroes if name is not long enough.
22589 + *
22590 + * prefix-2      next 8 characters of the file name.
22591 + *
22592 + * hash          hash of the rest of file name (i.e., portion of file
22593 + *               name not included into prefix-1).
22594 + *
22595 + * File names shorter than 15 (== 7 + 8) characters are completely encoded in
22596 + * the key. Such file names are called "short". They are distinguished by H
22597 + * bit set in the key.
22598 + *
22599 + * Other file names are "long". For long name, H bit is 0, and first 7
22600 + * characters are encoded in prefix-1 portion of the key. Last 8 bytes of the
22601 + * key are occupied by hash of the remaining characters of the name.
22602 + *
22603 + * STAT DATA
22604 + *
22605 + *  |       60     | 4 | 4 |     60       |        64       |
22606 + *  +--------------+---+---+--------------+-----------------+
22607 + *  |  locality id | 1 | 0 |  objectid    |        0        |
22608 + *  +--------------+---+---+--------------+-----------------+
22609 + *  |                  |                  |                 |
22610 + *  |    8 bytes       |     8 bytes      |     8 bytes     |
22611 + *
22612 + * locality id     object id of a directory where first name was created for
22613 + *                 the object
22614 + *
22615 + * objectid        object id for this object
22616 + *
22617 + * FILE BODY
22618 + *
22619 + *  |       60     | 4 | 4 |     60       |        64       |
22620 + *  +--------------+---+---+--------------+-----------------+
22621 + *  |  locality id | 4 | 0 |  objectid    |      offset     |
22622 + *  +--------------+---+---+--------------+-----------------+
22623 + *  |                  |                  |                 |
22624 + *  |    8 bytes       |     8 bytes      |     8 bytes     |
22625 + *
22626 + * locality id     object id of a directory where first name was created for
22627 + *                 the object
22628 + *
22629 + * objectid        object id for this object
22630 + *
22631 + * offset          logical offset from the beginning of this file.
22632 + *                 Measured in bytes.
22633 + *
22634 + *
22635 + */
22636 +
22637 +#include "debug.h"
22638 +#include "key.h"
22639 +#include "kassign.h"
22640 +#include "vfs_ops.h"
22641 +#include "inode.h"
22642 +#include "super.h"
22643 +#include "dscale.h"
22644 +
22645 +#include <linux/types.h>       /* for __u??  */
22646 +#include <linux/fs.h>          /* for struct super_block, etc  */
22647 +
22648 +/* bitmask for H bit (see comment at the beginning of this file */
22649 +static const __u64 longname_mark = 0x0100000000000000ull;
22650 +/* bitmask for F and H portions of the key. */
22651 +static const __u64 fibration_mask = 0xff00000000000000ull;
22652 +
22653 +/* return true if name is not completely encoded in @key */
22654 +int is_longname_key(const reiser4_key * key)
22655 +{
22656 +       __u64 highpart;
22657 +
22658 +       assert("nikita-2863", key != NULL);
22659 +       if (get_key_type(key) != KEY_FILE_NAME_MINOR)
22660 +               reiser4_print_key("oops", key);
22661 +       assert("nikita-2864", get_key_type(key) == KEY_FILE_NAME_MINOR);
22662 +
22663 +       if (REISER4_LARGE_KEY)
22664 +               highpart = get_key_ordering(key);
22665 +       else
22666 +               highpart = get_key_objectid(key);
22667 +
22668 +       return (highpart & longname_mark) ? 1 : 0;
22669 +}
22670 +
22671 +/* return true if @name is too long to be completely encoded in the key */
22672 +int is_longname(const char *name UNUSED_ARG, int len)
22673 +{
22674 +       if (REISER4_LARGE_KEY)
22675 +               return len > 23;
22676 +       else
22677 +               return len > 15;
22678 +}
22679 +
22680 +/* code ascii string into __u64.
22681 +
22682 +   Put characters of @name into result (@str) one after another starting
22683 +   from @start_idx-th highest (arithmetically) byte. This produces
22684 +   endian-safe encoding. memcpy(2) will not do.
22685 +
22686 +*/
22687 +static __u64 pack_string(const char *name /* string to encode */ ,
22688 +                        int start_idx  /* highest byte in result from
22689 +                                        * which to start encoding */ )
22690 +{
22691 +       unsigned i;
22692 +       __u64 str;
22693 +
22694 +       str = 0;
22695 +       for (i = 0; (i < sizeof str - start_idx) && name[i]; ++i) {
22696 +               str <<= 8;
22697 +               str |= (unsigned char)name[i];
22698 +       }
22699 +       str <<= (sizeof str - i - start_idx) << 3;
22700 +       return str;
22701 +}
22702 +
22703 +/* opposite to pack_string(). Takes value produced by pack_string(), restores
22704 + * string encoded in it and stores result in @buf */
22705 +char *reiser4_unpack_string(__u64 value, char *buf)
22706 +{
22707 +       do {
22708 +               *buf = value >> (64 - 8);
22709 +               if (*buf)
22710 +                       ++buf;
22711 +               value <<= 8;
22712 +       } while (value != 0);
22713 +       *buf = 0;
22714 +       return buf;
22715 +}
22716 +
22717 +/* obtain name encoded in @key and store it in @buf */
22718 +char *extract_name_from_key(const reiser4_key * key, char *buf)
22719 +{
22720 +       char *c;
22721 +
22722 +       assert("nikita-2868", !is_longname_key(key));
22723 +
22724 +       c = buf;
22725 +       if (REISER4_LARGE_KEY) {
22726 +               c = reiser4_unpack_string(get_key_ordering(key) &
22727 +                                         ~fibration_mask, c);
22728 +               c = reiser4_unpack_string(get_key_fulloid(key), c);
22729 +       } else
22730 +               c = reiser4_unpack_string(get_key_fulloid(key) &
22731 +                                         ~fibration_mask, c);
22732 +       reiser4_unpack_string(get_key_offset(key), c);
22733 +       return buf;
22734 +}
22735 +
22736 +/**
22737 + * complete_entry_key - calculate entry key by name
22738 + * @dir: directory where entry is (or will be) in
22739 + * @name: name to calculate key of
22740 + * @len: lenth of name
22741 + * @result: place to store result in
22742 + *
22743 + * Sets fields of entry key @result which depend on file name.
22744 + * When REISER4_LARGE_KEY is defined three fields of @result are set: ordering,
22745 + * objectid and offset. Otherwise, objectid and offset are set.
22746 + */
22747 +void complete_entry_key(const struct inode *dir, const char *name,
22748 +                       int len, reiser4_key *result)
22749 +{
22750 +#if REISER4_LARGE_KEY
22751 +       __u64 ordering;
22752 +       __u64 objectid;
22753 +       __u64 offset;
22754 +
22755 +       assert("nikita-1139", dir != NULL);
22756 +       assert("nikita-1142", result != NULL);
22757 +       assert("nikita-2867", strlen(name) == len);
22758 +
22759 +       /*
22760 +        * key allocation algorithm for directory entries in case of large
22761 +        * keys:
22762 +        *
22763 +        * If name is not longer than 7 + 8 + 8 = 23 characters, put first 7
22764 +        * characters into ordering field of key, next 8 charactes (if any)
22765 +        * into objectid field of key and next 8 ones (of any) into offset
22766 +        * field of key
22767 +        *
22768 +        * If file name is longer than 23 characters, put first 7 characters
22769 +        * into key's ordering, next 8 to objectid and hash of remaining
22770 +        * characters into offset field.
22771 +        *
22772 +        * To distinguish above cases, in latter set up unused high bit in
22773 +        * ordering field.
22774 +        */
22775 +
22776 +       /* [0-6] characters to ordering */
22777 +       ordering = pack_string(name, 1);
22778 +       if (len > 7) {
22779 +               /* [7-14] characters to objectid */
22780 +               objectid = pack_string(name + 7, 0);
22781 +               if (len > 15) {
22782 +                       if (len <= 23) {
22783 +                               /* [15-23] characters to offset */
22784 +                               offset = pack_string(name + 15, 0);
22785 +                       } else {
22786 +                               /* note in a key the fact that offset contains
22787 +                                * hash */
22788 +                               ordering |= longname_mark;
22789 +
22790 +                               /* offset is the hash of the file name's tail */
22791 +                               offset = inode_hash_plugin(dir)->hash(name + 15,
22792 +                                                                     len - 15);
22793 +                       }
22794 +               } else {
22795 +                       offset = 0ull;
22796 +               }
22797 +       } else {
22798 +               objectid = 0ull;
22799 +               offset = 0ull;
22800 +       }
22801 +
22802 +       assert("nikita-3480", inode_fibration_plugin(dir) != NULL);
22803 +       ordering |= inode_fibration_plugin(dir)->fibre(dir, name, len);
22804 +
22805 +       set_key_ordering(result, ordering);
22806 +       set_key_fulloid(result, objectid);
22807 +       set_key_offset(result, offset);
22808 +       return;
22809 +
22810 +#else
22811 +       __u64 objectid;
22812 +       __u64 offset;
22813 +
22814 +       assert("nikita-1139", dir != NULL);
22815 +       assert("nikita-1142", result != NULL);
22816 +       assert("nikita-2867", strlen(name) == len);
22817 +
22818 +       /*
22819 +        * key allocation algorithm for directory entries in case of not large
22820 +        * keys:
22821 +        *
22822 +        * If name is not longer than 7 + 8 = 15 characters, put first 7
22823 +        * characters into objectid field of key, next 8 charactes (if any)
22824 +        * into offset field of key
22825 +        *
22826 +        * If file name is longer than 15 characters, put first 7 characters
22827 +        * into key's objectid, and hash of remaining characters into offset
22828 +        * field.
22829 +        *
22830 +        * To distinguish above cases, in latter set up unused high bit in
22831 +        * objectid field.
22832 +        */
22833 +
22834 +       /* [0-6] characters to objectid */
22835 +       objectid = pack_string(name, 1);
22836 +       if (len > 7) {
22837 +               if (len <= 15) {
22838 +                       /* [7-14] characters to offset */
22839 +                       offset = pack_string(name + 7, 0);
22840 +               } else {
22841 +                       /* note in a key the fact that offset contains hash. */
22842 +                       objectid |= longname_mark;
22843 +
22844 +                       /* offset is the hash of the file name. */
22845 +                       offset = inode_hash_plugin(dir)->hash(name + 7,
22846 +                                                             len - 7);
22847 +               }
22848 +       } else
22849 +               offset = 0ull;
22850 +
22851 +       assert("nikita-3480", inode_fibration_plugin(dir) != NULL);
22852 +       objectid |= inode_fibration_plugin(dir)->fibre(dir, name, len);
22853 +
22854 +       set_key_fulloid(result, objectid);
22855 +       set_key_offset(result, offset);
22856 +       return;
22857 +#endif                         /* ! REISER4_LARGE_KEY */
22858 +}
22859 +
22860 +/* true, if @key is the key of "." */
22861 +int is_dot_key(const reiser4_key * key/* key to check */)
22862 +{
22863 +       assert("nikita-1717", key != NULL);
22864 +       assert("nikita-1718", get_key_type(key) == KEY_FILE_NAME_MINOR);
22865 +       return
22866 +           (get_key_ordering(key) == 0ull) &&
22867 +           (get_key_objectid(key) == 0ull) && (get_key_offset(key) == 0ull);
22868 +}
22869 +
22870 +/* build key for stat-data.
22871 +
22872 +   return key of stat-data of this object. This should became sd plugin
22873 +   method in the future. For now, let it be here.
22874 +
22875 +*/
22876 +reiser4_key *build_sd_key(const struct inode *target /* inode of an object */ ,
22877 +                         reiser4_key * result  /* resulting key of @target
22878 +                                                  stat-data */ )
22879 +{
22880 +       assert("nikita-261", result != NULL);
22881 +
22882 +       reiser4_key_init(result);
22883 +       set_key_locality(result, reiser4_inode_data(target)->locality_id);
22884 +       set_key_ordering(result, get_inode_ordering(target));
22885 +       set_key_objectid(result, get_inode_oid(target));
22886 +       set_key_type(result, KEY_SD_MINOR);
22887 +       set_key_offset(result, (__u64) 0);
22888 +       return result;
22889 +}
22890 +
22891 +/* encode part of key into &obj_key_id
22892 +
22893 +   This encodes into @id part of @key sufficient to restore @key later,
22894 +   given that latter is key of object (key of stat-data).
22895 +
22896 +   See &obj_key_id
22897 +*/
22898 +int build_obj_key_id(const reiser4_key * key /* key to encode */ ,
22899 +                    obj_key_id * id/* id where key is encoded in */)
22900 +{
22901 +       assert("nikita-1151", key != NULL);
22902 +       assert("nikita-1152", id != NULL);
22903 +
22904 +       memcpy(id, key, sizeof *id);
22905 +       return 0;
22906 +}
22907 +
22908 +/* encode reference to @obj in @id.
22909 +
22910 +   This is like build_obj_key_id() above, but takes inode as parameter. */
22911 +int build_inode_key_id(const struct inode *obj /* object to build key of */ ,
22912 +                      obj_key_id * id/* result */)
22913 +{
22914 +       reiser4_key sdkey;
22915 +
22916 +       assert("nikita-1166", obj != NULL);
22917 +       assert("nikita-1167", id != NULL);
22918 +
22919 +       build_sd_key(obj, &sdkey);
22920 +       build_obj_key_id(&sdkey, id);
22921 +       return 0;
22922 +}
22923 +
22924 +/* decode @id back into @key
22925 +
22926 +   Restore key of object stat-data from @id. This is dual to
22927 +   build_obj_key_id() above.
22928 +*/
22929 +int extract_key_from_id(const obj_key_id * id  /* object key id to extract key
22930 +                                                * from */ ,
22931 +                       reiser4_key * key/* result */)
22932 +{
22933 +       assert("nikita-1153", id != NULL);
22934 +       assert("nikita-1154", key != NULL);
22935 +
22936 +       reiser4_key_init(key);
22937 +       memcpy(key, id, sizeof *id);
22938 +       return 0;
22939 +}
22940 +
22941 +/* extract objectid of directory from key of directory entry within said
22942 +   directory.
22943 +   */
22944 +oid_t extract_dir_id_from_key(const reiser4_key * de_key       /* key of
22945 +                                                                * directory
22946 +                                                                * entry */ )
22947 +{
22948 +       assert("nikita-1314", de_key != NULL);
22949 +       return get_key_locality(de_key);
22950 +}
22951 +
22952 +/* encode into @id key of directory entry.
22953 +
22954 +   Encode into @id information sufficient to later distinguish directory
22955 +   entries within the same directory. This is not whole key, because all
22956 +   directory entries within directory item share locality which is equal
22957 +   to objectid of their directory.
22958 +
22959 +*/
22960 +int build_de_id(const struct inode *dir /* inode of directory */ ,
22961 +               const struct qstr *name /* name to be given to @obj by
22962 +                                        * directory entry being
22963 +                                        * constructed */ ,
22964 +               de_id * id/* short key of directory entry */)
22965 +{
22966 +       reiser4_key key;
22967 +
22968 +       assert("nikita-1290", dir != NULL);
22969 +       assert("nikita-1292", id != NULL);
22970 +
22971 +       /* NOTE-NIKITA this is suboptimal. */
22972 +       inode_dir_plugin(dir)->build_entry_key(dir, name, &key);
22973 +       return build_de_id_by_key(&key, id);
22974 +}
22975 +
22976 +/* encode into @id key of directory entry.
22977 +
22978 +   Encode into @id information sufficient to later distinguish directory
22979 +   entries within the same directory. This is not whole key, because all
22980 +   directory entries within directory item share locality which is equal
22981 +   to objectid of their directory.
22982 +
22983 +*/
22984 +int build_de_id_by_key(const reiser4_key * entry_key   /* full key of directory
22985 +                                                        * entry */ ,
22986 +                      de_id * id/* short key of directory entry */)
22987 +{
22988 +       memcpy(id, ((__u64 *) entry_key) + 1, sizeof *id);
22989 +       return 0;
22990 +}
22991 +
22992 +/* restore from @id key of directory entry.
22993 +
22994 +   Function dual to build_de_id(): given @id and locality, build full
22995 +   key of directory entry within directory item.
22996 +
22997 +*/
22998 +int extract_key_from_de_id(const oid_t locality        /* locality of directory
22999 +                                                * entry */ ,
23000 +                          const de_id * id /* directory entry id */ ,
23001 +                          reiser4_key * key/* result */)
23002 +{
23003 +       /* no need to initialise key here: all fields are overwritten */
23004 +       memcpy(((__u64 *) key) + 1, id, sizeof *id);
23005 +       set_key_locality(key, locality);
23006 +       set_key_type(key, KEY_FILE_NAME_MINOR);
23007 +       return 0;
23008 +}
23009 +
23010 +/* compare two &de_id's */
23011 +cmp_t de_id_cmp(const de_id * id1 /* first &de_id to compare */ ,
23012 +               const de_id * id2/* second &de_id to compare */)
23013 +{
23014 +       /* NOTE-NIKITA ugly implementation */
23015 +       reiser4_key k1;
23016 +       reiser4_key k2;
23017 +
23018 +       extract_key_from_de_id((oid_t) 0, id1, &k1);
23019 +       extract_key_from_de_id((oid_t) 0, id2, &k2);
23020 +       return keycmp(&k1, &k2);
23021 +}
23022 +
23023 +/* compare &de_id with key */
23024 +cmp_t de_id_key_cmp(const de_id * id /* directory entry id to compare */ ,
23025 +                   const reiser4_key * key/* key to compare */)
23026 +{
23027 +       cmp_t result;
23028 +       reiser4_key *k1;
23029 +
23030 +       k1 = (reiser4_key *) (((unsigned long)id) - sizeof key->el[0]);
23031 +       result = KEY_DIFF_EL(k1, key, 1);
23032 +       if (result == EQUAL_TO) {
23033 +               result = KEY_DIFF_EL(k1, key, 2);
23034 +               if (REISER4_LARGE_KEY && result == EQUAL_TO)
23035 +                       result = KEY_DIFF_EL(k1, key, 3);
23036 +       }
23037 +       return result;
23038 +}
23039 +
23040 +/*
23041 + * return number of bytes necessary to encode @inode identity.
23042 + */
23043 +int inode_onwire_size(const struct inode *inode)
23044 +{
23045 +       int result;
23046 +
23047 +       result = dscale_bytes_to_write(get_inode_oid(inode));
23048 +       result += dscale_bytes_to_write(get_inode_locality(inode));
23049 +
23050 +       /*
23051 +        * ordering is large (it usually has highest bits set), so it makes
23052 +        * little sense to dscale it.
23053 +        */
23054 +       if (REISER4_LARGE_KEY)
23055 +               result += sizeof(get_inode_ordering(inode));
23056 +       return result;
23057 +}
23058 +
23059 +/*
23060 + * encode @inode identity at @start
23061 + */
23062 +char *build_inode_onwire(const struct inode *inode, char *start)
23063 +{
23064 +       start += dscale_write(start, get_inode_locality(inode));
23065 +       start += dscale_write(start, get_inode_oid(inode));
23066 +
23067 +       if (REISER4_LARGE_KEY) {
23068 +               put_unaligned(cpu_to_le64(get_inode_ordering(inode)), (__le64 *)start);
23069 +               start += sizeof(get_inode_ordering(inode));
23070 +       }
23071 +       return start;
23072 +}
23073 +
23074 +/*
23075 + * extract key that was previously encoded by build_inode_onwire() at @addr
23076 + */
23077 +char *extract_obj_key_id_from_onwire(char *addr, obj_key_id * key_id)
23078 +{
23079 +       __u64 val;
23080 +
23081 +       addr += dscale_read(addr, &val);
23082 +       val = (val << KEY_LOCALITY_SHIFT) | KEY_SD_MINOR;
23083 +       put_unaligned(cpu_to_le64(val), (__le64 *)key_id->locality);
23084 +       addr += dscale_read(addr, &val);
23085 +       put_unaligned(cpu_to_le64(val), (__le64 *)key_id->objectid);
23086 +#if REISER4_LARGE_KEY
23087 +       memcpy(&key_id->ordering, addr, sizeof key_id->ordering);
23088 +       addr += sizeof key_id->ordering;
23089 +#endif
23090 +       return addr;
23091 +}
23092 +
23093 +/*
23094 + * skip a key that was previously encoded by build_inode_onwire() at @addr
23095 + * FIXME: handle IO errors.
23096 + */
23097 +char * locate_obj_key_id_onwire(char * addr)
23098 +{
23099 +       /* locality */
23100 +       addr += dscale_bytes_to_read(addr);
23101 +       /* objectid */
23102 +       addr += dscale_bytes_to_read(addr);
23103 +#if REISER4_LARGE_KEY
23104 +       addr += sizeof ((obj_key_id *)0)->ordering;
23105 +#endif
23106 +       return addr;
23107 +}
23108 +
23109 +/* Make Linus happy.
23110 +   Local variables:
23111 +   c-indentation-style: "K&R"
23112 +   mode-name: "LC"
23113 +   c-basic-offset: 8
23114 +   tab-width: 8
23115 +   fill-column: 120
23116 +   End:
23117 +*/
23118 diff -puN /dev/null fs/reiser4/kassign.h
23119 --- /dev/null
23120 +++ a/fs/reiser4/kassign.h
23121 @@ -0,0 +1,111 @@
23122 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
23123 + * reiser4/README */
23124 +
23125 +/* Key assignment policy interface. See kassign.c for details. */
23126 +
23127 +#if !defined(__KASSIGN_H__)
23128 +#define __KASSIGN_H__
23129 +
23130 +#include "forward.h"
23131 +#include "key.h"
23132 +#include "dformat.h"
23133 +
23134 +#include <linux/types.h>       /* for __u??  */
23135 +#include <linux/fs.h>          /* for struct super_block, etc  */
23136 +#include <linux/dcache.h>      /* for struct qstr */
23137 +
23138 +/* key assignment functions */
23139 +
23140 +/* Information from which key of file stat-data can be uniquely
23141 +   restored. This depends on key assignment policy for
23142 +   stat-data. Currently it's enough to store object id and locality id
23143 +   (60+60==120) bits, because minor packing locality and offset of
23144 +   stat-data key are always known constants: KEY_SD_MINOR and 0
23145 +   respectively. For simplicity 4 bits are wasted in each id, and just
23146 +   two 64 bit integers are stored.
23147 +
23148 +   This field has to be byte-aligned, because we don't want to waste
23149 +   space in directory entries. There is another side of a coin of
23150 +   course: we waste CPU and bus bandwidth in stead, by copying data back
23151 +   and forth.
23152 +
23153 +   Next optimization: &obj_key_id is mainly used to address stat data from
23154 +   directory entries. Under the assumption that majority of files only have
23155 +   only name (one hard link) from *the* parent directory it seems reasonable
23156 +   to only store objectid of stat data and take its locality from key of
23157 +   directory item.
23158 +
23159 +   This requires some flag to be added to the &obj_key_id to distinguish
23160 +   between these two cases. Remaining bits in flag byte are then asking to be
23161 +   used to store file type.
23162 +
23163 +   This optimization requires changes in directory item handling code.
23164 +
23165 +*/
23166 +typedef struct obj_key_id {
23167 +       d8 locality[sizeof(__u64)];
23168 +        ON_LARGE_KEY(d8 ordering[sizeof(__u64)];
23169 +           )
23170 +       d8 objectid[sizeof(__u64)];
23171 +}
23172 +obj_key_id;
23173 +
23174 +/* Information sufficient to uniquely identify directory entry within
23175 +   compressed directory item.
23176 +
23177 +   For alignment issues see &obj_key_id above.
23178 +*/
23179 +typedef struct de_id {
23180 +       ON_LARGE_KEY(d8 ordering[sizeof(__u64)];)
23181 +       d8 objectid[sizeof(__u64)];
23182 +       d8 offset[sizeof(__u64)];
23183 +}
23184 +de_id;
23185 +
23186 +extern int inode_onwire_size(const struct inode *obj);
23187 +extern char *build_inode_onwire(const struct inode *obj, char *area);
23188 +extern char *locate_obj_key_id_onwire(char *area);
23189 +extern char *extract_obj_key_id_from_onwire(char *area, obj_key_id * key_id);
23190 +
23191 +extern int build_inode_key_id(const struct inode *obj, obj_key_id * id);
23192 +extern int extract_key_from_id(const obj_key_id * id, reiser4_key * key);
23193 +extern int build_obj_key_id(const reiser4_key * key, obj_key_id * id);
23194 +extern oid_t extract_dir_id_from_key(const reiser4_key * de_key);
23195 +extern int build_de_id(const struct inode *dir, const struct qstr *name,
23196 +                      de_id * id);
23197 +extern int build_de_id_by_key(const reiser4_key * entry_key, de_id * id);
23198 +extern int extract_key_from_de_id(const oid_t locality, const de_id * id,
23199 +                                 reiser4_key * key);
23200 +extern cmp_t de_id_cmp(const de_id * id1, const de_id * id2);
23201 +extern cmp_t de_id_key_cmp(const de_id * id, const reiser4_key * key);
23202 +
23203 +extern int build_readdir_key_common(struct file *dir, reiser4_key * result);
23204 +extern void build_entry_key_common(const struct inode *dir,
23205 +                                  const struct qstr *name,
23206 +                                  reiser4_key * result);
23207 +extern void build_entry_key_stable_entry(const struct inode *dir,
23208 +                                        const struct qstr *name,
23209 +                                        reiser4_key * result);
23210 +extern int is_dot_key(const reiser4_key * key);
23211 +extern reiser4_key *build_sd_key(const struct inode *target,
23212 +                                reiser4_key * result);
23213 +
23214 +extern int is_longname_key(const reiser4_key * key);
23215 +extern int is_longname(const char *name, int len);
23216 +extern char *extract_name_from_key(const reiser4_key * key, char *buf);
23217 +extern char *reiser4_unpack_string(__u64 value, char *buf);
23218 +extern void complete_entry_key(const struct inode *dir, const char *name,
23219 +                              int len, reiser4_key *result);
23220 +
23221 +/* __KASSIGN_H__ */
23222 +#endif
23223 +
23224 +/* Make Linus happy.
23225 +   Local variables:
23226 +   c-indentation-style: "K&R"
23227 +   mode-name: "LC"
23228 +   c-basic-offset: 8
23229 +   tab-width: 8
23230 +   fill-column: 120
23231 +   End:
23232 +*/
23233 diff -puN /dev/null fs/reiser4/key.c
23234 --- /dev/null
23235 +++ a/fs/reiser4/key.c
23236 @@ -0,0 +1,138 @@
23237 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
23238 + * reiser4/README */
23239 +
23240 +/* Key manipulations. */
23241 +
23242 +#include "debug.h"
23243 +#include "key.h"
23244 +#include "super.h"
23245 +#include "reiser4.h"
23246 +
23247 +#include <linux/types.h>       /* for __u??  */
23248 +
23249 +/* Minimal possible key: all components are zero. It is presumed that this is
23250 +   independent of key scheme. */
23251 +static const reiser4_key MINIMAL_KEY = {
23252 +       .el = {
23253 +               0ull,
23254 +               ON_LARGE_KEY(0ull,)
23255 +               0ull,
23256 +               0ull
23257 +       }
23258 +};
23259 +
23260 +/* Maximal possible key: all components are ~0. It is presumed that this is
23261 +   independent of key scheme. */
23262 +static const reiser4_key MAXIMAL_KEY = {
23263 +       .el = {
23264 +               __constant_cpu_to_le64(~0ull),
23265 +               ON_LARGE_KEY(__constant_cpu_to_le64(~0ull),)
23266 +               __constant_cpu_to_le64(~0ull),
23267 +               __constant_cpu_to_le64(~0ull)
23268 +       }
23269 +};
23270 +
23271 +/* Initialize key. */
23272 +void reiser4_key_init(reiser4_key * key/* key to init */)
23273 +{
23274 +       assert("nikita-1169", key != NULL);
23275 +       memset(key, 0, sizeof *key);
23276 +}
23277 +
23278 +/* minimal possible key in the tree. Return pointer to the static storage. */
23279 +const reiser4_key * reiser4_min_key(void)
23280 +{
23281 +       return &MINIMAL_KEY;
23282 +}
23283 +
23284 +/* maximum possible key in the tree. Return pointer to the static storage. */
23285 +const reiser4_key * reiser4_max_key(void)
23286 +{
23287 +       return &MAXIMAL_KEY;
23288 +}
23289 +
23290 +#if REISER4_DEBUG
23291 +/* debugging aid: print symbolic name of key type */
23292 +static const char *type_name(unsigned int key_type/* key type */)
23293 +{
23294 +       switch (key_type) {
23295 +       case KEY_FILE_NAME_MINOR:
23296 +               return "file name";
23297 +       case KEY_SD_MINOR:
23298 +               return "stat data";
23299 +       case KEY_ATTR_NAME_MINOR:
23300 +               return "attr name";
23301 +       case KEY_ATTR_BODY_MINOR:
23302 +               return "attr body";
23303 +       case KEY_BODY_MINOR:
23304 +               return "file body";
23305 +       default:
23306 +               return "unknown";
23307 +       }
23308 +}
23309 +
23310 +/* debugging aid: print human readable information about key */
23311 +void reiser4_print_key(const char *prefix /* prefix to print */ ,
23312 +              const reiser4_key * key/* key to print */)
23313 +{
23314 +       /* turn bold on */
23315 +       /* printf ("\033[1m"); */
23316 +       if (key == NULL)
23317 +               printk("%s: null key\n", prefix);
23318 +       else {
23319 +               if (REISER4_LARGE_KEY)
23320 +                       printk("%s: (%Lx:%x:%Lx:%Lx:%Lx:%Lx)", prefix,
23321 +                              get_key_locality(key),
23322 +                              get_key_type(key),
23323 +                              get_key_ordering(key),
23324 +                              get_key_band(key),
23325 +                              get_key_objectid(key), get_key_offset(key));
23326 +               else
23327 +                       printk("%s: (%Lx:%x:%Lx:%Lx:%Lx)", prefix,
23328 +                              get_key_locality(key),
23329 +                              get_key_type(key),
23330 +                              get_key_band(key),
23331 +                              get_key_objectid(key), get_key_offset(key));
23332 +               /*
23333 +                * if this is a key of directory entry, try to decode part of
23334 +                * a name stored in the key, and output it.
23335 +                */
23336 +               if (get_key_type(key) == KEY_FILE_NAME_MINOR) {
23337 +                       char buf[DE_NAME_BUF_LEN];
23338 +                       char *c;
23339 +
23340 +                       c = buf;
23341 +                       c = reiser4_unpack_string(get_key_ordering(key), c);
23342 +                       reiser4_unpack_string(get_key_fulloid(key), c);
23343 +                       printk("[%s", buf);
23344 +                       if (is_longname_key(key))
23345 +                               /*
23346 +                                * only part of the name is stored in the key.
23347 +                                */
23348 +                               printk("...]\n");
23349 +                       else {
23350 +                               /*
23351 +                                * whole name is stored in the key.
23352 +                                */
23353 +                               reiser4_unpack_string(get_key_offset(key), buf);
23354 +                               printk("%s]\n", buf);
23355 +                       }
23356 +               } else {
23357 +                       printk("[%s]\n", type_name(get_key_type(key)));
23358 +               }
23359 +       }
23360 +       /* turn bold off */
23361 +       /* printf ("\033[m\017"); */
23362 +}
23363 +
23364 +#endif
23365 +
23366 +/* Make Linus happy.
23367 +   Local variables:
23368 +   c-indentation-style: "K&R"
23369 +   mode-name: "LC"
23370 +   c-basic-offset: 8
23371 +   tab-width: 8
23372 +   fill-column: 120
23373 +   End:
23374 +*/
23375 diff -puN /dev/null fs/reiser4/key.h
23376 --- /dev/null
23377 +++ a/fs/reiser4/key.h
23378 @@ -0,0 +1,392 @@
23379 +/* Copyright 2000, 2001, 2002, 2003 by Hans Reiser, licensing governed by
23380 + * reiser4/README */
23381 +
23382 +/* Declarations of key-related data-structures and operations on keys. */
23383 +
23384 +#if !defined(__REISER4_KEY_H__)
23385 +#define __REISER4_KEY_H__
23386 +
23387 +#include "dformat.h"
23388 +#include "forward.h"
23389 +#include "debug.h"
23390 +
23391 +#include <linux/types.h>       /* for __u??  */
23392 +
23393 +/* Operations on keys in reiser4 tree */
23394 +
23395 +/* No access to any of these fields shall be done except via a
23396 +   wrapping macro/function, and that wrapping macro/function shall
23397 +   convert to little endian order. Compare keys will consider cpu byte order. */
23398 +
23399 +/* A storage layer implementation difference between a regular unix file body
23400 +   and its attributes is in the typedef below which causes all of the attributes
23401 +   of a file to be near in key to all of the other attributes for all of the
23402 +   files within that directory, and not near to the file itself. It is
23403 +   interesting to consider whether this is the wrong approach, and whether there
23404 +   should be no difference at all. For current usage patterns this choice is
23405 +   probably the right one.  */
23406 +
23407 +/* possible values for minor packing locality (4 bits required) */
23408 +typedef enum {
23409 +       /* file name */
23410 +       KEY_FILE_NAME_MINOR = 0,
23411 +       /* stat-data */
23412 +       KEY_SD_MINOR = 1,
23413 +       /* file attribute name */
23414 +       KEY_ATTR_NAME_MINOR = 2,
23415 +       /* file attribute value */
23416 +       KEY_ATTR_BODY_MINOR = 3,
23417 +       /* file body (tail or extent) */
23418 +       KEY_BODY_MINOR = 4,
23419 +} key_minor_locality;
23420 +
23421 +/* Everything stored in the tree has a unique key, which means that the tree is
23422 +   (logically) fully ordered by key. Physical order is determined by dynamic
23423 +   heuristics that attempt to reflect key order when allocating available space,
23424 +   and by the repacker. It is stylistically better to put aggregation
23425 +   information into the key. Thus, if you want to segregate extents from tails,
23426 +   it is better to give them distinct minor packing localities rather than
23427 +   changing block_alloc.c to check the node type when deciding where to allocate
23428 +   the node.
23429 +
23430 +   The need to randomly displace new directories and large files disturbs this
23431 +   symmetry unfortunately. However, it should be noted that this is a need that
23432 +   is not clearly established given the existence of a repacker. Also, in our
23433 +   current implementation tails have a different minor packing locality from
23434 +   extents, and no files have both extents and tails, so maybe symmetry can be
23435 +   had without performance cost after all. Symmetry is what we ship for now....
23436 +*/
23437 +
23438 +/* Arbitrary major packing localities can be assigned to objects using
23439 +   the reiser4(filenameA/..packing<=some_number) system call.
23440 +
23441 +   In reiser4, the creat() syscall creates a directory
23442 +
23443 +   whose default flow (that which is referred to if the directory is
23444 +   read as a file) is the traditional unix file body.
23445 +
23446 +   whose directory plugin is the 'filedir'
23447 +
23448 +   whose major packing locality is that of the parent of the object created.
23449 +
23450 +   The static_stat item is a particular commonly used directory
23451 +   compression (the one for normal unix files).
23452 +
23453 +   The filedir plugin checks to see if the static_stat item exists.
23454 +   There is a unique key for static_stat.  If yes, then it uses the
23455 +   static_stat item for all of the values that it contains.  The
23456 +   static_stat item contains a flag for each stat it contains which
23457 +   indicates whether one should look outside the static_stat item for its
23458 +   contents.
23459 +*/
23460 +
23461 +/* offset of fields in reiser4_key. Value of each element of this enum
23462 +    is index within key (thought as array of __u64's) where this field
23463 +    is. */
23464 +typedef enum {
23465 +       /* major "locale", aka dirid. Sits in 1st element */
23466 +       KEY_LOCALITY_INDEX = 0,
23467 +       /* minor "locale", aka item type. Sits in 1st element */
23468 +       KEY_TYPE_INDEX = 0,
23469 +       ON_LARGE_KEY(KEY_ORDERING_INDEX,)
23470 +           /* "object band". Sits in 2nd element */
23471 +           KEY_BAND_INDEX,
23472 +       /* objectid. Sits in 2nd element */
23473 +       KEY_OBJECTID_INDEX = KEY_BAND_INDEX,
23474 +       /* full objectid. Sits in 2nd element */
23475 +       KEY_FULLOID_INDEX = KEY_BAND_INDEX,
23476 +       /* Offset. Sits in 3rd element */
23477 +       KEY_OFFSET_INDEX,
23478 +       /* Name hash. Sits in 3rd element */
23479 +       KEY_HASH_INDEX = KEY_OFFSET_INDEX,
23480 +       KEY_CACHELINE_END = KEY_OFFSET_INDEX,
23481 +       KEY_LAST_INDEX
23482 +} reiser4_key_field_index;
23483 +
23484 +/* key in reiser4 internal "balanced" tree. It is just array of three
23485 +    64bit integers in disk byte order (little-endian by default). This
23486 +    array is actually indexed by reiser4_key_field.  Each __u64 within
23487 +    this array is called "element". Logical key component encoded within
23488 +    elements are called "fields".
23489 +
23490 +    We declare this as union with second component dummy to suppress
23491 +    inconvenient array<->pointer casts implied in C. */
23492 +union reiser4_key {
23493 +       __le64 el[KEY_LAST_INDEX];
23494 +       int pad;
23495 +};
23496 +
23497 +/* bitmasks showing where within reiser4_key particular key is stored. */
23498 +/* major locality occupies higher 60 bits of the first element */
23499 +#define KEY_LOCALITY_MASK 0xfffffffffffffff0ull
23500 +
23501 +/* minor locality occupies lower 4 bits of the first element */
23502 +#define KEY_TYPE_MASK 0xfull
23503 +
23504 +/* controversial band occupies higher 4 bits of the 2nd element */
23505 +#define KEY_BAND_MASK 0xf000000000000000ull
23506 +
23507 +/* objectid occupies lower 60 bits of the 2nd element */
23508 +#define KEY_OBJECTID_MASK 0x0fffffffffffffffull
23509 +
23510 +/* full 64bit objectid*/
23511 +#define KEY_FULLOID_MASK 0xffffffffffffffffull
23512 +
23513 +/* offset is just 3rd L.M.Nt itself */
23514 +#define KEY_OFFSET_MASK 0xffffffffffffffffull
23515 +
23516 +/* ordering is whole second element */
23517 +#define KEY_ORDERING_MASK 0xffffffffffffffffull
23518 +
23519 +/* how many bits key element should be shifted to left to get particular field
23520 + */
23521 +typedef enum {
23522 +       KEY_LOCALITY_SHIFT = 4,
23523 +       KEY_TYPE_SHIFT = 0,
23524 +       KEY_BAND_SHIFT = 60,
23525 +       KEY_OBJECTID_SHIFT = 0,
23526 +       KEY_FULLOID_SHIFT = 0,
23527 +       KEY_OFFSET_SHIFT = 0,
23528 +       KEY_ORDERING_SHIFT = 0,
23529 +} reiser4_key_field_shift;
23530 +
23531 +static inline __u64
23532 +get_key_el(const reiser4_key * key, reiser4_key_field_index off)
23533 +{
23534 +       assert("nikita-753", key != NULL);
23535 +       assert("nikita-754", off < KEY_LAST_INDEX);
23536 +       return le64_to_cpu(get_unaligned(&key->el[off]));
23537 +}
23538 +
23539 +static inline void
23540 +set_key_el(reiser4_key * key, reiser4_key_field_index off, __u64 value)
23541 +{
23542 +       assert("nikita-755", key != NULL);
23543 +       assert("nikita-756", off < KEY_LAST_INDEX);
23544 +       put_unaligned(cpu_to_le64(value), &key->el[off]);
23545 +}
23546 +
23547 +/* macro to define getter and setter functions for field F with type T */
23548 +#define DEFINE_KEY_FIELD(L, U, T)                                      \
23549 +static inline T get_key_ ## L(const reiser4_key *key)                  \
23550 +{                                                                      \
23551 +       assert("nikita-750", key != NULL);                              \
23552 +       return (T) (get_key_el(key, KEY_ ## U ## _INDEX) &              \
23553 +                KEY_ ## U ## _MASK) >> KEY_ ## U ## _SHIFT;            \
23554 +}                                                                      \
23555 +                                                                       \
23556 +static inline void set_key_ ## L(reiser4_key * key, T loc)             \
23557 +{                                                                      \
23558 +       __u64 el;                                                       \
23559 +                                                                       \
23560 +       assert("nikita-752", key != NULL);                              \
23561 +                                                                       \
23562 +       el = get_key_el(key, KEY_ ## U ## _INDEX);                      \
23563 +       /* clear field bits in the key */                               \
23564 +       el &= ~KEY_ ## U ## _MASK;                                      \
23565 +       /* actually it should be                                        \
23566 +                                                                       \
23567 +          el |= ( loc << KEY_ ## U ## _SHIFT ) & KEY_ ## U ## _MASK;   \
23568 +                                                                       \
23569 +          but we trust user to never pass values that wouldn't fit     \
23570 +          into field. Clearing extra bits is one operation, but this   \
23571 +          function is time-critical.                                   \
23572 +          But check this in assertion. */                              \
23573 +       assert("nikita-759", ((loc << KEY_ ## U ## _SHIFT) &            \
23574 +               ~KEY_ ## U ## _MASK) == 0);                             \
23575 +       el |= (loc << KEY_ ## U ## _SHIFT);                             \
23576 +       set_key_el(key, KEY_ ## U ## _INDEX, el);                       \
23577 +}
23578 +
23579 +typedef __u64 oid_t;
23580 +
23581 +/* define get_key_locality(), set_key_locality() */
23582 +DEFINE_KEY_FIELD(locality, LOCALITY, oid_t);
23583 +/* define get_key_type(), set_key_type() */
23584 +DEFINE_KEY_FIELD(type, TYPE, key_minor_locality);
23585 +/* define get_key_band(), set_key_band() */
23586 +DEFINE_KEY_FIELD(band, BAND, __u64);
23587 +/* define get_key_objectid(), set_key_objectid() */
23588 +DEFINE_KEY_FIELD(objectid, OBJECTID, oid_t);
23589 +/* define get_key_fulloid(), set_key_fulloid() */
23590 +DEFINE_KEY_FIELD(fulloid, FULLOID, oid_t);
23591 +/* define get_key_offset(), set_key_offset() */
23592 +DEFINE_KEY_FIELD(offset, OFFSET, __u64);
23593 +#if (REISER4_LARGE_KEY)
23594 +/* define get_key_ordering(), set_key_ordering() */
23595 +DEFINE_KEY_FIELD(ordering, ORDERING, __u64);
23596 +#else
23597 +static inline __u64 get_key_ordering(const reiser4_key * key)
23598 +{
23599 +       return 0;
23600 +}
23601 +
23602 +static inline void set_key_ordering(reiser4_key * key, __u64 val)
23603 +{
23604 +}
23605 +#endif
23606 +
23607 +/* key comparison result */
23608 +typedef enum { LESS_THAN = -1, /* if first key is less than second */
23609 +       EQUAL_TO = 0,           /* if keys are equal */
23610 +       GREATER_THAN = +1       /* if first key is greater than second */
23611 +} cmp_t;
23612 +
23613 +void reiser4_key_init(reiser4_key * key);
23614 +
23615 +/* minimal possible key in the tree. Return pointer to the static storage. */
23616 +extern const reiser4_key *reiser4_min_key(void);
23617 +extern const reiser4_key *reiser4_max_key(void);
23618 +
23619 +/* helper macro for keycmp() */
23620 +#define KEY_DIFF(k1, k2, field)                                                \
23621 +({                                                                     \
23622 +       typeof(get_key_ ## field(k1)) f1;                               \
23623 +       typeof(get_key_ ## field(k2)) f2;                               \
23624 +                                                                       \
23625 +       f1 = get_key_ ## field(k1);                                     \
23626 +       f2 = get_key_ ## field(k2);                                     \
23627 +                                                                       \
23628 +       (f1 < f2) ? LESS_THAN : ((f1 == f2) ? EQUAL_TO : GREATER_THAN); \
23629 +})
23630 +
23631 +/* helper macro for keycmp() */
23632 +#define KEY_DIFF_EL(k1, k2, off)                                       \
23633 +({                                                                     \
23634 +       __u64 e1;                                                       \
23635 +       __u64 e2;                                                       \
23636 +                                                                       \
23637 +       e1 = get_key_el(k1, off);                                       \
23638 +       e2 = get_key_el(k2, off);                                       \
23639 +                                                                       \
23640 +       (e1 < e2) ? LESS_THAN : ((e1 == e2) ? EQUAL_TO : GREATER_THAN); \
23641 +})
23642 +
23643 +/* compare `k1' and `k2'.  This function is a heart of "key allocation
23644 +    policy". All you need to implement new policy is to add yet another
23645 +    clause here. */
23646 +static inline cmp_t keycmp(const reiser4_key * k1 /* first key to compare */ ,
23647 +                          const reiser4_key * k2/* second key to compare */)
23648 +{
23649 +       cmp_t result;
23650 +
23651 +       /*
23652 +        * This function is the heart of reiser4 tree-routines. Key comparison
23653 +        * is among most heavily used operations in the file system.
23654 +        */
23655 +
23656 +       assert("nikita-439", k1 != NULL);
23657 +       assert("nikita-440", k2 != NULL);
23658 +
23659 +       /* there is no actual branch here: condition is compile time constant
23660 +        * and constant folding and propagation ensures that only one branch
23661 +        * is actually compiled in. */
23662 +
23663 +       if (REISER4_PLANA_KEY_ALLOCATION) {
23664 +               /* if physical order of fields in a key is identical
23665 +                  with logical order, we can implement key comparison
23666 +                  as three 64bit comparisons. */
23667 +               /* logical order of fields in plan-a:
23668 +                  locality->type->objectid->offset. */
23669 +               /* compare locality and type at once */
23670 +               result = KEY_DIFF_EL(k1, k2, 0);
23671 +               if (result == EQUAL_TO) {
23672 +                       /* compare objectid (and band if it's there) */
23673 +                       result = KEY_DIFF_EL(k1, k2, 1);
23674 +                       /* compare offset */
23675 +                       if (result == EQUAL_TO) {
23676 +                               result = KEY_DIFF_EL(k1, k2, 2);
23677 +                               if (REISER4_LARGE_KEY && result == EQUAL_TO)
23678 +                                       result = KEY_DIFF_EL(k1, k2, 3);
23679 +                       }
23680 +               }
23681 +       } else if (REISER4_3_5_KEY_ALLOCATION) {
23682 +               result = KEY_DIFF(k1, k2, locality);
23683 +               if (result == EQUAL_TO) {
23684 +                       result = KEY_DIFF(k1, k2, objectid);
23685 +                       if (result == EQUAL_TO) {
23686 +                               result = KEY_DIFF(k1, k2, type);
23687 +                               if (result == EQUAL_TO)
23688 +                                       result = KEY_DIFF(k1, k2, offset);
23689 +                       }
23690 +               }
23691 +       } else
23692 +               impossible("nikita-441", "Unknown key allocation scheme!");
23693 +       return result;
23694 +}
23695 +
23696 +/* true if @k1 equals @k2 */
23697 +static inline int keyeq(const reiser4_key * k1 /* first key to compare */ ,
23698 +                       const reiser4_key * k2/* second key to compare */)
23699 +{
23700 +       assert("nikita-1879", k1 != NULL);
23701 +       assert("nikita-1880", k2 != NULL);
23702 +       return !memcmp(k1, k2, sizeof *k1);
23703 +}
23704 +
23705 +/* true if @k1 is less than @k2 */
23706 +static inline int keylt(const reiser4_key * k1 /* first key to compare */ ,
23707 +                       const reiser4_key * k2/* second key to compare */)
23708 +{
23709 +       assert("nikita-1952", k1 != NULL);
23710 +       assert("nikita-1953", k2 != NULL);
23711 +       return keycmp(k1, k2) == LESS_THAN;
23712 +}
23713 +
23714 +/* true if @k1 is less than or equal to @k2 */
23715 +static inline int keyle(const reiser4_key * k1 /* first key to compare */ ,
23716 +                       const reiser4_key * k2/* second key to compare */)
23717 +{
23718 +       assert("nikita-1954", k1 != NULL);
23719 +       assert("nikita-1955", k2 != NULL);
23720 +       return keycmp(k1, k2) != GREATER_THAN;
23721 +}
23722 +
23723 +/* true if @k1 is greater than @k2 */
23724 +static inline int keygt(const reiser4_key * k1 /* first key to compare */ ,
23725 +                       const reiser4_key * k2/* second key to compare */)
23726 +{
23727 +       assert("nikita-1959", k1 != NULL);
23728 +       assert("nikita-1960", k2 != NULL);
23729 +       return keycmp(k1, k2) == GREATER_THAN;
23730 +}
23731 +
23732 +/* true if @k1 is greater than or equal to @k2 */
23733 +static inline int keyge(const reiser4_key * k1 /* first key to compare */ ,
23734 +                       const reiser4_key * k2/* second key to compare */)
23735 +{
23736 +       assert("nikita-1956", k1 != NULL);
23737 +       assert("nikita-1957", k2 != NULL);      /* October  4: sputnik launched
23738 +                                                * November 3: Laika */
23739 +       return keycmp(k1, k2) != LESS_THAN;
23740 +}
23741 +
23742 +static inline void prefetchkey(reiser4_key * key)
23743 +{
23744 +       prefetch(key);
23745 +       prefetch(&key->el[KEY_CACHELINE_END]);
23746 +}
23747 +
23748 +/* (%Lx:%x:%Lx:%Lx:%Lx:%Lx) =
23749 +       1 + 16 + 1 + 1 + 1 + 1 + 1 + 16 + 1 + 16 + 1 + 16 + 1 */
23750 +/* size of a buffer suitable to hold human readable key representation */
23751 +#define KEY_BUF_LEN (80)
23752 +
23753 +#if REISER4_DEBUG
23754 +extern void reiser4_print_key(const char *prefix, const reiser4_key * key);
23755 +#else
23756 +#define reiser4_print_key(p, k) noop
23757 +#endif
23758 +
23759 +/* __FS_REISERFS_KEY_H__ */
23760 +#endif
23761 +
23762 +/* Make Linus happy.
23763 +   Local variables:
23764 +   c-indentation-style: "K&R"
23765 +   mode-name: "LC"
23766 +   c-basic-offset: 8
23767 +   tab-width: 8
23768 +   fill-column: 120
23769 +   End:
23770 +*/
23771 diff -puN /dev/null fs/reiser4/ktxnmgrd.c
23772 --- /dev/null
23773 +++ a/fs/reiser4/ktxnmgrd.c
23774 @@ -0,0 +1,215 @@
23775 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
23776 +/* Transaction manager daemon. */
23777 +
23778 +/*
23779 + * ktxnmgrd is a kernel daemon responsible for committing transactions. It is
23780 + * needed/important for the following reasons:
23781 + *
23782 + *     1. in reiser4 atom is not committed immediately when last transaction
23783 + *     handle closes, unless atom is either too old or too large (see
23784 + *     atom_should_commit()). This is done to avoid committing too frequently.
23785 + *     because:
23786 + *
23787 + *     2. sometimes we don't want to commit atom when closing last transaction
23788 + *     handle even if it is old and fat enough. For example, because we are at
23789 + *     this point under directory semaphore, and committing would stall all
23790 + *     accesses to this directory.
23791 + *
23792 + * ktxnmgrd binds its time sleeping on condition variable. When is awakes
23793 + * either due to (tunable) timeout or because it was explicitly woken up by
23794 + * call to ktxnmgrd_kick(), it scans list of all atoms and commits ones
23795 + * eligible.
23796 + *
23797 + */
23798 +
23799 +#include "debug.h"
23800 +#include "txnmgr.h"
23801 +#include "tree.h"
23802 +#include "ktxnmgrd.h"
23803 +#include "super.h"
23804 +#include "reiser4.h"
23805 +
23806 +#include <linux/sched.h>       /* for struct task_struct */
23807 +#include <linux/wait.h>
23808 +#include <linux/suspend.h>
23809 +#include <linux/kernel.h>
23810 +#include <linux/writeback.h>
23811 +#include <linux/kthread.h>
23812 +#include <linux/freezer.h>
23813 +
23814 +static int scan_mgr(struct super_block *);
23815 +
23816 +/*
23817 + * change current->comm so that ps, top, and friends will see changed
23818 + * state. This serves no useful purpose whatsoever, but also costs nothing. May
23819 + * be it will make lonely system administrator feeling less alone at 3 A.M.
23820 + */
23821 +#define set_comm(state)                                                \
23822 +       snprintf(current->comm, sizeof(current->comm),                  \
23823 +                 "%s:%s:%s", __FUNCTION__, (super)->s_id, (state))
23824 +
23825 +/**
23826 + * ktxnmgrd - kernel txnmgr daemon
23827 + * @arg: pointer to super block
23828 + *
23829 + * The background transaction manager daemon, started as a kernel thread during
23830 + * reiser4 initialization.
23831 + */
23832 +static int ktxnmgrd(void *arg)
23833 +{
23834 +       struct super_block *super;
23835 +       ktxnmgrd_context *ctx;
23836 +       txn_mgr *mgr;
23837 +       int done = 0;
23838 +
23839 +       super = arg;
23840 +       mgr = &get_super_private(super)->tmgr;
23841 +
23842 +       /*
23843 +        * do_fork() just copies task_struct into the new thread. ->fs_context
23844 +        * shouldn't be copied of course. This shouldn't be a problem for the
23845 +        * rest of the code though.
23846 +        */
23847 +       current->journal_info = NULL;
23848 +       ctx = mgr->daemon;
23849 +       while (1) {
23850 +               try_to_freeze();
23851 +               set_comm("wait");
23852 +               {
23853 +                       DEFINE_WAIT(__wait);
23854 +
23855 +                       prepare_to_wait(&ctx->wait, &__wait,
23856 +                                       TASK_INTERRUPTIBLE);
23857 +                       if (kthread_should_stop())
23858 +                               done = 1;
23859 +                       else
23860 +                               schedule_timeout(ctx->timeout);
23861 +                       finish_wait(&ctx->wait, &__wait);
23862 +               }
23863 +               if (done)
23864 +                       break;
23865 +               set_comm("run");
23866 +               spin_lock(&ctx->guard);
23867 +               /*
23868 +                * wait timed out or ktxnmgrd was woken up by explicit request
23869 +                * to commit something. Scan list of atoms in txnmgr and look
23870 +                * for too old atoms.
23871 +                */
23872 +               do {
23873 +                       ctx->rescan = 0;
23874 +                       scan_mgr(super);
23875 +                       spin_lock(&ctx->guard);
23876 +                       if (ctx->rescan) {
23877 +                               /*
23878 +                                * the list could be modified while ctx
23879 +                                * spinlock was released, we have to repeat
23880 +                                * scanning from the beginning
23881 +                                */
23882 +                               break;
23883 +                       }
23884 +               } while (ctx->rescan);
23885 +               spin_unlock(&ctx->guard);
23886 +       }
23887 +       return 0;
23888 +}
23889 +
23890 +#undef set_comm
23891 +
23892 +/**
23893 + * reiser4_init_ktxnmgrd - initialize ktxnmgrd context and start kernel daemon
23894 + * @super: pointer to super block
23895 + *
23896 + * Allocates and initializes ktxnmgrd_context, attaches it to transaction
23897 + * manager. Starts kernel txnmgr daemon. This is called on mount.
23898 + */
23899 +int reiser4_init_ktxnmgrd(struct super_block *super)
23900 +{
23901 +       txn_mgr *mgr;
23902 +       ktxnmgrd_context *ctx;
23903 +
23904 +       mgr = &get_super_private(super)->tmgr;
23905 +
23906 +       assert("zam-1014", mgr->daemon == NULL);
23907 +
23908 +       ctx = kzalloc(sizeof(ktxnmgrd_context), reiser4_ctx_gfp_mask_get());
23909 +       if (!ctx)
23910 +               return RETERR(-ENOMEM);
23911 +
23912 +       assert("nikita-2442", ctx != NULL);
23913 +
23914 +       init_waitqueue_head(&ctx->wait);
23915 +
23916 +       /*kcond_init(&ctx->startup);*/
23917 +       spin_lock_init(&ctx->guard);
23918 +       ctx->timeout = REISER4_TXNMGR_TIMEOUT;
23919 +       ctx->rescan = 1;
23920 +       mgr->daemon = ctx;
23921 +
23922 +       ctx->tsk = kthread_run(ktxnmgrd, super, "ktxnmgrd");
23923 +       if (IS_ERR(ctx->tsk)) {
23924 +               int ret = PTR_ERR(ctx->tsk);
23925 +               mgr->daemon = NULL;
23926 +               kfree(ctx);
23927 +               return RETERR(ret);
23928 +       }
23929 +       return 0;
23930 +}
23931 +
23932 +void ktxnmgrd_kick(txn_mgr *mgr)
23933 +{
23934 +       assert("nikita-3234", mgr != NULL);
23935 +       assert("nikita-3235", mgr->daemon != NULL);
23936 +       wake_up(&mgr->daemon->wait);
23937 +}
23938 +
23939 +int is_current_ktxnmgrd(void)
23940 +{
23941 +       return (get_current_super_private()->tmgr.daemon->tsk == current);
23942 +}
23943 +
23944 +/**
23945 + * scan_mgr - commit atoms which are to be committed
23946 + * @super: super block to commit atoms of
23947 + *
23948 + * Commits old atoms.
23949 + */
23950 +static int scan_mgr(struct super_block *super)
23951 +{
23952 +       int ret;
23953 +       reiser4_context ctx;
23954 +
23955 +       init_stack_context(&ctx, super);
23956 +
23957 +       ret = commit_some_atoms(&get_super_private(super)->tmgr);
23958 +
23959 +       reiser4_exit_context(&ctx);
23960 +       return ret;
23961 +}
23962 +
23963 +/**
23964 + * reiser4_done_ktxnmgrd - stop kernel thread and frees ktxnmgrd context
23965 + * @mgr:
23966 + *
23967 + * This is called on umount. Stops ktxnmgrd and free t
23968 + */
23969 +void reiser4_done_ktxnmgrd(struct super_block *super)
23970 +{
23971 +       txn_mgr *mgr;
23972 +
23973 +       mgr = &get_super_private(super)->tmgr;
23974 +       assert("zam-1012", mgr->daemon != NULL);
23975 +
23976 +       kthread_stop(mgr->daemon->tsk);
23977 +       kfree(mgr->daemon);
23978 +       mgr->daemon = NULL;
23979 +}
23980 +
23981 +/*
23982 + * Local variables:
23983 + * c-indentation-style: "K&R"
23984 + * mode-name: "LC"
23985 + * c-basic-offset: 8
23986 + * tab-width: 8
23987 + * fill-column: 120
23988 + * End:
23989 + */
23990 diff -puN /dev/null fs/reiser4/ktxnmgrd.h
23991 --- /dev/null
23992 +++ a/fs/reiser4/ktxnmgrd.h
23993 @@ -0,0 +1,52 @@
23994 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
23995 + * reiser4/README */
23996 +
23997 +/* Transaction manager daemon. See ktxnmgrd.c for comments. */
23998 +
23999 +#ifndef __KTXNMGRD_H__
24000 +#define __KTXNMGRD_H__
24001 +
24002 +#include "txnmgr.h"
24003 +
24004 +#include <linux/fs.h>
24005 +#include <linux/wait.h>
24006 +#include <linux/completion.h>
24007 +#include <linux/spinlock.h>
24008 +#include <asm/atomic.h>
24009 +#include <linux/sched.h>       /* for struct task_struct */
24010 +
24011 +/* in this structure all data necessary to start up, shut down and communicate
24012 + * with ktxnmgrd are kept. */
24013 +struct ktxnmgrd_context {
24014 +       /* wait queue head on which ktxnmgrd sleeps */
24015 +       wait_queue_head_t wait;
24016 +       /* spin lock protecting all fields of this structure */
24017 +       spinlock_t guard;
24018 +       /* timeout of sleeping on ->wait */
24019 +       signed long timeout;
24020 +       /* kernel thread running ktxnmgrd */
24021 +       struct task_struct *tsk;
24022 +       /* list of all file systems served by this ktxnmgrd */
24023 +       struct list_head queue;
24024 +       /* should ktxnmgrd repeat scanning of atoms? */
24025 +       unsigned int rescan:1;
24026 +};
24027 +
24028 +extern int reiser4_init_ktxnmgrd(struct super_block *);
24029 +extern void reiser4_done_ktxnmgrd(struct super_block *);
24030 +
24031 +extern void ktxnmgrd_kick(txn_mgr * mgr);
24032 +extern int is_current_ktxnmgrd(void);
24033 +
24034 +/* __KTXNMGRD_H__ */
24035 +#endif
24036 +
24037 +/* Make Linus happy.
24038 +   Local variables:
24039 +   c-indentation-style: "K&R"
24040 +   mode-name: "LC"
24041 +   c-basic-offset: 8
24042 +   tab-width: 8
24043 +   fill-column: 120
24044 +   End:
24045 +*/
24046 diff -puN /dev/null fs/reiser4/lock.c
24047 --- /dev/null
24048 +++ a/fs/reiser4/lock.c
24049 @@ -0,0 +1,1237 @@
24050 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
24051 + * reiser4/README */
24052 +
24053 +/* Traditional deadlock avoidance is achieved by acquiring all locks in a single
24054 +   order.  V4 balances the tree from the bottom up, and searches the tree from
24055 +   the top down, and that is really the way we want it, so tradition won't work
24056 +   for us.
24057 +
24058 +   Instead we have two lock orderings, a high priority lock ordering, and a low
24059 +   priority lock ordering.  Each node in the tree has a lock in its znode.
24060 +
24061 +   Suppose we have a set of processes which lock (R/W) tree nodes. Each process
24062 +   has a set (maybe empty) of already locked nodes ("process locked set"). Each
24063 +   process may have a pending lock request to a node locked by another process.
24064 +   Note: we lock and unlock, but do not transfer locks: it is possible
24065 +   transferring locks instead would save some bus locking....
24066 +
24067 +   Deadlock occurs when we have a loop constructed from process locked sets and
24068 +   lock request vectors.
24069 +
24070 +   NOTE: The reiser4 "tree" is a tree on disk, but its cached representation in
24071 +   memory is extended with "znodes" with which we connect nodes with their left
24072 +   and right neighbors using sibling pointers stored in the znodes.  When we
24073 +   perform balancing operations we often go from left to right and from right to
24074 +   left.
24075 +
24076 +   +-P1-+          +-P3-+
24077 +   |+--+|   V1     |+--+|
24078 +   ||N1|| -------> ||N3||
24079 +   |+--+|          |+--+|
24080 +   +----+          +----+
24081 +     ^               |
24082 +     |V2             |V3
24083 +     |               v
24084 +   +---------P2---------+
24085 +   |+--+            +--+|
24086 +   ||N2|  --------  |N4||
24087 +   |+--+            +--+|
24088 +   +--------------------+
24089 +
24090 +   We solve this by ensuring that only low priority processes lock in top to
24091 +   bottom order and from right to left, and high priority processes lock from
24092 +   bottom to top and left to right.
24093 +
24094 +   ZAM-FIXME-HANS: order not just node locks in this way, order atom locks, and
24095 +   kill those damn busy loops.
24096 +   ANSWER(ZAM): atom locks (which are introduced by ASTAGE_CAPTURE_WAIT atom
24097 +   stage) cannot be ordered that way. There are no rules what nodes can belong
24098 +   to the atom and what nodes cannot.  We cannot define what is right or left
24099 +   direction, what is top or bottom.  We can take immediate parent or side
24100 +   neighbor of one node, but nobody guarantees that, say, left neighbor node is
24101 +   not a far right neighbor for other nodes from the same atom.  It breaks
24102 +   deadlock avoidance rules and hi-low priority locking cannot be applied for
24103 +   atom locks.
24104 +
24105 +   How does it help to avoid deadlocks ?
24106 +
24107 +   Suppose we have a deadlock with n processes. Processes from one priority
24108 +   class never deadlock because they take locks in one consistent
24109 +   order.
24110 +
24111 +   So, any possible deadlock loop must have low priority as well as high
24112 +   priority processes.  There are no other lock priority levels except low and
24113 +   high. We know that any deadlock loop contains at least one node locked by a
24114 +   low priority process and requested by a high priority process. If this
24115 +   situation is caught and resolved it is sufficient to avoid deadlocks.
24116 +
24117 +   V4 DEADLOCK PREVENTION ALGORITHM IMPLEMENTATION.
24118 +
24119 +   The deadlock prevention algorithm is based on comparing
24120 +   priorities of node owners (processes which keep znode locked) and
24121 +   requesters (processes which want to acquire a lock on znode).  We
24122 +   implement a scheme where low-priority owners yield locks to
24123 +   high-priority requesters. We created a signal passing system that
24124 +   is used to ask low-priority processes to yield one or more locked
24125 +   znodes.
24126 +
24127 +   The condition when a znode needs to change its owners is described by the
24128 +   following formula:
24129 +
24130 +   #############################################
24131 +   #                                           #
24132 +   # (number of high-priority requesters) >  0 #
24133 +   #                AND                        #
24134 +   # (numbers of high-priority owners)    == 0 #
24135 +   #                                           #
24136 +   #############################################
24137 +
24138 +   Note that a low-priority process delays node releasing if another
24139 +   high-priority process owns this node.  So, slightly more strictly speaking,
24140 +   to have a deadlock capable cycle you must have a loop in which a high
24141 +   priority process is waiting on a low priority process to yield a node, which
24142 +   is slightly different from saying a high priority process is waiting on a
24143 +   node owned by a low priority process.
24144 +
24145 +   It is enough to avoid deadlocks if we prevent any low-priority process from
24146 +   falling asleep if its locked set contains a node which satisfies the
24147 +   deadlock condition.
24148 +
24149 +   That condition is implicitly or explicitly checked in all places where new
24150 +   high-priority requests may be added or removed from node request queue or
24151 +   high-priority process takes or releases a lock on node. The main
24152 +   goal of these checks is to never lose the moment when node becomes "has
24153 +   wrong owners" and send "must-yield-this-lock" signals to its low-pri owners
24154 +   at that time.
24155 +
24156 +   The information about received signals is stored in the per-process
24157 +   structure (lock stack) and analyzed before a low-priority process goes to
24158 +   sleep but after a "fast" attempt to lock a node fails. Any signal wakes
24159 +   sleeping process up and forces him to re-check lock status and received
24160 +   signal info. If "must-yield-this-lock" signals were received the locking
24161 +   primitive (longterm_lock_znode()) fails with -E_DEADLOCK error code.
24162 +
24163 +   V4 LOCKING DRAWBACKS
24164 +
24165 +   If we have already balanced on one level, and we are propagating our changes
24166 +   upward to a higher level, it could be very messy to surrender all locks on
24167 +   the lower level because we put so much computational work into it, and
24168 +   reverting them to their state before they were locked might be very complex.
24169 +   We also don't want to acquire all locks before performing balancing because
24170 +   that would either be almost as much work as the balancing, or it would be
24171 +   too conservative and lock too much.  We want balancing to be done only at
24172 +   high priority.  Yet, we might want to go to the left one node and use some
24173 +   of its empty space... So we make one attempt at getting the node to the left
24174 +   using try_lock, and if it fails we do without it, because we didn't really
24175 +   need it, it was only a nice to have.
24176 +
24177 +   LOCK STRUCTURES DESCRIPTION
24178 +
24179 +   The following data structures are used in the reiser4 locking
24180 +   implementation:
24181 +
24182 +   All fields related to long-term locking are stored in znode->lock.
24183 +
24184 +   The lock stack is a per thread object.  It owns all znodes locked by the
24185 +   thread. One znode may be locked by several threads in case of read lock or
24186 +   one znode may be write locked by one thread several times. The special link
24187 +   objects (lock handles) support n<->m relation between znodes and lock
24188 +   owners.
24189 +
24190 +   <Thread 1>                       <Thread 2>
24191 +
24192 +   +---------+                     +---------+
24193 +   |  LS1    |                    |  LS2    |
24194 +   +---------+                    +---------+
24195 +       ^                                ^
24196 +       |---------------+                +----------+
24197 +       v               v                v          v
24198 +   +---------+      +---------+    +---------+   +---------+
24199 +   |  LH1    |      |   LH2   |           |  LH3    |   |   LH4   |
24200 +   +---------+     +---------+    +---------+   +---------+
24201 +       ^                   ^            ^           ^
24202 +       |                   +------------+           |
24203 +       v                   v                        v
24204 +   +---------+      +---------+                  +---------+
24205 +   |  Z1     |     |   Z2    |                  |  Z3     |
24206 +   +---------+     +---------+                  +---------+
24207 +
24208 +   Thread 1 locked znodes Z1 and Z2, thread 2 locked znodes Z2 and Z3. The
24209 +   picture above shows that lock stack LS1 has a list of 2 lock handles LH1 and
24210 +   LH2, lock stack LS2 has a list with lock handles LH3 and LH4 on it.  Znode
24211 +   Z1 is locked by only one thread, znode has only one lock handle LH1 on its
24212 +   list, similar situation is for Z3 which is locked by the thread 2 only. Z2
24213 +   is locked (for read) twice by different threads and two lock handles are on
24214 +   its list. Each lock handle represents a single relation of a locking of a
24215 +   znode by a thread. Locking of a znode is an establishing of a locking
24216 +   relation between the lock stack and the znode by adding of a new lock handle
24217 +   to a list of lock handles, the lock stack.  The lock stack links all lock
24218 +   handles for all znodes locked by the lock stack.  The znode list groups all
24219 +   lock handles for all locks stacks which locked the znode.
24220 +
24221 +   Yet another relation may exist between znode and lock owners.  If lock
24222 +   procedure cannot immediately take lock on an object it adds the lock owner
24223 +   on special `requestors' list belongs to znode.  That list represents a
24224 +   queue of pending lock requests.  Because one lock owner may request only
24225 +   only one lock object at a time, it is a 1->n relation between lock objects
24226 +   and a lock owner implemented as it is described above. Full information
24227 +   (priority, pointers to lock and link objects) about each lock request is
24228 +   stored in lock owner structure in `request' field.
24229 +
24230 +   SHORT_TERM LOCKING
24231 +
24232 +   This is a list of primitive operations over lock stacks / lock handles /
24233 +   znodes and locking descriptions for them.
24234 +
24235 +   1. locking / unlocking which is done by two list insertion/deletion, one
24236 +      to/from znode's list of lock handles, another one is to/from lock stack's
24237 +      list of lock handles.  The first insertion is protected by
24238 +      znode->lock.guard spinlock.  The list owned by the lock stack can be
24239 +      modified only by thread who owns the lock stack and nobody else can
24240 +      modify/read it. There is nothing to be protected by a spinlock or
24241 +      something else.
24242 +
24243 +   2. adding/removing a lock request to/from znode requesters list. The rule is
24244 +      that znode->lock.guard spinlock should be taken for this.
24245 +
24246 +   3. we can traverse list of lock handles and use references to lock stacks who
24247 +      locked given znode if znode->lock.guard spinlock is taken.
24248 +
24249 +   4. If a lock stack is associated with a znode as a lock requestor or lock
24250 +      owner its existence is guaranteed by znode->lock.guard spinlock.  Some its
24251 +      (lock stack's) fields should be protected from being accessed in parallel
24252 +      by two or more threads. Please look at  lock_stack structure definition
24253 +      for the info how those fields are protected. */
24254 +
24255 +/* Znode lock and capturing intertwining. */
24256 +/* In current implementation we capture formatted nodes before locking
24257 +   them. Take a look on longterm lock znode, reiser4_try_capture() request
24258 +   precedes locking requests.  The longterm_lock_znode function unconditionally
24259 +   captures znode before even checking of locking conditions.
24260 +
24261 +   Another variant is to capture znode after locking it.  It was not tested, but
24262 +   at least one deadlock condition is supposed to be there.  One thread has
24263 +   locked a znode (Node-1) and calls reiser4_try_capture() for it.
24264 +   reiser4_try_capture() sleeps because znode's atom has CAPTURE_WAIT state.
24265 +   Second thread is a flushing thread, its current atom is the atom Node-1
24266 +   belongs to. Second thread wants to lock Node-1 and sleeps because Node-1
24267 +   is locked by the first thread.  The described situation is a deadlock. */
24268 +
24269 +#include "debug.h"
24270 +#include "txnmgr.h"
24271 +#include "znode.h"
24272 +#include "jnode.h"
24273 +#include "tree.h"
24274 +#include "plugin/node/node.h"
24275 +#include "super.h"
24276 +
24277 +#include <linux/spinlock.h>
24278 +
24279 +#if REISER4_DEBUG
24280 +static int request_is_deadlock_safe(znode * , znode_lock_mode,
24281 +                                   znode_lock_request);
24282 +#endif
24283 +
24284 +/* Returns a lock owner associated with current thread */
24285 +lock_stack *get_current_lock_stack(void)
24286 +{
24287 +       return &get_current_context()->stack;
24288 +}
24289 +
24290 +/* Wakes up all low priority owners informing them about possible deadlock */
24291 +static void wake_up_all_lopri_owners(znode * node)
24292 +{
24293 +       lock_handle *handle;
24294 +
24295 +       assert_spin_locked(&(node->lock.guard));
24296 +       list_for_each_entry(handle, &node->lock.owners, owners_link) {
24297 +               assert("nikita-1832", handle->node == node);
24298 +               /* count this signal in owner->nr_signaled */
24299 +               if (!handle->signaled) {
24300 +                       handle->signaled = 1;
24301 +                       atomic_inc(&handle->owner->nr_signaled);
24302 +                       /* Wake up a single process */
24303 +                       reiser4_wake_up(handle->owner);
24304 +               }
24305 +       }
24306 +}
24307 +
24308 +/* Adds a lock to a lock owner, which means creating a link to the lock and
24309 +   putting the link into the two lists all links are on (the doubly linked list
24310 +   that forms the lock_stack, and the doubly linked list of links attached
24311 +   to a lock.
24312 +*/
24313 +static inline void
24314 +link_object(lock_handle * handle, lock_stack * owner, znode * node)
24315 +{
24316 +       assert("jmacd-810", handle->owner == NULL);
24317 +       assert_spin_locked(&(node->lock.guard));
24318 +
24319 +       handle->owner = owner;
24320 +       handle->node = node;
24321 +
24322 +       assert("reiser4-4",
24323 +              ergo(list_empty_careful(&owner->locks), owner->nr_locks == 0));
24324 +
24325 +       /* add lock handle to the end of lock_stack's list of locks */
24326 +       list_add_tail(&handle->locks_link, &owner->locks);
24327 +       ON_DEBUG(owner->nr_locks++);
24328 +       reiser4_ctx_gfp_mask_set();
24329 +
24330 +       /* add lock handle to the head of znode's list of owners */
24331 +       list_add(&handle->owners_link, &node->lock.owners);
24332 +       handle->signaled = 0;
24333 +}
24334 +
24335 +/* Breaks a relation between a lock and its owner */
24336 +static inline void unlink_object(lock_handle * handle)
24337 +{
24338 +       assert("zam-354", handle->owner != NULL);
24339 +       assert("nikita-1608", handle->node != NULL);
24340 +       assert_spin_locked(&(handle->node->lock.guard));
24341 +       assert("nikita-1829", handle->owner == get_current_lock_stack());
24342 +       assert("reiser4-5", handle->owner->nr_locks > 0);
24343 +
24344 +       /* remove lock handle from lock_stack's list of locks */
24345 +       list_del(&handle->locks_link);
24346 +       ON_DEBUG(handle->owner->nr_locks--);
24347 +       reiser4_ctx_gfp_mask_set();
24348 +       assert("reiser4-6",
24349 +              ergo(list_empty_careful(&handle->owner->locks),
24350 +                   handle->owner->nr_locks == 0));
24351 +       /* remove lock handle from znode's list of owners */
24352 +       list_del(&handle->owners_link);
24353 +       /* indicates that lock handle is free now */
24354 +       handle->node = NULL;
24355 +#if REISER4_DEBUG
24356 +       INIT_LIST_HEAD(&handle->locks_link);
24357 +       INIT_LIST_HEAD(&handle->owners_link);
24358 +       handle->owner = NULL;
24359 +#endif
24360 +}
24361 +
24362 +/* Actually locks an object knowing that we are able to do this */
24363 +static void lock_object(lock_stack * owner)
24364 +{
24365 +       struct lock_request *request;
24366 +       znode *node;
24367 +
24368 +       request = &owner->request;
24369 +       node = request->node;
24370 +       assert_spin_locked(&(node->lock.guard));
24371 +       if (request->mode == ZNODE_READ_LOCK) {
24372 +               node->lock.nr_readers++;
24373 +       } else {
24374 +               /* check that we don't switched from read to write lock */
24375 +               assert("nikita-1840", node->lock.nr_readers <= 0);
24376 +               /* We allow recursive locking; a node can be locked several
24377 +                  times for write by same process */
24378 +               node->lock.nr_readers--;
24379 +       }
24380 +
24381 +       link_object(request->handle, owner, node);
24382 +
24383 +       if (owner->curpri)
24384 +               node->lock.nr_hipri_owners++;
24385 +}
24386 +
24387 +/* Check for recursive write locking */
24388 +static int recursive(lock_stack * owner)
24389 +{
24390 +       int ret;
24391 +       znode *node;
24392 +       lock_handle *lh;
24393 +
24394 +       node = owner->request.node;
24395 +
24396 +       /* Owners list is not empty for a locked node */
24397 +       assert("zam-314", !list_empty_careful(&node->lock.owners));
24398 +       assert("nikita-1841", owner == get_current_lock_stack());
24399 +       assert_spin_locked(&(node->lock.guard));
24400 +
24401 +       lh = list_entry(node->lock.owners.next, lock_handle, owners_link);
24402 +       ret = (lh->owner == owner);
24403 +
24404 +       /* Recursive read locking should be done usual way */
24405 +       assert("zam-315", !ret || owner->request.mode == ZNODE_WRITE_LOCK);
24406 +       /* mixing of read/write locks is not allowed */
24407 +       assert("zam-341", !ret || znode_is_wlocked(node));
24408 +
24409 +       return ret;
24410 +}
24411 +
24412 +#if REISER4_DEBUG
24413 +/* Returns true if the lock is held by the calling thread. */
24414 +int znode_is_any_locked(const znode * node)
24415 +{
24416 +       lock_handle *handle;
24417 +       lock_stack *stack;
24418 +       int ret;
24419 +
24420 +       if (!znode_is_locked(node))
24421 +               return 0;
24422 +
24423 +       stack = get_current_lock_stack();
24424 +
24425 +       spin_lock_stack(stack);
24426 +
24427 +       ret = 0;
24428 +
24429 +       list_for_each_entry(handle, &stack->locks, locks_link) {
24430 +               if (handle->node == node) {
24431 +                       ret = 1;
24432 +                       break;
24433 +               }
24434 +       }
24435 +
24436 +       spin_unlock_stack(stack);
24437 +
24438 +       return ret;
24439 +}
24440 +
24441 +#endif
24442 +
24443 +/* Returns true if a write lock is held by the calling thread. */
24444 +int znode_is_write_locked(const znode * node)
24445 +{
24446 +       lock_stack *stack;
24447 +       lock_handle *handle;
24448 +
24449 +       assert("jmacd-8765", node != NULL);
24450 +
24451 +       if (!znode_is_wlocked(node))
24452 +               return 0;
24453 +
24454 +       stack = get_current_lock_stack();
24455 +
24456 +       /*
24457 +        * When znode is write locked, all owner handles point to the same lock
24458 +        * stack. Get pointer to lock stack from the first lock handle from
24459 +        * znode's owner list
24460 +        */
24461 +       handle = list_entry(node->lock.owners.next, lock_handle, owners_link);
24462 +
24463 +       return (handle->owner == stack);
24464 +}
24465 +
24466 +/* This "deadlock" condition is the essential part of reiser4 locking
24467 +   implementation. This condition is checked explicitly by calling
24468 +   check_deadlock_condition() or implicitly in all places where znode lock
24469 +   state (set of owners and request queue) is changed. Locking code is
24470 +   designed to use this condition to trigger procedure of passing object from
24471 +   low priority owner(s) to high priority one(s).
24472 +
24473 +   The procedure results in passing an event (setting lock_handle->signaled
24474 +   flag) and counting this event in nr_signaled field of owner's lock stack
24475 +   object and wakeup owner's process.
24476 +*/
24477 +static inline int check_deadlock_condition(znode * node)
24478 +{
24479 +       assert_spin_locked(&(node->lock.guard));
24480 +       return node->lock.nr_hipri_requests > 0
24481 +           && node->lock.nr_hipri_owners == 0;
24482 +}
24483 +
24484 +static int check_livelock_condition(znode * node, znode_lock_mode mode)
24485 +{
24486 +       zlock * lock = &node->lock;
24487 +
24488 +       return mode == ZNODE_READ_LOCK &&
24489 +               lock->nr_readers >= 0 && lock->nr_hipri_write_requests > 0;
24490 +}
24491 +
24492 +/* checks lock/request compatibility */
24493 +static int can_lock_object(lock_stack * owner)
24494 +{
24495 +       znode *node = owner->request.node;
24496 +
24497 +       assert_spin_locked(&(node->lock.guard));
24498 +
24499 +       /* See if the node is disconnected. */
24500 +       if (unlikely(ZF_ISSET(node, JNODE_IS_DYING)))
24501 +               return RETERR(-EINVAL);
24502 +
24503 +       /* Do not ever try to take a lock if we are going in low priority
24504 +          direction and a node have a high priority request without high
24505 +          priority owners. */
24506 +       if (unlikely(!owner->curpri && check_deadlock_condition(node)))
24507 +               return RETERR(-E_REPEAT);
24508 +       if (unlikely(owner->curpri &&
24509 +                    check_livelock_condition(node, owner->request.mode)))
24510 +               return RETERR(-E_REPEAT);
24511 +       if (unlikely(!is_lock_compatible(node, owner->request.mode)))
24512 +               return RETERR(-E_REPEAT);
24513 +       return 0;
24514 +}
24515 +
24516 +/* Setting of a high priority to the process. It clears "signaled" flags
24517 +   because znode locked by high-priority process can't satisfy our "deadlock
24518 +   condition". */
24519 +static void set_high_priority(lock_stack * owner)
24520 +{
24521 +       assert("nikita-1846", owner == get_current_lock_stack());
24522 +       /* Do nothing if current priority is already high */
24523 +       if (!owner->curpri) {
24524 +               /* We don't need locking for owner->locks list, because, this
24525 +                * function is only called with the lock stack of the current
24526 +                * thread, and no other thread can play with owner->locks list
24527 +                * and/or change ->node pointers of lock handles in this list.
24528 +                *
24529 +                * (Interrupts also are not involved.)
24530 +                */
24531 +               lock_handle *item = list_entry(owner->locks.next, lock_handle,
24532 +                                              locks_link);
24533 +               while (&owner->locks != &item->locks_link) {
24534 +                       znode *node = item->node;
24535 +
24536 +                       spin_lock_zlock(&node->lock);
24537 +
24538 +                       node->lock.nr_hipri_owners++;
24539 +
24540 +                       /* we can safely set signaled to zero, because
24541 +                          previous statement (nr_hipri_owners ++) guarantees
24542 +                          that signaled will be never set again. */
24543 +                       item->signaled = 0;
24544 +                       spin_unlock_zlock(&node->lock);
24545 +
24546 +                       item = list_entry(item->locks_link.next, lock_handle,
24547 +                                         locks_link);
24548 +               }
24549 +               owner->curpri = 1;
24550 +               atomic_set(&owner->nr_signaled, 0);
24551 +       }
24552 +}
24553 +
24554 +/* Sets a low priority to the process. */
24555 +static void set_low_priority(lock_stack * owner)
24556 +{
24557 +       assert("nikita-3075", owner == get_current_lock_stack());
24558 +       /* Do nothing if current priority is already low */
24559 +       if (owner->curpri) {
24560 +               /* scan all locks (lock handles) held by @owner, which is
24561 +                  actually current thread, and check whether we are reaching
24562 +                  deadlock possibility anywhere.
24563 +                */
24564 +               lock_handle *handle = list_entry(owner->locks.next, lock_handle,
24565 +                                                locks_link);
24566 +               while (&owner->locks != &handle->locks_link) {
24567 +                       znode *node = handle->node;
24568 +                       spin_lock_zlock(&node->lock);
24569 +                       /* this thread just was hipri owner of @node, so
24570 +                          nr_hipri_owners has to be greater than zero. */
24571 +                       assert("nikita-1835", node->lock.nr_hipri_owners > 0);
24572 +                       node->lock.nr_hipri_owners--;
24573 +                       /* If we have deadlock condition, adjust a nr_signaled
24574 +                          field. It is enough to set "signaled" flag only for
24575 +                          current process, other low-pri owners will be
24576 +                          signaled and waken up after current process unlocks
24577 +                          this object and any high-priority requestor takes
24578 +                          control. */
24579 +                       if (check_deadlock_condition(node)
24580 +                           && !handle->signaled) {
24581 +                               handle->signaled = 1;
24582 +                               atomic_inc(&owner->nr_signaled);
24583 +                       }
24584 +                       spin_unlock_zlock(&node->lock);
24585 +                       handle = list_entry(handle->locks_link.next,
24586 +                                           lock_handle, locks_link);
24587 +               }
24588 +               owner->curpri = 0;
24589 +       }
24590 +}
24591 +
24592 +static void remove_lock_request(lock_stack * requestor)
24593 +{
24594 +       zlock * lock = &requestor->request.node->lock;
24595 +
24596 +       if (requestor->curpri) {
24597 +               assert("nikita-1838", lock->nr_hipri_requests > 0);
24598 +               lock->nr_hipri_requests--;
24599 +               if (requestor->request.mode == ZNODE_WRITE_LOCK)
24600 +                       lock->nr_hipri_write_requests--;
24601 +       }
24602 +       list_del(&requestor->requestors_link);
24603 +}
24604 +
24605 +static void invalidate_all_lock_requests(znode * node)
24606 +{
24607 +       lock_stack *requestor, *tmp;
24608 +
24609 +       assert_spin_locked(&(node->lock.guard));
24610 +
24611 +       list_for_each_entry_safe(requestor, tmp, &node->lock.requestors,
24612 +                                requestors_link) {
24613 +               remove_lock_request(requestor);
24614 +               requestor->request.ret_code = -EINVAL;
24615 +               reiser4_wake_up(requestor);
24616 +               requestor->request.mode = ZNODE_NO_LOCK;
24617 +       }
24618 +}
24619 +
24620 +static void dispatch_lock_requests(znode * node)
24621 +{
24622 +       lock_stack *requestor, *tmp;
24623 +
24624 +       assert_spin_locked(&(node->lock.guard));
24625 +
24626 +       list_for_each_entry_safe(requestor, tmp, &node->lock.requestors,
24627 +                                requestors_link) {
24628 +               if (znode_is_write_locked(node))
24629 +                       break;
24630 +               if (!can_lock_object(requestor)) {
24631 +                       lock_object(requestor);
24632 +                       remove_lock_request(requestor);
24633 +                       requestor->request.ret_code = 0;
24634 +                       reiser4_wake_up(requestor);
24635 +                       requestor->request.mode = ZNODE_NO_LOCK;
24636 +               }
24637 +       }
24638 +}
24639 +
24640 +/* release long-term lock, acquired by longterm_lock_znode() */
24641 +void longterm_unlock_znode(lock_handle * handle)
24642 +{
24643 +       znode *node = handle->node;
24644 +       lock_stack *oldowner = handle->owner;
24645 +       int hipri;
24646 +       int readers;
24647 +       int rdelta;
24648 +       int youdie;
24649 +
24650 +       /*
24651 +        * this is time-critical and highly optimized code. Modify carefully.
24652 +        */
24653 +
24654 +       assert("jmacd-1021", handle != NULL);
24655 +       assert("jmacd-1022", handle->owner != NULL);
24656 +       assert("nikita-1392", LOCK_CNT_GTZ(long_term_locked_znode));
24657 +
24658 +       assert("zam-130", oldowner == get_current_lock_stack());
24659 +
24660 +       LOCK_CNT_DEC(long_term_locked_znode);
24661 +
24662 +       /*
24663 +        * to minimize amount of operations performed under lock, pre-compute
24664 +        * all variables used within critical section. This makes code
24665 +        * obscure.
24666 +        */
24667 +
24668 +       /* was this lock of hi or lo priority */
24669 +       hipri = oldowner->curpri ? 1 : 0;
24670 +       /* number of readers */
24671 +       readers = node->lock.nr_readers;
24672 +       /* +1 if write lock, -1 if read lock */
24673 +       rdelta = (readers > 0) ? -1 : +1;
24674 +       /* true if node is to die and write lock is released */
24675 +       youdie = ZF_ISSET(node, JNODE_HEARD_BANSHEE) && (readers < 0);
24676 +
24677 +       spin_lock_zlock(&node->lock);
24678 +
24679 +       assert("zam-101", znode_is_locked(node));
24680 +
24681 +       /* Adjust a number of high priority owners of this lock */
24682 +       assert("nikita-1836", node->lock.nr_hipri_owners >= hipri);
24683 +       node->lock.nr_hipri_owners -= hipri;
24684 +
24685 +       /* Handle znode deallocation on last write-lock release. */
24686 +       if (znode_is_wlocked_once(node)) {
24687 +               if (youdie) {
24688 +                       forget_znode(handle);
24689 +                       assert("nikita-2191", znode_invariant(node));
24690 +                       zput(node);
24691 +                       return;
24692 +               }
24693 +       }
24694 +
24695 +       if (handle->signaled)
24696 +               atomic_dec(&oldowner->nr_signaled);
24697 +
24698 +       /* Unlocking means owner<->object link deletion */
24699 +       unlink_object(handle);
24700 +
24701 +       /* This is enough to be sure whether an object is completely
24702 +          unlocked. */
24703 +       node->lock.nr_readers += rdelta;
24704 +
24705 +       /* If the node is locked it must have an owners list.  Likewise, if
24706 +          the node is unlocked it must have an empty owners list. */
24707 +       assert("zam-319", equi(znode_is_locked(node),
24708 +                              !list_empty_careful(&node->lock.owners)));
24709 +
24710 +#if REISER4_DEBUG
24711 +       if (!znode_is_locked(node))
24712 +               ++node->times_locked;
24713 +#endif
24714 +
24715 +       /* If there are pending lock requests we wake up a requestor */
24716 +       if (!znode_is_wlocked(node))
24717 +               dispatch_lock_requests(node);
24718 +       if (check_deadlock_condition(node))
24719 +               wake_up_all_lopri_owners(node);
24720 +       spin_unlock_zlock(&node->lock);
24721 +
24722 +       /* minus one reference from handle->node */
24723 +       assert("nikita-2190", znode_invariant(node));
24724 +       ON_DEBUG(check_lock_data());
24725 +       ON_DEBUG(check_lock_node_data(node));
24726 +       zput(node);
24727 +}
24728 +
24729 +/* final portion of longterm-lock */
24730 +static int
24731 +lock_tail(lock_stack * owner, int ok, znode_lock_mode mode)
24732 +{
24733 +       znode *node = owner->request.node;
24734 +
24735 +       assert_spin_locked(&(node->lock.guard));
24736 +
24737 +       /* If we broke with (ok == 0) it means we can_lock, now do it. */
24738 +       if (ok == 0) {
24739 +               lock_object(owner);
24740 +               owner->request.mode = 0;
24741 +               /* count a reference from lockhandle->node
24742 +
24743 +                  znode was already referenced at the entry to this function,
24744 +                  hence taking spin-lock here is not necessary (see comment
24745 +                  in the zref()).
24746 +                */
24747 +               zref(node);
24748 +
24749 +               LOCK_CNT_INC(long_term_locked_znode);
24750 +       }
24751 +       spin_unlock_zlock(&node->lock);
24752 +       ON_DEBUG(check_lock_data());
24753 +       ON_DEBUG(check_lock_node_data(node));
24754 +       return ok;
24755 +}
24756 +
24757 +/*
24758 + * version of longterm_znode_lock() optimized for the most common case: read
24759 + * lock without any special flags. This is the kind of lock that any tree
24760 + * traversal takes on the root node of the tree, which is very frequent.
24761 + */
24762 +static int longterm_lock_tryfast(lock_stack * owner)
24763 +{
24764 +       int result;
24765 +       znode *node;
24766 +       zlock *lock;
24767 +
24768 +       node = owner->request.node;
24769 +       lock = &node->lock;
24770 +
24771 +       assert("nikita-3340", reiser4_schedulable());
24772 +       assert("nikita-3341", request_is_deadlock_safe(node,
24773 +                                                      ZNODE_READ_LOCK,
24774 +                                                      ZNODE_LOCK_LOPRI));
24775 +       spin_lock_zlock(lock);
24776 +       result = can_lock_object(owner);
24777 +       spin_unlock_zlock(lock);
24778 +
24779 +       if (likely(result != -EINVAL)) {
24780 +               spin_lock_znode(node);
24781 +               result = reiser4_try_capture(ZJNODE(node), ZNODE_READ_LOCK, 0);
24782 +               spin_unlock_znode(node);
24783 +               spin_lock_zlock(lock);
24784 +               if (unlikely(result != 0)) {
24785 +                       owner->request.mode = 0;
24786 +               } else {
24787 +                       result = can_lock_object(owner);
24788 +                       if (unlikely(result == -E_REPEAT)) {
24789 +                               /* fall back to longterm_lock_znode() */
24790 +                               spin_unlock_zlock(lock);
24791 +                               return 1;
24792 +                       }
24793 +               }
24794 +               return lock_tail(owner, result, ZNODE_READ_LOCK);
24795 +       } else
24796 +               return 1;
24797 +}
24798 +
24799 +/* locks given lock object */
24800 +int longterm_lock_znode(
24801 +                              /* local link object (allocated by lock owner
24802 +                               * thread, usually on its own stack) */
24803 +                              lock_handle * handle,
24804 +                              /* znode we want to lock. */
24805 +                              znode * node,
24806 +                              /* {ZNODE_READ_LOCK, ZNODE_WRITE_LOCK}; */
24807 +                              znode_lock_mode mode,
24808 +                              /* {0, -EINVAL, -E_DEADLOCK}, see return codes
24809 +                                 description. */
24810 +                              znode_lock_request request) {
24811 +       int ret;
24812 +       int hipri = (request & ZNODE_LOCK_HIPRI) != 0;
24813 +       int non_blocking = 0;
24814 +       int has_atom;
24815 +       txn_capture cap_flags;
24816 +       zlock *lock;
24817 +       txn_handle *txnh;
24818 +       tree_level level;
24819 +
24820 +       /* Get current process context */
24821 +       lock_stack *owner = get_current_lock_stack();
24822 +
24823 +       /* Check that the lock handle is initialized and isn't already being
24824 +        * used. */
24825 +       assert("jmacd-808", handle->owner == NULL);
24826 +       assert("nikita-3026", reiser4_schedulable());
24827 +       assert("nikita-3219", request_is_deadlock_safe(node, mode, request));
24828 +       assert("zam-1056", atomic_read(&ZJNODE(node)->x_count) > 0);
24829 +       /* long term locks are not allowed in the VM contexts (->writepage(),
24830 +        * prune_{d,i}cache()).
24831 +        *
24832 +        * FIXME this doesn't work due to unused-dentry-with-unlinked-inode
24833 +        * bug caused by d_splice_alias() only working for directories.
24834 +        */
24835 +       assert("nikita-3547", 1 || ((current->flags & PF_MEMALLOC) == 0));
24836 +       assert("zam-1055", mode != ZNODE_NO_LOCK);
24837 +
24838 +       cap_flags = 0;
24839 +       if (request & ZNODE_LOCK_NONBLOCK) {
24840 +               cap_flags |= TXN_CAPTURE_NONBLOCKING;
24841 +               non_blocking = 1;
24842 +       }
24843 +
24844 +       if (request & ZNODE_LOCK_DONT_FUSE)
24845 +               cap_flags |= TXN_CAPTURE_DONT_FUSE;
24846 +
24847 +       /* If we are changing our process priority we must adjust a number
24848 +          of high priority owners for each znode that we already lock */
24849 +       if (hipri) {
24850 +               set_high_priority(owner);
24851 +       } else {
24852 +               set_low_priority(owner);
24853 +       }
24854 +
24855 +       level = znode_get_level(node);
24856 +
24857 +       /* Fill request structure with our values. */
24858 +       owner->request.mode = mode;
24859 +       owner->request.handle = handle;
24860 +       owner->request.node = node;
24861 +
24862 +       txnh = get_current_context()->trans;
24863 +       lock = &node->lock;
24864 +
24865 +       if (mode == ZNODE_READ_LOCK && request == 0) {
24866 +               ret = longterm_lock_tryfast(owner);
24867 +               if (ret <= 0)
24868 +                       return ret;
24869 +       }
24870 +
24871 +       has_atom = (txnh->atom != NULL);
24872 +
24873 +       /* Synchronize on node's zlock guard lock. */
24874 +       spin_lock_zlock(lock);
24875 +
24876 +       if (znode_is_locked(node) &&
24877 +           mode == ZNODE_WRITE_LOCK && recursive(owner))
24878 +               return lock_tail(owner, 0, mode);
24879 +
24880 +       for (;;) {
24881 +               /* Check the lock's availability: if it is unavaiable we get
24882 +                  E_REPEAT, 0 indicates "can_lock", otherwise the node is
24883 +                  invalid.  */
24884 +               ret = can_lock_object(owner);
24885 +
24886 +               if (unlikely(ret == -EINVAL)) {
24887 +                       /* @node is dying. Leave it alone. */
24888 +                       break;
24889 +               }
24890 +
24891 +               if (unlikely(ret == -E_REPEAT && non_blocking)) {
24892 +                       /* either locking of @node by the current thread will
24893 +                        * lead to the deadlock, or lock modes are
24894 +                        * incompatible. */
24895 +                       break;
24896 +               }
24897 +
24898 +               assert("nikita-1844", (ret == 0)
24899 +                      || ((ret == -E_REPEAT) && !non_blocking));
24900 +               /* If we can get the lock... Try to capture first before
24901 +                  taking the lock. */
24902 +
24903 +               /* first handle commonest case where node and txnh are already
24904 +                * in the same atom. */
24905 +               /* safe to do without taking locks, because:
24906 +                *
24907 +                * 1. read of aligned word is atomic with respect to writes to
24908 +                * this word
24909 +                *
24910 +                * 2. false negatives are handled in reiser4_try_capture().
24911 +                *
24912 +                * 3. false positives are impossible.
24913 +                *
24914 +                * PROOF: left as an exercise to the curious reader.
24915 +                *
24916 +                * Just kidding. Here is one:
24917 +                *
24918 +                * At the time T0 txnh->atom is stored in txnh_atom.
24919 +                *
24920 +                * At the time T1 node->atom is stored in node_atom.
24921 +                *
24922 +                * At the time T2 we observe that
24923 +                *
24924 +                *     txnh_atom != NULL && node_atom == txnh_atom.
24925 +                *
24926 +                * Imagine that at this moment we acquire node and txnh spin
24927 +                * lock in this order. Suppose that under spin lock we have
24928 +                *
24929 +                *     node->atom != txnh->atom,                       (S1)
24930 +                *
24931 +                * at the time T3.
24932 +                *
24933 +                * txnh->atom != NULL still, because txnh is open by the
24934 +                * current thread.
24935 +                *
24936 +                * Suppose node->atom == NULL, that is, node was un-captured
24937 +                * between T1, and T3. But un-capturing of formatted node is
24938 +                * always preceded by the call to reiser4_invalidate_lock(),
24939 +                * which marks znode as JNODE_IS_DYING under zlock spin
24940 +                * lock. Contradiction, because can_lock_object() above checks
24941 +                * for JNODE_IS_DYING. Hence, node->atom != NULL at T3.
24942 +                *
24943 +                * Suppose that node->atom != node_atom, that is, atom, node
24944 +                * belongs to was fused into another atom: node_atom was fused
24945 +                * into node->atom. Atom of txnh was equal to node_atom at T2,
24946 +                * which means that under spin lock, txnh->atom == node->atom,
24947 +                * because txnh->atom can only follow fusion
24948 +                * chain. Contradicts S1.
24949 +                *
24950 +                * The same for hypothesis txnh->atom != txnh_atom. Hence,
24951 +                * node->atom == node_atom == txnh_atom == txnh->atom. Again
24952 +                * contradicts S1. Hence S1 is false. QED.
24953 +                *
24954 +                */
24955 +
24956 +               if (likely(has_atom && ZJNODE(node)->atom == txnh->atom)) {
24957 +                       ;
24958 +               } else {
24959 +                       /*
24960 +                        * unlock zlock spin lock here. It is possible for
24961 +                        * longterm_unlock_znode() to sneak in here, but there
24962 +                        * is no harm: reiser4_invalidate_lock() will mark znode
24963 +                        * as JNODE_IS_DYING and this will be noted by
24964 +                        * can_lock_object() below.
24965 +                        */
24966 +                       spin_unlock_zlock(lock);
24967 +                       spin_lock_znode(node);
24968 +                       ret = reiser4_try_capture(ZJNODE(node), mode,
24969 +                                                 cap_flags);
24970 +                       spin_unlock_znode(node);
24971 +                       spin_lock_zlock(lock);
24972 +                       if (unlikely(ret != 0)) {
24973 +                               /* In the failure case, the txnmgr releases
24974 +                                  the znode's lock (or in some cases, it was
24975 +                                  released a while ago).  There's no need to
24976 +                                  reacquire it so we should return here,
24977 +                                  avoid releasing the lock. */
24978 +                               owner->request.mode = 0;
24979 +                               break;
24980 +                       }
24981 +
24982 +                       /* Check the lock's availability again -- this is
24983 +                          because under some circumstances the capture code
24984 +                          has to release and reacquire the znode spinlock. */
24985 +                       ret = can_lock_object(owner);
24986 +               }
24987 +
24988 +               /* This time, a return of (ret == 0) means we can lock, so we
24989 +                  should break out of the loop. */
24990 +               if (likely(ret != -E_REPEAT || non_blocking))
24991 +                       break;
24992 +
24993 +               /* Lock is unavailable, we have to wait. */
24994 +               ret = reiser4_prepare_to_sleep(owner);
24995 +               if (unlikely(ret != 0))
24996 +                       break;
24997 +
24998 +               assert_spin_locked(&(node->lock.guard));
24999 +               if (hipri) {
25000 +                       /* If we are going in high priority direction then
25001 +                          increase high priority requests counter for the
25002 +                          node */
25003 +                       lock->nr_hipri_requests++;
25004 +                       if (mode == ZNODE_WRITE_LOCK)
25005 +                               lock->nr_hipri_write_requests++;
25006 +                       /* If there are no high priority owners for a node,
25007 +                          then immediately wake up low priority owners, so
25008 +                          they can detect possible deadlock */
25009 +                       if (lock->nr_hipri_owners == 0)
25010 +                               wake_up_all_lopri_owners(node);
25011 +               }
25012 +               list_add_tail(&owner->requestors_link, &lock->requestors);
25013 +
25014 +               /* Ok, here we have prepared a lock request, so unlock
25015 +                  a znode ... */
25016 +               spin_unlock_zlock(lock);
25017 +               /* ... and sleep */
25018 +               reiser4_go_to_sleep(owner);
25019 +               if (owner->request.mode == ZNODE_NO_LOCK)
25020 +                       goto request_is_done;
25021 +               spin_lock_zlock(lock);
25022 +               if (owner->request.mode == ZNODE_NO_LOCK) {
25023 +                       spin_unlock_zlock(lock);
25024 +request_is_done:
25025 +                       if (owner->request.ret_code == 0) {
25026 +                               LOCK_CNT_INC(long_term_locked_znode);
25027 +                               zref(node);
25028 +                       }
25029 +                       return owner->request.ret_code;
25030 +               }
25031 +               remove_lock_request(owner);
25032 +       }
25033 +
25034 +       return lock_tail(owner, ret, mode);
25035 +}
25036 +
25037 +/* lock object invalidation means changing of lock object state to `INVALID'
25038 +   and waiting for all other processes to cancel theirs lock requests. */
25039 +void reiser4_invalidate_lock(lock_handle * handle      /* path to lock
25040 +                                                        * owner and lock
25041 +                                                        * object is being
25042 +                                                        * invalidated. */ )
25043 +{
25044 +       znode *node = handle->node;
25045 +       lock_stack *owner = handle->owner;
25046 +
25047 +       assert("zam-325", owner == get_current_lock_stack());
25048 +       assert("zam-103", znode_is_write_locked(node));
25049 +       assert("nikita-1393", !ZF_ISSET(node, JNODE_LEFT_CONNECTED));
25050 +       assert("nikita-1793", !ZF_ISSET(node, JNODE_RIGHT_CONNECTED));
25051 +       assert("nikita-1394", ZF_ISSET(node, JNODE_HEARD_BANSHEE));
25052 +       assert("nikita-3097", znode_is_wlocked_once(node));
25053 +       assert_spin_locked(&(node->lock.guard));
25054 +
25055 +       if (handle->signaled)
25056 +               atomic_dec(&owner->nr_signaled);
25057 +
25058 +       ZF_SET(node, JNODE_IS_DYING);
25059 +       unlink_object(handle);
25060 +       node->lock.nr_readers = 0;
25061 +
25062 +       invalidate_all_lock_requests(node);
25063 +       spin_unlock_zlock(&node->lock);
25064 +}
25065 +
25066 +/* Initializes lock_stack. */
25067 +void init_lock_stack(lock_stack * owner        /* pointer to
25068 +                                        * allocated
25069 +                                        * structure. */ )
25070 +{
25071 +       INIT_LIST_HEAD(&owner->locks);
25072 +       INIT_LIST_HEAD(&owner->requestors_link);
25073 +       spin_lock_init(&owner->sguard);
25074 +       owner->curpri = 1;
25075 +       init_waitqueue_head(&owner->wait);
25076 +}
25077 +
25078 +/* Initializes lock object. */
25079 +void reiser4_init_lock(zlock * lock    /* pointer on allocated
25080 +                                        * uninitialized lock object
25081 +                                        * structure. */ )
25082 +{
25083 +       memset(lock, 0, sizeof(zlock));
25084 +       spin_lock_init(&lock->guard);
25085 +       INIT_LIST_HEAD(&lock->requestors);
25086 +       INIT_LIST_HEAD(&lock->owners);
25087 +}
25088 +
25089 +/* Transfer a lock handle (presumably so that variables can be moved between
25090 +   stack and heap locations). */
25091 +static void
25092 +move_lh_internal(lock_handle * new, lock_handle * old, int unlink_old)
25093 +{
25094 +       znode *node = old->node;
25095 +       lock_stack *owner = old->owner;
25096 +       int signaled;
25097 +
25098 +       /* locks_list, modified by link_object() is not protected by
25099 +          anything. This is valid because only current thread ever modifies
25100 +          locks_list of its lock_stack.
25101 +        */
25102 +       assert("nikita-1827", owner == get_current_lock_stack());
25103 +       assert("nikita-1831", new->owner == NULL);
25104 +
25105 +       spin_lock_zlock(&node->lock);
25106 +
25107 +       signaled = old->signaled;
25108 +       if (unlink_old) {
25109 +               unlink_object(old);
25110 +       } else {
25111 +               if (node->lock.nr_readers > 0) {
25112 +                       node->lock.nr_readers += 1;
25113 +               } else {
25114 +                       node->lock.nr_readers -= 1;
25115 +               }
25116 +               if (signaled)
25117 +                       atomic_inc(&owner->nr_signaled);
25118 +               if (owner->curpri)
25119 +                       node->lock.nr_hipri_owners += 1;
25120 +               LOCK_CNT_INC(long_term_locked_znode);
25121 +
25122 +               zref(node);
25123 +       }
25124 +       link_object(new, owner, node);
25125 +       new->signaled = signaled;
25126 +
25127 +       spin_unlock_zlock(&node->lock);
25128 +}
25129 +
25130 +void move_lh(lock_handle * new, lock_handle * old)
25131 +{
25132 +       move_lh_internal(new, old, /*unlink_old */ 1);
25133 +}
25134 +
25135 +void copy_lh(lock_handle * new, lock_handle * old)
25136 +{
25137 +       move_lh_internal(new, old, /*unlink_old */ 0);
25138 +}
25139 +
25140 +/* after getting -E_DEADLOCK we unlock znodes until this function returns false
25141 + */
25142 +int reiser4_check_deadlock(void)
25143 +{
25144 +       lock_stack *owner = get_current_lock_stack();
25145 +       return atomic_read(&owner->nr_signaled) != 0;
25146 +}
25147 +
25148 +/* Before going to sleep we re-check "release lock" requests which might come
25149 +   from threads with hi-pri lock priorities. */
25150 +int reiser4_prepare_to_sleep(lock_stack * owner)
25151 +{
25152 +       assert("nikita-1847", owner == get_current_lock_stack());
25153 +
25154 +       /* We return -E_DEADLOCK if one or more "give me the lock" messages are
25155 +        * counted in nr_signaled */
25156 +       if (unlikely(atomic_read(&owner->nr_signaled) != 0)) {
25157 +               assert("zam-959", !owner->curpri);
25158 +               return RETERR(-E_DEADLOCK);
25159 +       }
25160 +       return 0;
25161 +}
25162 +
25163 +/* Wakes up a single thread */
25164 +void __reiser4_wake_up(lock_stack * owner)
25165 +{
25166 +       atomic_set(&owner->wakeup, 1);
25167 +       wake_up(&owner->wait);
25168 +}
25169 +
25170 +/* Puts a thread to sleep */
25171 +void reiser4_go_to_sleep(lock_stack * owner)
25172 +{
25173 +       /* Well, we might sleep here, so holding of any spinlocks is no-no */
25174 +       assert("nikita-3027", reiser4_schedulable());
25175 +
25176 +       wait_event(owner->wait, atomic_read(&owner->wakeup));
25177 +       atomic_set(&owner->wakeup, 0);
25178 +}
25179 +
25180 +int lock_stack_isclean(lock_stack * owner)
25181 +{
25182 +       if (list_empty_careful(&owner->locks)) {
25183 +               assert("zam-353", atomic_read(&owner->nr_signaled) == 0);
25184 +               return 1;
25185 +       }
25186 +
25187 +       return 0;
25188 +}
25189 +
25190 +#if REISER4_DEBUG
25191 +
25192 +/*
25193 + * debugging functions
25194 + */
25195 +
25196 +static void list_check(struct list_head *head)
25197 +{
25198 +       struct list_head *pos;
25199 +
25200 +       list_for_each(pos, head)
25201 +               assert("", (pos->prev != NULL && pos->next != NULL &&
25202 +                           pos->prev->next == pos && pos->next->prev == pos));
25203 +}
25204 +
25205 +/* check consistency of locking data-structures hanging of the @stack */
25206 +static void check_lock_stack(lock_stack * stack)
25207 +{
25208 +       spin_lock_stack(stack);
25209 +       /* check that stack->locks is not corrupted */
25210 +       list_check(&stack->locks);
25211 +       spin_unlock_stack(stack);
25212 +}
25213 +
25214 +/* check consistency of locking data structures */
25215 +void check_lock_data(void)
25216 +{
25217 +       check_lock_stack(&get_current_context()->stack);
25218 +}
25219 +
25220 +/* check consistency of locking data structures for @node */
25221 +void check_lock_node_data(znode * node)
25222 +{
25223 +       spin_lock_zlock(&node->lock);
25224 +       list_check(&node->lock.owners);
25225 +       list_check(&node->lock.requestors);
25226 +       spin_unlock_zlock(&node->lock);
25227 +}
25228 +
25229 +/* check that given lock request is dead lock safe. This check is, of course,
25230 + * not exhaustive. */
25231 +static int
25232 +request_is_deadlock_safe(znode * node, znode_lock_mode mode,
25233 +                        znode_lock_request request)
25234 +{
25235 +       lock_stack *owner;
25236 +
25237 +       owner = get_current_lock_stack();
25238 +       /*
25239 +        * check that hipri lock request is not issued when there are locked
25240 +        * nodes at the higher levels.
25241 +        */
25242 +       if (request & ZNODE_LOCK_HIPRI && !(request & ZNODE_LOCK_NONBLOCK) &&
25243 +           znode_get_level(node) != 0) {
25244 +               lock_handle *item;
25245 +
25246 +               list_for_each_entry(item, &owner->locks, locks_link) {
25247 +                       znode *other;
25248 +
25249 +                       other = item->node;
25250 +
25251 +                       if (znode_get_level(other) == 0)
25252 +                               continue;
25253 +                       if (znode_get_level(other) > znode_get_level(node))
25254 +                               return 0;
25255 +               }
25256 +       }
25257 +       return 1;
25258 +}
25259 +
25260 +#endif
25261 +
25262 +/* return pointer to static storage with name of lock_mode. For
25263 +    debugging */
25264 +const char *lock_mode_name(znode_lock_mode lock/* lock mode to get name of */)
25265 +{
25266 +       if (lock == ZNODE_READ_LOCK)
25267 +               return "read";
25268 +       else if (lock == ZNODE_WRITE_LOCK)
25269 +               return "write";
25270 +       else {
25271 +               static char buf[30];
25272 +
25273 +               sprintf(buf, "unknown: %i", lock);
25274 +               return buf;
25275 +       }
25276 +}
25277 +
25278 +/* Make Linus happy.
25279 +   Local variables:
25280 +   c-indentation-style: "K&R"
25281 +   mode-name: "LC"
25282 +   c-basic-offset: 8
25283 +   tab-width: 8
25284 +   fill-column: 79
25285 +   End:
25286 +*/
25287 diff -puN /dev/null fs/reiser4/lock.h
25288 --- /dev/null
25289 +++ a/fs/reiser4/lock.h
25290 @@ -0,0 +1,250 @@
25291 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
25292 + * reiser4/README */
25293 +
25294 +/* Long term locking data structures. See lock.c for details. */
25295 +
25296 +#ifndef __LOCK_H__
25297 +#define __LOCK_H__
25298 +
25299 +#include "forward.h"
25300 +#include "debug.h"
25301 +#include "dformat.h"
25302 +#include "key.h"
25303 +#include "coord.h"
25304 +#include "plugin/node/node.h"
25305 +#include "txnmgr.h"
25306 +#include "readahead.h"
25307 +
25308 +#include <linux/types.h>
25309 +#include <linux/spinlock.h>
25310 +#include <linux/pagemap.h>     /* for PAGE_CACHE_SIZE */
25311 +#include <asm/atomic.h>
25312 +#include <linux/wait.h>
25313 +
25314 +/* Per-znode lock object */
25315 +struct zlock {
25316 +       spinlock_t guard;
25317 +       /* The number of readers if positive; the number of recursively taken
25318 +          write locks if negative. Protected by zlock spin lock. */
25319 +       int nr_readers;
25320 +       /* A number of processes (lock_stacks) that have this object
25321 +          locked with high priority */
25322 +       unsigned nr_hipri_owners;
25323 +       /* A number of attempts to lock znode in high priority direction */
25324 +       unsigned nr_hipri_requests;
25325 +       /* A linked list of lock_handle objects that contains pointers
25326 +          for all lock_stacks which have this lock object locked */
25327 +       unsigned nr_hipri_write_requests;
25328 +       struct list_head owners;
25329 +       /* A linked list of lock_stacks that wait for this lock */
25330 +       struct list_head requestors;
25331 +};
25332 +
25333 +static inline void spin_lock_zlock(zlock *lock)
25334 +{
25335 +       /* check that zlock is not locked */
25336 +       assert("", LOCK_CNT_NIL(spin_locked_zlock));
25337 +       /* check that spinlocks of lower priorities are not held */
25338 +       assert("", LOCK_CNT_NIL(spin_locked_stack));
25339 +
25340 +       spin_lock(&lock->guard);
25341 +
25342 +       LOCK_CNT_INC(spin_locked_zlock);
25343 +       LOCK_CNT_INC(spin_locked);
25344 +}
25345 +
25346 +static inline void spin_unlock_zlock(zlock *lock)
25347 +{
25348 +       assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_zlock));
25349 +       assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
25350 +
25351 +       LOCK_CNT_DEC(spin_locked_zlock);
25352 +       LOCK_CNT_DEC(spin_locked);
25353 +
25354 +       spin_unlock(&lock->guard);
25355 +}
25356 +
25357 +#define lock_is_locked(lock)          ((lock)->nr_readers != 0)
25358 +#define lock_is_rlocked(lock)         ((lock)->nr_readers > 0)
25359 +#define lock_is_wlocked(lock)         ((lock)->nr_readers < 0)
25360 +#define lock_is_wlocked_once(lock)    ((lock)->nr_readers == -1)
25361 +#define lock_can_be_rlocked(lock)     ((lock)->nr_readers >= 0)
25362 +#define lock_mode_compatible(lock, mode)                               \
25363 +             (((mode) == ZNODE_WRITE_LOCK && !lock_is_locked(lock)) || \
25364 +             ((mode) == ZNODE_READ_LOCK && lock_can_be_rlocked(lock)))
25365 +
25366 +/* Since we have R/W znode locks we need additional bidirectional `link'
25367 +   objects to implement n<->m relationship between lock owners and lock
25368 +   objects. We call them `lock handles'.
25369 +
25370 +   Locking: see lock.c/"SHORT-TERM LOCKING"
25371 +*/
25372 +struct lock_handle {
25373 +       /* This flag indicates that a signal to yield a lock was passed to
25374 +          lock owner and counted in owner->nr_signalled
25375 +
25376 +          Locking: this is accessed under spin lock on ->node.
25377 +        */
25378 +       int signaled;
25379 +       /* A link to owner of a lock */
25380 +       lock_stack *owner;
25381 +       /* A link to znode locked */
25382 +       znode *node;
25383 +       /* A list of all locks for a process */
25384 +       struct list_head locks_link;
25385 +       /* A list of all owners for a znode */
25386 +       struct list_head owners_link;
25387 +};
25388 +
25389 +struct lock_request {
25390 +       /* A pointer to uninitialized link object */
25391 +       lock_handle *handle;
25392 +       /* A pointer to the object we want to lock */
25393 +       znode *node;
25394 +       /* Lock mode (ZNODE_READ_LOCK or ZNODE_WRITE_LOCK) */
25395 +       znode_lock_mode mode;
25396 +       /* how dispatch_lock_requests() returns lock request result code */
25397 +       int ret_code;
25398 +};
25399 +
25400 +/* A lock stack structure for accumulating locks owned by a process */
25401 +struct lock_stack {
25402 +       /* A guard lock protecting a lock stack */
25403 +       spinlock_t sguard;
25404 +       /* number of znodes which were requested by high priority processes */
25405 +       atomic_t nr_signaled;
25406 +       /* Current priority of a process
25407 +
25408 +          This is only accessed by the current thread and thus requires no
25409 +          locking.
25410 +        */
25411 +       int curpri;
25412 +       /* A list of all locks owned by this process. Elements can be added to
25413 +        * this list only by the current thread. ->node pointers in this list
25414 +        * can be only changed by the current thread. */
25415 +       struct list_head locks;
25416 +       /* When lock_stack waits for the lock, it puts itself on double-linked
25417 +          requestors list of that lock */
25418 +       struct list_head requestors_link;
25419 +       /* Current lock request info.
25420 +
25421 +          This is only accessed by the current thread and thus requires no
25422 +          locking.
25423 +        */
25424 +       struct lock_request request;
25425 +       /* the following two fields are the lock stack's
25426 +        * synchronization object to use with the standard linux/wait.h
25427 +        * interface. See reiser4_go_to_sleep and __reiser4_wake_up for
25428 +        * usage details. */
25429 +       wait_queue_head_t wait;
25430 +       atomic_t wakeup;
25431 +#if REISER4_DEBUG
25432 +       int nr_locks;           /* number of lock handles in the above list */
25433 +#endif
25434 +};
25435 +
25436 +/*
25437 +  User-visible znode locking functions
25438 +*/
25439 +
25440 +extern int longterm_lock_znode(lock_handle * handle,
25441 +                              znode * node,
25442 +                              znode_lock_mode mode,
25443 +                              znode_lock_request request);
25444 +
25445 +extern void longterm_unlock_znode(lock_handle * handle);
25446 +
25447 +extern int reiser4_check_deadlock(void);
25448 +
25449 +extern lock_stack *get_current_lock_stack(void);
25450 +
25451 +extern void init_lock_stack(lock_stack * owner);
25452 +extern void reiser4_init_lock(zlock * lock);
25453 +
25454 +static inline void init_lh(lock_handle *lh)
25455 +{
25456 +#if REISER4_DEBUG
25457 +       memset(lh, 0, sizeof *lh);
25458 +       INIT_LIST_HEAD(&lh->locks_link);
25459 +       INIT_LIST_HEAD(&lh->owners_link);
25460 +#else
25461 +       lh->node = NULL;
25462 +#endif
25463 +}
25464 +
25465 +static inline  void done_lh(lock_handle *lh)
25466 +{
25467 +       assert("zam-342", lh != NULL);
25468 +       if (lh->node != NULL)
25469 +               longterm_unlock_znode(lh);
25470 +}
25471 +
25472 +extern void move_lh(lock_handle * new, lock_handle * old);
25473 +extern void copy_lh(lock_handle * new, lock_handle * old);
25474 +
25475 +extern int reiser4_prepare_to_sleep(lock_stack * owner);
25476 +extern void reiser4_go_to_sleep(lock_stack * owner);
25477 +extern void __reiser4_wake_up(lock_stack * owner);
25478 +
25479 +extern int lock_stack_isclean(lock_stack * owner);
25480 +
25481 +/* zlock object state check macros: only used in assertions. Both forms imply
25482 +   that the lock is held by the current thread. */
25483 +extern int znode_is_write_locked(const znode *);
25484 +extern void reiser4_invalidate_lock(lock_handle *);
25485 +
25486 +/* lock ordering is: first take zlock spin lock, then lock stack spin lock */
25487 +#define spin_ordering_pred_stack(stack)                        \
25488 +       (LOCK_CNT_NIL(spin_locked_stack) &&             \
25489 +        LOCK_CNT_NIL(spin_locked_txnmgr) &&            \
25490 +        LOCK_CNT_NIL(spin_locked_inode) &&             \
25491 +        LOCK_CNT_NIL(rw_locked_cbk_cache) &&           \
25492 +        LOCK_CNT_NIL(spin_locked_super_eflush))
25493 +
25494 +static inline void spin_lock_stack(lock_stack *stack)
25495 +{
25496 +       assert("", spin_ordering_pred_stack(stack));
25497 +       spin_lock(&(stack->sguard));
25498 +       LOCK_CNT_INC(spin_locked_stack);
25499 +       LOCK_CNT_INC(spin_locked);
25500 +}
25501 +
25502 +static inline void spin_unlock_stack(lock_stack *stack)
25503 +{
25504 +       assert_spin_locked(&(stack->sguard));
25505 +       assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_stack));
25506 +       assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
25507 +       LOCK_CNT_DEC(spin_locked_stack);
25508 +       LOCK_CNT_DEC(spin_locked);
25509 +       spin_unlock(&(stack->sguard));
25510 +}
25511 +
25512 +static inline void reiser4_wake_up(lock_stack * owner)
25513 +{
25514 +       spin_lock_stack(owner);
25515 +       __reiser4_wake_up(owner);
25516 +       spin_unlock_stack(owner);
25517 +}
25518 +
25519 +const char *lock_mode_name(znode_lock_mode lock);
25520 +
25521 +#if REISER4_DEBUG
25522 +extern void check_lock_data(void);
25523 +extern void check_lock_node_data(znode * node);
25524 +#else
25525 +#define check_lock_data() noop
25526 +#define check_lock_node_data() noop
25527 +#endif
25528 +
25529 +/* __LOCK_H__ */
25530 +#endif
25531 +
25532 +/* Make Linus happy.
25533 +   Local variables:
25534 +   c-indentation-style: "K&R"
25535 +   mode-name: "LC"
25536 +   c-basic-offset: 8
25537 +   tab-width: 8
25538 +   fill-column: 120
25539 +   End:
25540 +*/
25541 diff -puN /dev/null fs/reiser4/oid.c
25542 --- /dev/null
25543 +++ a/fs/reiser4/oid.c
25544 @@ -0,0 +1,141 @@
25545 +/* Copyright 2003 by Hans Reiser, licensing governed by reiser4/README */
25546 +
25547 +#include "debug.h"
25548 +#include "super.h"
25549 +#include "txnmgr.h"
25550 +
25551 +/* we used to have oid allocation plugin. It was removed because it
25552 +   was recognized as providing unneeded level of abstraction. If one
25553 +   ever will find it useful - look at yet_unneeded_abstractions/oid
25554 +*/
25555 +
25556 +/*
25557 + * initialize in-memory data for oid allocator at @super. @nr_files and @next
25558 + * are provided by disk format plugin that reads them from the disk during
25559 + * mount.
25560 + */
25561 +int oid_init_allocator(struct super_block *super, oid_t nr_files, oid_t next)
25562 +{
25563 +       reiser4_super_info_data *sbinfo;
25564 +
25565 +       sbinfo = get_super_private(super);
25566 +
25567 +       sbinfo->next_to_use = next;
25568 +       sbinfo->oids_in_use = nr_files;
25569 +       return 0;
25570 +}
25571 +
25572 +/*
25573 + * allocate oid and return it. ABSOLUTE_MAX_OID is returned when allocator
25574 + * runs out of oids.
25575 + */
25576 +oid_t oid_allocate(struct super_block *super)
25577 +{
25578 +       reiser4_super_info_data *sbinfo;
25579 +       oid_t oid;
25580 +
25581 +       sbinfo = get_super_private(super);
25582 +
25583 +       spin_lock_reiser4_super(sbinfo);
25584 +       if (sbinfo->next_to_use != ABSOLUTE_MAX_OID) {
25585 +               oid = sbinfo->next_to_use++;
25586 +               sbinfo->oids_in_use++;
25587 +       } else
25588 +               oid = ABSOLUTE_MAX_OID;
25589 +       spin_unlock_reiser4_super(sbinfo);
25590 +       return oid;
25591 +}
25592 +
25593 +/*
25594 + * Tell oid allocator that @oid is now free.
25595 + */
25596 +int oid_release(struct super_block *super, oid_t oid UNUSED_ARG)
25597 +{
25598 +       reiser4_super_info_data *sbinfo;
25599 +
25600 +       sbinfo = get_super_private(super);
25601 +
25602 +       spin_lock_reiser4_super(sbinfo);
25603 +       sbinfo->oids_in_use--;
25604 +       spin_unlock_reiser4_super(sbinfo);
25605 +       return 0;
25606 +}
25607 +
25608 +/*
25609 + * return next @oid that would be allocated (i.e., returned by oid_allocate())
25610 + * without actually allocating it. This is used by disk format plugin to save
25611 + * oid allocator state on the disk.
25612 + */
25613 +oid_t oid_next(const struct super_block *super)
25614 +{
25615 +       reiser4_super_info_data *sbinfo;
25616 +       oid_t oid;
25617 +
25618 +       sbinfo = get_super_private(super);
25619 +
25620 +       spin_lock_reiser4_super(sbinfo);
25621 +       oid = sbinfo->next_to_use;
25622 +       spin_unlock_reiser4_super(sbinfo);
25623 +       return oid;
25624 +}
25625 +
25626 +/*
25627 + * returns number of currently used oids. This is used by statfs(2) to report
25628 + * number of "inodes" and by disk format plugin to save oid allocator state on
25629 + * the disk.
25630 + */
25631 +long oids_used(const struct super_block *super)
25632 +{
25633 +       reiser4_super_info_data *sbinfo;
25634 +       oid_t used;
25635 +
25636 +       sbinfo = get_super_private(super);
25637 +
25638 +       spin_lock_reiser4_super(sbinfo);
25639 +       used = sbinfo->oids_in_use;
25640 +       spin_unlock_reiser4_super(sbinfo);
25641 +       if (used < (__u64) ((long)~0) >> 1)
25642 +               return (long)used;
25643 +       else
25644 +               return (long)-1;
25645 +}
25646 +
25647 +/*
25648 + * Count oid as allocated in atom. This is done after call to oid_allocate()
25649 + * at the point when we are irrevocably committed to creation of the new file
25650 + * (i.e., when oid allocation cannot be any longer rolled back due to some
25651 + * error).
25652 + */
25653 +void oid_count_allocated(void)
25654 +{
25655 +       txn_atom *atom;
25656 +
25657 +       atom = get_current_atom_locked();
25658 +       atom->nr_objects_created++;
25659 +       spin_unlock_atom(atom);
25660 +}
25661 +
25662 +/*
25663 + * Count oid as free in atom. This is done after call to oid_release() at the
25664 + * point when we are irrevocably committed to the deletion of the file (i.e.,
25665 + * when oid release cannot be any longer rolled back due to some error).
25666 + */
25667 +void oid_count_released(void)
25668 +{
25669 +       txn_atom *atom;
25670 +
25671 +       atom = get_current_atom_locked();
25672 +       atom->nr_objects_deleted++;
25673 +       spin_unlock_atom(atom);
25674 +}
25675 +
25676 +/*
25677 +   Local variables:
25678 +   c-indentation-style: "K&R"
25679 +   mode-name: "LC"
25680 +   c-basic-offset: 8
25681 +   tab-width: 8
25682 +   fill-column: 120
25683 +   scroll-step: 1
25684 +   End:
25685 +*/
25686 diff -puN /dev/null fs/reiser4/page_cache.c
25687 --- /dev/null
25688 +++ a/fs/reiser4/page_cache.c
25689 @@ -0,0 +1,714 @@
25690 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
25691 + * reiser4/README */
25692 +
25693 +/* Memory pressure hooks. Fake inodes handling. */
25694 +
25695 +/*   GLOSSARY
25696 +
25697 +   . Formatted and unformatted nodes.
25698 +     Elements of reiser4 balanced tree to store data and metadata.
25699 +     Unformatted nodes are pointed to by extent pointers. Such nodes
25700 +     are used to store data of large objects. Unlike unformatted nodes,
25701 +     formatted ones have associated format described by node4X plugin.
25702 +
25703 +   . Jnode (or journal node)
25704 +     The in-memory header which is used to track formatted and unformatted
25705 +     nodes, bitmap nodes, etc. In particular, jnodes are used to track
25706 +     transactional information associated with each block(see reiser4/jnode.c
25707 +     for details).
25708 +
25709 +   . Znode
25710 +     The in-memory header which is used to track formatted nodes. Contains
25711 +     embedded jnode (see reiser4/znode.c for details).
25712 +*/
25713 +
25714 +/* We store all file system meta data (and data, of course) in the page cache.
25715 +
25716 +   What does this mean? In stead of using bread/brelse we create special
25717 +   "fake" inode (one per super block) and store content of formatted nodes
25718 +   into pages bound to this inode in the page cache. In newer kernels bread()
25719 +   already uses inode attached to block device (bd_inode). Advantage of having
25720 +   our own fake inode is that we can install appropriate methods in its
25721 +   address_space operations. Such methods are called by VM on memory pressure
25722 +   (or during background page flushing) and we can use them to react
25723 +   appropriately.
25724 +
25725 +   In initial version we only support one block per page. Support for multiple
25726 +   blocks per page is complicated by relocation.
25727 +
25728 +   To each page, used by reiser4, jnode is attached. jnode is analogous to
25729 +   buffer head. Difference is that jnode is bound to the page permanently:
25730 +   jnode cannot be removed from memory until its backing page is.
25731 +
25732 +   jnode contain pointer to page (->pg field) and page contain pointer to
25733 +   jnode in ->private field. Pointer from jnode to page is protected to by
25734 +   jnode's spinlock and pointer from page to jnode is protected by page lock
25735 +   (PG_locked bit). Lock ordering is: first take page lock, then jnode spin
25736 +   lock. To go into reverse direction use jnode_lock_page() function that uses
25737 +   standard try-lock-and-release device.
25738 +
25739 +   Properties:
25740 +
25741 +   1. when jnode-to-page mapping is established (by jnode_attach_page()), page
25742 +   reference counter is increased.
25743 +
25744 +   2. when jnode-to-page mapping is destroyed (by page_clear_jnode(), page
25745 +   reference counter is decreased.
25746 +
25747 +   3. on jload() reference counter on jnode page is increased, page is
25748 +   kmapped and `referenced'.
25749 +
25750 +   4. on jrelse() inverse operations are performed.
25751 +
25752 +   5. kmapping/kunmapping of unformatted pages is done by read/write methods.
25753 +
25754 +   DEADLOCKS RELATED TO MEMORY PRESSURE. [OUTDATED. Only interesting
25755 +   historically.]
25756 +
25757 +   [In the following discussion, `lock' invariably means long term lock on
25758 +   znode.] (What about page locks?)
25759 +
25760 +   There is some special class of deadlock possibilities related to memory
25761 +   pressure. Locks acquired by other reiser4 threads are accounted for in
25762 +   deadlock prevention mechanism (lock.c), but when ->vm_writeback() is
25763 +   invoked additional hidden arc is added to the locking graph: thread that
25764 +   tries to allocate memory waits for ->vm_writeback() to finish. If this
25765 +   thread keeps lock and ->vm_writeback() tries to acquire this lock, deadlock
25766 +   prevention is useless.
25767 +
25768 +   Another related problem is possibility for ->vm_writeback() to run out of
25769 +   memory itself. This is not a problem for ext2 and friends, because their
25770 +   ->vm_writeback() don't allocate much memory, but reiser4 flush is
25771 +   definitely able to allocate huge amounts of memory.
25772 +
25773 +   It seems that there is no reliable way to cope with the problems above. In
25774 +   stead it was decided that ->vm_writeback() (as invoked in the kswapd
25775 +   context) wouldn't perform any flushing itself, but rather should just wake
25776 +   up some auxiliary thread dedicated for this purpose (or, the same thread
25777 +   that does periodic commit of old atoms (ktxnmgrd.c)).
25778 +
25779 +   Details:
25780 +
25781 +   1. Page is called `reclaimable' against particular reiser4 mount F if this
25782 +   page can be ultimately released by try_to_free_pages() under presumptions
25783 +   that:
25784 +
25785 +    a. ->vm_writeback() for F is no-op, and
25786 +
25787 +    b. none of the threads accessing F are making any progress, and
25788 +
25789 +    c. other reiser4 mounts obey the same memory reservation protocol as F
25790 +    (described below).
25791 +
25792 +   For example, clean un-pinned page, or page occupied by ext2 data are
25793 +   reclaimable against any reiser4 mount.
25794 +
25795 +   When there is more than one reiser4 mount in a system, condition (c) makes
25796 +   reclaim-ability not easily verifiable beyond trivial cases mentioned above.
25797 +
25798 +   THIS COMMENT IS VALID FOR "MANY BLOCKS ON PAGE" CASE
25799 +
25800 +   Fake inode is used to bound formatted nodes and each node is indexed within
25801 +   fake inode by its block number. If block size of smaller than page size, it
25802 +   may so happen that block mapped to the page with formatted node is occupied
25803 +   by unformatted node or is unallocated. This lead to some complications,
25804 +   because flushing whole page can lead to an incorrect overwrite of
25805 +   unformatted node that is moreover, can be cached in some other place as
25806 +   part of the file body. To avoid this, buffers for unformatted nodes are
25807 +   never marked dirty. Also pages in the fake are never marked dirty. This
25808 +   rules out usage of ->writepage() as memory pressure hook. In stead
25809 +   ->releasepage() is used.
25810 +
25811 +   Josh is concerned that page->buffer is going to die. This should not pose
25812 +   significant problem though, because we need to add some data structures to
25813 +   the page anyway (jnode) and all necessary book keeping can be put there.
25814 +
25815 +*/
25816 +
25817 +/* Life cycle of pages/nodes.
25818 +
25819 +   jnode contains reference to page and page contains reference back to
25820 +   jnode. This reference is counted in page ->count. Thus, page bound to jnode
25821 +   cannot be released back into free pool.
25822 +
25823 +    1. Formatted nodes.
25824 +
25825 +      1. formatted node is represented by znode. When new znode is created its
25826 +      ->pg pointer is NULL initially.
25827 +
25828 +      2. when node content is loaded into znode (by call to zload()) for the
25829 +      first time following happens (in call to ->read_node() or
25830 +      ->allocate_node()):
25831 +
25832 +       1. new page is added to the page cache.
25833 +
25834 +       2. this page is attached to znode and its ->count is increased.
25835 +
25836 +       3. page is kmapped.
25837 +
25838 +      3. if more calls to zload() follow (without corresponding zrelses), page
25839 +      counter is left intact and in its stead ->d_count is increased in znode.
25840 +
25841 +      4. each call to zrelse decreases ->d_count. When ->d_count drops to zero
25842 +      ->release_node() is called and page is kunmapped as result.
25843 +
25844 +      5. at some moment node can be captured by a transaction. Its ->x_count
25845 +      is then increased by transaction manager.
25846 +
25847 +      6. if node is removed from the tree (empty node with JNODE_HEARD_BANSHEE
25848 +      bit set) following will happen (also see comment at the top of znode.c):
25849 +
25850 +       1. when last lock is released, node will be uncaptured from
25851 +       transaction. This released reference that transaction manager acquired
25852 +       at the step 5.
25853 +
25854 +       2. when last reference is released, zput() detects that node is
25855 +       actually deleted and calls ->delete_node()
25856 +       operation. page_cache_delete_node() implementation detaches jnode from
25857 +       page and releases page.
25858 +
25859 +      7. otherwise (node wasn't removed from the tree), last reference to
25860 +      znode will be released after transaction manager committed transaction
25861 +      node was in. This implies squallocing of this node (see
25862 +      flush.c). Nothing special happens at this point. Znode is still in the
25863 +      hash table and page is still attached to it.
25864 +
25865 +      8. znode is actually removed from the memory because of the memory
25866 +      pressure, or during umount (znodes_tree_done()). Anyway, znode is
25867 +      removed by the call to zdrop(). At this moment, page is detached from
25868 +      znode and removed from the inode address space.
25869 +
25870 +*/
25871 +
25872 +#include "debug.h"
25873 +#include "dformat.h"
25874 +#include "key.h"
25875 +#include "txnmgr.h"
25876 +#include "jnode.h"
25877 +#include "znode.h"
25878 +#include "block_alloc.h"
25879 +#include "tree.h"
25880 +#include "vfs_ops.h"
25881 +#include "inode.h"
25882 +#include "super.h"
25883 +#include "entd.h"
25884 +#include "page_cache.h"
25885 +#include "ktxnmgrd.h"
25886 +
25887 +#include <linux/types.h>
25888 +#include <linux/fs.h>
25889 +#include <linux/mm.h>          /* for struct page */
25890 +#include <linux/swap.h>                /* for struct page */
25891 +#include <linux/pagemap.h>
25892 +#include <linux/bio.h>
25893 +#include <linux/writeback.h>
25894 +#include <linux/blkdev.h>
25895 +
25896 +static struct bio *page_bio(struct page *, jnode * , int rw, gfp_t gfp);
25897 +
25898 +static struct address_space_operations formatted_fake_as_ops;
25899 +
25900 +static const oid_t fake_ino = 0x1;
25901 +static const oid_t bitmap_ino = 0x2;
25902 +static const oid_t cc_ino = 0x3;
25903 +
25904 +static void
25905 +init_fake_inode(struct super_block *super, struct inode *fake,
25906 +               struct inode **pfake)
25907 +{
25908 +       assert("nikita-2168", fake->i_state & I_NEW);
25909 +       fake->i_mapping->a_ops = &formatted_fake_as_ops;
25910 +       *pfake = fake;
25911 +       /* NOTE-NIKITA something else? */
25912 +       unlock_new_inode(fake);
25913 +}
25914 +
25915 +/**
25916 + * reiser4_init_formatted_fake - iget inodes for formatted nodes and bitmaps
25917 + * @super: super block to init fake inode for
25918 + *
25919 + * Initializes fake inode to which formatted nodes are bound in the page cache
25920 + * and inode for bitmaps.
25921 + */
25922 +int reiser4_init_formatted_fake(struct super_block *super)
25923 +{
25924 +       struct inode *fake;
25925 +       struct inode *bitmap;
25926 +       struct inode *cc;
25927 +       reiser4_super_info_data *sinfo;
25928 +
25929 +       assert("nikita-1703", super != NULL);
25930 +
25931 +       sinfo = get_super_private_nocheck(super);
25932 +       fake = iget_locked(super, oid_to_ino(fake_ino));
25933 +
25934 +       if (fake != NULL) {
25935 +               init_fake_inode(super, fake, &sinfo->fake);
25936 +
25937 +               bitmap = iget_locked(super, oid_to_ino(bitmap_ino));
25938 +               if (bitmap != NULL) {
25939 +                       init_fake_inode(super, bitmap, &sinfo->bitmap);
25940 +
25941 +                       cc = iget_locked(super, oid_to_ino(cc_ino));
25942 +                       if (cc != NULL) {
25943 +                               init_fake_inode(super, cc, &sinfo->cc);
25944 +                               return 0;
25945 +                       } else {
25946 +                               iput(sinfo->fake);
25947 +                               iput(sinfo->bitmap);
25948 +                               sinfo->fake = NULL;
25949 +                               sinfo->bitmap = NULL;
25950 +                       }
25951 +               } else {
25952 +                       iput(sinfo->fake);
25953 +                       sinfo->fake = NULL;
25954 +               }
25955 +       }
25956 +       return RETERR(-ENOMEM);
25957 +}
25958 +
25959 +/**
25960 + * reiser4_done_formatted_fake - release inode used by formatted nodes and bitmaps
25961 + * @super: super block to init fake inode for
25962 + *
25963 + * Releases inodes which were used as address spaces of bitmap and formatted
25964 + * nodes.
25965 + */
25966 +void reiser4_done_formatted_fake(struct super_block *super)
25967 +{
25968 +       reiser4_super_info_data *sinfo;
25969 +
25970 +       sinfo = get_super_private_nocheck(super);
25971 +
25972 +       if (sinfo->fake != NULL) {
25973 +               iput(sinfo->fake);
25974 +               sinfo->fake = NULL;
25975 +       }
25976 +
25977 +       if (sinfo->bitmap != NULL) {
25978 +               iput(sinfo->bitmap);
25979 +               sinfo->bitmap = NULL;
25980 +       }
25981 +
25982 +       if (sinfo->cc != NULL) {
25983 +               iput(sinfo->cc);
25984 +               sinfo->cc = NULL;
25985 +       }
25986 +       return;
25987 +}
25988 +
25989 +void reiser4_wait_page_writeback(struct page *page)
25990 +{
25991 +       assert("zam-783", PageLocked(page));
25992 +
25993 +       do {
25994 +               unlock_page(page);
25995 +               wait_on_page_writeback(page);
25996 +               lock_page(page);
25997 +       } while (PageWriteback(page));
25998 +}
25999 +
26000 +/* return tree @page is in */
26001 +reiser4_tree *reiser4_tree_by_page(const struct page *page/* page to query */)
26002 +{
26003 +       assert("nikita-2461", page != NULL);
26004 +       return &get_super_private(page->mapping->host->i_sb)->tree;
26005 +}
26006 +
26007 +/* completion handler for single page bio-based read.
26008 +
26009 +   mpage_end_io_read() would also do. But it's static.
26010 +
26011 +*/
26012 +static void
26013 +end_bio_single_page_read(struct bio *bio, int err UNUSED_ARG)
26014 +{
26015 +       struct page *page;
26016 +
26017 +       page = bio->bi_io_vec[0].bv_page;
26018 +
26019 +       if (test_bit(BIO_UPTODATE, &bio->bi_flags)) {
26020 +               SetPageUptodate(page);
26021 +       } else {
26022 +               ClearPageUptodate(page);
26023 +               SetPageError(page);
26024 +       }
26025 +       unlock_page(page);
26026 +       bio_put(bio);
26027 +}
26028 +
26029 +/* completion handler for single page bio-based write.
26030 +
26031 +   mpage_end_io_write() would also do. But it's static.
26032 +
26033 +*/
26034 +static void
26035 +end_bio_single_page_write(struct bio *bio, int err UNUSED_ARG)
26036 +{
26037 +       struct page *page;
26038 +
26039 +       page = bio->bi_io_vec[0].bv_page;
26040 +
26041 +       if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
26042 +               SetPageError(page);
26043 +       end_page_writeback(page);
26044 +       bio_put(bio);
26045 +}
26046 +
26047 +/* ->readpage() method for formatted nodes */
26048 +static int formatted_readpage(struct file *f UNUSED_ARG,
26049 +                             struct page *page/* page to read */)
26050 +{
26051 +       assert("nikita-2412", PagePrivate(page) && jprivate(page));
26052 +       return reiser4_page_io(page, jprivate(page), READ,
26053 +                              reiser4_ctx_gfp_mask_get());
26054 +}
26055 +
26056 +/**
26057 + * reiser4_page_io - submit single-page bio request
26058 + * @page: page to perform io for
26059 + * @node: jnode of page
26060 + * @rw: read or write
26061 + * @gfp: gfp mask for bio allocation
26062 + *
26063 + * Submits single page read or write.
26064 + */
26065 +int reiser4_page_io(struct page *page, jnode *node, int rw, gfp_t gfp)
26066 +{
26067 +       struct bio *bio;
26068 +       int result;
26069 +
26070 +       assert("nikita-2094", page != NULL);
26071 +       assert("nikita-2226", PageLocked(page));
26072 +       assert("nikita-2634", node != NULL);
26073 +       assert("nikita-2893", rw == READ || rw == WRITE);
26074 +
26075 +       if (rw) {
26076 +               if (unlikely(page->mapping->host->i_sb->s_flags & MS_RDONLY)) {
26077 +                       unlock_page(page);
26078 +                       return 0;
26079 +               }
26080 +       }
26081 +
26082 +       bio = page_bio(page, node, rw, gfp);
26083 +       if (!IS_ERR(bio)) {
26084 +               if (rw == WRITE) {
26085 +                       set_page_writeback(page);
26086 +                       unlock_page(page);
26087 +               }
26088 +               reiser4_submit_bio(rw, bio);
26089 +               result = 0;
26090 +       } else {
26091 +               unlock_page(page);
26092 +               result = PTR_ERR(bio);
26093 +       }
26094 +
26095 +       return result;
26096 +}
26097 +
26098 +/* helper function to construct bio for page */
26099 +static struct bio *page_bio(struct page *page, jnode * node, int rw, gfp_t gfp)
26100 +{
26101 +       struct bio *bio;
26102 +       assert("nikita-2092", page != NULL);
26103 +       assert("nikita-2633", node != NULL);
26104 +
26105 +       /* Simple implementation in the assumption that blocksize == pagesize.
26106 +
26107 +          We only have to submit one block, but submit_bh() will allocate bio
26108 +          anyway, so lets use all the bells-and-whistles of bio code.
26109 +        */
26110 +
26111 +       bio = bio_alloc(gfp, 1);
26112 +       if (bio != NULL) {
26113 +               int blksz;
26114 +               struct super_block *super;
26115 +               reiser4_block_nr blocknr;
26116 +
26117 +               super = page->mapping->host->i_sb;
26118 +               assert("nikita-2029", super != NULL);
26119 +               blksz = super->s_blocksize;
26120 +               assert("nikita-2028", blksz == (int)PAGE_CACHE_SIZE);
26121 +
26122 +               spin_lock_jnode(node);
26123 +               blocknr = *jnode_get_io_block(node);
26124 +               spin_unlock_jnode(node);
26125 +
26126 +               assert("nikita-2275", blocknr != (reiser4_block_nr) 0);
26127 +               assert("nikita-2276", !reiser4_blocknr_is_fake(&blocknr));
26128 +
26129 +               bio->bi_bdev = super->s_bdev;
26130 +               /* fill bio->bi_sector before calling bio_add_page(), because
26131 +                * q->merge_bvec_fn may want to inspect it (see
26132 +                * drivers/md/linear.c:linear_mergeable_bvec() for example. */
26133 +               bio->bi_sector = blocknr * (blksz >> 9);
26134 +
26135 +               if (!bio_add_page(bio, page, blksz, 0)) {
26136 +                       warning("nikita-3452",
26137 +                               "Single page bio cannot be constructed");
26138 +                       return ERR_PTR(RETERR(-EINVAL));
26139 +               }
26140 +
26141 +               /* bio -> bi_idx is filled by bio_init() */
26142 +               bio->bi_end_io = (rw == READ) ?
26143 +                   end_bio_single_page_read : end_bio_single_page_write;
26144 +
26145 +               return bio;
26146 +       } else
26147 +               return ERR_PTR(RETERR(-ENOMEM));
26148 +}
26149 +
26150 +/* this function is internally called by jnode_make_dirty() */
26151 +int reiser4_set_page_dirty_internal(struct page *page)
26152 +{
26153 +       struct address_space *mapping;
26154 +
26155 +       mapping = page->mapping;
26156 +       BUG_ON(mapping == NULL);
26157 +
26158 +       if (!TestSetPageDirty(page)) {
26159 +               if (mapping_cap_account_dirty(mapping))
26160 +                       inc_zone_page_state(page, NR_FILE_DIRTY);
26161 +
26162 +               __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
26163 +       }
26164 +
26165 +       /* znode must be dirty ? */
26166 +       if (mapping->host == reiser4_get_super_fake(mapping->host->i_sb))
26167 +               assert("", JF_ISSET(jprivate(page), JNODE_DIRTY));
26168 +       return 0;
26169 +}
26170 +
26171 +#if 0
26172 +static int can_hit_entd(reiser4_context *ctx, struct super_block *s)
26173 +{
26174 +       if (ctx == NULL || ((unsigned long)ctx->magic) != context_magic)
26175 +               return 1;
26176 +       if (ctx->super != s)
26177 +               return 1;
26178 +       if (get_super_private(s)->entd.tsk == current)
26179 +               return 0;
26180 +       if (!lock_stack_isclean(&ctx->stack))
26181 +               return 0;
26182 +       if (ctx->trans->atom != NULL)
26183 +               return 0;
26184 +       return 1;
26185 +}
26186 +#endif
26187 +
26188 +/**
26189 + * reiser4_writepage - writepage of struct address_space_operations
26190 + * @page: page to write
26191 + * @wbc:
26192 + *
26193 + *
26194 + */
26195 +/* Common memory pressure notification. */
26196 +int reiser4_writepage(struct page *page,
26197 +                     struct writeback_control *wbc)
26198 +{
26199 +       struct super_block *s;
26200 +       reiser4_context *ctx;
26201 +
26202 +       assert("vs-828", PageLocked(page));
26203 +
26204 +       s = page->mapping->host->i_sb;
26205 +       ctx = get_current_context_check();
26206 +
26207 +       /* assert("", can_hit_entd(ctx, s)); */
26208 +       return write_page_by_ent(page, wbc);
26209 +}
26210 +
26211 +/* ->set_page_dirty() method of formatted address_space */
26212 +static int formatted_set_page_dirty(struct page *page)
26213 +{
26214 +       assert("nikita-2173", page != NULL);
26215 +       BUG();
26216 +       return __set_page_dirty_nobuffers(page);
26217 +}
26218 +
26219 +/* writepages method of address space operations in reiser4 is used to involve
26220 +   into transactions pages which are dirtied via mmap. Only regular files can
26221 +   have such pages. Fake inode is used to access formatted nodes via page
26222 +   cache. As formatted nodes can never be mmaped, fake inode's writepages has
26223 +   nothing to do */
26224 +static int
26225 +writepages_fake(struct address_space *mapping, struct writeback_control *wbc)
26226 +{
26227 +       return 0;
26228 +}
26229 +
26230 +/* address space operations for the fake inode */
26231 +static struct address_space_operations formatted_fake_as_ops = {
26232 +       /* Perform a writeback of a single page as a memory-freeing
26233 +        * operation. */
26234 +       .writepage = reiser4_writepage,
26235 +       /* this is called to read formatted node */
26236 +       .readpage = formatted_readpage,
26237 +       /* ->sync_page() method of fake inode address space operations. Called
26238 +          from wait_on_page() and lock_page().
26239 +
26240 +          This is most annoyingly misnomered method. Actually it is called
26241 +          from wait_on_page_bit() and lock_page() and its purpose is to
26242 +          actually start io by jabbing device drivers.
26243 +        */
26244 +       .sync_page = block_sync_page,
26245 +       /* Write back some dirty pages from this mapping. Called from sync.
26246 +          called during sync (pdflush) */
26247 +       .writepages = writepages_fake,
26248 +       /* Set a page dirty */
26249 +       .set_page_dirty = formatted_set_page_dirty,
26250 +       /* used for read-ahead. Not applicable */
26251 +       .readpages = NULL,
26252 +       .prepare_write = NULL,
26253 +       .commit_write = NULL,
26254 +       .bmap = NULL,
26255 +       /* called just before page is being detached from inode mapping and
26256 +          removed from memory. Called on truncate, cut/squeeze, and
26257 +          umount. */
26258 +       .invalidatepage = reiser4_invalidatepage,
26259 +       /* this is called by shrink_cache() so that file system can try to
26260 +          release objects (jnodes, buffers, journal heads) attached to page
26261 +          and, may be made page itself free-able.
26262 +        */
26263 +       .releasepage = reiser4_releasepage,
26264 +       .direct_IO = NULL
26265 +};
26266 +
26267 +/* called just before page is released (no longer used by reiser4). Callers:
26268 +   jdelete() and extent2tail(). */
26269 +void reiser4_drop_page(struct page *page)
26270 +{
26271 +       assert("nikita-2181", PageLocked(page));
26272 +       clear_page_dirty_for_io(page);
26273 +       ClearPageUptodate(page);
26274 +#if defined(PG_skipped)
26275 +       ClearPageSkipped(page);
26276 +#endif
26277 +       unlock_page(page);
26278 +}
26279 +
26280 +#define JNODE_GANG_SIZE (16)
26281 +
26282 +/* find all jnodes from range specified and invalidate them */
26283 +static int
26284 +truncate_jnodes_range(struct inode *inode, pgoff_t from, pgoff_t count)
26285 +{
26286 +       reiser4_inode *info;
26287 +       int truncated_jnodes;
26288 +       reiser4_tree *tree;
26289 +       unsigned long index;
26290 +       unsigned long end;
26291 +
26292 +       if (inode_file_plugin(inode) ==
26293 +           file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID))
26294 +               /*
26295 +                * No need to get rid of jnodes here: if the single jnode of
26296 +                * page cluster did not have page, then it was found and killed
26297 +                * before in
26298 +                * truncate_complete_page_cluster()->jput()->jput_final(),
26299 +                * otherwise it will be dropped by reiser4_invalidatepage()
26300 +                */
26301 +               return 0;
26302 +       truncated_jnodes = 0;
26303 +
26304 +       info = reiser4_inode_data(inode);
26305 +       tree = reiser4_tree_by_inode(inode);
26306 +
26307 +       index = from;
26308 +       end = from + count;
26309 +
26310 +       while (1) {
26311 +               jnode *gang[JNODE_GANG_SIZE];
26312 +               int taken;
26313 +               int i;
26314 +               jnode *node;
26315 +
26316 +               assert("nikita-3466", index <= end);
26317 +
26318 +               read_lock_tree(tree);
26319 +               taken =
26320 +                   radix_tree_gang_lookup(jnode_tree_by_reiser4_inode(info),
26321 +                                          (void **)gang, index,
26322 +                                          JNODE_GANG_SIZE);
26323 +               for (i = 0; i < taken; ++i) {
26324 +                       node = gang[i];
26325 +                       if (index_jnode(node) < end)
26326 +                               jref(node);
26327 +                       else
26328 +                               gang[i] = NULL;
26329 +               }
26330 +               read_unlock_tree(tree);
26331 +
26332 +               for (i = 0; i < taken; ++i) {
26333 +                       node = gang[i];
26334 +                       if (node != NULL) {
26335 +                               index = max(index, index_jnode(node));
26336 +                               spin_lock_jnode(node);
26337 +                               assert("edward-1457", node->pg == NULL);
26338 +                               /* this is always called after
26339 +                                  truncate_inode_pages_range(). Therefore, here
26340 +                                  jnode can not have page. New pages can not be
26341 +                                  created because truncate_jnodes_range goes
26342 +                                  under exclusive access on file obtained,
26343 +                                  where as new page creation requires
26344 +                                  non-exclusive access obtained */
26345 +                               JF_SET(node, JNODE_HEARD_BANSHEE);
26346 +                               reiser4_uncapture_jnode(node);
26347 +                               unhash_unformatted_jnode(node);
26348 +                               truncated_jnodes++;
26349 +                               jput(node);
26350 +                       } else
26351 +                               break;
26352 +               }
26353 +               if (i != taken || taken == 0)
26354 +                       break;
26355 +       }
26356 +       return truncated_jnodes;
26357 +}
26358 +
26359 +/* Truncating files in reiser4: problems and solutions.
26360 +
26361 +   VFS calls fs's truncate after it has called truncate_inode_pages()
26362 +   to get rid of pages corresponding to part of file being truncated.
26363 +   In reiser4 it may cause existence of unallocated extents which do
26364 +   not have jnodes. Flush code does not expect that. Solution of this
26365 +   problem is straightforward. As vfs's truncate is implemented using
26366 +   setattr operation, it seems reasonable to have ->setattr() that
26367 +   will cut file body. However, flush code also does not expect dirty
26368 +   pages without parent items, so it is impossible to cut all items,
26369 +   then truncate all pages in two steps. We resolve this problem by
26370 +   cutting items one-by-one. Each such fine-grained step performed
26371 +   under longterm znode lock calls at the end ->kill_hook() method of
26372 +   a killed item to remove its binded pages and jnodes.
26373 +
26374 +   The following function is a common part of mentioned kill hooks.
26375 +   Also, this is called before tail-to-extent conversion (to not manage
26376 +   few copies of the data).
26377 +*/
26378 +void reiser4_invalidate_pages(struct address_space *mapping, pgoff_t from,
26379 +                             unsigned long count, int even_cows)
26380 +{
26381 +       loff_t from_bytes, count_bytes;
26382 +
26383 +       if (count == 0)
26384 +               return;
26385 +       from_bytes = ((loff_t) from) << PAGE_CACHE_SHIFT;
26386 +       count_bytes = ((loff_t) count) << PAGE_CACHE_SHIFT;
26387 +
26388 +       unmap_mapping_range(mapping, from_bytes, count_bytes, even_cows);
26389 +       truncate_inode_pages_range(mapping, from_bytes,
26390 +                                  from_bytes + count_bytes - 1);
26391 +       truncate_jnodes_range(mapping->host, from, count);
26392 +}
26393 +
26394 +/*
26395 + * Local variables:
26396 + * c-indentation-style: "K&R"
26397 + * mode-name: "LC"
26398 + * c-basic-offset: 8
26399 + * tab-width: 8
26400 + * fill-column: 120
26401 + * scroll-step: 1
26402 + * End:
26403 + */
26404 diff -puN /dev/null fs/reiser4/page_cache.h
26405 --- /dev/null
26406 +++ a/fs/reiser4/page_cache.h
26407 @@ -0,0 +1,68 @@
26408 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
26409 + * reiser4/README */
26410 +/* Memory pressure hooks. Fake inodes handling. See page_cache.c. */
26411 +
26412 +#if !defined(__REISER4_PAGE_CACHE_H__)
26413 +#define __REISER4_PAGE_CACHE_H__
26414 +
26415 +#include "forward.h"
26416 +#include "context.h"            /* for reiser4_ctx_gfp_mask_get() */
26417 +
26418 +#include <linux/fs.h>          /* for struct super_block, address_space  */
26419 +#include <linux/mm.h>          /* for struct page  */
26420 +#include <linux/pagemap.h>     /* for lock_page()  */
26421 +#include <linux/vmalloc.h>     /* for __vmalloc()  */
26422 +
26423 +extern int reiser4_init_formatted_fake(struct super_block *);
26424 +extern void reiser4_done_formatted_fake(struct super_block *);
26425 +
26426 +extern reiser4_tree *reiser4_tree_by_page(const struct page *);
26427 +
26428 +extern int reiser4_set_page_dirty_internal(struct page *);
26429 +
26430 +#define reiser4_submit_bio(rw, bio) submit_bio((rw), (bio))
26431 +
26432 +extern void reiser4_wait_page_writeback(struct page *);
26433 +static inline void lock_and_wait_page_writeback(struct page *page)
26434 +{
26435 +       lock_page(page);
26436 +       if (unlikely(PageWriteback(page)))
26437 +               reiser4_wait_page_writeback(page);
26438 +}
26439 +
26440 +#define jprivate(page) ((jnode *)page_private(page))
26441 +
26442 +extern int reiser4_page_io(struct page *, jnode *, int rw, gfp_t);
26443 +extern void reiser4_drop_page(struct page *);
26444 +extern void reiser4_invalidate_pages(struct address_space *, pgoff_t from,
26445 +                                    unsigned long count, int even_cows);
26446 +extern void capture_reiser4_inodes(struct super_block *,
26447 +                                  struct writeback_control *);
26448 +static inline void *reiser4_vmalloc(unsigned long size)
26449 +{
26450 +       return __vmalloc(size,
26451 +                        reiser4_ctx_gfp_mask_get() | __GFP_HIGHMEM,
26452 +                        PAGE_KERNEL);
26453 +}
26454 +
26455 +#define PAGECACHE_TAG_REISER4_MOVED PAGECACHE_TAG_DIRTY
26456 +
26457 +#if REISER4_DEBUG
26458 +extern void print_page(const char *prefix, struct page *page);
26459 +#else
26460 +#define print_page(prf, p) noop
26461 +#endif
26462 +
26463 +/* __REISER4_PAGE_CACHE_H__ */
26464 +#endif
26465 +
26466 +/* Make Linus happy.
26467 +   Local variables:
26468 +   c-indentation-style: "K&R"
26469 +   mode-name: "LC"
26470 +   c-basic-offset: 8
26471 +   tab-width: 8
26472 +   fill-column: 120
26473 +   scroll-step: 1
26474 +   End:
26475 +*/
26476 diff -puN /dev/null fs/reiser4/plugin/Makefile
26477 --- /dev/null
26478 +++ a/fs/reiser4/plugin/Makefile
26479 @@ -0,0 +1,26 @@
26480 +obj-$(CONFIG_REISER4_FS) += plugins.o
26481 +
26482 +plugins-objs :=                        \
26483 +       plugin.o                \
26484 +       plugin_set.o            \
26485 +       object.o                \
26486 +       inode_ops.o             \
26487 +       inode_ops_rename.o      \
26488 +       file_ops.o              \
26489 +       file_ops_readdir.o      \
26490 +       file_plugin_common.o    \
26491 +       dir_plugin_common.o     \
26492 +       digest.o                \
26493 +       hash.o                  \
26494 +       fibration.o             \
26495 +       tail_policy.o           \
26496 +       regular.o
26497 +
26498 +obj-$(CONFIG_REISER4_FS) += item/
26499 +obj-$(CONFIG_REISER4_FS) += file/
26500 +obj-$(CONFIG_REISER4_FS) += dir/
26501 +obj-$(CONFIG_REISER4_FS) += node/
26502 +obj-$(CONFIG_REISER4_FS) += compress/
26503 +obj-$(CONFIG_REISER4_FS) += space/
26504 +obj-$(CONFIG_REISER4_FS) += disk_format/
26505 +obj-$(CONFIG_REISER4_FS) += security/
26506 diff -puN /dev/null fs/reiser4/plugin/cluster.c
26507 --- /dev/null
26508 +++ a/fs/reiser4/plugin/cluster.c
26509 @@ -0,0 +1,72 @@
26510 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
26511 + * reiser4/README */
26512 +
26513 +/* Contains reiser4 cluster plugins (see
26514 +   http://www.namesys.com/cryptcompress_design.html
26515 +   "Concepts of clustering" for details). */
26516 +
26517 +#include "plugin_header.h"
26518 +#include "plugin.h"
26519 +#include "../inode.h"
26520 +
26521 +static int change_cluster(struct inode *inode,
26522 +                         reiser4_plugin * plugin,
26523 +                         pset_member memb)
26524 +{
26525 +       assert("edward-1324", inode != NULL);
26526 +       assert("edward-1325", plugin != NULL);
26527 +       assert("edward-1326", is_reiser4_inode(inode));
26528 +       assert("edward-1327", plugin->h.type_id == REISER4_CLUSTER_PLUGIN_TYPE);
26529 +
26530 +       /* Can't change the cluster plugin for already existent regular files */
26531 +       if (!plugin_of_group(inode_file_plugin(inode), REISER4_DIRECTORY_FILE))
26532 +               return RETERR(-EINVAL);
26533 +
26534 +       /* If matches, nothing to change. */
26535 +       if (inode_hash_plugin(inode) != NULL &&
26536 +           inode_hash_plugin(inode)->h.id == plugin->h.id)
26537 +               return 0;
26538 +
26539 +       return aset_set_unsafe(&reiser4_inode_data(inode)->pset,
26540 +                              PSET_CLUSTER, plugin);
26541 +}
26542 +
26543 +static reiser4_plugin_ops cluster_plugin_ops = {
26544 +       .init = NULL,
26545 +       .load = NULL,
26546 +       .save_len = NULL,
26547 +       .save = NULL,
26548 +       .change = &change_cluster
26549 +};
26550 +
26551 +#define SUPPORT_CLUSTER(SHIFT, ID, LABEL, DESC)                        \
26552 +       [CLUSTER_ ## ID ## _ID] = {                             \
26553 +               .h = {                                          \
26554 +                       .type_id = REISER4_CLUSTER_PLUGIN_TYPE, \
26555 +                       .id = CLUSTER_ ## ID ## _ID,            \
26556 +                       .pops = &cluster_plugin_ops,            \
26557 +                       .label = LABEL,                         \
26558 +                       .desc = DESC,                           \
26559 +                       .linkage = {NULL, NULL}                 \
26560 +               },                                              \
26561 +               .shift = SHIFT                                  \
26562 +       }
26563 +
26564 +cluster_plugin cluster_plugins[LAST_CLUSTER_ID] = {
26565 +       SUPPORT_CLUSTER(16, 64K, "64K", "Large"),
26566 +       SUPPORT_CLUSTER(15, 32K, "32K", "Big"),
26567 +       SUPPORT_CLUSTER(14, 16K, "16K", "Average"),
26568 +       SUPPORT_CLUSTER(13, 8K, "8K", "Small"),
26569 +       SUPPORT_CLUSTER(12, 4K, "4K", "Minimal")
26570 +};
26571 +
26572 +/*
26573 +  Local variables:
26574 +  c-indentation-style: "K&R"
26575 +  mode-name: "LC"
26576 +  c-basic-offset: 8
26577 +  tab-width: 8
26578 +  fill-column: 120
26579 +  scroll-step: 1
26580 +  End:
26581 +*/
26582 diff -puN /dev/null fs/reiser4/plugin/cluster.h
26583 --- /dev/null
26584 +++ a/fs/reiser4/plugin/cluster.h
26585 @@ -0,0 +1,410 @@
26586 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
26587 +
26588 +/* This file contains size/offset translators, modulators
26589 +   and other helper functions. */
26590 +
26591 +#if !defined(__FS_REISER4_CLUSTER_H__)
26592 +#define __FS_REISER4_CLUSTER_H__
26593 +
26594 +#include "../inode.h"
26595 +
26596 +static inline int inode_cluster_shift(struct inode *inode)
26597 +{
26598 +       assert("edward-92", inode != NULL);
26599 +       assert("edward-93", reiser4_inode_data(inode) != NULL);
26600 +
26601 +       return inode_cluster_plugin(inode)->shift;
26602 +}
26603 +
26604 +static inline unsigned cluster_nrpages_shift(struct inode *inode)
26605 +{
26606 +       return inode_cluster_shift(inode) - PAGE_CACHE_SHIFT;
26607 +}
26608 +
26609 +/* cluster size in page units */
26610 +static inline unsigned cluster_nrpages(struct inode *inode)
26611 +{
26612 +       return 1U << cluster_nrpages_shift(inode);
26613 +}
26614 +
26615 +static inline size_t inode_cluster_size(struct inode *inode)
26616 +{
26617 +       assert("edward-96", inode != NULL);
26618 +
26619 +       return 1U << inode_cluster_shift(inode);
26620 +}
26621 +
26622 +static inline cloff_t pg_to_clust(pgoff_t idx, struct inode *inode)
26623 +{
26624 +       return idx >> cluster_nrpages_shift(inode);
26625 +}
26626 +
26627 +static inline pgoff_t clust_to_pg(cloff_t idx, struct inode *inode)
26628 +{
26629 +       return idx << cluster_nrpages_shift(inode);
26630 +}
26631 +
26632 +static inline pgoff_t pg_to_clust_to_pg(pgoff_t idx, struct inode *inode)
26633 +{
26634 +       return clust_to_pg(pg_to_clust(idx, inode), inode);
26635 +}
26636 +
26637 +static inline pgoff_t off_to_pg(loff_t off)
26638 +{
26639 +       return (off >> PAGE_CACHE_SHIFT);
26640 +}
26641 +
26642 +static inline loff_t pg_to_off(pgoff_t idx)
26643 +{
26644 +       return ((loff_t) (idx) << PAGE_CACHE_SHIFT);
26645 +}
26646 +
26647 +static inline cloff_t off_to_clust(loff_t off, struct inode *inode)
26648 +{
26649 +       return off >> inode_cluster_shift(inode);
26650 +}
26651 +
26652 +static inline loff_t clust_to_off(cloff_t idx, struct inode *inode)
26653 +{
26654 +       return (loff_t) idx << inode_cluster_shift(inode);
26655 +}
26656 +
26657 +static inline loff_t off_to_clust_to_off(loff_t off, struct inode *inode)
26658 +{
26659 +       return clust_to_off(off_to_clust(off, inode), inode);
26660 +}
26661 +
26662 +static inline pgoff_t off_to_clust_to_pg(loff_t off, struct inode *inode)
26663 +{
26664 +       return clust_to_pg(off_to_clust(off, inode), inode);
26665 +}
26666 +
26667 +static inline unsigned off_to_pgoff(loff_t off)
26668 +{
26669 +       return off & (PAGE_CACHE_SIZE - 1);
26670 +}
26671 +
26672 +static inline unsigned off_to_cloff(loff_t off, struct inode *inode)
26673 +{
26674 +       return off & ((loff_t) (inode_cluster_size(inode)) - 1);
26675 +}
26676 +
26677 +static inline  pgoff_t offset_in_clust(struct page *page)
26678 +{
26679 +       assert("edward-1488", page != NULL);
26680 +       assert("edward-1489", page->mapping != NULL);
26681 +
26682 +       return page_index(page) & ((cluster_nrpages(page->mapping->host)) - 1);
26683 +}
26684 +
26685 +static inline int first_page_in_cluster(struct page *page)
26686 +{
26687 +       return offset_in_clust(page) == 0;
26688 +}
26689 +
26690 +static inline int last_page_in_cluster(struct page *page)
26691 +{
26692 +       return offset_in_clust(page) ==
26693 +               cluster_nrpages(page->mapping->host) - 1;
26694 +}
26695 +
26696 +static inline unsigned
26697 +pg_to_off_to_cloff(unsigned long idx, struct inode *inode)
26698 +{
26699 +       return off_to_cloff(pg_to_off(idx), inode);
26700 +}
26701 +
26702 +/*********************** Size translators **************************/
26703 +
26704 +/* Translate linear size.
26705 + * New units are (1 << @blk_shift) times larger, then old ones.
26706 + * In other words, calculate number of logical blocks, occupied
26707 + * by @count elements
26708 + */
26709 +static inline unsigned long size_in_blocks(loff_t count, unsigned blkbits)
26710 +{
26711 +       return (count + (1UL << blkbits) - 1) >> blkbits;
26712 +}
26713 +
26714 +/* size in pages */
26715 +static inline pgoff_t size_in_pages(loff_t size)
26716 +{
26717 +       return size_in_blocks(size, PAGE_CACHE_SHIFT);
26718 +}
26719 +
26720 +/* size in logical clusters */
26721 +static inline cloff_t size_in_lc(loff_t size, struct inode *inode)
26722 +{
26723 +       return size_in_blocks(size, inode_cluster_shift(inode));
26724 +}
26725 +
26726 +/* size in pages to the size in page clusters */
26727 +static inline cloff_t sp_to_spcl(pgoff_t size, struct inode *inode)
26728 +{
26729 +       return size_in_blocks(size, cluster_nrpages_shift(inode));
26730 +}
26731 +
26732 +/*********************** Size modulators ***************************/
26733 +
26734 +/*
26735 +  Modulate linear size by nominated block size and offset.
26736 +
26737 +  The "finite" function (which is zero almost everywhere).
26738 +  How much is a height of the figure at a position @pos,
26739 +  when trying to construct rectangle of height (1 << @blkbits),
26740 +  and square @size.
26741 +
26742 +  ******
26743 +  *******
26744 +  *******
26745 +  *******
26746 +  ----------> pos
26747 +*/
26748 +static inline unsigned __mbb(loff_t size, unsigned long pos, int blkbits)
26749 +{
26750 +       unsigned end = size >> blkbits;
26751 +       if (pos < end)
26752 +               return 1U << blkbits;
26753 +       if (unlikely(pos > end))
26754 +               return 0;
26755 +       return size & ~(~0ull << blkbits);
26756 +}
26757 +
26758 +/* the same as above, but block size is page size */
26759 +static inline unsigned __mbp(loff_t size, pgoff_t pos)
26760 +{
26761 +       return __mbb(size, pos, PAGE_CACHE_SHIFT);
26762 +}
26763 +
26764 +/* number of file's bytes in the nominated logical cluster */
26765 +static inline unsigned lbytes(cloff_t index, struct inode *inode)
26766 +{
26767 +       return __mbb(i_size_read(inode), index, inode_cluster_shift(inode));
26768 +}
26769 +
26770 +/* number of file's bytes in the nominated page */
26771 +static inline unsigned pbytes(pgoff_t index, struct inode *inode)
26772 +{
26773 +       return __mbp(i_size_read(inode), index);
26774 +}
26775 +
26776 +/**
26777 + * number of pages occuped by @win->count bytes starting from
26778 + * @win->off at logical cluster defined by @win. This is exactly
26779 + * a number of pages to be modified and dirtied in any cluster operation.
26780 + */
26781 +static inline pgoff_t win_count_to_nrpages(struct reiser4_slide * win)
26782 +{
26783 +       return ((win->off + win->count +
26784 +                (1UL << PAGE_CACHE_SHIFT) - 1) >> PAGE_CACHE_SHIFT) -
26785 +               off_to_pg(win->off);
26786 +}
26787 +
26788 +/* return true, if logical cluster is not occupied by the file */
26789 +static inline int new_logical_cluster(struct cluster_handle *clust,
26790 +                                     struct inode *inode)
26791 +{
26792 +       return clust_to_off(clust->index, inode) >= i_size_read(inode);
26793 +}
26794 +
26795 +/* return true, if pages @p1 and @p2 are of the same page cluster */
26796 +static inline int same_page_cluster(struct page *p1, struct page *p2)
26797 +{
26798 +       assert("edward-1490", p1 != NULL);
26799 +       assert("edward-1491", p2 != NULL);
26800 +       assert("edward-1492", p1->mapping != NULL);
26801 +       assert("edward-1493", p2->mapping != NULL);
26802 +
26803 +       return (pg_to_clust(page_index(p1), p1->mapping->host) ==
26804 +               pg_to_clust(page_index(p2), p2->mapping->host));
26805 +}
26806 +
26807 +static inline int cluster_is_complete(struct cluster_handle *clust,
26808 +                                     struct inode *inode)
26809 +{
26810 +       return clust->tc.lsize == inode_cluster_size(inode);
26811 +}
26812 +
26813 +static inline void reiser4_slide_init(struct reiser4_slide *win)
26814 +{
26815 +       assert("edward-1084", win != NULL);
26816 +       memset(win, 0, sizeof *win);
26817 +}
26818 +
26819 +static inline tfm_action
26820 +cluster_get_tfm_act(struct tfm_cluster *tc)
26821 +{
26822 +       assert("edward-1356", tc != NULL);
26823 +       return tc->act;
26824 +}
26825 +
26826 +static inline void
26827 +cluster_set_tfm_act(struct tfm_cluster *tc, tfm_action act)
26828 +{
26829 +       assert("edward-1356", tc != NULL);
26830 +       tc->act = act;
26831 +}
26832 +
26833 +static inline void cluster_init_act(struct cluster_handle *clust,
26834 +                                   tfm_action act,
26835 +                                   struct reiser4_slide *window)
26836 +{
26837 +       assert("edward-84", clust != NULL);
26838 +       memset(clust, 0, sizeof *clust);
26839 +       cluster_set_tfm_act(&clust->tc, act);
26840 +       clust->dstat = INVAL_DISK_CLUSTER;
26841 +       clust->win = window;
26842 +}
26843 +
26844 +static inline void cluster_init_read(struct cluster_handle *clust,
26845 +                                    struct reiser4_slide *window)
26846 +{
26847 +       cluster_init_act(clust, TFMA_READ, window);
26848 +}
26849 +
26850 +static inline void cluster_init_write(struct cluster_handle *clust,
26851 +                                     struct reiser4_slide *window)
26852 +{
26853 +       cluster_init_act(clust, TFMA_WRITE, window);
26854 +}
26855 +
26856 +/* true if @p1 and @p2 are items of the same disk cluster */
26857 +static inline int same_disk_cluster(const coord_t *p1, const coord_t *p2)
26858 +{
26859 +       /* drop this if you have other items to aggregate */
26860 +       assert("edward-1494", item_id_by_coord(p1) == CTAIL_ID);
26861 +
26862 +       return item_plugin_by_coord(p1)->b.mergeable(p1, p2);
26863 +}
26864 +
26865 +static inline int dclust_get_extension_dsize(hint_t *hint)
26866 +{
26867 +       return hint->ext_coord.extension.ctail.dsize;
26868 +}
26869 +
26870 +static inline void dclust_set_extension_dsize(hint_t *hint, int dsize)
26871 +{
26872 +       hint->ext_coord.extension.ctail.dsize = dsize;
26873 +}
26874 +
26875 +static inline int dclust_get_extension_shift(hint_t *hint)
26876 +{
26877 +       return hint->ext_coord.extension.ctail.shift;
26878 +}
26879 +
26880 +static inline int dclust_get_extension_ncount(hint_t *hint)
26881 +{
26882 +       return hint->ext_coord.extension.ctail.ncount;
26883 +}
26884 +
26885 +static inline void dclust_inc_extension_ncount(hint_t *hint)
26886 +{
26887 +       hint->ext_coord.extension.ctail.ncount++;
26888 +}
26889 +
26890 +static inline void dclust_init_extension(hint_t *hint)
26891 +{
26892 +       memset(&hint->ext_coord.extension.ctail, 0,
26893 +              sizeof(hint->ext_coord.extension.ctail));
26894 +}
26895 +
26896 +static inline int hint_is_unprepped_dclust(hint_t *hint)
26897 +{
26898 +       assert("edward-1451", hint_is_valid(hint));
26899 +       return dclust_get_extension_shift(hint) == (int)UCTAIL_SHIFT;
26900 +}
26901 +
26902 +static inline void coord_set_between_clusters(coord_t *coord)
26903 +{
26904 +#if REISER4_DEBUG
26905 +       int result;
26906 +       result = zload(coord->node);
26907 +       assert("edward-1296", !result);
26908 +#endif
26909 +       if (!coord_is_between_items(coord)) {
26910 +               coord->between = AFTER_ITEM;
26911 +               coord->unit_pos = 0;
26912 +       }
26913 +#if REISER4_DEBUG
26914 +       zrelse(coord->node);
26915 +#endif
26916 +}
26917 +
26918 +int reiser4_inflate_cluster(struct cluster_handle *, struct inode *);
26919 +int find_disk_cluster(struct cluster_handle *, struct inode *, int read,
26920 +                     znode_lock_mode mode);
26921 +int checkout_logical_cluster(struct cluster_handle *, jnode * , struct inode *);
26922 +int reiser4_deflate_cluster(struct cluster_handle *, struct inode *);
26923 +void truncate_complete_page_cluster(struct inode *inode, cloff_t start,
26924 +                                        int even_cows);
26925 +void invalidate_hint_cluster(struct cluster_handle *clust);
26926 +int get_disk_cluster_locked(struct cluster_handle *clust, struct inode *inode,
26927 +                           znode_lock_mode lock_mode);
26928 +void reset_cluster_params(struct cluster_handle *clust);
26929 +int set_cluster_by_page(struct cluster_handle *clust, struct page *page,
26930 +                       int count);
26931 +int prepare_page_cluster(struct inode *inode, struct cluster_handle *clust,
26932 +                        rw_op rw);
26933 +void __put_page_cluster(int from, int count, struct page **pages,
26934 +                       struct inode *inode);
26935 +void put_page_cluster(struct cluster_handle *clust,
26936 +                     struct inode *inode, rw_op rw);
26937 +void put_cluster_handle(struct cluster_handle *clust);
26938 +int grab_tfm_stream(struct inode *inode, struct tfm_cluster *tc,
26939 +                   tfm_stream_id id);
26940 +int tfm_cluster_is_uptodate(struct tfm_cluster *tc);
26941 +void tfm_cluster_set_uptodate(struct tfm_cluster *tc);
26942 +void tfm_cluster_clr_uptodate(struct tfm_cluster *tc);
26943 +
26944 +/* move cluster handle to the target position
26945 +   specified by the page of index @pgidx */
26946 +static inline void move_cluster_forward(struct cluster_handle *clust,
26947 +                                       struct inode *inode,
26948 +                                       pgoff_t pgidx)
26949 +{
26950 +       assert("edward-1297", clust != NULL);
26951 +       assert("edward-1298", inode != NULL);
26952 +
26953 +       reset_cluster_params(clust);
26954 +       if (clust->index_valid &&
26955 +           /* Hole in the indices. Hint became invalid and can not be
26956 +              used by find_cluster_item() even if seal/node versions
26957 +              will coincide */
26958 +           pg_to_clust(pgidx, inode) != clust->index + 1) {
26959 +               reiser4_unset_hint(clust->hint);
26960 +               invalidate_hint_cluster(clust);
26961 +       }
26962 +       clust->index = pg_to_clust(pgidx, inode);
26963 +       clust->index_valid = 1;
26964 +}
26965 +
26966 +static inline int alloc_clust_pages(struct cluster_handle *clust,
26967 +                                   struct inode *inode)
26968 +{
26969 +       assert("edward-791", clust != NULL);
26970 +       assert("edward-792", inode != NULL);
26971 +       clust->pages =
26972 +               kmalloc(sizeof(*clust->pages) << inode_cluster_shift(inode),
26973 +                       reiser4_ctx_gfp_mask_get());
26974 +       if (!clust->pages)
26975 +               return -ENOMEM;
26976 +       return 0;
26977 +}
26978 +
26979 +static inline void free_clust_pages(struct cluster_handle *clust)
26980 +{
26981 +       kfree(clust->pages);
26982 +}
26983 +
26984 +#endif                         /* __FS_REISER4_CLUSTER_H__ */
26985 +
26986 +/* Make Linus happy.
26987 +   Local variables:
26988 +   c-indentation-style: "K&R"
26989 +   mode-name: "LC"
26990 +   c-basic-offset: 8
26991 +   tab-width: 8
26992 +   fill-column: 120
26993 +   scroll-step: 1
26994 +   End:
26995 +*/
26996 diff -puN /dev/null fs/reiser4/plugin/compress/Makefile
26997 --- /dev/null
26998 +++ a/fs/reiser4/plugin/compress/Makefile
26999 @@ -0,0 +1,5 @@
27000 +obj-$(CONFIG_REISER4_FS) += compress_plugins.o
27001 +
27002 +compress_plugins-objs :=       \
27003 +       compress.o              \
27004 +       compress_mode.o
27005 diff -puN /dev/null fs/reiser4/plugin/compress/compress.c
27006 --- /dev/null
27007 +++ a/fs/reiser4/plugin/compress/compress.c
27008 @@ -0,0 +1,367 @@
27009 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
27010 +/* reiser4 compression transform plugins */
27011 +
27012 +#include "../../debug.h"
27013 +#include "../../inode.h"
27014 +#include "../plugin.h"
27015 +
27016 +#include <linux/lzo.h>
27017 +#include <linux/zlib.h>
27018 +#include <linux/types.h>
27019 +#include <linux/hardirq.h>
27020 +
27021 +static int change_compression(struct inode *inode,
27022 +                             reiser4_plugin * plugin,
27023 +                             pset_member memb)
27024 +{
27025 +       assert("edward-1316", inode != NULL);
27026 +       assert("edward-1317", plugin != NULL);
27027 +       assert("edward-1318", is_reiser4_inode(inode));
27028 +       assert("edward-1319",
27029 +              plugin->h.type_id == REISER4_COMPRESSION_PLUGIN_TYPE);
27030 +
27031 +       /* cannot change compression plugin of already existing regular object */
27032 +       if (!plugin_of_group(inode_file_plugin(inode), REISER4_DIRECTORY_FILE))
27033 +               return RETERR(-EINVAL);
27034 +
27035 +       /* If matches, nothing to change. */
27036 +       if (inode_hash_plugin(inode) != NULL &&
27037 +           inode_hash_plugin(inode)->h.id == plugin->h.id)
27038 +               return 0;
27039 +
27040 +       return aset_set_unsafe(&reiser4_inode_data(inode)->pset,
27041 +                              PSET_COMPRESSION, plugin);
27042 +}
27043 +
27044 +static reiser4_plugin_ops compression_plugin_ops = {
27045 +       .init = NULL,
27046 +       .load = NULL,
27047 +       .save_len = NULL,
27048 +       .save = NULL,
27049 +       .change = &change_compression
27050 +};
27051 +
27052 +/******************************************************************************/
27053 +/*                         gzip1 compression                                  */
27054 +/******************************************************************************/
27055 +
27056 +#define GZIP1_DEF_LEVEL                        Z_BEST_SPEED
27057 +#define GZIP1_DEF_WINBITS              15
27058 +#define GZIP1_DEF_MEMLEVEL             MAX_MEM_LEVEL
27059 +
27060 +static int gzip1_init(void)
27061 +{
27062 +       int ret = -EINVAL;
27063 +#if REISER4_ZLIB
27064 +       ret = 0;
27065 +#endif
27066 +       if (ret == -EINVAL)
27067 +               warning("edward-1337", "Zlib not compiled into kernel");
27068 +       return ret;
27069 +}
27070 +
27071 +static int gzip1_overrun(unsigned src_len UNUSED_ARG)
27072 +{
27073 +       return 0;
27074 +}
27075 +
27076 +static coa_t gzip1_alloc(tfm_action act)
27077 +{
27078 +       coa_t coa = NULL;
27079 +#if REISER4_ZLIB
27080 +       int ret = 0;
27081 +       switch (act) {
27082 +       case TFMA_WRITE:        /* compress */
27083 +               coa = reiser4_vmalloc(zlib_deflate_workspacesize());
27084 +               if (!coa) {
27085 +                       ret = -ENOMEM;
27086 +                       break;
27087 +               }
27088 +               break;
27089 +       case TFMA_READ: /* decompress */
27090 +               coa = reiser4_vmalloc(zlib_inflate_workspacesize());
27091 +               if (!coa) {
27092 +                       ret = -ENOMEM;
27093 +                       break;
27094 +               }
27095 +               break;
27096 +       default:
27097 +               impossible("edward-767",
27098 +                          "trying to alloc workspace for unknown tfm action");
27099 +       }
27100 +       if (ret) {
27101 +               warning("edward-768",
27102 +                       "alloc workspace for gzip1 (tfm action = %d) failed\n",
27103 +                       act);
27104 +               return ERR_PTR(ret);
27105 +       }
27106 +#endif
27107 +       return coa;
27108 +}
27109 +
27110 +static void gzip1_free(coa_t coa, tfm_action act)
27111 +{
27112 +       assert("edward-769", coa != NULL);
27113 +
27114 +       switch (act) {
27115 +       case TFMA_WRITE:        /* compress */
27116 +               vfree(coa);
27117 +               break;
27118 +       case TFMA_READ:         /* decompress */
27119 +               vfree(coa);
27120 +               break;
27121 +       default:
27122 +               impossible("edward-770", "unknown tfm action");
27123 +       }
27124 +       return;
27125 +}
27126 +
27127 +static int gzip1_min_size_deflate(void)
27128 +{
27129 +       return 64;
27130 +}
27131 +
27132 +static void
27133 +gzip1_compress(coa_t coa, __u8 * src_first, unsigned src_len,
27134 +              __u8 * dst_first, unsigned *dst_len)
27135 +{
27136 +#if REISER4_ZLIB
27137 +       int ret = 0;
27138 +       struct z_stream_s stream;
27139 +
27140 +       assert("edward-842", coa != NULL);
27141 +       assert("edward-875", src_len != 0);
27142 +
27143 +       stream.workspace = coa;
27144 +       ret = zlib_deflateInit2(&stream, GZIP1_DEF_LEVEL, Z_DEFLATED,
27145 +                               -GZIP1_DEF_WINBITS, GZIP1_DEF_MEMLEVEL,
27146 +                               Z_DEFAULT_STRATEGY);
27147 +       if (ret != Z_OK) {
27148 +               warning("edward-771", "zlib_deflateInit2 returned %d\n", ret);
27149 +               goto rollback;
27150 +       }
27151 +       ret = zlib_deflateReset(&stream);
27152 +       if (ret != Z_OK) {
27153 +               warning("edward-772", "zlib_deflateReset returned %d\n", ret);
27154 +               goto rollback;
27155 +       }
27156 +       stream.next_in = src_first;
27157 +       stream.avail_in = src_len;
27158 +       stream.next_out = dst_first;
27159 +       stream.avail_out = *dst_len;
27160 +
27161 +       ret = zlib_deflate(&stream, Z_FINISH);
27162 +       if (ret != Z_STREAM_END) {
27163 +               if (ret != Z_OK)
27164 +                       warning("edward-773",
27165 +                               "zlib_deflate returned %d\n", ret);
27166 +               goto rollback;
27167 +       }
27168 +       *dst_len = stream.total_out;
27169 +       return;
27170 +      rollback:
27171 +       *dst_len = src_len;
27172 +#endif
27173 +       return;
27174 +}
27175 +
27176 +static void
27177 +gzip1_decompress(coa_t coa, __u8 * src_first, unsigned src_len,
27178 +                __u8 * dst_first, unsigned *dst_len)
27179 +{
27180 +#if REISER4_ZLIB
27181 +       int ret = 0;
27182 +       struct z_stream_s stream;
27183 +
27184 +       assert("edward-843", coa != NULL);
27185 +       assert("edward-876", src_len != 0);
27186 +
27187 +       stream.workspace = coa;
27188 +       ret = zlib_inflateInit2(&stream, -GZIP1_DEF_WINBITS);
27189 +       if (ret != Z_OK) {
27190 +               warning("edward-774", "zlib_inflateInit2 returned %d\n", ret);
27191 +               return;
27192 +       }
27193 +       ret = zlib_inflateReset(&stream);
27194 +       if (ret != Z_OK) {
27195 +               warning("edward-775", "zlib_inflateReset returned %d\n", ret);
27196 +               return;
27197 +       }
27198 +
27199 +       stream.next_in = src_first;
27200 +       stream.avail_in = src_len;
27201 +       stream.next_out = dst_first;
27202 +       stream.avail_out = *dst_len;
27203 +
27204 +       ret = zlib_inflate(&stream, Z_SYNC_FLUSH);
27205 +       /*
27206 +        * Work around a bug in zlib, which sometimes wants to taste an extra
27207 +        * byte when being used in the (undocumented) raw deflate mode.
27208 +        * (From USAGI).
27209 +        */
27210 +       if (ret == Z_OK && !stream.avail_in && stream.avail_out) {
27211 +               u8 zerostuff = 0;
27212 +               stream.next_in = &zerostuff;
27213 +               stream.avail_in = 1;
27214 +               ret = zlib_inflate(&stream, Z_FINISH);
27215 +       }
27216 +       if (ret != Z_STREAM_END) {
27217 +               warning("edward-776", "zlib_inflate returned %d\n", ret);
27218 +               return;
27219 +       }
27220 +       *dst_len = stream.total_out;
27221 +#endif
27222 +       return;
27223 +}
27224 +
27225 +/******************************************************************************/
27226 +/*                            lzo1 compression                                */
27227 +/******************************************************************************/
27228 +
27229 +static int lzo1_init(void)
27230 +{
27231 +       return 0;
27232 +}
27233 +
27234 +static int lzo1_overrun(unsigned in_len)
27235 +{
27236 +       return in_len / 64 + 16 + 3;
27237 +}
27238 +
27239 +static coa_t lzo1_alloc(tfm_action act)
27240 +{
27241 +       int ret = 0;
27242 +       coa_t coa = NULL;
27243 +
27244 +       switch (act) {
27245 +       case TFMA_WRITE:        /* compress */
27246 +               coa = reiser4_vmalloc(LZO1X_1_MEM_COMPRESS);
27247 +               if (!coa) {
27248 +                       ret = -ENOMEM;
27249 +                       break;
27250 +               }
27251 +       case TFMA_READ:         /* decompress */
27252 +               break;
27253 +       default:
27254 +               impossible("edward-877",
27255 +                          "trying to alloc workspace for unknown tfm action");
27256 +       }
27257 +       if (ret) {
27258 +               warning("edward-878",
27259 +                       "alloc workspace for lzo1 (tfm action = %d) failed\n",
27260 +                       act);
27261 +               return ERR_PTR(ret);
27262 +       }
27263 +       return coa;
27264 +}
27265 +
27266 +static void lzo1_free(coa_t coa, tfm_action act)
27267 +{
27268 +       assert("edward-879", coa != NULL);
27269 +
27270 +       switch (act) {
27271 +       case TFMA_WRITE:        /* compress */
27272 +               vfree(coa);
27273 +               break;
27274 +       case TFMA_READ:         /* decompress */
27275 +               impossible("edward-1304",
27276 +                          "trying to free non-allocated workspace");
27277 +       default:
27278 +               impossible("edward-880", "unknown tfm action");
27279 +       }
27280 +       return;
27281 +}
27282 +
27283 +static int lzo1_min_size_deflate(void)
27284 +{
27285 +       return 256;
27286 +}
27287 +
27288 +static void
27289 +lzo1_compress(coa_t coa, __u8 * src_first, unsigned src_len,
27290 +             __u8 * dst_first, unsigned *dst_len)
27291 +{
27292 +       int result;
27293 +
27294 +       assert("edward-846", coa != NULL);
27295 +       assert("edward-847", src_len != 0);
27296 +
27297 +       result = lzo1x_1_compress(src_first, src_len, dst_first, dst_len, coa);
27298 +       if (unlikely(result != LZO_E_OK)) {
27299 +               warning("edward-849", "lzo1x_1_compress failed\n");
27300 +               goto out;
27301 +       }
27302 +       if (*dst_len >= src_len) {
27303 +               //warning("edward-850", "lzo1x_1_compress: incompressible data\n");
27304 +               goto out;
27305 +       }
27306 +       return;
27307 +      out:
27308 +       *dst_len = src_len;
27309 +       return;
27310 +}
27311 +
27312 +static void
27313 +lzo1_decompress(coa_t coa, __u8 * src_first, unsigned src_len,
27314 +               __u8 * dst_first, unsigned *dst_len)
27315 +{
27316 +       int result;
27317 +
27318 +       assert("edward-851", coa == NULL);
27319 +       assert("edward-852", src_len != 0);
27320 +
27321 +       result = lzo1x_decompress_safe(src_first, src_len, dst_first, dst_len);
27322 +       if (result != LZO_E_OK)
27323 +               warning("edward-853", "lzo1x_1_decompress failed\n");
27324 +       return;
27325 +}
27326 +
27327 +compression_plugin compression_plugins[LAST_COMPRESSION_ID] = {
27328 +       [LZO1_COMPRESSION_ID] = {
27329 +               .h = {
27330 +                       .type_id = REISER4_COMPRESSION_PLUGIN_TYPE,
27331 +                       .id = LZO1_COMPRESSION_ID,
27332 +                       .pops = &compression_plugin_ops,
27333 +                       .label = "lzo1",
27334 +                       .desc = "lzo1 compression transform",
27335 +                       .linkage = {NULL, NULL}
27336 +               },
27337 +               .init = lzo1_init,
27338 +               .overrun = lzo1_overrun,
27339 +               .alloc = lzo1_alloc,
27340 +               .free = lzo1_free,
27341 +               .min_size_deflate = lzo1_min_size_deflate,
27342 +               .checksum = reiser4_adler32,
27343 +               .compress = lzo1_compress,
27344 +               .decompress = lzo1_decompress
27345 +       },
27346 +       [GZIP1_COMPRESSION_ID] = {
27347 +               .h = {
27348 +                       .type_id = REISER4_COMPRESSION_PLUGIN_TYPE,
27349 +                       .id = GZIP1_COMPRESSION_ID,
27350 +                       .pops = &compression_plugin_ops,
27351 +                       .label = "gzip1",
27352 +                       .desc = "gzip1 compression transform",
27353 +                       .linkage = {NULL, NULL}
27354 +               },
27355 +               .init = gzip1_init,
27356 +               .overrun = gzip1_overrun,
27357 +               .alloc = gzip1_alloc,
27358 +               .free = gzip1_free,
27359 +               .min_size_deflate = gzip1_min_size_deflate,
27360 +               .checksum = reiser4_adler32,
27361 +               .compress = gzip1_compress,
27362 +               .decompress = gzip1_decompress
27363 +       }
27364 +};
27365 +
27366 +/*
27367 +  Local variables:
27368 +  c-indentation-style: "K&R"
27369 +  mode-name: "LC"
27370 +  c-basic-offset: 8
27371 +  tab-width: 8
27372 +  fill-column: 120
27373 +  scroll-step: 1
27374 +  End:
27375 +*/
27376 diff -puN /dev/null fs/reiser4/plugin/compress/compress.h
27377 --- /dev/null
27378 +++ a/fs/reiser4/plugin/compress/compress.h
27379 @@ -0,0 +1,43 @@
27380 +#if !defined( __FS_REISER4_COMPRESS_H__ )
27381 +#define __FS_REISER4_COMPRESS_H__
27382 +
27383 +#include <linux/types.h>
27384 +#include <linux/string.h>
27385 +
27386 +/* transform direction */
27387 +typedef enum {
27388 +       TFMA_READ,   /* decrypt, decompress */
27389 +       TFMA_WRITE,  /* encrypt, compress */
27390 +       TFMA_LAST
27391 +} tfm_action;
27392 +
27393 +/* supported compression algorithms */
27394 +typedef enum {
27395 +       LZO1_COMPRESSION_ID,
27396 +       GZIP1_COMPRESSION_ID,
27397 +       LAST_COMPRESSION_ID,
27398 +} reiser4_compression_id;
27399 +
27400 +/* the same as pgoff, but units are page clusters */
27401 +typedef unsigned long cloff_t;
27402 +
27403 +/* working data of a (de)compression algorithm */
27404 +typedef void *coa_t;
27405 +
27406 +/* table for all supported (de)compression algorithms */
27407 +typedef coa_t coa_set[LAST_COMPRESSION_ID][TFMA_LAST];
27408 +
27409 +__u32 reiser4_adler32(char *data, __u32 len);
27410 +
27411 +#endif                         /* __FS_REISER4_COMPRESS_H__ */
27412 +
27413 +/* Make Linus happy.
27414 +   Local variables:
27415 +   c-indentation-style: "K&R"
27416 +   mode-name: "LC"
27417 +   c-basic-offset: 8
27418 +   tab-width: 8
27419 +   fill-column: 120
27420 +   scroll-step: 1
27421 +   End:
27422 +*/
27423 diff -puN /dev/null fs/reiser4/plugin/compress/compress_mode.c
27424 --- /dev/null
27425 +++ a/fs/reiser4/plugin/compress/compress_mode.c
27426 @@ -0,0 +1,162 @@
27427 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
27428 +/* This file contains Reiser4 compression mode plugins.
27429 +
27430 +   Compression mode plugin is a set of handlers called by compressor
27431 +   at flush time and represent some heuristics including the ones
27432 +   which are to avoid compression of incompressible data, see
27433 +   http://www.namesys.com/cryptcompress_design.html for more details.
27434 +*/
27435 +#include "../../inode.h"
27436 +#include "../plugin.h"
27437 +
27438 +static int should_deflate_none(struct inode * inode, cloff_t index)
27439 +{
27440 +       return 0;
27441 +}
27442 +
27443 +static int should_deflate_common(struct inode * inode, cloff_t index)
27444 +{
27445 +       return compression_is_on(cryptcompress_inode_data(inode));
27446 +}
27447 +
27448 +static int discard_hook_ultim(struct inode *inode, cloff_t index)
27449 +{
27450 +       turn_off_compression(cryptcompress_inode_data(inode));
27451 +       return 0;
27452 +}
27453 +
27454 +static int discard_hook_lattd(struct inode *inode, cloff_t index)
27455 +{
27456 +       struct cryptcompress_info * info = cryptcompress_inode_data(inode);
27457 +
27458 +       assert("edward-1462",
27459 +              get_lattice_factor(info) >= MIN_LATTICE_FACTOR &&
27460 +              get_lattice_factor(info) <= MAX_LATTICE_FACTOR);
27461 +
27462 +       turn_off_compression(info);
27463 +       if (get_lattice_factor(info) < MAX_LATTICE_FACTOR)
27464 +               set_lattice_factor(info, get_lattice_factor(info) << 1);
27465 +       return 0;
27466 +}
27467 +
27468 +static int accept_hook_lattd(struct inode *inode, cloff_t index)
27469 +{
27470 +       turn_on_compression(cryptcompress_inode_data(inode));
27471 +       set_lattice_factor(cryptcompress_inode_data(inode), MIN_LATTICE_FACTOR);
27472 +       return 0;
27473 +}
27474 +
27475 +/* Check on dynamic lattice, the adaptive compression modes which
27476 +   defines the following behavior:
27477 +
27478 +   Compression is on: try to compress everything and turn
27479 +   it off, whenever cluster is incompressible.
27480 +
27481 +   Compression is off: try to compress clusters of indexes
27482 +   k * FACTOR (k = 0, 1, 2, ...) and turn it on, if some of
27483 +   them is compressible. If incompressible, then increase FACTOR */
27484 +
27485 +/* check if @index belongs to one-dimensional lattice
27486 +   of sparce factor @factor */
27487 +static int is_on_lattice(cloff_t index, int factor)
27488 +{
27489 +       return (factor ? index % factor == 0: index == 0);
27490 +}
27491 +
27492 +static int should_deflate_lattd(struct inode * inode, cloff_t index)
27493 +{
27494 +       return should_deflate_common(inode, index) ||
27495 +               is_on_lattice(index,
27496 +                             get_lattice_factor
27497 +                             (cryptcompress_inode_data(inode)));
27498 +}
27499 +
27500 +/* compression mode_plugins */
27501 +compression_mode_plugin compression_mode_plugins[LAST_COMPRESSION_MODE_ID] = {
27502 +       [NONE_COMPRESSION_MODE_ID] = {
27503 +               .h = {
27504 +                       .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
27505 +                       .id = NONE_COMPRESSION_MODE_ID,
27506 +                       .pops = NULL,
27507 +                       .label = "none",
27508 +                       .desc = "Compress nothing",
27509 +                       .linkage = {NULL, NULL}
27510 +               },
27511 +               .should_deflate = should_deflate_none,
27512 +               .accept_hook = NULL,
27513 +               .discard_hook = NULL
27514 +       },
27515 +       /* Check-on-dynamic-lattice adaptive compression mode */
27516 +       [LATTD_COMPRESSION_MODE_ID] = {
27517 +               .h = {
27518 +                       .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
27519 +                       .id = LATTD_COMPRESSION_MODE_ID,
27520 +                       .pops = NULL,
27521 +                       .label = "lattd",
27522 +                       .desc = "Check on dynamic lattice",
27523 +                       .linkage = {NULL, NULL}
27524 +               },
27525 +               .should_deflate = should_deflate_lattd,
27526 +               .accept_hook = accept_hook_lattd,
27527 +               .discard_hook = discard_hook_lattd
27528 +       },
27529 +       /* Check-ultimately compression mode:
27530 +          Turn off compression forever as soon as we meet
27531 +          incompressible data */
27532 +       [ULTIM_COMPRESSION_MODE_ID] = {
27533 +               .h = {
27534 +                       .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
27535 +                       .id = ULTIM_COMPRESSION_MODE_ID,
27536 +                       .pops = NULL,
27537 +                       .label = "ultim",
27538 +                       .desc = "Check ultimately",
27539 +                       .linkage = {NULL, NULL}
27540 +               },
27541 +               .should_deflate = should_deflate_common,
27542 +               .accept_hook = NULL,
27543 +               .discard_hook = discard_hook_ultim
27544 +       },
27545 +       /* Force-to-compress-everything compression mode */
27546 +       [FORCE_COMPRESSION_MODE_ID] = {
27547 +               .h = {
27548 +                       .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
27549 +                       .id = FORCE_COMPRESSION_MODE_ID,
27550 +                       .pops = NULL,
27551 +                       .label = "force",
27552 +                       .desc = "Force to compress everything",
27553 +                       .linkage = {NULL, NULL}
27554 +               },
27555 +               .should_deflate = NULL,
27556 +               .accept_hook = NULL,
27557 +               .discard_hook = NULL
27558 +       },
27559 +       /* Convert-to-extent compression mode.
27560 +          In this mode items will be converted to extents and management
27561 +          will be passed to (classic) unix file plugin as soon as ->write()
27562 +          detects that the first complete logical cluster (of index #0) is
27563 +          incompressible. */
27564 +       [CONVX_COMPRESSION_MODE_ID] = {
27565 +               .h = {
27566 +                       .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
27567 +                       .id = CONVX_COMPRESSION_MODE_ID,
27568 +                       .pops = NULL,
27569 +                       .label = "conv",
27570 +                       .desc = "Convert to extent",
27571 +                       .linkage = {NULL, NULL}
27572 +               },
27573 +               .should_deflate = should_deflate_common,
27574 +               .accept_hook = NULL,
27575 +               .discard_hook = NULL
27576 +       }
27577 +};
27578 +
27579 +/*
27580 +  Local variables:
27581 +  c-indentation-style: "K&R"
27582 +  mode-name: "LC"
27583 +  c-basic-offset: 8
27584 +  tab-width: 8
27585 +  fill-column: 120
27586 +  scroll-step: 1
27587 +  End:
27588 +*/
27589 diff -puN /dev/null fs/reiser4/plugin/crypto/cipher.c
27590 --- /dev/null
27591 +++ a/fs/reiser4/plugin/crypto/cipher.c
27592 @@ -0,0 +1,37 @@
27593 +/* Copyright 2001, 2002, 2003 by Hans Reiser,
27594 +   licensing governed by reiser4/README */
27595 +/* Reiser4 cipher transform plugins */
27596 +
27597 +#include "../../debug.h"
27598 +#include "../plugin.h"
27599 +
27600 +cipher_plugin cipher_plugins[LAST_CIPHER_ID] = {
27601 +       [NONE_CIPHER_ID] = {
27602 +               .h = {
27603 +                       .type_id = REISER4_CIPHER_PLUGIN_TYPE,
27604 +                       .id = NONE_CIPHER_ID,
27605 +                       .pops = NULL,
27606 +                       .label = "none",
27607 +                       .desc = "no cipher transform",
27608 +                       .linkage = {NULL, NULL}
27609 +               },
27610 +               .alloc = NULL,
27611 +               .free = NULL,
27612 +               .scale = NULL,
27613 +               .align_stream = NULL,
27614 +               .setkey = NULL,
27615 +               .encrypt = NULL,
27616 +               .decrypt = NULL
27617 +       }
27618 +};
27619 +
27620 +/* Make Linus happy.
27621 +   Local variables:
27622 +   c-indentation-style: "K&R"
27623 +   mode-name: "LC"
27624 +   c-basic-offset: 8
27625 +   tab-width: 8
27626 +   fill-column: 120
27627 +   scroll-step: 1
27628 +   End:
27629 +*/
27630 diff -puN /dev/null fs/reiser4/plugin/crypto/cipher.h
27631 --- /dev/null
27632 +++ a/fs/reiser4/plugin/crypto/cipher.h
27633 @@ -0,0 +1,55 @@
27634 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
27635 +/* This file contains definitions for the objects operated
27636 +   by reiser4 key manager, which is something like keyring
27637 +   wrapped by appropriate reiser4 plugin */
27638 +
27639 +#if !defined( __FS_REISER4_CRYPT_H__ )
27640 +#define __FS_REISER4_CRYPT_H__
27641 +
27642 +#include <linux/crypto.h>
27643 +
27644 +/* key info imported from user space */
27645 +struct reiser4_crypto_data {
27646 +       int keysize;    /* uninstantiated key size */
27647 +       __u8 * key;     /* uninstantiated key */
27648 +       int keyid_size; /* size of passphrase */
27649 +       __u8 * keyid;   /* passphrase */
27650 +};
27651 +
27652 +/* This object contains all needed infrastructure to implement
27653 +   cipher transform. This is operated (allocating, inheriting,
27654 +   validating, binding to host inode, etc..) by reiser4 key manager.
27655 +
27656 +   This info can be allocated in two cases:
27657 +   1. importing a key from user space.
27658 +   2. reading inode from disk */
27659 +struct reiser4_crypto_info {
27660 +       struct inode * host;
27661 +       struct crypto_hash      * digest;
27662 +       struct crypto_blkcipher * cipher;
27663 +#if 0
27664 +       cipher_key_plugin * kplug; /* key manager */
27665 +#endif
27666 +       __u8 * keyid;              /* key fingerprint, created by digest plugin,
27667 +                                     using uninstantiated key and passphrase.
27668 +                                     supposed to be stored in disk stat-data */
27669 +       int inst;                  /* this indicates if the cipher key is
27670 +                                     instantiated (case 1 above) */
27671 +       int keysize;               /* uninstantiated key size (bytes), supposed
27672 +                                     to be stored in disk stat-data */
27673 +       int keyload_count;         /* number of the objects which has this
27674 +                                     crypto-stat attached */
27675 +};
27676 +
27677 +#endif /* __FS_REISER4_CRYPT_H__ */
27678 +
27679 +/*
27680 +   Local variables:
27681 +   c-indentation-style: "K&R"
27682 +   mode-name: "LC"
27683 +   c-basic-offset: 8
27684 +   tab-width: 8
27685 +   fill-column: 120
27686 +   scroll-step: 1
27687 +   End:
27688 +*/
27689 diff -puN /dev/null fs/reiser4/plugin/crypto/digest.c
27690 --- /dev/null
27691 +++ a/fs/reiser4/plugin/crypto/digest.c
27692 @@ -0,0 +1,58 @@
27693 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
27694 +
27695 +/* reiser4 digest transform plugin (is used by cryptcompress object plugin) */
27696 +/* EDWARD-FIXME-HANS: and it does what? a digest is a what? */
27697 +#include "../../debug.h"
27698 +#include "../plugin_header.h"
27699 +#include "../plugin.h"
27700 +#include "../file/cryptcompress.h"
27701 +
27702 +#include <linux/types.h>
27703 +
27704 +extern digest_plugin digest_plugins[LAST_DIGEST_ID];
27705 +
27706 +static struct crypto_hash * alloc_sha256 (void)
27707 +{
27708 +#if REISER4_SHA256
27709 +       return crypto_alloc_hash ("sha256", 0, CRYPTO_ALG_ASYNC);
27710 +#else
27711 +       warning("edward-1418", "sha256 unsupported");
27712 +       return ERR_PTR(-EINVAL);
27713 +#endif
27714 +}
27715 +
27716 +static void free_sha256 (struct crypto_hash * tfm)
27717 +{
27718 +#if REISER4_SHA256
27719 +       crypto_free_hash(tfm);
27720 +#endif
27721 +       return;
27722 +}
27723 +
27724 +/* digest plugins */
27725 +digest_plugin digest_plugins[LAST_DIGEST_ID] = {
27726 +       [SHA256_32_DIGEST_ID] = {
27727 +               .h = {
27728 +                       .type_id = REISER4_DIGEST_PLUGIN_TYPE,
27729 +                       .id = SHA256_32_DIGEST_ID,
27730 +                       .pops = NULL,
27731 +                       .label = "sha256_32",
27732 +                       .desc = "sha256_32 digest transform",
27733 +                       .linkage = {NULL, NULL}
27734 +               },
27735 +               .fipsize = sizeof(__u32),
27736 +               .alloc = alloc_sha256,
27737 +               .free = free_sha256
27738 +       }
27739 +};
27740 +
27741 +/*
27742 +  Local variables:
27743 +  c-indentation-style: "K&R"
27744 +  mode-name: "LC"
27745 +  c-basic-offset: 8
27746 +  tab-width: 8
27747 +  fill-column: 120
27748 +  scroll-step: 1
27749 +  End:
27750 +*/
27751 diff -puN /dev/null fs/reiser4/plugin/dir/Makefile
27752 --- /dev/null
27753 +++ a/fs/reiser4/plugin/dir/Makefile
27754 @@ -0,0 +1,5 @@
27755 +obj-$(CONFIG_REISER4_FS) += dir_plugins.o
27756 +
27757 +dir_plugins-objs :=    \
27758 +       hashed_dir.o    \
27759 +       seekable_dir.o
27760 diff -puN /dev/null fs/reiser4/plugin/dir/dir.h
27761 --- /dev/null
27762 +++ a/fs/reiser4/plugin/dir/dir.h
27763 @@ -0,0 +1,36 @@
27764 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
27765 + * reiser4/README */
27766 +
27767 +/* this file contains declarations of methods implementing directory plugins */
27768 +
27769 +#if !defined( __REISER4_DIR_H__ )
27770 +#define __REISER4_DIR_H__
27771 +
27772 +/*#include "../../key.h"
27773 +
27774 +#include <linux/fs.h>*/
27775 +
27776 +/* declarations of functions implementing HASHED_DIR_PLUGIN_ID dir plugin */
27777 +
27778 +/* "hashed" directory methods of dir plugin */
27779 +void build_entry_key_hashed(const struct inode *, const struct qstr *,
27780 +                           reiser4_key *);
27781 +
27782 +/* declarations of functions implementing SEEKABLE_HASHED_DIR_PLUGIN_ID dir plugin */
27783 +
27784 +/* "seekable" directory methods of dir plugin */
27785 +void build_entry_key_seekable(const struct inode *, const struct qstr *,
27786 +                             reiser4_key *);
27787 +
27788 +/* __REISER4_DIR_H__ */
27789 +#endif
27790 +
27791 +/*
27792 +   Local variables:
27793 +   c-indentation-style: "K&R"
27794 +   mode-name: "LC"
27795 +   c-basic-offset: 8
27796 +   tab-width: 8
27797 +   fill-column: 120
27798 +   End:
27799 +*/
27800 diff -puN /dev/null fs/reiser4/plugin/dir/hashed_dir.c
27801 --- /dev/null
27802 +++ a/fs/reiser4/plugin/dir/hashed_dir.c
27803 @@ -0,0 +1,81 @@
27804 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
27805 + * reiser4/README */
27806 +
27807 +/* Directory plugin using hashes (see fs/reiser4/plugin/hash.c) to map file
27808 +   names to the files. */
27809 +
27810 +/*
27811 + * Hashed directory logically consists of persistent directory
27812 + * entries. Directory entry is a pair of a file name and a key of stat-data of
27813 + * a file that has this name in the given directory.
27814 + *
27815 + * Directory entries are stored in the tree in the form of directory
27816 + * items. Directory item should implement dir_entry_ops portion of item plugin
27817 + * interface (see plugin/item/item.h). Hashed directory interacts with
27818 + * directory item plugin exclusively through dir_entry_ops operations.
27819 + *
27820 + * Currently there are two implementations of directory items: "simple
27821 + * directory item" (plugin/item/sde.[ch]), and "compound directory item"
27822 + * (plugin/item/cde.[ch]) with the latter being the default.
27823 + *
27824 + * There is, however some delicate way through which directory code interferes
27825 + * with item plugin: key assignment policy. A key for a directory item is
27826 + * chosen by directory code, and as described in kassign.c, this key contains
27827 + * a portion of file name. Directory item uses this knowledge to avoid storing
27828 + * this portion of file name twice: in the key and in the directory item body.
27829 + *
27830 + */
27831 +
27832 +#include "../../inode.h"
27833 +
27834 +void complete_entry_key(const struct inode *, const char *name,
27835 +                       int len, reiser4_key * result);
27836 +
27837 +/* this is implementation of build_entry_key method of dir
27838 +   plugin for HASHED_DIR_PLUGIN_ID
27839 + */
27840 +void build_entry_key_hashed(const struct inode *dir,   /* directory where entry is
27841 +                                                        * (or will be) in.*/
27842 +                           const struct qstr *qname,   /* name of file referenced
27843 +                                                        * by this entry */
27844 +                           reiser4_key * result        /* resulting key of directory
27845 +                                                        * entry */ )
27846 +{
27847 +       const char *name;
27848 +       int len;
27849 +
27850 +       assert("nikita-1139", dir != NULL);
27851 +       assert("nikita-1140", qname != NULL);
27852 +       assert("nikita-1141", qname->name != NULL);
27853 +       assert("nikita-1142", result != NULL);
27854 +
27855 +       name = qname->name;
27856 +       len = qname->len;
27857 +
27858 +       assert("nikita-2867", strlen(name) == len);
27859 +
27860 +       reiser4_key_init(result);
27861 +       /* locality of directory entry's key is objectid of parent
27862 +          directory */
27863 +       set_key_locality(result, get_inode_oid(dir));
27864 +       /* minor packing locality is constant */
27865 +       set_key_type(result, KEY_FILE_NAME_MINOR);
27866 +       /* dot is special case---we always want it to be first entry in
27867 +          a directory. Actually, we just want to have smallest
27868 +          directory entry.
27869 +        */
27870 +       if (len == 1 && name[0] == '.')
27871 +               return;
27872 +
27873 +       /* initialize part of entry key which depends on file name */
27874 +       complete_entry_key(dir, name, len, result);
27875 +}
27876 +
27877 +/* Local variables:
27878 +   c-indentation-style: "K&R"
27879 +   mode-name: "LC"
27880 +   c-basic-offset: 8
27881 +   tab-width: 8
27882 +   fill-column: 120
27883 +   End:
27884 +*/
27885 diff -puN /dev/null fs/reiser4/plugin/dir/seekable_dir.c
27886 --- /dev/null
27887 +++ a/fs/reiser4/plugin/dir/seekable_dir.c
27888 @@ -0,0 +1,46 @@
27889 +/* Copyright 2005 by Hans Reiser, licensing governed by
27890 + * reiser4/README */
27891 +
27892 +#include "../../inode.h"
27893 +
27894 +/* this is implementation of build_entry_key method of dir
27895 +   plugin for SEEKABLE_HASHED_DIR_PLUGIN_ID
27896 +   This is for directories where we want repeatable and restartable readdir()
27897 +   even in case 32bit user level struct dirent (readdir(3)).
27898 +*/
27899 +void
27900 +build_entry_key_seekable(const struct inode *dir, const struct qstr *name,
27901 +                        reiser4_key * result)
27902 +{
27903 +       oid_t objectid;
27904 +
27905 +       assert("nikita-2283", dir != NULL);
27906 +       assert("nikita-2284", name != NULL);
27907 +       assert("nikita-2285", name->name != NULL);
27908 +       assert("nikita-2286", result != NULL);
27909 +
27910 +       reiser4_key_init(result);
27911 +       /* locality of directory entry's key is objectid of parent
27912 +          directory */
27913 +       set_key_locality(result, get_inode_oid(dir));
27914 +       /* minor packing locality is constant */
27915 +       set_key_type(result, KEY_FILE_NAME_MINOR);
27916 +       /* dot is special case---we always want it to be first entry in
27917 +          a directory. Actually, we just want to have smallest
27918 +          directory entry.
27919 +        */
27920 +       if ((name->len == 1) && (name->name[0] == '.'))
27921 +               return;
27922 +
27923 +       /* objectid of key is 31 lowest bits of hash. */
27924 +       objectid =
27925 +           inode_hash_plugin(dir)->hash(name->name,
27926 +                                        (int)name->len) & 0x7fffffff;
27927 +
27928 +       assert("nikita-2303", !(objectid & ~KEY_OBJECTID_MASK));
27929 +       set_key_objectid(result, objectid);
27930 +
27931 +       /* offset is always 0. */
27932 +       set_key_offset(result, (__u64) 0);
27933 +       return;
27934 +}
27935 diff -puN /dev/null fs/reiser4/plugin/dir_plugin_common.c
27936 --- /dev/null
27937 +++ a/fs/reiser4/plugin/dir_plugin_common.c
27938 @@ -0,0 +1,873 @@
27939 +/* Copyright 2005 by Hans Reiser, licensing governed by
27940 +   reiser4/README */
27941 +
27942 +/* this file contains typical implementations for most of methods of
27943 +   directory plugin
27944 +*/
27945 +
27946 +#include "../inode.h"
27947 +
27948 +int reiser4_find_entry(struct inode *dir, struct dentry *name,
27949 +              lock_handle * , znode_lock_mode, reiser4_dir_entry_desc *);
27950 +int reiser4_lookup_name(struct inode *parent, struct dentry *dentry,
27951 +                       reiser4_key * key);
27952 +void check_light_weight(struct inode *inode, struct inode *parent);
27953 +
27954 +/* this is common implementation of get_parent method of dir plugin
27955 +   this is used by NFS kernel server to "climb" up directory tree to
27956 +   check permissions
27957 + */
27958 +struct dentry *get_parent_common(struct inode *child)
27959 +{
27960 +       struct super_block *s;
27961 +       struct inode *parent;
27962 +       struct dentry dotdot;
27963 +       struct dentry *dentry;
27964 +       reiser4_key key;
27965 +       int result;
27966 +
27967 +       /*
27968 +        * lookup dotdot entry.
27969 +        */
27970 +
27971 +       s = child->i_sb;
27972 +       memset(&dotdot, 0, sizeof(dotdot));
27973 +       dotdot.d_name.name = "..";
27974 +       dotdot.d_name.len = 2;
27975 +       dotdot.d_op = &get_super_private(s)->ops.dentry;
27976 +
27977 +       result = reiser4_lookup_name(child, &dotdot, &key);
27978 +       if (result != 0)
27979 +               return ERR_PTR(result);
27980 +
27981 +       parent = reiser4_iget(s, &key, 1);
27982 +       if (!IS_ERR(parent)) {
27983 +               /*
27984 +                * FIXME-NIKITA dubious: attributes are inherited from @child
27985 +                * to @parent. But:
27986 +                *
27987 +                *     (*) this is the only this we can do
27988 +                *
27989 +                *     (*) attributes of light-weight object are inherited
27990 +                *     from a parent through which object was looked up first,
27991 +                *     so it is ambiguous anyway.
27992 +                *
27993 +                */
27994 +               check_light_weight(parent, child);
27995 +               reiser4_iget_complete(parent);
27996 +               dentry = d_obtain_alias(parent);
27997 +               if (dentry == NULL) {
27998 +                       iput(parent);
27999 +                       dentry = ERR_PTR(RETERR(-ENOMEM));
28000 +               } else
28001 +                       dentry->d_op = &get_super_private(s)->ops.dentry;
28002 +       } else if (PTR_ERR(parent) == -ENOENT)
28003 +               dentry = ERR_PTR(RETERR(-ESTALE));
28004 +       else
28005 +               dentry = (void *)parent;
28006 +       return dentry;
28007 +}
28008 +
28009 +/* this is common implementation of is_name_acceptable method of dir
28010 +   plugin
28011 + */
28012 +int is_name_acceptable_common(const struct inode *inode, /* directory to check*/
28013 +                             const char *name UNUSED_ARG, /* name to check */
28014 +                             int len/* @name's length */)
28015 +{
28016 +       assert("nikita-733", inode != NULL);
28017 +       assert("nikita-734", name != NULL);
28018 +       assert("nikita-735", len > 0);
28019 +
28020 +       return len <= reiser4_max_filename_len(inode);
28021 +}
28022 +
28023 +/* there is no common implementation of build_entry_key method of dir
28024 +   plugin. See plugin/dir/hashed_dir.c:build_entry_key_hashed() or
28025 +   plugin/dir/seekable.c:build_entry_key_seekable() for example
28026 +*/
28027 +
28028 +/* this is common implementation of build_readdir_key method of dir
28029 +   plugin
28030 +   see reiser4_readdir_common for more details
28031 +*/
28032 +int build_readdir_key_common(struct file *dir /* directory being read */ ,
28033 +                            reiser4_key * result/* where to store key */)
28034 +{
28035 +       reiser4_file_fsdata *fdata;
28036 +       struct inode *inode;
28037 +
28038 +       assert("nikita-1361", dir != NULL);
28039 +       assert("nikita-1362", result != NULL);
28040 +       assert("nikita-1363", dir->f_dentry != NULL);
28041 +       inode = dir->f_dentry->d_inode;
28042 +       assert("nikita-1373", inode != NULL);
28043 +
28044 +       fdata = reiser4_get_file_fsdata(dir);
28045 +       if (IS_ERR(fdata))
28046 +               return PTR_ERR(fdata);
28047 +       assert("nikita-1364", fdata != NULL);
28048 +       return extract_key_from_de_id(get_inode_oid(inode),
28049 +                                     &fdata->dir.readdir.position.
28050 +                                     dir_entry_key, result);
28051 +
28052 +}
28053 +
28054 +void reiser4_adjust_dir_file(struct inode *, const struct dentry *, int offset,
28055 +                            int adj);
28056 +
28057 +/* this is common implementation of add_entry method of dir plugin
28058 +*/
28059 +int reiser4_add_entry_common(struct inode *object, /* directory to add new name
28060 +                                                   * in */
28061 +                            struct dentry *where,      /* new name */
28062 +                            reiser4_object_create_data * data, /* parameters of
28063 +                                                               *  new object */
28064 +                            reiser4_dir_entry_desc * entry /* parameters of
28065 +                                                            * new directory
28066 +                                                            * entry */)
28067 +{
28068 +       int result;
28069 +       coord_t *coord;
28070 +       lock_handle lh;
28071 +       struct reiser4_dentry_fsdata *fsdata;
28072 +       reiser4_block_nr reserve;
28073 +
28074 +       assert("nikita-1114", object != NULL);
28075 +       assert("nikita-1250", where != NULL);
28076 +
28077 +       fsdata = reiser4_get_dentry_fsdata(where);
28078 +       if (unlikely(IS_ERR(fsdata)))
28079 +               return PTR_ERR(fsdata);
28080 +
28081 +       reserve = inode_dir_plugin(object)->estimate.add_entry(object);
28082 +       if (reiser4_grab_space(reserve, BA_CAN_COMMIT))
28083 +               return RETERR(-ENOSPC);
28084 +
28085 +       init_lh(&lh);
28086 +       coord = &fsdata->dec.entry_coord;
28087 +       coord_clear_iplug(coord);
28088 +
28089 +       /* check for this entry in a directory. This is plugin method. */
28090 +       result = reiser4_find_entry(object, where, &lh, ZNODE_WRITE_LOCK,
28091 +                                   entry);
28092 +       if (likely(result == -ENOENT)) {
28093 +               /* add new entry. Just pass control to the directory
28094 +                  item plugin. */
28095 +               assert("nikita-1709", inode_dir_item_plugin(object));
28096 +               assert("nikita-2230", coord->node == lh.node);
28097 +               reiser4_seal_done(&fsdata->dec.entry_seal);
28098 +               result =
28099 +                   inode_dir_item_plugin(object)->s.dir.add_entry(object,
28100 +                                                                  coord, &lh,
28101 +                                                                  where,
28102 +                                                                  entry);
28103 +               if (result == 0) {
28104 +                       reiser4_adjust_dir_file(object, where,
28105 +                                               fsdata->dec.pos + 1, +1);
28106 +                       INODE_INC_FIELD(object, i_size);
28107 +               }
28108 +       } else if (result == 0) {
28109 +               assert("nikita-2232", coord->node == lh.node);
28110 +               result = RETERR(-EEXIST);
28111 +       }
28112 +       done_lh(&lh);
28113 +
28114 +       return result;
28115 +}
28116 +
28117 +/**
28118 + * rem_entry - remove entry from directory item
28119 + * @dir:
28120 + * @dentry:
28121 + * @entry:
28122 + * @coord:
28123 + * @lh:
28124 + *
28125 + * Checks that coordinate @coord is set properly and calls item plugin
28126 + * method to cut entry.
28127 + */
28128 +static int
28129 +rem_entry(struct inode *dir, struct dentry *dentry,
28130 +         reiser4_dir_entry_desc * entry, coord_t *coord, lock_handle * lh)
28131 +{
28132 +       item_plugin *iplug;
28133 +       struct inode *child;
28134 +
28135 +       iplug = inode_dir_item_plugin(dir);
28136 +       child = dentry->d_inode;
28137 +       assert("nikita-3399", child != NULL);
28138 +
28139 +       /* check that we are really destroying an entry for @child */
28140 +       if (REISER4_DEBUG) {
28141 +               int result;
28142 +               reiser4_key key;
28143 +
28144 +               result = iplug->s.dir.extract_key(coord, &key);
28145 +               if (result != 0)
28146 +                       return result;
28147 +               if (get_key_objectid(&key) != get_inode_oid(child)) {
28148 +                       warning("nikita-3397",
28149 +                               "rem_entry: %#llx != %#llx\n",
28150 +                               get_key_objectid(&key),
28151 +                               (unsigned long long)get_inode_oid(child));
28152 +                       return RETERR(-EIO);
28153 +               }
28154 +       }
28155 +       return iplug->s.dir.rem_entry(dir, &dentry->d_name, coord, lh, entry);
28156 +}
28157 +
28158 +/**
28159 + * reiser4_rem_entry_common - remove entry from a directory
28160 + * @dir: directory to remove entry from
28161 + * @where: name that is being removed
28162 + * @entry: description of entry being removed
28163 + *
28164 + * This is common implementation of rem_entry method of dir plugin.
28165 + */
28166 +int reiser4_rem_entry_common(struct inode *dir,
28167 +                            struct dentry *dentry,
28168 +                            reiser4_dir_entry_desc * entry)
28169 +{
28170 +       int result;
28171 +       coord_t *coord;
28172 +       lock_handle lh;
28173 +       struct reiser4_dentry_fsdata *fsdata;
28174 +       __u64 tograb;
28175 +
28176 +       assert("nikita-1124", dir != NULL);
28177 +       assert("nikita-1125", dentry != NULL);
28178 +
28179 +       tograb = inode_dir_plugin(dir)->estimate.rem_entry(dir);
28180 +       result = reiser4_grab_space(tograb, BA_CAN_COMMIT | BA_RESERVED);
28181 +       if (result != 0)
28182 +               return RETERR(-ENOSPC);
28183 +
28184 +       init_lh(&lh);
28185 +
28186 +       /* check for this entry in a directory. This is plugin method. */
28187 +       result = reiser4_find_entry(dir, dentry, &lh, ZNODE_WRITE_LOCK, entry);
28188 +       fsdata = reiser4_get_dentry_fsdata(dentry);
28189 +       if (IS_ERR(fsdata)) {
28190 +               done_lh(&lh);
28191 +               return PTR_ERR(fsdata);
28192 +       }
28193 +
28194 +       coord = &fsdata->dec.entry_coord;
28195 +
28196 +       assert("nikita-3404",
28197 +              get_inode_oid(dentry->d_inode) != get_inode_oid(dir) ||
28198 +              dir->i_size <= 1);
28199 +
28200 +       coord_clear_iplug(coord);
28201 +       if (result == 0) {
28202 +               /* remove entry. Just pass control to the directory item
28203 +                  plugin. */
28204 +               assert("vs-542", inode_dir_item_plugin(dir));
28205 +               reiser4_seal_done(&fsdata->dec.entry_seal);
28206 +               reiser4_adjust_dir_file(dir, dentry, fsdata->dec.pos, -1);
28207 +               result =
28208 +                   WITH_COORD(coord,
28209 +                              rem_entry(dir, dentry, entry, coord, &lh));
28210 +               if (result == 0) {
28211 +                       if (dir->i_size >= 1)
28212 +                               INODE_DEC_FIELD(dir, i_size);
28213 +                       else {
28214 +                               warning("nikita-2509", "Dir %llu is runt",
28215 +                                       (unsigned long long)
28216 +                                       get_inode_oid(dir));
28217 +                               result = RETERR(-EIO);
28218 +                       }
28219 +
28220 +                       assert("nikita-3405", dentry->d_inode->i_nlink != 1 ||
28221 +                              dentry->d_inode->i_size != 2 ||
28222 +                              inode_dir_plugin(dentry->d_inode) == NULL);
28223 +               }
28224 +       }
28225 +       done_lh(&lh);
28226 +
28227 +       return result;
28228 +}
28229 +
28230 +static reiser4_block_nr estimate_init(struct inode *parent,
28231 +                                     struct inode *object);
28232 +static int create_dot_dotdot(struct inode *object, struct inode *parent);
28233 +
28234 +/* this is common implementation of init method of dir plugin
28235 +   create "." and ".." entries
28236 +*/
28237 +int reiser4_dir_init_common(struct inode *object,      /* new directory */
28238 +                           struct inode *parent,       /* parent directory */
28239 +                           reiser4_object_create_data * data /* info passed
28240 +                                                              * to us, this
28241 +                                                              * is filled by
28242 +                                                              * reiser4()
28243 +                                                              * syscall in
28244 +                                                              * particular */)
28245 +{
28246 +       reiser4_block_nr reserve;
28247 +
28248 +       assert("nikita-680", object != NULL);
28249 +       assert("nikita-681", S_ISDIR(object->i_mode));
28250 +       assert("nikita-682", parent != NULL);
28251 +       assert("nikita-684", data != NULL);
28252 +       assert("nikita-686", data->id == DIRECTORY_FILE_PLUGIN_ID);
28253 +       assert("nikita-687", object->i_mode & S_IFDIR);
28254 +
28255 +       reserve = estimate_init(parent, object);
28256 +       if (reiser4_grab_space(reserve, BA_CAN_COMMIT))
28257 +               return RETERR(-ENOSPC);
28258 +
28259 +       return create_dot_dotdot(object, parent);
28260 +}
28261 +
28262 +/* this is common implementation of done method of dir plugin
28263 +   remove "." entry
28264 +*/
28265 +int reiser4_dir_done_common(struct inode *object/* object being deleted */)
28266 +{
28267 +       int result;
28268 +       reiser4_block_nr reserve;
28269 +       struct dentry goodby_dots;
28270 +       reiser4_dir_entry_desc entry;
28271 +
28272 +       assert("nikita-1449", object != NULL);
28273 +
28274 +       if (reiser4_inode_get_flag(object, REISER4_NO_SD))
28275 +               return 0;
28276 +
28277 +       /* of course, this can be rewritten to sweep everything in one
28278 +          reiser4_cut_tree(). */
28279 +       memset(&entry, 0, sizeof entry);
28280 +
28281 +       /* FIXME: this done method is called from reiser4_delete_dir_common
28282 +        * which reserved space already */
28283 +       reserve = inode_dir_plugin(object)->estimate.rem_entry(object);
28284 +       if (reiser4_grab_space(reserve, BA_CAN_COMMIT | BA_RESERVED))
28285 +               return RETERR(-ENOSPC);
28286 +
28287 +       memset(&goodby_dots, 0, sizeof goodby_dots);
28288 +       entry.obj = goodby_dots.d_inode = object;
28289 +       goodby_dots.d_name.name = ".";
28290 +       goodby_dots.d_name.len = 1;
28291 +       result = reiser4_rem_entry_common(object, &goodby_dots, &entry);
28292 +       reiser4_free_dentry_fsdata(&goodby_dots);
28293 +       if (unlikely(result != 0 && result != -ENOMEM && result != -ENOENT))
28294 +               /* only worth a warning
28295 +
28296 +                  "values of \ eB\ f will give rise to dom!\n"
28297 +                  -- v6src/s2/mv.c:89
28298 +                */
28299 +               warning("nikita-2252", "Cannot remove dot of %lli: %i",
28300 +                       (unsigned long long)get_inode_oid(object), result);
28301 +       return 0;
28302 +}
28303 +
28304 +/* this is common implementation of attach method of dir plugin
28305 +*/
28306 +int reiser4_attach_common(struct inode *child UNUSED_ARG,
28307 +                         struct inode *parent UNUSED_ARG)
28308 +{
28309 +       assert("nikita-2647", child != NULL);
28310 +       assert("nikita-2648", parent != NULL);
28311 +
28312 +       return 0;
28313 +}
28314 +
28315 +/* this is common implementation of detach method of dir plugin
28316 +   remove "..", decrease nlink on parent
28317 +*/
28318 +int reiser4_detach_common(struct inode *object, struct inode *parent)
28319 +{
28320 +       int result;
28321 +       struct dentry goodby_dots;
28322 +       reiser4_dir_entry_desc entry;
28323 +
28324 +       assert("nikita-2885", object != NULL);
28325 +       assert("nikita-2886", !reiser4_inode_get_flag(object, REISER4_NO_SD));
28326 +
28327 +       memset(&entry, 0, sizeof entry);
28328 +
28329 +       /* NOTE-NIKITA this only works if @parent is -the- parent of
28330 +          @object, viz. object whose key is stored in dotdot
28331 +          entry. Wouldn't work with hard-links on directories. */
28332 +       memset(&goodby_dots, 0, sizeof goodby_dots);
28333 +       entry.obj = goodby_dots.d_inode = parent;
28334 +       goodby_dots.d_name.name = "..";
28335 +       goodby_dots.d_name.len = 2;
28336 +       result = reiser4_rem_entry_common(object, &goodby_dots, &entry);
28337 +       reiser4_free_dentry_fsdata(&goodby_dots);
28338 +       if (result == 0) {
28339 +               /* the dot should be the only entry remaining at this time... */
28340 +               assert("nikita-3400",
28341 +                      object->i_size == 1 && object->i_nlink <= 2);
28342 +#if 0
28343 +               /* and, together with the only name directory can have, they
28344 +                * provides for the last 2 remaining references. If we get
28345 +                * here as part of error handling during mkdir, @object
28346 +                * possibly has no name yet, so its nlink == 1. If we get here
28347 +                * from rename (targeting empty directory), it has no name
28348 +                * already, so its nlink == 1. */
28349 +               assert("nikita-3401",
28350 +                      object->i_nlink == 2 || object->i_nlink == 1);
28351 +#endif
28352 +
28353 +               /* decrement nlink of directory removed ".." pointed
28354 +                  to */
28355 +               reiser4_del_nlink(parent, NULL, 0);
28356 +       }
28357 +       return result;
28358 +}
28359 +
28360 +/* this is common implementation of estimate.add_entry method of
28361 +   dir plugin
28362 +   estimation of adding entry which supposes that entry is inserting a
28363 +   unit into item
28364 +*/
28365 +reiser4_block_nr estimate_add_entry_common(const struct inode *inode)
28366 +{
28367 +       return estimate_one_insert_into_item(reiser4_tree_by_inode(inode));
28368 +}
28369 +
28370 +/* this is common implementation of estimate.rem_entry method of dir
28371 +   plugin
28372 +*/
28373 +reiser4_block_nr estimate_rem_entry_common(const struct inode *inode)
28374 +{
28375 +       return estimate_one_item_removal(reiser4_tree_by_inode(inode));
28376 +}
28377 +
28378 +/* this is common implementation of estimate.unlink method of dir
28379 +   plugin
28380 +*/
28381 +reiser4_block_nr
28382 +dir_estimate_unlink_common(const struct inode *parent,
28383 +                          const struct inode *object)
28384 +{
28385 +       reiser4_block_nr res;
28386 +
28387 +       /* hashed_rem_entry(object) */
28388 +       res = inode_dir_plugin(object)->estimate.rem_entry(object);
28389 +       /* del_nlink(parent) */
28390 +       res += 2 * inode_file_plugin(parent)->estimate.update(parent);
28391 +
28392 +       return res;
28393 +}
28394 +
28395 +/*
28396 + * helper for inode_ops ->lookup() and dir plugin's ->get_parent()
28397 + * methods: if @inode is a light-weight file, setup its credentials
28398 + * that are not stored in the stat-data in this case
28399 + */
28400 +void check_light_weight(struct inode *inode, struct inode *parent)
28401 +{
28402 +       if (reiser4_inode_get_flag(inode, REISER4_LIGHT_WEIGHT)) {
28403 +               inode->i_uid = parent->i_uid;
28404 +               inode->i_gid = parent->i_gid;
28405 +               /* clear light-weight flag. If inode would be read by any
28406 +                  other name, [ug]id wouldn't change. */
28407 +               reiser4_inode_clr_flag(inode, REISER4_LIGHT_WEIGHT);
28408 +       }
28409 +}
28410 +
28411 +/* looks for name specified in @dentry in directory @parent and if name is
28412 +   found - key of object found entry points to is stored in @entry->key */
28413 +int reiser4_lookup_name(struct inode *parent,  /* inode of directory to lookup
28414 +                                        * for name in */
28415 +               struct dentry *dentry,  /* name to look for */
28416 +               reiser4_key * key/* place to store key */)
28417 +{
28418 +       int result;
28419 +       coord_t *coord;
28420 +       lock_handle lh;
28421 +       const char *name;
28422 +       int len;
28423 +       reiser4_dir_entry_desc entry;
28424 +       struct reiser4_dentry_fsdata *fsdata;
28425 +
28426 +       assert("nikita-1247", parent != NULL);
28427 +       assert("nikita-1248", dentry != NULL);
28428 +       assert("nikita-1123", dentry->d_name.name != NULL);
28429 +       assert("vs-1486",
28430 +              dentry->d_op == &get_super_private(parent->i_sb)->ops.dentry);
28431 +
28432 +       name = dentry->d_name.name;
28433 +       len = dentry->d_name.len;
28434 +
28435 +       if (!inode_dir_plugin(parent)->is_name_acceptable(parent, name, len))
28436 +               /* some arbitrary error code to return */
28437 +               return RETERR(-ENAMETOOLONG);
28438 +
28439 +       fsdata = reiser4_get_dentry_fsdata(dentry);
28440 +       if (IS_ERR(fsdata))
28441 +               return PTR_ERR(fsdata);
28442 +
28443 +       coord = &fsdata->dec.entry_coord;
28444 +       coord_clear_iplug(coord);
28445 +       init_lh(&lh);
28446 +
28447 +       /* find entry in a directory. This is plugin method. */
28448 +       result = reiser4_find_entry(parent, dentry, &lh, ZNODE_READ_LOCK,
28449 +                                   &entry);
28450 +       if (result == 0) {
28451 +               /* entry was found, extract object key from it. */
28452 +               result =
28453 +                   WITH_COORD(coord,
28454 +                              item_plugin_by_coord(coord)->s.dir.
28455 +                              extract_key(coord, key));
28456 +       }
28457 +       done_lh(&lh);
28458 +       return result;
28459 +
28460 +}
28461 +
28462 +/* helper for reiser4_dir_init_common(): estimate number of blocks to reserve */
28463 +static reiser4_block_nr
28464 +estimate_init(struct inode *parent, struct inode *object)
28465 +{
28466 +       reiser4_block_nr res = 0;
28467 +
28468 +       assert("vpf-321", parent != NULL);
28469 +       assert("vpf-322", object != NULL);
28470 +
28471 +       /* hashed_add_entry(object) */
28472 +       res += inode_dir_plugin(object)->estimate.add_entry(object);
28473 +       /* reiser4_add_nlink(object) */
28474 +       res += inode_file_plugin(object)->estimate.update(object);
28475 +       /* hashed_add_entry(object) */
28476 +       res += inode_dir_plugin(object)->estimate.add_entry(object);
28477 +       /* reiser4_add_nlink(parent) */
28478 +       res += inode_file_plugin(parent)->estimate.update(parent);
28479 +
28480 +       return 0;
28481 +}
28482 +
28483 +/* helper function for reiser4_dir_init_common(). Create "." and ".." */
28484 +static int create_dot_dotdot(struct inode *object/* object to create dot and
28485 +                                                 * dotdot for */ ,
28486 +                            struct inode *parent/* parent of @object */)
28487 +{
28488 +       int result;
28489 +       struct dentry dots_entry;
28490 +       reiser4_dir_entry_desc entry;
28491 +
28492 +       assert("nikita-688", object != NULL);
28493 +       assert("nikita-689", S_ISDIR(object->i_mode));
28494 +       assert("nikita-691", parent != NULL);
28495 +
28496 +       /* We store dot and dotdot as normal directory entries. This is
28497 +          not necessary, because almost all information stored in them
28498 +          is already in the stat-data of directory, the only thing
28499 +          being missed is objectid of grand-parent directory that can
28500 +          easily be added there as extension.
28501 +
28502 +          But it is done the way it is done, because not storing dot
28503 +          and dotdot will lead to the following complications:
28504 +
28505 +          . special case handling in ->lookup().
28506 +          . addition of another extension to the sd.
28507 +          . dependency on key allocation policy for stat data.
28508 +
28509 +        */
28510 +
28511 +       memset(&entry, 0, sizeof entry);
28512 +       memset(&dots_entry, 0, sizeof dots_entry);
28513 +       entry.obj = dots_entry.d_inode = object;
28514 +       dots_entry.d_name.name = ".";
28515 +       dots_entry.d_name.len = 1;
28516 +       result = reiser4_add_entry_common(object, &dots_entry, NULL, &entry);
28517 +       reiser4_free_dentry_fsdata(&dots_entry);
28518 +
28519 +       if (result == 0) {
28520 +               result = reiser4_add_nlink(object, object, 0);
28521 +               if (result == 0) {
28522 +                       entry.obj = dots_entry.d_inode = parent;
28523 +                       dots_entry.d_name.name = "..";
28524 +                       dots_entry.d_name.len = 2;
28525 +                       result = reiser4_add_entry_common(object,
28526 +                                                 &dots_entry, NULL, &entry);
28527 +                       reiser4_free_dentry_fsdata(&dots_entry);
28528 +                       /* if creation of ".." failed, iput() will delete
28529 +                          object with ".". */
28530 +                       if (result == 0) {
28531 +                               result = reiser4_add_nlink(parent, object, 0);
28532 +                               if (result != 0)
28533 +                                       /*
28534 +                                        * if we failed to bump i_nlink, try
28535 +                                        * to remove ".."
28536 +                                        */
28537 +                                       reiser4_detach_common(object, parent);
28538 +                       }
28539 +               }
28540 +       }
28541 +
28542 +       if (result != 0) {
28543 +               /*
28544 +                * in the case of error, at least update stat-data so that,
28545 +                * ->i_nlink updates are not lingering.
28546 +                */
28547 +               reiser4_update_sd(object);
28548 +               reiser4_update_sd(parent);
28549 +       }
28550 +
28551 +       return result;
28552 +}
28553 +
28554 +/*
28555 + * return 0 iff @coord contains a directory entry for the file with the name
28556 + * @name.
28557 + */
28558 +static int
28559 +check_item(const struct inode *dir, const coord_t *coord, const char *name)
28560 +{
28561 +       item_plugin *iplug;
28562 +       char buf[DE_NAME_BUF_LEN];
28563 +
28564 +       iplug = item_plugin_by_coord(coord);
28565 +       if (iplug == NULL) {
28566 +               warning("nikita-1135", "Cannot get item plugin");
28567 +               print_coord("coord", coord, 1);
28568 +               return RETERR(-EIO);
28569 +       } else if (item_id_by_coord(coord) !=
28570 +                  item_id_by_plugin(inode_dir_item_plugin(dir))) {
28571 +               /* item id of current item does not match to id of items a
28572 +                  directory is built of */
28573 +               warning("nikita-1136", "Wrong item plugin");
28574 +               print_coord("coord", coord, 1);
28575 +               return RETERR(-EIO);
28576 +       }
28577 +       assert("nikita-1137", iplug->s.dir.extract_name);
28578 +
28579 +       /* Compare name stored in this entry with name we are looking for.
28580 +
28581 +          NOTE-NIKITA Here should go code for support of something like
28582 +          unicode, code tables, etc.
28583 +        */
28584 +       return !!strcmp(name, iplug->s.dir.extract_name(coord, buf));
28585 +}
28586 +
28587 +static int
28588 +check_entry(const struct inode *dir, coord_t *coord, const struct qstr *name)
28589 +{
28590 +       return WITH_COORD(coord, check_item(dir, coord, name->name));
28591 +}
28592 +
28593 +/*
28594 + * argument package used by entry_actor to scan entries with identical keys.
28595 + */
28596 +struct entry_actor_args {
28597 +       /* name we are looking for */
28598 +       const char *name;
28599 +       /* key of directory entry. entry_actor() scans through sequence of
28600 +        * items/units having the same key */
28601 +       reiser4_key *key;
28602 +       /* how many entries with duplicate key was scanned so far. */
28603 +       int non_uniq;
28604 +#if REISER4_USE_COLLISION_LIMIT
28605 +       /* scan limit */
28606 +       int max_non_uniq;
28607 +#endif
28608 +       /* return parameter: set to true, if ->name wasn't found */
28609 +       int not_found;
28610 +       /* what type of lock to take when moving to the next node during
28611 +        * scan */
28612 +       znode_lock_mode mode;
28613 +
28614 +       /* last coord that was visited during scan */
28615 +       coord_t last_coord;
28616 +       /* last node locked during scan */
28617 +       lock_handle last_lh;
28618 +       /* inode of directory */
28619 +       const struct inode *inode;
28620 +};
28621 +
28622 +/* Function called by reiser4_find_entry() to look for given name
28623 +   in the directory. */
28624 +static int entry_actor(reiser4_tree * tree UNUSED_ARG /* tree being scanned */ ,
28625 +                      coord_t *coord /* current coord */ ,
28626 +                      lock_handle * lh /* current lock handle */ ,
28627 +                      void *entry_actor_arg/* argument to scan */)
28628 +{
28629 +       reiser4_key unit_key;
28630 +       struct entry_actor_args *args;
28631 +
28632 +       assert("nikita-1131", tree != NULL);
28633 +       assert("nikita-1132", coord != NULL);
28634 +       assert("nikita-1133", entry_actor_arg != NULL);
28635 +
28636 +       args = entry_actor_arg;
28637 +       ++args->non_uniq;
28638 +#if REISER4_USE_COLLISION_LIMIT
28639 +       if (args->non_uniq > args->max_non_uniq) {
28640 +               args->not_found = 1;
28641 +               /* hash collision overflow. */
28642 +               return RETERR(-EBUSY);
28643 +       }
28644 +#endif
28645 +
28646 +       /*
28647 +        * did we just reach the end of the sequence of items/units with
28648 +        * identical keys?
28649 +        */
28650 +       if (!keyeq(args->key, unit_key_by_coord(coord, &unit_key))) {
28651 +               assert("nikita-1791",
28652 +                      keylt(args->key, unit_key_by_coord(coord, &unit_key)));
28653 +               args->not_found = 1;
28654 +               args->last_coord.between = AFTER_UNIT;
28655 +               return 0;
28656 +       }
28657 +
28658 +       coord_dup(&args->last_coord, coord);
28659 +       /*
28660 +        * did scan just moved to the next node?
28661 +        */
28662 +       if (args->last_lh.node != lh->node) {
28663 +               int lock_result;
28664 +
28665 +               /*
28666 +                * if so, lock new node with the mode requested by the caller
28667 +                */
28668 +               done_lh(&args->last_lh);
28669 +               assert("nikita-1896", znode_is_any_locked(lh->node));
28670 +               lock_result = longterm_lock_znode(&args->last_lh, lh->node,
28671 +                                                 args->mode, ZNODE_LOCK_HIPRI);
28672 +               if (lock_result != 0)
28673 +                       return lock_result;
28674 +       }
28675 +       return check_item(args->inode, coord, args->name);
28676 +}
28677 +
28678 +/* Look for given @name within directory @dir.
28679 +
28680 +   This is called during lookup, creation and removal of directory
28681 +   entries and on reiser4_rename_common
28682 +
28683 +   First calculate key that directory entry for @name would have. Search
28684 +   for this key in the tree. If such key is found, scan all items with
28685 +   the same key, checking name in each directory entry along the way.
28686 +*/
28687 +int reiser4_find_entry(struct inode *dir,      /* directory to scan */
28688 +                      struct dentry *de,       /* name to search for */
28689 +                      lock_handle * lh,        /* resulting lock handle */
28690 +                      znode_lock_mode mode,    /* required lock mode */
28691 +                      reiser4_dir_entry_desc * entry   /* parameters of found
28692 +                                                          directory entry */)
28693 +{
28694 +       const struct qstr *name;
28695 +       seal_t *seal;
28696 +       coord_t *coord;
28697 +       int result;
28698 +       __u32 flags;
28699 +       struct de_location *dec;
28700 +       struct reiser4_dentry_fsdata *fsdata;
28701 +
28702 +       assert("nikita-1130", lh != NULL);
28703 +       assert("nikita-1128", dir != NULL);
28704 +
28705 +       name = &de->d_name;
28706 +       assert("nikita-1129", name != NULL);
28707 +
28708 +       /* dentry private data don't require lock, because dentry
28709 +          manipulations are protected by i_mutex on parent.
28710 +
28711 +          This is not so for inodes, because there is no -the- parent in
28712 +          inode case.
28713 +        */
28714 +       fsdata = reiser4_get_dentry_fsdata(de);
28715 +       if (IS_ERR(fsdata))
28716 +               return PTR_ERR(fsdata);
28717 +       dec = &fsdata->dec;
28718 +
28719 +       coord = &dec->entry_coord;
28720 +       coord_clear_iplug(coord);
28721 +       seal = &dec->entry_seal;
28722 +       /* compose key of directory entry for @name */
28723 +       inode_dir_plugin(dir)->build_entry_key(dir, name, &entry->key);
28724 +
28725 +       if (reiser4_seal_is_set(seal)) {
28726 +               /* check seal */
28727 +               result = reiser4_seal_validate(seal, coord, &entry->key,
28728 +                                              lh, mode, ZNODE_LOCK_LOPRI);
28729 +               if (result == 0) {
28730 +                       /* key was found. Check that it is really item we are
28731 +                          looking for. */
28732 +                       result = check_entry(dir, coord, name);
28733 +                       if (result == 0)
28734 +                               return 0;
28735 +               }
28736 +       }
28737 +       flags = (mode == ZNODE_WRITE_LOCK) ? CBK_FOR_INSERT : 0;
28738 +       /*
28739 +        * find place in the tree where directory item should be located.
28740 +        */
28741 +       result = reiser4_object_lookup(dir, &entry->key, coord, lh, mode,
28742 +                                      FIND_EXACT, LEAF_LEVEL, LEAF_LEVEL,
28743 +                                      flags, NULL/*ra_info */);
28744 +       if (result == CBK_COORD_FOUND) {
28745 +               struct entry_actor_args arg;
28746 +
28747 +               /* fast path: no hash collisions */
28748 +               result = check_entry(dir, coord, name);
28749 +               if (result == 0) {
28750 +                       reiser4_seal_init(seal, coord, &entry->key);
28751 +                       dec->pos = 0;
28752 +               } else if (result > 0) {
28753 +                       /* Iterate through all units with the same keys. */
28754 +                       arg.name = name->name;
28755 +                       arg.key = &entry->key;
28756 +                       arg.not_found = 0;
28757 +                       arg.non_uniq = 0;
28758 +#if REISER4_USE_COLLISION_LIMIT
28759 +                       arg.max_non_uniq = max_hash_collisions(dir);
28760 +                       assert("nikita-2851", arg.max_non_uniq > 1);
28761 +#endif
28762 +                       arg.mode = mode;
28763 +                       arg.inode = dir;
28764 +                       coord_init_zero(&arg.last_coord);
28765 +                       init_lh(&arg.last_lh);
28766 +
28767 +                       result = reiser4_iterate_tree
28768 +                               (reiser4_tree_by_inode(dir),
28769 +                                coord, lh,
28770 +                                entry_actor, &arg, mode, 1);
28771 +                       /* if end of the tree or extent was reached during
28772 +                          scanning. */
28773 +                       if (arg.not_found || (result == -E_NO_NEIGHBOR)) {
28774 +                               /* step back */
28775 +                               done_lh(lh);
28776 +
28777 +                               result = zload(arg.last_coord.node);
28778 +                               if (result == 0) {
28779 +                                       coord_clear_iplug(&arg.last_coord);
28780 +                                       coord_dup(coord, &arg.last_coord);
28781 +                                       move_lh(lh, &arg.last_lh);
28782 +                                       result = RETERR(-ENOENT);
28783 +                                       zrelse(arg.last_coord.node);
28784 +                                       --arg.non_uniq;
28785 +                               }
28786 +                       }
28787 +
28788 +                       done_lh(&arg.last_lh);
28789 +                       if (result == 0)
28790 +                               reiser4_seal_init(seal, coord, &entry->key);
28791 +
28792 +                       if (result == 0 || result == -ENOENT) {
28793 +                               assert("nikita-2580", arg.non_uniq > 0);
28794 +                               dec->pos = arg.non_uniq - 1;
28795 +                       }
28796 +               }
28797 +       } else
28798 +               dec->pos = -1;
28799 +       return result;
28800 +}
28801 +
28802 +/*
28803 +   Local variables:
28804 +   c-indentation-style: "K&R"
28805 +   mode-name: "LC"
28806 +   c-basic-offset: 8
28807 +   tab-width: 8
28808 +   fill-column: 120
28809 +   scroll-step: 1
28810 +   End:
28811 +*/
28812 diff -puN /dev/null fs/reiser4/plugin/disk_format/Makefile
28813 --- /dev/null
28814 +++ a/fs/reiser4/plugin/disk_format/Makefile
28815 @@ -0,0 +1,5 @@
28816 +obj-$(CONFIG_REISER4_FS) += df_plugins.o
28817 +
28818 +df_plugins-objs :=     \
28819 +       disk_format40.o \
28820 +       disk_format.o
28821 diff -puN /dev/null fs/reiser4/plugin/disk_format/disk_format.c
28822 --- /dev/null
28823 +++ a/fs/reiser4/plugin/disk_format/disk_format.c
28824 @@ -0,0 +1,38 @@
28825 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
28826 +
28827 +#include "../../debug.h"
28828 +#include "../plugin_header.h"
28829 +#include "disk_format40.h"
28830 +#include "disk_format.h"
28831 +#include "../plugin.h"
28832 +
28833 +/* initialization of disk layout plugins */
28834 +disk_format_plugin format_plugins[LAST_FORMAT_ID] = {
28835 +       [FORMAT40_ID] = {
28836 +               .h = {
28837 +                       .type_id = REISER4_FORMAT_PLUGIN_TYPE,
28838 +                       .id = FORMAT40_ID,
28839 +                       .pops = NULL,
28840 +                       .label = "reiser40",
28841 +                       .desc = "standard disk layout for reiser40",
28842 +                       .linkage = {NULL, NULL}
28843 +               },
28844 +               .init_format = init_format_format40,
28845 +               .root_dir_key = root_dir_key_format40,
28846 +               .release = release_format40,
28847 +               .log_super = log_super_format40,
28848 +               .check_open = check_open_format40,
28849 +               .version_update = version_update_format40
28850 +       }
28851 +};
28852 +
28853 +/* Make Linus happy.
28854 +   Local variables:
28855 +   c-indentation-style: "K&R"
28856 +   mode-name: "LC"
28857 +   c-basic-offset: 8
28858 +   tab-width: 8
28859 +   fill-column: 120
28860 +   scroll-step: 1
28861 +   End:
28862 +*/
28863 diff -puN /dev/null fs/reiser4/plugin/disk_format/disk_format.h
28864 --- /dev/null
28865 +++ a/fs/reiser4/plugin/disk_format/disk_format.h
28866 @@ -0,0 +1,27 @@
28867 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
28868 +
28869 +/* identifiers for disk layouts, they are also used as indexes in array of disk
28870 +   plugins */
28871 +
28872 +#if !defined( __REISER4_DISK_FORMAT_H__ )
28873 +#define __REISER4_DISK_FORMAT_H__
28874 +
28875 +typedef enum {
28876 +       /* standard reiser4 disk layout plugin id */
28877 +       FORMAT40_ID,
28878 +       LAST_FORMAT_ID
28879 +} disk_format_id;
28880 +
28881 +/* __REISER4_DISK_FORMAT_H__ */
28882 +#endif
28883 +
28884 +/* Make Linus happy.
28885 +   Local variables:
28886 +   c-indentation-style: "K&R"
28887 +   mode-name: "LC"
28888 +   c-basic-offset: 8
28889 +   tab-width: 8
28890 +   fill-column: 120
28891 +   scroll-step: 1
28892 +   End:
28893 +*/
28894 diff -puN /dev/null fs/reiser4/plugin/disk_format/disk_format40.c
28895 --- /dev/null
28896 +++ a/fs/reiser4/plugin/disk_format/disk_format40.c
28897 @@ -0,0 +1,655 @@
28898 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
28899 +
28900 +#include "../../debug.h"
28901 +#include "../../dformat.h"
28902 +#include "../../key.h"
28903 +#include "../node/node.h"
28904 +#include "../space/space_allocator.h"
28905 +#include "disk_format40.h"
28906 +#include "../plugin.h"
28907 +#include "../../txnmgr.h"
28908 +#include "../../jnode.h"
28909 +#include "../../tree.h"
28910 +#include "../../super.h"
28911 +#include "../../wander.h"
28912 +#include "../../inode.h"
28913 +#include "../../ktxnmgrd.h"
28914 +#include "../../status_flags.h"
28915 +
28916 +#include <linux/types.h>       /* for __u??  */
28917 +#include <linux/fs.h>          /* for struct super_block  */
28918 +#include <linux/buffer_head.h>
28919 +
28920 +/* reiser 4.0 default disk layout */
28921 +
28922 +/* Amount of free blocks needed to perform release_format40 when fs gets
28923 +   mounted RW: 1 for SB, 1 for non-leaves in overwrite set, 2 for tx header
28924 +   & tx record. */
28925 +#define RELEASE_RESERVED 4
28926 +
28927 +/* The greatest supported format40 version number */
28928 +#define FORMAT40_VERSION PLUGIN_LIBRARY_VERSION
28929 +
28930 +/* This flag indicates that backup should be updated
28931 +   (the update is performed by fsck) */
28932 +#define FORMAT40_UPDATE_BACKUP (1 << 31)
28933 +
28934 +/* functions to access fields of format40_disk_super_block */
28935 +static __u64 get_format40_block_count(const format40_disk_super_block * sb)
28936 +{
28937 +       return le64_to_cpu(get_unaligned(&sb->block_count));
28938 +}
28939 +
28940 +static __u64 get_format40_free_blocks(const format40_disk_super_block * sb)
28941 +{
28942 +       return le64_to_cpu(get_unaligned(&sb->free_blocks));
28943 +}
28944 +
28945 +static __u64 get_format40_root_block(const format40_disk_super_block * sb)
28946 +{
28947 +       return le64_to_cpu(get_unaligned(&sb->root_block));
28948 +}
28949 +
28950 +static __u16 get_format40_tree_height(const format40_disk_super_block * sb)
28951 +{
28952 +       return le16_to_cpu(get_unaligned(&sb->tree_height));
28953 +}
28954 +
28955 +static __u64 get_format40_file_count(const format40_disk_super_block * sb)
28956 +{
28957 +       return le64_to_cpu(get_unaligned(&sb->file_count));
28958 +}
28959 +
28960 +static __u64 get_format40_oid(const format40_disk_super_block * sb)
28961 +{
28962 +       return le64_to_cpu(get_unaligned(&sb->oid));
28963 +}
28964 +
28965 +static __u32 get_format40_mkfs_id(const format40_disk_super_block * sb)
28966 +{
28967 +       return le32_to_cpu(get_unaligned(&sb->mkfs_id));
28968 +}
28969 +
28970 +static __u64 get_format40_flags(const format40_disk_super_block * sb)
28971 +{
28972 +       return le64_to_cpu(get_unaligned(&sb->flags));
28973 +}
28974 +
28975 +static __u32 get_format40_version(const format40_disk_super_block * sb)
28976 +{
28977 +       return le32_to_cpu(get_unaligned(&sb->version)) &
28978 +               ~FORMAT40_UPDATE_BACKUP;
28979 +}
28980 +
28981 +static int update_backup_version(const format40_disk_super_block * sb)
28982 +{
28983 +       return (le32_to_cpu(get_unaligned(&sb->version)) &
28984 +               FORMAT40_UPDATE_BACKUP);
28985 +}
28986 +
28987 +static int update_disk_version(const format40_disk_super_block * sb)
28988 +{
28989 +       return (get_format40_version(sb) < FORMAT40_VERSION);
28990 +}
28991 +
28992 +static int incomplete_compatibility(const format40_disk_super_block * sb)
28993 +{
28994 +       return (get_format40_version(sb) > FORMAT40_VERSION);
28995 +}
28996 +
28997 +static format40_super_info *get_sb_info(struct super_block *super)
28998 +{
28999 +       return &get_super_private(super)->u.format40;
29000 +}
29001 +
29002 +static int consult_diskmap(struct super_block *s)
29003 +{
29004 +       format40_super_info *info;
29005 +       journal_location *jloc;
29006 +
29007 +       info = get_sb_info(s);
29008 +       jloc = &get_super_private(s)->jloc;
29009 +       /* Default format-specific locations, if there is nothing in
29010 +        * diskmap */
29011 +       jloc->footer = FORMAT40_JOURNAL_FOOTER_BLOCKNR;
29012 +       jloc->header = FORMAT40_JOURNAL_HEADER_BLOCKNR;
29013 +       info->loc.super = FORMAT40_OFFSET / s->s_blocksize;
29014 +#ifdef CONFIG_REISER4_BADBLOCKS
29015 +       reiser4_get_diskmap_value(FORMAT40_PLUGIN_DISKMAP_ID, FORMAT40_JF,
29016 +                                 &jloc->footer);
29017 +       reiser4_get_diskmap_value(FORMAT40_PLUGIN_DISKMAP_ID, FORMAT40_JH,
29018 +                                 &jloc->header);
29019 +       reiser4_get_diskmap_value(FORMAT40_PLUGIN_DISKMAP_ID, FORMAT40_SUPER,
29020 +                                 &info->loc.super);
29021 +#endif
29022 +       return 0;
29023 +}
29024 +
29025 +/* find any valid super block of disk_format40 (even if the first
29026 +   super block is destroyed), will change block numbers of actual journal header/footer (jf/jh)
29027 +   if needed */
29028 +static struct buffer_head *find_a_disk_format40_super_block(struct super_block
29029 +                                                           *s)
29030 +{
29031 +       struct buffer_head *super_bh;
29032 +       format40_disk_super_block *disk_sb;
29033 +       format40_super_info *info;
29034 +
29035 +       assert("umka-487", s != NULL);
29036 +
29037 +       info = get_sb_info(s);
29038 +
29039 +       super_bh = sb_bread(s, info->loc.super);
29040 +       if (super_bh == NULL)
29041 +               return ERR_PTR(RETERR(-EIO));
29042 +
29043 +       disk_sb = (format40_disk_super_block *) super_bh->b_data;
29044 +       if (strncmp(disk_sb->magic, FORMAT40_MAGIC, sizeof(FORMAT40_MAGIC))) {
29045 +               brelse(super_bh);
29046 +               return ERR_PTR(RETERR(-EINVAL));
29047 +       }
29048 +
29049 +       reiser4_set_block_count(s, le64_to_cpu(get_unaligned(&disk_sb->block_count)));
29050 +       reiser4_set_data_blocks(s, le64_to_cpu(get_unaligned(&disk_sb->block_count)) -
29051 +                               le64_to_cpu(get_unaligned(&disk_sb->free_blocks)));
29052 +       reiser4_set_free_blocks(s, le64_to_cpu(get_unaligned(&disk_sb->free_blocks)));
29053 +
29054 +       return super_bh;
29055 +}
29056 +
29057 +/* find the most recent version of super block. This is called after journal is
29058 +   replayed */
29059 +static struct buffer_head *read_super_block(struct super_block *s UNUSED_ARG)
29060 +{
29061 +       /* Here the most recent superblock copy has to be read. However, as
29062 +          journal replay isn't complete, we are using
29063 +          find_a_disk_format40_super_block() function. */
29064 +       return find_a_disk_format40_super_block(s);
29065 +}
29066 +
29067 +static int get_super_jnode(struct super_block *s)
29068 +{
29069 +       reiser4_super_info_data *sbinfo = get_super_private(s);
29070 +       jnode *sb_jnode;
29071 +       int ret;
29072 +
29073 +       sb_jnode = reiser4_alloc_io_head(&get_sb_info(s)->loc.super);
29074 +
29075 +       ret = jload(sb_jnode);
29076 +
29077 +       if (ret) {
29078 +               reiser4_drop_io_head(sb_jnode);
29079 +               return ret;
29080 +       }
29081 +
29082 +       pin_jnode_data(sb_jnode);
29083 +       jrelse(sb_jnode);
29084 +
29085 +       sbinfo->u.format40.sb_jnode = sb_jnode;
29086 +
29087 +       return 0;
29088 +}
29089 +
29090 +static void done_super_jnode(struct super_block *s)
29091 +{
29092 +       jnode *sb_jnode = get_super_private(s)->u.format40.sb_jnode;
29093 +
29094 +       if (sb_jnode) {
29095 +               unpin_jnode_data(sb_jnode);
29096 +               reiser4_drop_io_head(sb_jnode);
29097 +       }
29098 +}
29099 +
29100 +typedef enum format40_init_stage {
29101 +       NONE_DONE = 0,
29102 +       CONSULT_DISKMAP,
29103 +       FIND_A_SUPER,
29104 +       INIT_JOURNAL_INFO,
29105 +       INIT_STATUS,
29106 +       JOURNAL_REPLAY,
29107 +       READ_SUPER,
29108 +       KEY_CHECK,
29109 +       INIT_OID,
29110 +       INIT_TREE,
29111 +       JOURNAL_RECOVER,
29112 +       INIT_SA,
29113 +       INIT_JNODE,
29114 +       ALL_DONE
29115 +} format40_init_stage;
29116 +
29117 +static format40_disk_super_block *copy_sb(const struct buffer_head *super_bh)
29118 +{
29119 +       format40_disk_super_block *sb_copy;
29120 +
29121 +       sb_copy = kmalloc(sizeof(format40_disk_super_block),
29122 +                         reiser4_ctx_gfp_mask_get());
29123 +       if (sb_copy == NULL)
29124 +               return ERR_PTR(RETERR(-ENOMEM));
29125 +       memcpy(sb_copy, ((format40_disk_super_block *) super_bh->b_data),
29126 +              sizeof(format40_disk_super_block));
29127 +       return sb_copy;
29128 +}
29129 +
29130 +static int check_key_format(const format40_disk_super_block *sb_copy)
29131 +{
29132 +       if (!equi(REISER4_LARGE_KEY,
29133 +                 get_format40_flags(sb_copy) & (1 << FORMAT40_LARGE_KEYS))) {
29134 +               warning("nikita-3228", "Key format mismatch. "
29135 +                       "Only %s keys are supported.",
29136 +                       REISER4_LARGE_KEY ? "large" : "small");
29137 +               return RETERR(-EINVAL);
29138 +       }
29139 +       return 0;
29140 +}
29141 +
29142 +/**
29143 + * try_init_format40
29144 + * @super:
29145 + * @stage:
29146 + *
29147 + */
29148 +static int try_init_format40(struct super_block *super,
29149 +                            format40_init_stage *stage)
29150 +{
29151 +       int result;
29152 +       struct buffer_head *super_bh;
29153 +       reiser4_super_info_data *sbinfo;
29154 +       format40_disk_super_block *sb_copy;
29155 +       tree_level height;
29156 +       reiser4_block_nr root_block;
29157 +       node_plugin *nplug;
29158 +
29159 +       assert("vs-475", super != NULL);
29160 +       assert("vs-474", get_super_private(super));
29161 +
29162 +       *stage = NONE_DONE;
29163 +
29164 +       result = consult_diskmap(super);
29165 +       if (result)
29166 +               return result;
29167 +       *stage = CONSULT_DISKMAP;
29168 +
29169 +       super_bh = find_a_disk_format40_super_block(super);
29170 +       if (IS_ERR(super_bh))
29171 +               return PTR_ERR(super_bh);
29172 +       brelse(super_bh);
29173 +       *stage = FIND_A_SUPER;
29174 +
29175 +       /* ok, we are sure that filesystem format is a format40 format */
29176 +
29177 +       /* map jnodes for journal control blocks (header, footer) to disk  */
29178 +       result = reiser4_init_journal_info(super);
29179 +       if (result)
29180 +               return result;
29181 +       *stage = INIT_JOURNAL_INFO;
29182 +
29183 +       /* ok, we are sure that filesystem format is a format40 format */
29184 +       /* Now check it's state */
29185 +       result = reiser4_status_init(FORMAT40_STATUS_BLOCKNR);
29186 +       if (result != 0 && result != -EINVAL)
29187 +               /* -EINVAL means there is no magic, so probably just old
29188 +                * fs. */
29189 +               return result;
29190 +       *stage = INIT_STATUS;
29191 +
29192 +       result = reiser4_status_query(NULL, NULL);
29193 +       if (result == REISER4_STATUS_MOUNT_WARN)
29194 +               notice("vpf-1363", "Warning: mounting %s with errors.",
29195 +                      super->s_id);
29196 +       if (result == REISER4_STATUS_MOUNT_RO)
29197 +               notice("vpf-1364", "Warning: mounting %s with fatal errors,"
29198 +                      " forcing read-only mount.", super->s_id);
29199 +       result = reiser4_journal_replay(super);
29200 +       if (result)
29201 +               return result;
29202 +       *stage = JOURNAL_REPLAY;
29203 +
29204 +       super_bh = read_super_block(super);
29205 +       if (IS_ERR(super_bh))
29206 +               return PTR_ERR(super_bh);
29207 +       *stage = READ_SUPER;
29208 +
29209 +       /* allocate and make a copy of format40_disk_super_block */
29210 +       sb_copy = copy_sb(super_bh);
29211 +       brelse(super_bh);
29212 +
29213 +       if (IS_ERR(sb_copy))
29214 +               return PTR_ERR(sb_copy);
29215 +       printk("reiser4: %s: found disk format 4.0.%u.\n",
29216 +              super->s_id,
29217 +              get_format40_version(sb_copy));
29218 +       if (incomplete_compatibility(sb_copy))
29219 +               printk("reiser4: Warning: The last completely supported "
29220 +                      "version of disk format40 is %u. Some objects of "
29221 +                      "the semantic tree can be unaccessible.\n",
29222 +                      FORMAT40_VERSION);
29223 +       /* make sure that key format of kernel and filesystem match */
29224 +       result = check_key_format(sb_copy);
29225 +       if (result) {
29226 +               kfree(sb_copy);
29227 +               return result;
29228 +       }
29229 +       *stage = KEY_CHECK;
29230 +
29231 +       result = oid_init_allocator(super, get_format40_file_count(sb_copy),
29232 +                                   get_format40_oid(sb_copy));
29233 +       if (result) {
29234 +               kfree(sb_copy);
29235 +               return result;
29236 +       }
29237 +       *stage = INIT_OID;
29238 +
29239 +       /* get things necessary to init reiser4_tree */
29240 +       root_block = get_format40_root_block(sb_copy);
29241 +       height = get_format40_tree_height(sb_copy);
29242 +       nplug = node_plugin_by_id(NODE40_ID);
29243 +
29244 +       /* initialize reiser4_super_info_data */
29245 +       sbinfo = get_super_private(super);
29246 +       assert("", sbinfo->tree.super == super);
29247 +       /* init reiser4_tree for the filesystem */
29248 +       result = reiser4_init_tree(&sbinfo->tree, &root_block, height, nplug);
29249 +       if (result) {
29250 +               kfree(sb_copy);
29251 +               return result;
29252 +       }
29253 +       *stage = INIT_TREE;
29254 +
29255 +       /*
29256 +        * initialize reiser4_super_info_data with data from format40 super
29257 +        * block
29258 +        */
29259 +       sbinfo->default_uid = 0;
29260 +       sbinfo->default_gid = 0;
29261 +       sbinfo->mkfs_id = get_format40_mkfs_id(sb_copy);
29262 +       /* number of blocks in filesystem and reserved space */
29263 +       reiser4_set_block_count(super, get_format40_block_count(sb_copy));
29264 +       sbinfo->blocks_free = get_format40_free_blocks(sb_copy);
29265 +       sbinfo->version = get_format40_version(sb_copy);
29266 +       kfree(sb_copy);
29267 +
29268 +       if (update_backup_version(sb_copy))
29269 +               printk("reiser4: Warning: metadata backup is not updated. "
29270 +                      "Please run 'fsck.reiser4 --fix' on %s.\n",
29271 +                      super->s_id);
29272 +
29273 +       sbinfo->fsuid = 0;
29274 +       sbinfo->fs_flags |= (1 << REISER4_ADG); /* hard links for directories
29275 +                                                * are not supported */
29276 +       sbinfo->fs_flags |= (1 << REISER4_ONE_NODE_PLUGIN);     /* all nodes in
29277 +                                                                * layout 40 are
29278 +                                                                * of one
29279 +                                                                * plugin */
29280 +       /* sbinfo->tmgr is initialized already */
29281 +
29282 +       /* recover sb data which were logged separately from sb block */
29283 +
29284 +       /* NOTE-NIKITA: reiser4_journal_recover_sb_data() calls
29285 +        * oid_init_allocator() and reiser4_set_free_blocks() with new
29286 +        * data. What's the reason to call them above? */
29287 +       result = reiser4_journal_recover_sb_data(super);
29288 +       if (result != 0)
29289 +               return result;
29290 +       *stage = JOURNAL_RECOVER;
29291 +
29292 +       /*
29293 +        * Set number of used blocks.  The number of used blocks is not stored
29294 +        * neither in on-disk super block nor in the journal footer blocks.  At
29295 +        * this moment actual values of total blocks and free block counters
29296 +        * are set in the reiser4 super block (in-memory structure) and we can
29297 +        * calculate number of used blocks from them.
29298 +        */
29299 +       reiser4_set_data_blocks(super,
29300 +                               reiser4_block_count(super) -
29301 +                               reiser4_free_blocks(super));
29302 +
29303 +#if REISER4_DEBUG
29304 +       sbinfo->min_blocks_used = 16 /* reserved area */  +
29305 +               2 /* super blocks */  +
29306 +               2 /* journal footer and header */ ;
29307 +#endif
29308 +
29309 +       /* init disk space allocator */
29310 +       result = sa_init_allocator(reiser4_get_space_allocator(super),
29311 +                                  super, NULL);
29312 +       if (result)
29313 +               return result;
29314 +       *stage = INIT_SA;
29315 +
29316 +       result = get_super_jnode(super);
29317 +       if (result == 0)
29318 +               *stage = ALL_DONE;
29319 +       return result;
29320 +}
29321 +
29322 +/* plugin->u.format.get_ready */
29323 +int init_format_format40(struct super_block *s, void *data UNUSED_ARG)
29324 +{
29325 +       int result;
29326 +       format40_init_stage stage;
29327 +
29328 +       result = try_init_format40(s, &stage);
29329 +       switch (stage) {
29330 +       case ALL_DONE:
29331 +               assert("nikita-3458", result == 0);
29332 +               break;
29333 +       case INIT_JNODE:
29334 +               done_super_jnode(s);
29335 +       case INIT_SA:
29336 +               sa_destroy_allocator(reiser4_get_space_allocator(s), s);
29337 +       case JOURNAL_RECOVER:
29338 +       case INIT_TREE:
29339 +               reiser4_done_tree(&get_super_private(s)->tree);
29340 +       case INIT_OID:
29341 +       case KEY_CHECK:
29342 +       case READ_SUPER:
29343 +       case JOURNAL_REPLAY:
29344 +       case INIT_STATUS:
29345 +               reiser4_status_finish();
29346 +       case INIT_JOURNAL_INFO:
29347 +               reiser4_done_journal_info(s);
29348 +       case FIND_A_SUPER:
29349 +       case CONSULT_DISKMAP:
29350 +       case NONE_DONE:
29351 +               break;
29352 +       default:
29353 +               impossible("nikita-3457", "init stage: %i", stage);
29354 +       }
29355 +
29356 +       if (!rofs_super(s) && reiser4_free_blocks(s) < RELEASE_RESERVED)
29357 +               return RETERR(-ENOSPC);
29358 +
29359 +       return result;
29360 +}
29361 +
29362 +static void pack_format40_super(const struct super_block *s, char *data)
29363 +{
29364 +       format40_disk_super_block *super_data =
29365 +           (format40_disk_super_block *) data;
29366 +
29367 +       reiser4_super_info_data *sbinfo = get_super_private(s);
29368 +
29369 +       assert("zam-591", data != NULL);
29370 +
29371 +       put_unaligned(cpu_to_le64(reiser4_free_committed_blocks(s)),
29372 +                     &super_data->free_blocks);
29373 +
29374 +       put_unaligned(cpu_to_le64(sbinfo->tree.root_block),
29375 +                     &super_data->root_block);
29376 +
29377 +       put_unaligned(cpu_to_le64(oid_next(s)),
29378 +                     &super_data->oid);
29379 +
29380 +       put_unaligned(cpu_to_le64(oids_used(s)),
29381 +                     &super_data->file_count);
29382 +
29383 +       put_unaligned(cpu_to_le16(sbinfo->tree.height),
29384 +                     &super_data->tree_height);
29385 +
29386 +       if (update_disk_version(super_data)) {
29387 +               __u32 version = FORMAT40_VERSION | FORMAT40_UPDATE_BACKUP;
29388 +
29389 +               put_unaligned(cpu_to_le32(version), &super_data->version);
29390 +       }
29391 +}
29392 +
29393 +/* plugin->u.format.log_super
29394 +   return a jnode which should be added to transaction when the super block
29395 +   gets logged */
29396 +jnode *log_super_format40(struct super_block *s)
29397 +{
29398 +       jnode *sb_jnode;
29399 +
29400 +       sb_jnode = get_super_private(s)->u.format40.sb_jnode;
29401 +
29402 +       jload(sb_jnode);
29403 +
29404 +       pack_format40_super(s, jdata(sb_jnode));
29405 +
29406 +       jrelse(sb_jnode);
29407 +
29408 +       return sb_jnode;
29409 +}
29410 +
29411 +/* plugin->u.format.release */
29412 +int release_format40(struct super_block *s)
29413 +{
29414 +       int ret;
29415 +       reiser4_super_info_data *sbinfo;
29416 +
29417 +       sbinfo = get_super_private(s);
29418 +       assert("zam-579", sbinfo != NULL);
29419 +
29420 +       if (!rofs_super(s)) {
29421 +               ret = reiser4_capture_super_block(s);
29422 +               if (ret != 0)
29423 +                       warning("vs-898",
29424 +                               "reiser4_capture_super_block failed: %d",
29425 +                               ret);
29426 +
29427 +               ret = txnmgr_force_commit_all(s, 1);
29428 +               if (ret != 0)
29429 +                       warning("jmacd-74438", "txn_force failed: %d", ret);
29430 +
29431 +               all_grabbed2free();
29432 +       }
29433 +
29434 +       sa_destroy_allocator(&sbinfo->space_allocator, s);
29435 +       reiser4_done_journal_info(s);
29436 +       done_super_jnode(s);
29437 +
29438 +       rcu_barrier();
29439 +       reiser4_done_tree(&sbinfo->tree);
29440 +       /* call finish_rcu(), because some znode were "released" in
29441 +        * reiser4_done_tree(). */
29442 +       rcu_barrier();
29443 +
29444 +       return 0;
29445 +}
29446 +
29447 +#define FORMAT40_ROOT_LOCALITY 41
29448 +#define FORMAT40_ROOT_OBJECTID 42
29449 +
29450 +/* plugin->u.format.root_dir_key */
29451 +const reiser4_key *root_dir_key_format40(const struct super_block *super
29452 +                                        UNUSED_ARG)
29453 +{
29454 +       static const reiser4_key FORMAT40_ROOT_DIR_KEY = {
29455 +               .el = {
29456 +                       __constant_cpu_to_le64((FORMAT40_ROOT_LOCALITY << 4) | KEY_SD_MINOR),
29457 +#if REISER4_LARGE_KEY
29458 +                       ON_LARGE_KEY(0ull,)
29459 +#endif
29460 +                       __constant_cpu_to_le64(FORMAT40_ROOT_OBJECTID),
29461 +                       0ull
29462 +               }
29463 +       };
29464 +
29465 +       return &FORMAT40_ROOT_DIR_KEY;
29466 +}
29467 +
29468 +/* plugin->u.format.check_open.
29469 +   Check the opened object for validness. For now it checks for the valid oid &
29470 +   locality only, can be improved later and it its work may depend on the mount
29471 +   options. */
29472 +int check_open_format40(const struct inode *object)
29473 +{
29474 +       oid_t max, oid;
29475 +
29476 +       max = oid_next(object->i_sb) - 1;
29477 +
29478 +       /* Check the oid. */
29479 +       oid = get_inode_oid(object);
29480 +       if (oid > max) {
29481 +               warning("vpf-1360", "The object with the oid %llu "
29482 +                       "greater then the max used oid %llu found.",
29483 +                       (unsigned long long)oid, (unsigned long long)max);
29484 +
29485 +               return RETERR(-EIO);
29486 +       }
29487 +
29488 +       /* Check the locality. */
29489 +       oid = reiser4_inode_data(object)->locality_id;
29490 +       if (oid > max) {
29491 +               warning("vpf-1361", "The object with the locality %llu "
29492 +                       "greater then the max used oid %llu found.",
29493 +                       (unsigned long long)oid, (unsigned long long)max);
29494 +
29495 +               return RETERR(-EIO);
29496 +       }
29497 +
29498 +       return 0;
29499 +}
29500 +
29501 +/* plugin->u.format.version_update.
29502 +   Perform all version update operations from the on-disk
29503 +   format40_disk_super_block.version on disk to FORMAT40_VERSION.
29504 + */
29505 +int version_update_format40(struct super_block *super) {
29506 +       txn_handle * trans;
29507 +       lock_handle lh;
29508 +       txn_atom *atom;
29509 +       int ret;
29510 +
29511 +       /* Nothing to do if RO mount or the on-disk version is not less. */
29512 +       if (super->s_flags & MS_RDONLY)
29513 +               return 0;
29514 +
29515 +       if (get_super_private(super)->version >= FORMAT40_VERSION)
29516 +               return 0;
29517 +
29518 +       printk("reiser4: Updating disk format to 4.0.%u. The reiser4 metadata "
29519 +              "backup is left unchanged. Please run 'fsck.reiser4 --fix' "
29520 +              "on %s to update it too.\n", FORMAT40_VERSION, super->s_id);
29521 +
29522 +       /* Mark the uber znode dirty to call log_super on write_logs. */
29523 +       init_lh(&lh);
29524 +       ret = get_uber_znode(reiser4_get_tree(super), ZNODE_WRITE_LOCK,
29525 +                            ZNODE_LOCK_HIPRI, &lh);
29526 +       if (ret != 0)
29527 +               return ret;
29528 +
29529 +       znode_make_dirty(lh.node);
29530 +       done_lh(&lh);
29531 +
29532 +       /* Update the backup blocks. */
29533 +
29534 +       /* Force write_logs immediately. */
29535 +       trans = get_current_context()->trans;
29536 +       atom = get_current_atom_locked();
29537 +       assert("vpf-1906", atom != NULL);
29538 +
29539 +       spin_lock_txnh(trans);
29540 +       return force_commit_atom(trans);
29541 +}
29542 +
29543 +/* Make Linus happy.
29544 +   Local variables:
29545 +   c-indentation-style: "K&R"
29546 +   mode-name: "LC"
29547 +   c-basic-offset: 8
29548 +   tab-width: 8
29549 +   fill-column: 120
29550 +   scroll-step: 1
29551 +   End:
29552 +*/
29553 diff -puN /dev/null fs/reiser4/plugin/disk_format/disk_format40.h
29554 --- /dev/null
29555 +++ a/fs/reiser4/plugin/disk_format/disk_format40.h
29556 @@ -0,0 +1,109 @@
29557 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
29558 +
29559 +/* this file contains:
29560 +   - definition of ondisk super block of standart disk layout for
29561 +     reiser 4.0 (layout 40)
29562 +   - definition of layout 40 specific portion of in-core super block
29563 +   - declarations of functions implementing methods of layout plugin
29564 +     for layout 40
29565 +   - declarations of functions used to get/set fields in layout 40 super block
29566 +*/
29567 +
29568 +#ifndef __DISK_FORMAT40_H__
29569 +#define __DISK_FORMAT40_H__
29570 +
29571 +/* magic for default reiser4 layout */
29572 +#define FORMAT40_MAGIC "ReIsEr40FoRmAt"
29573 +#define FORMAT40_OFFSET (REISER4_MASTER_OFFSET + PAGE_CACHE_SIZE)
29574 +
29575 +#include "../../dformat.h"
29576 +
29577 +#include <linux/fs.h>          /* for struct super_block  */
29578 +
29579 +typedef enum {
29580 +       FORMAT40_LARGE_KEYS
29581 +} format40_flags;
29582 +
29583 +/* ondisk super block for format 40. It is 512 bytes long */
29584 +typedef struct format40_disk_super_block {
29585 +       /*   0 */ d64 block_count;
29586 +       /* number of block in a filesystem */
29587 +       /*   8 */ d64 free_blocks;
29588 +       /* number of free blocks */
29589 +       /*  16 */ d64 root_block;
29590 +       /* filesystem tree root block */
29591 +       /*  24 */ d64 oid;
29592 +       /* smallest free objectid */
29593 +       /*  32 */ d64 file_count;
29594 +       /* number of files in a filesystem */
29595 +       /*  40 */ d64 flushes;
29596 +       /* number of times super block was
29597 +          flushed. Needed if format 40
29598 +          will have few super blocks */
29599 +       /*  48 */ d32 mkfs_id;
29600 +       /* unique identifier of fs */
29601 +       /*  52 */ char magic[16];
29602 +       /* magic string ReIsEr40FoRmAt */
29603 +       /*  68 */ d16 tree_height;
29604 +       /* height of filesystem tree */
29605 +       /*  70 */ d16 formatting_policy;
29606 +       /* not used anymore */
29607 +       /*  72 */ d64 flags;
29608 +       /*  80 */ d32 version;
29609 +       /* on-disk format version number
29610 +          initially assigned by mkfs as the greatest format40
29611 +          version number supported by reiser4progs and updated
29612 +          in mount time in accordance with the greatest format40
29613 +          version number supported by kernel.
29614 +          Is used by fsck to catch possible corruption and
29615 +          for various compatibility issues */
29616 +       /*  84 */ char not_used[428];
29617 +} format40_disk_super_block;
29618 +
29619 +/* format 40 specific part of reiser4_super_info_data */
29620 +typedef struct format40_super_info {
29621 +/*     format40_disk_super_block actual_sb; */
29622 +       jnode *sb_jnode;
29623 +       struct {
29624 +               reiser4_block_nr super;
29625 +       } loc;
29626 +} format40_super_info;
29627 +
29628 +/* Defines for journal header and footer respectively. */
29629 +#define FORMAT40_JOURNAL_HEADER_BLOCKNR \
29630 +       ((REISER4_MASTER_OFFSET / PAGE_CACHE_SIZE) + 3)
29631 +
29632 +#define FORMAT40_JOURNAL_FOOTER_BLOCKNR \
29633 +       ((REISER4_MASTER_OFFSET / PAGE_CACHE_SIZE) + 4)
29634 +
29635 +#define FORMAT40_STATUS_BLOCKNR \
29636 +       ((REISER4_MASTER_OFFSET / PAGE_CACHE_SIZE) + 5)
29637 +
29638 +/* Diskmap declarations */
29639 +#define FORMAT40_PLUGIN_DISKMAP_ID ((REISER4_FORMAT_PLUGIN_TYPE<<16) | (FORMAT40_ID))
29640 +#define FORMAT40_SUPER 1
29641 +#define FORMAT40_JH 2
29642 +#define FORMAT40_JF 3
29643 +
29644 +/* declarations of functions implementing methods of layout plugin for
29645 +   format 40. The functions theirself are in disk_format40.c */
29646 +extern int init_format_format40(struct super_block *, void *data);
29647 +extern const reiser4_key *root_dir_key_format40(const struct super_block *);
29648 +extern int release_format40(struct super_block *s);
29649 +extern jnode *log_super_format40(struct super_block *s);
29650 +extern int check_open_format40(const struct inode *object);
29651 +extern int version_update_format40(struct super_block *super);
29652 +
29653 +/* __DISK_FORMAT40_H__ */
29654 +#endif
29655 +
29656 +/* Make Linus happy.
29657 +   Local variables:
29658 +   c-indentation-style: "K&R"
29659 +   mode-name: "LC"
29660 +   c-basic-offset: 8
29661 +   tab-width: 8
29662 +   fill-column: 120
29663 +   scroll-step: 1
29664 +   End:
29665 +*/
29666 diff -puN /dev/null fs/reiser4/plugin/fibration.c
29667 --- /dev/null
29668 +++ a/fs/reiser4/plugin/fibration.c
29669 @@ -0,0 +1,175 @@
29670 +/* Copyright 2004 by Hans Reiser, licensing governed by
29671 + * reiser4/README */
29672 +
29673 +/* Directory fibrations */
29674 +
29675 +/*
29676 + * Suppose we have a directory tree with sources of some project. During
29677 + * compilation .o files are created within this tree. This makes access
29678 + * to the original source files less efficient, because source files are
29679 + * now "diluted" by object files: default directory plugin uses prefix
29680 + * of a file name as a part of the key for directory entry (and this
29681 + * part is also inherited by the key of file body). This means that
29682 + * foo.o will be located close to foo.c and foo.h in the tree.
29683 + *
29684 + * To avoid this effect directory plugin fill highest 7 (unused
29685 + * originally) bits of the second component of the directory entry key
29686 + * by bit-pattern depending on the file name (see
29687 + * fs/reiser4/kassign.c:build_entry_key_common()). These bits are called
29688 + * "fibre". Fibre of the file name key is inherited by key of stat data
29689 + * and keys of file body (in the case of REISER4_LARGE_KEY).
29690 + *
29691 + * Fibre for a given file is chosen by per-directory fibration
29692 + * plugin. Names within given fibre are ordered lexicographically.
29693 + */
29694 +
29695 +#include "../debug.h"
29696 +#include "plugin_header.h"
29697 +#include "plugin.h"
29698 +#include "../super.h"
29699 +#include "../inode.h"
29700 +
29701 +#include <linux/types.h>
29702 +
29703 +static const int fibre_shift = 57;
29704 +
29705 +#define FIBRE_NO(n) (((__u64)(n)) << fibre_shift)
29706 +
29707 +/*
29708 + * Trivial fibration: all files of directory are just ordered
29709 + * lexicographically.
29710 + */
29711 +static __u64 fibre_trivial(const struct inode *dir, const char *name, int len)
29712 +{
29713 +       return FIBRE_NO(0);
29714 +}
29715 +
29716 +/*
29717 + * dot-o fibration: place .o files after all others.
29718 + */
29719 +static __u64 fibre_dot_o(const struct inode *dir, const char *name, int len)
29720 +{
29721 +       /* special treatment for .*\.o */
29722 +       if (len > 2 && name[len - 1] == 'o' && name[len - 2] == '.')
29723 +               return FIBRE_NO(1);
29724 +       else
29725 +               return FIBRE_NO(0);
29726 +}
29727 +
29728 +/*
29729 + * ext.1 fibration: subdivide directory into 128 fibrations one for each
29730 + * 7bit extension character (file "foo.h" goes into fibre "h"), plus
29731 + * default fibre for the rest.
29732 + */
29733 +static __u64 fibre_ext_1(const struct inode *dir, const char *name, int len)
29734 +{
29735 +       if (len > 2 && name[len - 2] == '.')
29736 +               return FIBRE_NO(name[len - 1]);
29737 +       else
29738 +               return FIBRE_NO(0);
29739 +}
29740 +
29741 +/*
29742 + * ext.3 fibration: try to separate files with different 3-character
29743 + * extensions from each other.
29744 + */
29745 +static __u64 fibre_ext_3(const struct inode *dir, const char *name, int len)
29746 +{
29747 +       if (len > 4 && name[len - 4] == '.')
29748 +               return FIBRE_NO(name[len - 3] + name[len - 2] + name[len - 1]);
29749 +       else
29750 +               return FIBRE_NO(0);
29751 +}
29752 +
29753 +static int change_fibration(struct inode *inode,
29754 +                           reiser4_plugin * plugin,
29755 +                           pset_member memb)
29756 +{
29757 +       int result;
29758 +
29759 +       assert("nikita-3503", inode != NULL);
29760 +       assert("nikita-3504", plugin != NULL);
29761 +
29762 +       assert("nikita-3505", is_reiser4_inode(inode));
29763 +       assert("nikita-3506", inode_dir_plugin(inode) != NULL);
29764 +       assert("nikita-3507",
29765 +              plugin->h.type_id == REISER4_FIBRATION_PLUGIN_TYPE);
29766 +
29767 +       result = 0;
29768 +       if (inode_fibration_plugin(inode) == NULL ||
29769 +           inode_fibration_plugin(inode)->h.id != plugin->h.id) {
29770 +               if (is_dir_empty(inode) == 0)
29771 +                       result = aset_set_unsafe(&reiser4_inode_data(inode)->pset,
29772 +                                                PSET_FIBRATION, plugin);
29773 +               else
29774 +                       result = RETERR(-ENOTEMPTY);
29775 +
29776 +       }
29777 +       return result;
29778 +}
29779 +
29780 +static reiser4_plugin_ops fibration_plugin_ops = {
29781 +       .init = NULL,
29782 +       .load = NULL,
29783 +       .save_len = NULL,
29784 +       .save = NULL,
29785 +       .change = change_fibration
29786 +};
29787 +
29788 +/* fibration plugins */
29789 +fibration_plugin fibration_plugins[LAST_FIBRATION_ID] = {
29790 +       [FIBRATION_LEXICOGRAPHIC] = {
29791 +               .h = {
29792 +                       .type_id = REISER4_FIBRATION_PLUGIN_TYPE,
29793 +                       .id = FIBRATION_LEXICOGRAPHIC,
29794 +                       .pops = &fibration_plugin_ops,
29795 +                       .label = "lexicographic",
29796 +                       .desc = "no fibration",
29797 +                       .linkage = {NULL, NULL}
29798 +               },
29799 +               .fibre = fibre_trivial
29800 +       },
29801 +       [FIBRATION_DOT_O] = {
29802 +               .h = {
29803 +                       .type_id = REISER4_FIBRATION_PLUGIN_TYPE,
29804 +                       .id = FIBRATION_DOT_O,
29805 +                       .pops = &fibration_plugin_ops,
29806 +                       .label = "dot-o",
29807 +                       .desc = "fibrate .o files separately",
29808 +                       .linkage = {NULL, NULL}
29809 +               },
29810 +               .fibre = fibre_dot_o
29811 +       },
29812 +       [FIBRATION_EXT_1] = {
29813 +               .h = {
29814 +                       .type_id = REISER4_FIBRATION_PLUGIN_TYPE,
29815 +                       .id = FIBRATION_EXT_1,
29816 +                       .pops = &fibration_plugin_ops,
29817 +                       .label = "ext-1",
29818 +                       .desc = "fibrate file by single character extension",
29819 +                       .linkage = {NULL, NULL}
29820 +               },
29821 +               .fibre = fibre_ext_1
29822 +       },
29823 +       [FIBRATION_EXT_3] = {
29824 +               .h = {
29825 +                       .type_id = REISER4_FIBRATION_PLUGIN_TYPE,
29826 +                       .id = FIBRATION_EXT_3,
29827 +                       .pops = &fibration_plugin_ops,
29828 +                       .label = "ext-3",
29829 +                       .desc = "fibrate file by three character extension",
29830 +                       .linkage = {NULL, NULL}
29831 +               },
29832 +               .fibre = fibre_ext_3
29833 +       }
29834 +};
29835 +
29836 +/*
29837 + * Local variables:
29838 + * c-indentation-style: "K&R"
29839 + * mode-name: "LC"
29840 + * c-basic-offset: 8
29841 + * tab-width: 8
29842 + * fill-column: 79
29843 + * End:
29844 + */
29845 diff -puN /dev/null fs/reiser4/plugin/fibration.h
29846 --- /dev/null
29847 +++ a/fs/reiser4/plugin/fibration.h
29848 @@ -0,0 +1,37 @@
29849 +/* Copyright 2004 by Hans Reiser, licensing governed by reiser4/README */
29850 +
29851 +/* Fibration plugin used by hashed directory plugin to segment content
29852 + * of directory. See fs/reiser4/plugin/fibration.c for more on this. */
29853 +
29854 +#if !defined(__FS_REISER4_PLUGIN_FIBRATION_H__)
29855 +#define __FS_REISER4_PLUGIN_FIBRATION_H__
29856 +
29857 +#include "plugin_header.h"
29858 +
29859 +typedef struct fibration_plugin {
29860 +       /* generic fields */
29861 +       plugin_header h;
29862 +
29863 +        __u64(*fibre) (const struct inode *dir, const char *name, int len);
29864 +} fibration_plugin;
29865 +
29866 +typedef enum {
29867 +       FIBRATION_LEXICOGRAPHIC,
29868 +       FIBRATION_DOT_O,
29869 +       FIBRATION_EXT_1,
29870 +       FIBRATION_EXT_3,
29871 +       LAST_FIBRATION_ID
29872 +} reiser4_fibration_id;
29873 +
29874 +/* __FS_REISER4_PLUGIN_FIBRATION_H__ */
29875 +#endif
29876 +
29877 +/* Make Linus happy.
29878 +   Local variables:
29879 +   c-indentation-style: "K&R"
29880 +   mode-name: "LC"
29881 +   c-basic-offset: 8
29882 +   tab-width: 8
29883 +   fill-column: 120
29884 +   End:
29885 +*/
29886 diff -puN /dev/null fs/reiser4/plugin/file/Makefile
29887 --- /dev/null
29888 +++ a/fs/reiser4/plugin/file/Makefile
29889 @@ -0,0 +1,7 @@
29890 +obj-$(CONFIG_REISER4_FS) += file_plugins.o
29891 +
29892 +file_plugins-objs :=           \
29893 +       file.o                  \
29894 +       tail_conversion.o       \
29895 +       symlink.o               \
29896 +       cryptcompress.o
29897 diff -puN /dev/null fs/reiser4/plugin/file/cryptcompress.c
29898 --- /dev/null
29899 +++ a/fs/reiser4/plugin/file/cryptcompress.c
29900 @@ -0,0 +1,3775 @@
29901 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
29902 +   reiser4/README */
29903 +/*
29904 + * Written by Edward Shishkin.
29905 + *
29906 + * Implementations of inode/file/address_space operations
29907 + * specific for cryptcompress file plugin which manages
29908 + * regular files built of compressed and(or) encrypted bodies.
29909 + * See http://dev.namesys.com/CryptcompressPlugin for details.
29910 + */
29911 +
29912 +#include "../../inode.h"
29913 +#include "../cluster.h"
29914 +#include "../object.h"
29915 +#include "../../tree_walk.h"
29916 +#include "cryptcompress.h"
29917 +
29918 +#include <linux/pagevec.h>
29919 +#include <asm/uaccess.h>
29920 +#include <linux/swap.h>
29921 +#include <linux/writeback.h>
29922 +#include <linux/random.h>
29923 +#include <linux/scatterlist.h>
29924 +
29925 +/*
29926 +               Managing primary and secondary caches by Reiser4
29927 +               cryptcompress file plugin. Synchronization scheme.
29928 +
29929 +
29930 +                                             +------------------+
29931 +                        +------------------->|    tfm stream    |
29932 +                        |                    | (compressed data)|
29933 +                  flush |                    +------------------+
29934 +                        +-----------------+           |
29935 +                        |(->)longterm lock|           V
29936 +--+        writepages() |                 |        +-***-+  reiser4        +---+
29937 +  |                     |                 +--+     | *** |  storage tree   |   |
29938 +  |                     |                    |     +-***-+  (primary cache)|   |
29939 +u | write()   (secondary| cache)             V    /   |   \                |   |
29940 +s | ---->  +----+ +----+ +----+ +----+     +-***** ******* **----+  ---->  | d |
29941 +e |        |    | |page cluster |    |     | **disk cluster**    |         | i |
29942 +r | <----  +----+ +----+ +----+ +----+     +-***** **********----+  <----  | s |
29943 +  | read()              ^                      ^      |                    | k |
29944 +  |                     |     (->)longterm lock|      |           page_io()|   |
29945 +  |                     |                      +------+                    |   |
29946 +--+         readpages() |                             |                    +---+
29947 +                        |                             V
29948 +                        |                    +------------------+
29949 +                        +--------------------|    tfm stream    |
29950 +                                             |   (plain text)   |
29951 +                                             +------------------+
29952 +*/
29953 +
29954 +/* get cryptcompress specific portion of inode */
29955 +struct cryptcompress_info *cryptcompress_inode_data(const struct inode *inode)
29956 +{
29957 +       return &reiser4_inode_data(inode)->file_plugin_data.cryptcompress_info;
29958 +}
29959 +
29960 +/* plugin->u.file.init_inode_data */
29961 +void init_inode_data_cryptcompress(struct inode *inode,
29962 +                                  reiser4_object_create_data * crd,
29963 +                                  int create)
29964 +{
29965 +       struct cryptcompress_info *data;
29966 +
29967 +       data = cryptcompress_inode_data(inode);
29968 +       assert("edward-685", data != NULL);
29969 +
29970 +       memset(data, 0, sizeof(*data));
29971 +
29972 +       mutex_init(&data->checkin_mutex);
29973 +       data->trunc_index = ULONG_MAX;
29974 +       turn_on_compression(data);
29975 +       set_lattice_factor(data, MIN_LATTICE_FACTOR);
29976 +       init_inode_ordering(inode, crd, create);
29977 +}
29978 +
29979 +/* The following is a part of reiser4 cipher key manager
29980 +   which is called when opening/creating a cryptcompress file */
29981 +
29982 +/* get/set cipher key info */
29983 +struct reiser4_crypto_info * inode_crypto_info (struct inode * inode)
29984 +{
29985 +       assert("edward-90", inode != NULL);
29986 +       assert("edward-91", reiser4_inode_data(inode) != NULL);
29987 +       return cryptcompress_inode_data(inode)->crypt;
29988 +}
29989 +
29990 +static void set_inode_crypto_info (struct inode * inode,
29991 +                                  struct reiser4_crypto_info * info)
29992 +{
29993 +       cryptcompress_inode_data(inode)->crypt = info;
29994 +}
29995 +
29996 +/* allocate a cipher key info */
29997 +struct reiser4_crypto_info * reiser4_alloc_crypto_info (struct inode * inode)
29998 +{
29999 +       struct reiser4_crypto_info *info;
30000 +       int fipsize;
30001 +
30002 +       info = kzalloc(sizeof(*info), reiser4_ctx_gfp_mask_get());
30003 +       if (!info)
30004 +               return ERR_PTR(-ENOMEM);
30005 +
30006 +       fipsize = inode_digest_plugin(inode)->fipsize;
30007 +       info->keyid = kmalloc(fipsize, reiser4_ctx_gfp_mask_get());
30008 +       if (!info->keyid) {
30009 +               kfree(info);
30010 +               return ERR_PTR(-ENOMEM);
30011 +       }
30012 +       info->host = inode;
30013 +       return info;
30014 +}
30015 +
30016 +#if 0
30017 +/* allocate/free low-level info for cipher and digest
30018 +   transforms */
30019 +static int alloc_crypto_tfms(struct reiser4_crypto_info * info)
30020 +{
30021 +       struct crypto_blkcipher * ctfm = NULL;
30022 +       struct crypto_hash      * dtfm = NULL;
30023 +       cipher_plugin * cplug = inode_cipher_plugin(info->host);
30024 +       digest_plugin * dplug = inode_digest_plugin(info->host);
30025 +
30026 +       if (cplug->alloc) {
30027 +               ctfm = cplug->alloc();
30028 +               if (IS_ERR(ctfm)) {
30029 +                       warning("edward-1364",
30030 +                               "Can not allocate info for %s\n",
30031 +                               cplug->h.desc);
30032 +                       return RETERR(PTR_ERR(ctfm));
30033 +               }
30034 +       }
30035 +       info_set_cipher(info, ctfm);
30036 +       if (dplug->alloc) {
30037 +               dtfm = dplug->alloc();
30038 +               if (IS_ERR(dtfm)) {
30039 +                       warning("edward-1365",
30040 +                               "Can not allocate info for %s\n",
30041 +                               dplug->h.desc);
30042 +                       goto unhappy_with_digest;
30043 +               }
30044 +       }
30045 +       info_set_digest(info, dtfm);
30046 +       return 0;
30047 + unhappy_with_digest:
30048 +       if (cplug->free) {
30049 +               cplug->free(ctfm);
30050 +               info_set_cipher(info, NULL);
30051 +       }
30052 +       return RETERR(PTR_ERR(dtfm));
30053 +}
30054 +#endif
30055 +
30056 +static void
30057 +free_crypto_tfms(struct reiser4_crypto_info * info)
30058 +{
30059 +       assert("edward-1366", info != NULL);
30060 +       if (!info_get_cipher(info)) {
30061 +               assert("edward-1601", !info_get_digest(info));
30062 +               return;
30063 +       }
30064 +       inode_cipher_plugin(info->host)->free(info_get_cipher(info));
30065 +       info_set_cipher(info, NULL);
30066 +       inode_digest_plugin(info->host)->free(info_get_digest(info));
30067 +       info_set_digest(info, NULL);
30068 +       return;
30069 +}
30070 +
30071 +#if 0
30072 +/* create a key fingerprint for disk stat-data */
30073 +static int create_keyid (struct reiser4_crypto_info * info,
30074 +                        struct reiser4_crypto_data * data)
30075 +{
30076 +       int ret = -ENOMEM;
30077 +       size_t blk, pad;
30078 +       __u8 * dmem;
30079 +       __u8 * cmem;
30080 +       struct hash_desc      ddesc;
30081 +       struct blkcipher_desc cdesc;
30082 +       struct scatterlist sg;
30083 +
30084 +       assert("edward-1367", info != NULL);
30085 +       assert("edward-1368", info->keyid != NULL);
30086 +
30087 +       ddesc.tfm = info_get_digest(info);
30088 +       ddesc.flags = 0;
30089 +       cdesc.tfm = info_get_cipher(info);
30090 +       cdesc.flags = 0;
30091 +
30092 +       dmem = kmalloc((size_t)crypto_hash_digestsize(ddesc.tfm),
30093 +                      reiser4_ctx_gfp_mask_get());
30094 +       if (!dmem)
30095 +               goto exit1;
30096 +
30097 +       blk = crypto_blkcipher_blocksize(cdesc.tfm);
30098 +
30099 +       pad = data->keyid_size % blk;
30100 +       pad = (pad ? blk - pad : 0);
30101 +
30102 +       cmem = kmalloc((size_t)data->keyid_size + pad,
30103 +                      reiser4_ctx_gfp_mask_get());
30104 +       if (!cmem)
30105 +               goto exit2;
30106 +       memcpy(cmem, data->keyid, data->keyid_size);
30107 +       memset(cmem + data->keyid_size, 0, pad);
30108 +
30109 +       sg_init_one(&sg, cmem, data->keyid_size + pad);
30110 +
30111 +       ret = crypto_blkcipher_encrypt(&cdesc, &sg, &sg,
30112 +                                      data->keyid_size + pad);
30113 +       if (ret) {
30114 +               warning("edward-1369",
30115 +                       "encryption failed flags=%x\n", cdesc.flags);
30116 +               goto exit3;
30117 +       }
30118 +       ret = crypto_hash_digest(&ddesc, &sg, sg.length, dmem);
30119 +       if (ret) {
30120 +               warning("edward-1602",
30121 +                       "digest failed flags=%x\n", ddesc.flags);
30122 +               goto exit3;
30123 +       }
30124 +       memcpy(info->keyid, dmem, inode_digest_plugin(info->host)->fipsize);
30125 + exit3:
30126 +       kfree(cmem);
30127 + exit2:
30128 +       kfree(dmem);
30129 + exit1:
30130 +       return ret;
30131 +}
30132 +#endif
30133 +
30134 +static void destroy_keyid(struct reiser4_crypto_info * info)
30135 +{
30136 +       assert("edward-1370", info != NULL);
30137 +       assert("edward-1371", info->keyid != NULL);
30138 +       kfree(info->keyid);
30139 +       return;
30140 +}
30141 +
30142 +static void __free_crypto_info (struct inode * inode)
30143 +{
30144 +       struct reiser4_crypto_info * info = inode_crypto_info(inode);
30145 +       assert("edward-1372", info != NULL);
30146 +
30147 +       free_crypto_tfms(info);
30148 +       destroy_keyid(info);
30149 +       kfree(info);
30150 +}
30151 +
30152 +#if 0
30153 +static void instantiate_crypto_info(struct reiser4_crypto_info * info)
30154 +{
30155 +       assert("edward-1373", info != NULL);
30156 +       assert("edward-1374", info->inst == 0);
30157 +       info->inst = 1;
30158 +}
30159 +#endif
30160 +
30161 +static void uninstantiate_crypto_info(struct reiser4_crypto_info * info)
30162 +{
30163 +       assert("edward-1375", info != NULL);
30164 +       info->inst = 0;
30165 +}
30166 +
30167 +#if 0
30168 +static int is_crypto_info_instantiated(struct reiser4_crypto_info * info)
30169 +{
30170 +       return info->inst;
30171 +}
30172 +
30173 +static int inode_has_cipher_key(struct inode * inode)
30174 +{
30175 +       assert("edward-1376", inode != NULL);
30176 +       return inode_crypto_info(inode) &&
30177 +               is_crypto_info_instantiated(inode_crypto_info(inode));
30178 +}
30179 +#endif
30180 +
30181 +static void free_crypto_info (struct inode * inode)
30182 +{
30183 +       uninstantiate_crypto_info(inode_crypto_info(inode));
30184 +       __free_crypto_info(inode);
30185 +}
30186 +
30187 +static int need_cipher(struct inode * inode)
30188 +{
30189 +       return inode_cipher_plugin(inode) !=
30190 +               cipher_plugin_by_id(NONE_CIPHER_ID);
30191 +}
30192 +
30193 +/* Parse @data which contains a (uninstantiated) cipher key imported
30194 +   from user space, create a low-level cipher info and attach it to
30195 +   the @object. If success, then info contains an instantiated key */
30196 +#if 0
30197 +struct reiser4_crypto_info * create_crypto_info(struct inode * object,
30198 +                                 struct reiser4_crypto_data * data)
30199 +{
30200 +       int ret;
30201 +       struct reiser4_crypto_info * info;
30202 +
30203 +       assert("edward-1377", data != NULL);
30204 +       assert("edward-1378", need_cipher(object));
30205 +
30206 +       if (inode_file_plugin(object) !=
30207 +           file_plugin_by_id(DIRECTORY_FILE_PLUGIN_ID))
30208 +               return ERR_PTR(-EINVAL);
30209 +
30210 +       info = reiser4_alloc_crypto_info(object);
30211 +       if (IS_ERR(info))
30212 +               return info;
30213 +       ret = alloc_crypto_tfms(info);
30214 +       if (ret)
30215 +               goto err;
30216 +       /* instantiating a key */
30217 +       ret = crypto_blkcipher_setkey(info_get_cipher(info),
30218 +                                     data->key,
30219 +                                     data->keysize);
30220 +       if (ret) {
30221 +               warning("edward-1379",
30222 +                       "setkey failed flags=%x",
30223 +                       crypto_blkcipher_get_flags(info_get_cipher(info)));
30224 +               goto err;
30225 +       }
30226 +       info->keysize = data->keysize;
30227 +       ret = create_keyid(info, data);
30228 +       if (ret)
30229 +               goto err;
30230 +       instantiate_crypto_info(info);
30231 +       return info;
30232 + err:
30233 +       __free_crypto_info(object);
30234 +       return ERR_PTR(ret);
30235 +}
30236 +#endif
30237 +
30238 +/* increment/decrement a load counter when
30239 +   attaching/detaching the crypto-stat to any object */
30240 +static void load_crypto_info(struct reiser4_crypto_info * info)
30241 +{
30242 +       assert("edward-1380", info != NULL);
30243 +       inc_keyload_count(info);
30244 +}
30245 +
30246 +static void unload_crypto_info(struct inode * inode)
30247 +{
30248 +       struct reiser4_crypto_info * info = inode_crypto_info(inode);
30249 +       assert("edward-1381", info->keyload_count > 0);
30250 +
30251 +       dec_keyload_count(inode_crypto_info(inode));
30252 +       if (info->keyload_count == 0)
30253 +               /* final release */
30254 +               free_crypto_info(inode);
30255 +}
30256 +
30257 +/* attach/detach an existing crypto-stat */
30258 +void reiser4_attach_crypto_info(struct inode * inode,
30259 +                               struct reiser4_crypto_info * info)
30260 +{
30261 +       assert("edward-1382", inode != NULL);
30262 +       assert("edward-1383", info != NULL);
30263 +       assert("edward-1384", inode_crypto_info(inode) == NULL);
30264 +
30265 +       set_inode_crypto_info(inode, info);
30266 +       load_crypto_info(info);
30267 +}
30268 +
30269 +/* returns true, if crypto stat can be attached to the @host */
30270 +#if REISER4_DEBUG
30271 +static int host_allows_crypto_info(struct inode * host)
30272 +{
30273 +       int ret;
30274 +       file_plugin * fplug = inode_file_plugin(host);
30275 +
30276 +       switch (fplug->h.id) {
30277 +       case CRYPTCOMPRESS_FILE_PLUGIN_ID:
30278 +               ret = 1;
30279 +               break;
30280 +       default:
30281 +               ret = 0;
30282 +       }
30283 +       return ret;
30284 +}
30285 +#endif  /*  REISER4_DEBUG  */
30286 +
30287 +static void reiser4_detach_crypto_info(struct inode * inode)
30288 +{
30289 +       assert("edward-1385", inode != NULL);
30290 +       assert("edward-1386", host_allows_crypto_info(inode));
30291 +
30292 +       if (inode_crypto_info(inode))
30293 +               unload_crypto_info(inode);
30294 +       set_inode_crypto_info(inode, NULL);
30295 +}
30296 +
30297 +#if 0
30298 +
30299 +/* compare fingerprints of @child and @parent */
30300 +static int keyid_eq(struct reiser4_crypto_info * child,
30301 +                   struct reiser4_crypto_info * parent)
30302 +{
30303 +       return !memcmp(child->keyid,
30304 +                      parent->keyid,
30305 +                      info_digest_plugin(parent)->fipsize);
30306 +}
30307 +
30308 +/* check if a crypto-stat (which is bound to @parent) can be inherited */
30309 +int can_inherit_crypto_cryptcompress(struct inode *child, struct inode *parent)
30310 +{
30311 +       if (!need_cipher(child))
30312 +               return 0;
30313 +       /* the child is created */
30314 +       if (!inode_crypto_info(child))
30315 +               return 1;
30316 +       /* the child is looked up */
30317 +       if (!inode_crypto_info(parent))
30318 +               return 0;
30319 +       return (inode_cipher_plugin(child) == inode_cipher_plugin(parent) &&
30320 +               inode_digest_plugin(child) == inode_digest_plugin(parent) &&
30321 +               inode_crypto_info(child)->keysize ==
30322 +               inode_crypto_info(parent)->keysize &&
30323 +               keyid_eq(inode_crypto_info(child), inode_crypto_info(parent)));
30324 +}
30325 +#endif
30326 +
30327 +/* helper functions for ->create() method of the cryptcompress plugin */
30328 +static int inode_set_crypto(struct inode * object)
30329 +{
30330 +       reiser4_inode * info;
30331 +       if (!inode_crypto_info(object)) {
30332 +               if (need_cipher(object))
30333 +                       return RETERR(-EINVAL);
30334 +               /* the file is not to be encrypted */
30335 +               return 0;
30336 +       }
30337 +       info = reiser4_inode_data(object);
30338 +       info->extmask |= (1 << CRYPTO_STAT);
30339 +       return 0;
30340 +}
30341 +
30342 +static int inode_init_compression(struct inode * object)
30343 +{
30344 +       int result = 0;
30345 +       assert("edward-1461", object != NULL);
30346 +       if (inode_compression_plugin(object)->init)
30347 +               result = inode_compression_plugin(object)->init();
30348 +       return result;
30349 +}
30350 +
30351 +static int inode_check_cluster(struct inode * object)
30352 +{
30353 +       assert("edward-696", object != NULL);
30354 +
30355 +       if (unlikely(inode_cluster_size(object) < PAGE_CACHE_SIZE)) {
30356 +               warning("edward-1320", "Can not support '%s' "
30357 +                       "logical clusters (less then page size)",
30358 +                       inode_cluster_plugin(object)->h.label);
30359 +               return RETERR(-EINVAL);
30360 +       }
30361 +       if (unlikely(inode_cluster_shift(object)) >= BITS_PER_BYTE*sizeof(int)){
30362 +               warning("edward-1463", "Can not support '%s' "
30363 +                       "logical clusters (too big for transform)",
30364 +                       inode_cluster_plugin(object)->h.label);
30365 +               return RETERR(-EINVAL);
30366 +       }
30367 +       return 0;
30368 +}
30369 +
30370 +/* plugin->destroy_inode() */
30371 +void destroy_inode_cryptcompress(struct inode * inode)
30372 +{
30373 +       assert("edward-1464", INODE_PGCOUNT(inode) == 0);
30374 +       reiser4_detach_crypto_info(inode);
30375 +       return;
30376 +}
30377 +
30378 +/* plugin->create_object():
30379 +. install plugins
30380 +. attach crypto info if specified
30381 +. attach compression info if specified
30382 +. attach cluster info
30383 +*/
30384 +int create_object_cryptcompress(struct inode *object, struct inode *parent,
30385 +                               reiser4_object_create_data * data)
30386 +{
30387 +       int result;
30388 +       reiser4_inode *info;
30389 +
30390 +       assert("edward-23", object != NULL);
30391 +       assert("edward-24", parent != NULL);
30392 +       assert("edward-30", data != NULL);
30393 +       assert("edward-26", reiser4_inode_get_flag(object, REISER4_NO_SD));
30394 +       assert("edward-27", data->id == CRYPTCOMPRESS_FILE_PLUGIN_ID);
30395 +
30396 +       info = reiser4_inode_data(object);
30397 +
30398 +       assert("edward-29", info != NULL);
30399 +
30400 +       /* set file bit */
30401 +       info->plugin_mask |= (1 << PSET_FILE);
30402 +
30403 +       /* set crypto */
30404 +       result = inode_set_crypto(object);
30405 +       if (result)
30406 +               goto error;
30407 +       /* set compression */
30408 +       result = inode_init_compression(object);
30409 +       if (result)
30410 +               goto error;
30411 +       /* set cluster */
30412 +       result = inode_check_cluster(object);
30413 +       if (result)
30414 +               goto error;
30415 +
30416 +       /* save everything in disk stat-data */
30417 +       result = write_sd_by_inode_common(object);
30418 +       if (!result)
30419 +               return 0;
30420 + error:
30421 +       reiser4_detach_crypto_info(object);
30422 +       return result;
30423 +}
30424 +
30425 +/* plugin->open() */
30426 +int open_cryptcompress(struct inode * inode, struct file * file)
30427 +{
30428 +       return 0;
30429 +}
30430 +
30431 +/* returns a blocksize, the attribute of a cipher algorithm */
30432 +static unsigned int
30433 +cipher_blocksize(struct inode * inode)
30434 +{
30435 +       assert("edward-758", need_cipher(inode));
30436 +       assert("edward-1400", inode_crypto_info(inode) != NULL);
30437 +       return crypto_blkcipher_blocksize
30438 +               (info_get_cipher(inode_crypto_info(inode)));
30439 +}
30440 +
30441 +/* returns offset translated by scale factor of the crypto-algorithm */
30442 +static loff_t inode_scaled_offset (struct inode * inode,
30443 +                                  const loff_t src_off /* input offset */)
30444 +{
30445 +       assert("edward-97", inode != NULL);
30446 +
30447 +       if (!need_cipher(inode) ||
30448 +           src_off == get_key_offset(reiser4_min_key()) ||
30449 +           src_off == get_key_offset(reiser4_max_key()))
30450 +               return src_off;
30451 +
30452 +       return inode_cipher_plugin(inode)->scale(inode,
30453 +                                                cipher_blocksize(inode),
30454 +                                                src_off);
30455 +}
30456 +
30457 +/* returns disk cluster size */
30458 +size_t inode_scaled_cluster_size(struct inode * inode)
30459 +{
30460 +       assert("edward-110", inode != NULL);
30461 +
30462 +       return inode_scaled_offset(inode, inode_cluster_size(inode));
30463 +}
30464 +
30465 +/* set number of cluster pages */
30466 +static void set_cluster_nrpages(struct cluster_handle * clust,
30467 +                               struct inode *inode)
30468 +{
30469 +       struct reiser4_slide * win;
30470 +
30471 +       assert("edward-180", clust != NULL);
30472 +       assert("edward-1040", inode != NULL);
30473 +
30474 +       clust->old_nrpages = size_in_pages(lbytes(clust->index, inode));
30475 +       win = clust->win;
30476 +       if (!win) {
30477 +               clust->nr_pages = size_in_pages(lbytes(clust->index, inode));
30478 +               return;
30479 +       }
30480 +       assert("edward-1176", clust->op != LC_INVAL);
30481 +       assert("edward-1064", win->off + win->count + win->delta != 0);
30482 +
30483 +       if (win->stat == HOLE_WINDOW &&
30484 +           win->off == 0 && win->count == inode_cluster_size(inode)) {
30485 +               /* special case: writing a "fake" logical cluster */
30486 +               clust->nr_pages = 0;
30487 +               return;
30488 +       }
30489 +       clust->nr_pages = size_in_pages(max(win->off + win->count + win->delta,
30490 +                                           lbytes(clust->index, inode)));
30491 +       return;
30492 +}
30493 +
30494 +/* plugin->key_by_inode()
30495 +   build key of a disk cluster */
30496 +int key_by_inode_cryptcompress(struct inode *inode, loff_t off,
30497 +                              reiser4_key * key)
30498 +{
30499 +       assert("edward-64", inode != 0);
30500 +
30501 +       if (likely(off != get_key_offset(reiser4_max_key())))
30502 +               off = off_to_clust_to_off(off, inode);
30503 +       if (inode_crypto_info(inode))
30504 +               off = inode_scaled_offset(inode, off);
30505 +
30506 +       key_by_inode_and_offset_common(inode, 0, key);
30507 +       set_key_offset(key, (__u64)off);
30508 +       return 0;
30509 +}
30510 +
30511 +/* plugin->flow_by_inode() */
30512 +/* flow is used to read/write disk clusters */
30513 +int flow_by_inode_cryptcompress(struct inode *inode, const char __user * buf,
30514 +                               int user,       /* 1: @buf is of user space,
30515 +                                                  0: kernel space */
30516 +                               loff_t size,    /* @buf size */
30517 +                               loff_t off,     /* offset to start io from */
30518 +                               rw_op op,       /* READ or WRITE */
30519 +                               flow_t * f      /* resulting flow */)
30520 +{
30521 +       assert("edward-436", f != NULL);
30522 +       assert("edward-149", inode != NULL);
30523 +       assert("edward-150", inode_file_plugin(inode) != NULL);
30524 +       assert("edward-1465", user == 0); /* we use flow to read/write
30525 +                                           disk clusters located in
30526 +                                           kernel space */
30527 +       f->length = size;
30528 +       memcpy(&f->data, &buf, sizeof(buf));
30529 +       f->user = user;
30530 +       f->op = op;
30531 +
30532 +       return key_by_inode_cryptcompress(inode, off, &f->key);
30533 +}
30534 +
30535 +static int
30536 +cryptcompress_hint_validate(hint_t * hint, const reiser4_key * key,
30537 +                           znode_lock_mode lock_mode)
30538 +{
30539 +       coord_t *coord;
30540 +
30541 +       assert("edward-704", hint != NULL);
30542 +       assert("edward-1089", !hint_is_valid(hint));
30543 +       assert("edward-706", hint->lh.owner == NULL);
30544 +
30545 +       coord = &hint->ext_coord.coord;
30546 +
30547 +       if (!hint || !hint_is_set(hint) || hint->mode != lock_mode)
30548 +               /* hint either not set or set by different operation */
30549 +               return RETERR(-E_REPEAT);
30550 +
30551 +       if (get_key_offset(key) != hint->offset)
30552 +               /* hint is set for different key */
30553 +               return RETERR(-E_REPEAT);
30554 +
30555 +       assert("edward-707", reiser4_schedulable());
30556 +
30557 +       return reiser4_seal_validate(&hint->seal, &hint->ext_coord.coord,
30558 +                                    key, &hint->lh, lock_mode,
30559 +                                    ZNODE_LOCK_LOPRI);
30560 +}
30561 +
30562 +/* reserve disk space when writing a logical cluster */
30563 +static int reserve4cluster(struct inode *inode, struct cluster_handle *clust)
30564 +{
30565 +       int result = 0;
30566 +
30567 +       assert("edward-965", reiser4_schedulable());
30568 +       assert("edward-439", inode != NULL);
30569 +       assert("edward-440", clust != NULL);
30570 +       assert("edward-441", clust->pages != NULL);
30571 +
30572 +       if (clust->nr_pages == 0) {
30573 +               assert("edward-1152", clust->win != NULL);
30574 +               assert("edward-1153", clust->win->stat == HOLE_WINDOW);
30575 +               /* don't reserve disk space for fake logical cluster */
30576 +               return 0;
30577 +       }
30578 +       assert("edward-442", jprivate(clust->pages[0]) != NULL);
30579 +
30580 +       result = reiser4_grab_space_force(estimate_insert_cluster(inode) +
30581 +                                         estimate_update_cluster(inode),
30582 +                                         BA_CAN_COMMIT);
30583 +       if (result)
30584 +               return result;
30585 +       clust->reserved = 1;
30586 +       grabbed2cluster_reserved(estimate_insert_cluster(inode) +
30587 +                                estimate_update_cluster(inode));
30588 +#if REISER4_DEBUG
30589 +       clust->reserved_prepped = estimate_update_cluster(inode);
30590 +       clust->reserved_unprepped = estimate_insert_cluster(inode);
30591 +#endif
30592 +       /* there can be space grabbed by txnmgr_force_commit_all */
30593 +       return 0;
30594 +}
30595 +
30596 +/* free reserved disk space if writing a logical cluster fails */
30597 +static void free_reserved4cluster(struct inode *inode,
30598 +                                 struct cluster_handle *ch, int count)
30599 +{
30600 +       assert("edward-967", ch->reserved == 1);
30601 +
30602 +       cluster_reserved2free(count);
30603 +       ch->reserved = 0;
30604 +}
30605 +
30606 +/* The core search procedure of the cryptcompress plugin.
30607 +   If returned value is not cbk_errored, then current znode is locked */
30608 +static int find_cluster_item(hint_t * hint,
30609 +                            const reiser4_key * key, /* key of the item we are
30610 +                                                        looking for */
30611 +                            znode_lock_mode lock_mode /* which lock */ ,
30612 +                            ra_info_t * ra_info, lookup_bias bias, __u32 flags)
30613 +{
30614 +       int result;
30615 +       reiser4_key ikey;
30616 +       int went_right = 0;
30617 +       coord_t *coord = &hint->ext_coord.coord;
30618 +       coord_t orig = *coord;
30619 +
30620 +       assert("edward-152", hint != NULL);
30621 +
30622 +       if (!hint_is_valid(hint)) {
30623 +               result = cryptcompress_hint_validate(hint, key, lock_mode);
30624 +               if (result == -E_REPEAT)
30625 +                       goto traverse_tree;
30626 +               else if (result) {
30627 +                       assert("edward-1216", 0);
30628 +                       return result;
30629 +               }
30630 +               hint_set_valid(hint);
30631 +       }
30632 +       assert("edward-709", znode_is_any_locked(coord->node));
30633 +
30634 +       /* In-place lookup is going here, it means we just need to
30635 +          check if next item of the @coord match to the @keyhint) */
30636 +
30637 +       if (equal_to_rdk(coord->node, key)) {
30638 +               result = goto_right_neighbor(coord, &hint->lh);
30639 +               if (result == -E_NO_NEIGHBOR) {
30640 +                       assert("edward-1217", 0);
30641 +                       return RETERR(-EIO);
30642 +               }
30643 +               if (result)
30644 +                       return result;
30645 +               assert("edward-1218", equal_to_ldk(coord->node, key));
30646 +               went_right = 1;
30647 +       } else {
30648 +               coord->item_pos++;
30649 +               coord->unit_pos = 0;
30650 +               coord->between = AT_UNIT;
30651 +       }
30652 +       result = zload(coord->node);
30653 +       if (result)
30654 +               return result;
30655 +       assert("edward-1219", !node_is_empty(coord->node));
30656 +
30657 +       if (!coord_is_existing_item(coord)) {
30658 +               zrelse(coord->node);
30659 +               goto not_found;
30660 +       }
30661 +       item_key_by_coord(coord, &ikey);
30662 +       zrelse(coord->node);
30663 +       if (!keyeq(key, &ikey))
30664 +               goto not_found;
30665 +       /* Ok, item is found, update node counts */
30666 +       if (went_right)
30667 +               dclust_inc_extension_ncount(hint);
30668 +       return CBK_COORD_FOUND;
30669 +
30670 + not_found:
30671 +       assert("edward-1220", coord->item_pos > 0);
30672 +       //coord->item_pos--;
30673 +       /* roll back */
30674 +       *coord = orig;
30675 +       ON_DEBUG(coord_update_v(coord));
30676 +       return CBK_COORD_NOTFOUND;
30677 +
30678 + traverse_tree:
30679 +       assert("edward-713", hint->lh.owner == NULL);
30680 +       assert("edward-714", reiser4_schedulable());
30681 +
30682 +       reiser4_unset_hint(hint);
30683 +       dclust_init_extension(hint);
30684 +       coord_init_zero(coord);
30685 +       result = coord_by_key(current_tree, key, coord, &hint->lh,
30686 +                             lock_mode, bias, LEAF_LEVEL, LEAF_LEVEL,
30687 +                             CBK_UNIQUE | flags, ra_info);
30688 +       if (cbk_errored(result))
30689 +               return result;
30690 +       if(result == CBK_COORD_FOUND)
30691 +               dclust_inc_extension_ncount(hint);
30692 +       hint_set_valid(hint);
30693 +       return result;
30694 +}
30695 +
30696 +/* This function is called by deflate[inflate] manager when
30697 +   creating a transformed/plain stream to check if we should
30698 +   create/cut some overhead. If this returns true, then @oh
30699 +   contains the size of this overhead.
30700 + */
30701 +static int need_cut_or_align(struct inode * inode,
30702 +                            struct cluster_handle * ch, rw_op rw, int * oh)
30703 +{
30704 +       struct tfm_cluster * tc = &ch->tc;
30705 +       switch (rw) {
30706 +       case WRITE_OP: /* estimate align */
30707 +               *oh = tc->len % cipher_blocksize(inode);
30708 +               if (*oh != 0)
30709 +                       return 1;
30710 +               break;
30711 +       case READ_OP:  /* estimate cut */
30712 +               *oh = *(tfm_output_data(ch) + tc->len - 1);
30713 +               break;
30714 +       default:
30715 +               impossible("edward-1401", "bad option");
30716 +       }
30717 +       return (tc->len != tc->lsize);
30718 +}
30719 +
30720 +/* create/cut an overhead of transformed/plain stream */
30721 +static void align_or_cut_overhead(struct inode * inode,
30722 +                                 struct cluster_handle * ch, rw_op rw)
30723 +{
30724 +       unsigned int oh;
30725 +       cipher_plugin * cplug = inode_cipher_plugin(inode);
30726 +
30727 +       assert("edward-1402", need_cipher(inode));
30728 +
30729 +       if (!need_cut_or_align(inode, ch, rw, &oh))
30730 +               return;
30731 +       switch (rw) {
30732 +       case WRITE_OP: /* do align */
30733 +               ch->tc.len +=
30734 +                       cplug->align_stream(tfm_input_data(ch) +
30735 +                                           ch->tc.len, ch->tc.len,
30736 +                                           cipher_blocksize(inode));
30737 +               *(tfm_input_data(ch) + ch->tc.len - 1) =
30738 +                       cipher_blocksize(inode) - oh;
30739 +               break;
30740 +       case READ_OP: /* do cut */
30741 +               assert("edward-1403", oh <= cipher_blocksize(inode));
30742 +               ch->tc.len -= oh;
30743 +               break;
30744 +       default:
30745 +               impossible("edward-1404", "bad option");
30746 +       }
30747 +       return;
30748 +}
30749 +
30750 +static unsigned max_cipher_overhead(struct inode * inode)
30751 +{
30752 +       if (!need_cipher(inode) || !inode_cipher_plugin(inode)->align_stream)
30753 +               return 0;
30754 +       return cipher_blocksize(inode);
30755 +}
30756 +
30757 +static int deflate_overhead(struct inode *inode)
30758 +{
30759 +       return (inode_compression_plugin(inode)->
30760 +               checksum ? DC_CHECKSUM_SIZE : 0);
30761 +}
30762 +
30763 +static unsigned deflate_overrun(struct inode * inode, int ilen)
30764 +{
30765 +       return coa_overrun(inode_compression_plugin(inode), ilen);
30766 +}
30767 +
30768 +/* Estimating compressibility of a logical cluster by various
30769 +   policies represented by compression mode plugin.
30770 +   If this returns false, then compressor won't be called for
30771 +   the cluster of index @index.
30772 +*/
30773 +static int should_compress(struct tfm_cluster * tc, cloff_t index,
30774 +                          struct inode *inode)
30775 +{
30776 +       compression_plugin *cplug = inode_compression_plugin(inode);
30777 +       compression_mode_plugin *mplug = inode_compression_mode_plugin(inode);
30778 +
30779 +       assert("edward-1321", tc->len != 0);
30780 +       assert("edward-1322", cplug != NULL);
30781 +       assert("edward-1323", mplug != NULL);
30782 +
30783 +       return /* estimate by size */
30784 +               (cplug->min_size_deflate ?
30785 +                tc->len >= cplug->min_size_deflate() :
30786 +                1) &&
30787 +               /* estimate by compression mode plugin */
30788 +               (mplug->should_deflate ?
30789 +                mplug->should_deflate(inode, index) :
30790 +                1);
30791 +}
30792 +
30793 +/* Evaluating results of compression transform.
30794 +   Returns true, if we need to accept this results */
30795 +static int save_compressed(int size_before, int size_after, struct inode *inode)
30796 +{
30797 +       return (size_after + deflate_overhead(inode) +
30798 +               max_cipher_overhead(inode) < size_before);
30799 +}
30800 +
30801 +/* Guess result of the evaluation above */
30802 +static int need_inflate(struct cluster_handle * ch, struct inode * inode,
30803 +                       int encrypted /* is cluster encrypted */ )
30804 +{
30805 +       struct tfm_cluster * tc = &ch->tc;
30806 +
30807 +       assert("edward-142", tc != 0);
30808 +       assert("edward-143", inode != NULL);
30809 +
30810 +       return tc->len <
30811 +           (encrypted ?
30812 +            inode_scaled_offset(inode, tc->lsize) :
30813 +            tc->lsize);
30814 +}
30815 +
30816 +/* If results of compression were accepted, then we add
30817 +   a checksum to catch possible disk cluster corruption.
30818 +   The following is a format of the data stored in disk clusters:
30819 +
30820 +                  data                   This is (transformed) logical cluster.
30821 +                  cipher_overhead        This is created by ->align() method
30822 +                                          of cipher plugin. May be absent.
30823 +                  checksum          (4)  This is created by ->checksum method
30824 +                                          of compression plugin to check
30825 +                                          integrity. May be absent.
30826 +
30827 +                  Crypto overhead format:
30828 +
30829 +                  data
30830 +                  control_byte      (1)   contains aligned overhead size:
30831 +                                          1 <= overhead <= cipher_blksize
30832 +*/
30833 +/* Append a checksum at the end of a transformed stream */
30834 +static void dc_set_checksum(compression_plugin * cplug, struct tfm_cluster * tc)
30835 +{
30836 +       __u32 checksum;
30837 +
30838 +       assert("edward-1309", tc != NULL);
30839 +       assert("edward-1310", tc->len > 0);
30840 +       assert("edward-1311", cplug->checksum != NULL);
30841 +
30842 +       checksum = cplug->checksum(tfm_stream_data(tc, OUTPUT_STREAM), tc->len);
30843 +       put_unaligned(cpu_to_le32(checksum),
30844 +                (d32 *)(tfm_stream_data(tc, OUTPUT_STREAM) + tc->len));
30845 +       tc->len += (int)DC_CHECKSUM_SIZE;
30846 +}
30847 +
30848 +/* Check a disk cluster checksum.
30849 +   Returns 0 if checksum is correct, otherwise returns 1 */
30850 +static int dc_check_checksum(compression_plugin * cplug, struct tfm_cluster * tc)
30851 +{
30852 +       assert("edward-1312", tc != NULL);
30853 +       assert("edward-1313", tc->len > (int)DC_CHECKSUM_SIZE);
30854 +       assert("edward-1314", cplug->checksum != NULL);
30855 +
30856 +       if (cplug->checksum(tfm_stream_data(tc, INPUT_STREAM),
30857 +                           tc->len - (int)DC_CHECKSUM_SIZE) !=
30858 +           le32_to_cpu(get_unaligned((d32 *)
30859 +                                     (tfm_stream_data(tc, INPUT_STREAM)
30860 +                                      + tc->len - (int)DC_CHECKSUM_SIZE)))) {
30861 +               warning("edward-156",
30862 +                       "Bad disk cluster checksum %d, (should be %d) Fsck?\n",
30863 +                       (int)le32_to_cpu
30864 +                       (get_unaligned((d32 *)
30865 +                                      (tfm_stream_data(tc, INPUT_STREAM) +
30866 +                                       tc->len - (int)DC_CHECKSUM_SIZE))),
30867 +                       (int)cplug->checksum
30868 +                       (tfm_stream_data(tc, INPUT_STREAM),
30869 +                        tc->len - (int)DC_CHECKSUM_SIZE));
30870 +               return 1;
30871 +       }
30872 +       tc->len -= (int)DC_CHECKSUM_SIZE;
30873 +       return 0;
30874 +}
30875 +
30876 +/* get input/output stream for some transform action */
30877 +int grab_tfm_stream(struct inode * inode, struct tfm_cluster * tc,
30878 +                   tfm_stream_id id)
30879 +{
30880 +       size_t size = inode_scaled_cluster_size(inode);
30881 +
30882 +       assert("edward-901", tc != NULL);
30883 +       assert("edward-1027", inode_compression_plugin(inode) != NULL);
30884 +
30885 +       if (cluster_get_tfm_act(tc) == TFMA_WRITE)
30886 +               size += deflate_overrun(inode, inode_cluster_size(inode));
30887 +
30888 +       if (!get_tfm_stream(tc, id) && id == INPUT_STREAM)
30889 +               alternate_streams(tc);
30890 +       if (!get_tfm_stream(tc, id))
30891 +               return alloc_tfm_stream(tc, size, id);
30892 +
30893 +       assert("edward-902", tfm_stream_is_set(tc, id));
30894 +
30895 +       if (tfm_stream_size(tc, id) < size)
30896 +               return realloc_tfm_stream(tc, size, id);
30897 +       return 0;
30898 +}
30899 +
30900 +/* Common deflate manager */
30901 +int reiser4_deflate_cluster(struct cluster_handle * clust, struct inode * inode)
30902 +{
30903 +       int result = 0;
30904 +       int compressed = 0;
30905 +       int encrypted = 0;
30906 +       struct tfm_cluster * tc = &clust->tc;
30907 +       compression_plugin * coplug;
30908 +
30909 +       assert("edward-401", inode != NULL);
30910 +       assert("edward-903", tfm_stream_is_set(tc, INPUT_STREAM));
30911 +       assert("edward-1348", cluster_get_tfm_act(tc) == TFMA_WRITE);
30912 +       assert("edward-498", !tfm_cluster_is_uptodate(tc));
30913 +
30914 +       coplug = inode_compression_plugin(inode);
30915 +       if (should_compress(tc, clust->index, inode)) {
30916 +               /* try to compress, discard bad results */
30917 +               __u32 dst_len;
30918 +               compression_mode_plugin * mplug =
30919 +                       inode_compression_mode_plugin(inode);
30920 +               assert("edward-602", coplug != NULL);
30921 +               assert("edward-1423", coplug->compress != NULL);
30922 +
30923 +               result = grab_coa(tc, coplug);
30924 +               if (result) {
30925 +                   warning("edward-1424",
30926 +                           "alloc_coa failed with ret=%d, skipped compression",
30927 +                           result);
30928 +                   goto cipher;
30929 +               }
30930 +               result = grab_tfm_stream(inode, tc, OUTPUT_STREAM);
30931 +               if (result) {
30932 +                   warning("edward-1425",
30933 +                        "alloc stream failed with ret=%d, skipped compression",
30934 +                           result);
30935 +                   goto cipher;
30936 +               }
30937 +               dst_len = tfm_stream_size(tc, OUTPUT_STREAM);
30938 +               coplug->compress(get_coa(tc, coplug->h.id, tc->act),
30939 +                                tfm_input_data(clust), tc->len,
30940 +                                tfm_output_data(clust), &dst_len);
30941 +               /* make sure we didn't overwrite extra bytes */
30942 +               assert("edward-603",
30943 +                      dst_len <= tfm_stream_size(tc, OUTPUT_STREAM));
30944 +
30945 +               /* evaluate results of compression transform */
30946 +               if (save_compressed(tc->len, dst_len, inode)) {
30947 +                       /* good result, accept */
30948 +                       tc->len = dst_len;
30949 +                       if (mplug->accept_hook != NULL) {
30950 +                              result = mplug->accept_hook(inode, clust->index);
30951 +                              if (result)
30952 +                                      warning("edward-1426",
30953 +                                              "accept_hook failed with ret=%d",
30954 +                                              result);
30955 +                       }
30956 +                       compressed = 1;
30957 +               }
30958 +               else {
30959 +                       /* bad result, discard */
30960 +#if 0
30961 +                       if (cluster_is_complete(clust, inode))
30962 +                             warning("edward-1496",
30963 +                                     "incompressible cluster %lu (inode %llu)",
30964 +                                     clust->index,
30965 +                                     (unsigned long long)get_inode_oid(inode));
30966 +#endif
30967 +                       if (mplug->discard_hook != NULL &&
30968 +                           cluster_is_complete(clust, inode)) {
30969 +                               result = mplug->discard_hook(inode,
30970 +                                                            clust->index);
30971 +                               if (result)
30972 +                                     warning("edward-1427",
30973 +                                             "discard_hook failed with ret=%d",
30974 +                                             result);
30975 +                       }
30976 +               }
30977 +       }
30978 + cipher:
30979 +       if (need_cipher(inode)) {
30980 +               cipher_plugin * ciplug;
30981 +               struct blkcipher_desc desc;
30982 +               struct scatterlist src;
30983 +               struct scatterlist dst;
30984 +
30985 +               ciplug = inode_cipher_plugin(inode);
30986 +               desc.tfm = info_get_cipher(inode_crypto_info(inode));
30987 +               desc.flags = 0;
30988 +               if (compressed)
30989 +                       alternate_streams(tc);
30990 +               result = grab_tfm_stream(inode, tc, OUTPUT_STREAM);
30991 +               if (result)
30992 +                       return result;
30993 +
30994 +               align_or_cut_overhead(inode, clust, WRITE_OP);
30995 +               sg_init_one(&src, tfm_input_data(clust), tc->len);
30996 +               sg_init_one(&dst, tfm_output_data(clust), tc->len);
30997 +
30998 +               result = crypto_blkcipher_encrypt(&desc, &dst, &src, tc->len);
30999 +               if (result) {
31000 +                       warning("edward-1405",
31001 +                               "encryption failed flags=%x\n", desc.flags);
31002 +                       return result;
31003 +               }
31004 +               encrypted = 1;
31005 +       }
31006 +       if (compressed && coplug->checksum != NULL)
31007 +               dc_set_checksum(coplug, tc);
31008 +       if (!compressed && !encrypted)
31009 +               alternate_streams(tc);
31010 +       return result;
31011 +}
31012 +
31013 +/* Common inflate manager. */
31014 +int reiser4_inflate_cluster(struct cluster_handle * clust, struct inode * inode)
31015 +{
31016 +       int result = 0;
31017 +       int transformed = 0;
31018 +       struct tfm_cluster * tc = &clust->tc;
31019 +       compression_plugin * coplug;
31020 +
31021 +       assert("edward-905", inode != NULL);
31022 +       assert("edward-1178", clust->dstat == PREP_DISK_CLUSTER);
31023 +       assert("edward-906", tfm_stream_is_set(&clust->tc, INPUT_STREAM));
31024 +       assert("edward-1349", tc->act == TFMA_READ);
31025 +       assert("edward-907", !tfm_cluster_is_uptodate(tc));
31026 +
31027 +       /* Handle a checksum (if any) */
31028 +       coplug = inode_compression_plugin(inode);
31029 +       if (need_inflate(clust, inode, need_cipher(inode)) &&
31030 +           coplug->checksum != NULL) {
31031 +               result = dc_check_checksum(coplug, tc);
31032 +               if (unlikely(result)) {
31033 +                       warning("edward-1460",
31034 +                               "Inode %llu: disk cluster %lu looks corrupted",
31035 +                               (unsigned long long)get_inode_oid(inode),
31036 +                               clust->index);
31037 +                       return RETERR(-EIO);
31038 +               }
31039 +       }
31040 +       if (need_cipher(inode)) {
31041 +               cipher_plugin * ciplug;
31042 +               struct blkcipher_desc desc;
31043 +               struct scatterlist src;
31044 +               struct scatterlist dst;
31045 +
31046 +               ciplug = inode_cipher_plugin(inode);
31047 +               desc.tfm = info_get_cipher(inode_crypto_info(inode));
31048 +               desc.flags = 0;
31049 +               result = grab_tfm_stream(inode, tc, OUTPUT_STREAM);
31050 +               if (result)
31051 +                       return result;
31052 +               assert("edward-909", tfm_cluster_is_set(tc));
31053 +
31054 +               sg_init_one(&src, tfm_input_data(clust), tc->len);
31055 +               sg_init_one(&dst, tfm_output_data(clust), tc->len);
31056 +
31057 +               result = crypto_blkcipher_decrypt(&desc, &dst, &src, tc->len);
31058 +               if (result) {
31059 +                       warning("edward-1600", "decrypt failed flags=%x\n",
31060 +                               desc.flags);
31061 +                       return result;
31062 +               }
31063 +               align_or_cut_overhead(inode, clust, READ_OP);
31064 +               transformed = 1;
31065 +       }
31066 +       if (need_inflate(clust, inode, 0)) {
31067 +               unsigned dst_len = inode_cluster_size(inode);
31068 +               if(transformed)
31069 +                       alternate_streams(tc);
31070 +
31071 +               result = grab_tfm_stream(inode, tc, OUTPUT_STREAM);
31072 +               if (result)
31073 +                       return result;
31074 +               assert("edward-1305", coplug->decompress != NULL);
31075 +               assert("edward-910", tfm_cluster_is_set(tc));
31076 +
31077 +               coplug->decompress(get_coa(tc, coplug->h.id, tc->act),
31078 +                                  tfm_input_data(clust), tc->len,
31079 +                                  tfm_output_data(clust), &dst_len);
31080 +               /* check length */
31081 +               tc->len = dst_len;
31082 +               assert("edward-157", dst_len == tc->lsize);
31083 +               transformed = 1;
31084 +       }
31085 +       if (!transformed)
31086 +               alternate_streams(tc);
31087 +       return result;
31088 +}
31089 +
31090 +/* This is implementation of readpage method of struct
31091 +   address_space_operations for cryptcompress plugin. */
31092 +int readpage_cryptcompress(struct file *file, struct page *page)
31093 +{
31094 +       reiser4_context *ctx;
31095 +       struct cluster_handle clust;
31096 +       item_plugin *iplug;
31097 +       int result;
31098 +
31099 +       assert("edward-88", PageLocked(page));
31100 +       assert("vs-976", !PageUptodate(page));
31101 +       assert("edward-89", page->mapping && page->mapping->host);
31102 +
31103 +       ctx = reiser4_init_context(page->mapping->host->i_sb);
31104 +       if (IS_ERR(ctx)) {
31105 +               unlock_page(page);
31106 +               return PTR_ERR(ctx);
31107 +       }
31108 +       assert("edward-113",
31109 +              ergo(file != NULL,
31110 +                   page->mapping == file->f_dentry->d_inode->i_mapping));
31111 +
31112 +       if (PageUptodate(page)) {
31113 +               warning("edward-1338", "page is already uptodate\n");
31114 +               unlock_page(page);
31115 +               reiser4_exit_context(ctx);
31116 +               return 0;
31117 +       }
31118 +       cluster_init_read(&clust, NULL);
31119 +       clust.file = file;
31120 +       iplug = item_plugin_by_id(CTAIL_ID);
31121 +       if (!iplug->s.file.readpage) {
31122 +               unlock_page(page);
31123 +               put_cluster_handle(&clust);
31124 +               reiser4_exit_context(ctx);
31125 +               return -EINVAL;
31126 +       }
31127 +       result = iplug->s.file.readpage(&clust, page);
31128 +
31129 +       put_cluster_handle(&clust);
31130 +       reiser4_txn_restart(ctx);
31131 +       reiser4_exit_context(ctx);
31132 +       return result;
31133 +}
31134 +
31135 +/* number of pages to check in */
31136 +static int get_new_nrpages(struct cluster_handle * clust)
31137 +{
31138 +       switch (clust->op) {
31139 +       case LC_APPOV:
31140 +               return clust->nr_pages;
31141 +       case LC_TRUNC:
31142 +               assert("edward-1179", clust->win != NULL);
31143 +               return size_in_pages(clust->win->off + clust->win->count);
31144 +       default:
31145 +               impossible("edward-1180", "bad page cluster option");
31146 +               return 0;
31147 +       }
31148 +}
31149 +
31150 +static void set_cluster_pages_dirty(struct cluster_handle * clust,
31151 +                                   struct inode * inode)
31152 +{
31153 +       int i;
31154 +       struct page *pg;
31155 +       int nrpages = get_new_nrpages(clust);
31156 +
31157 +       for (i = 0; i < nrpages; i++) {
31158 +
31159 +               pg = clust->pages[i];
31160 +               assert("edward-968", pg != NULL);
31161 +               lock_page(pg);
31162 +               assert("edward-1065", PageUptodate(pg));
31163 +               reiser4_set_page_dirty_internal(pg);
31164 +               unlock_page(pg);
31165 +               mark_page_accessed(pg);
31166 +       }
31167 +}
31168 +
31169 +/* Grab a page cluster for read/write operations.
31170 +   Attach a jnode for write operations (when preparing for modifications, which
31171 +   are supposed to be committed).
31172 +
31173 +   We allocate only one jnode per page cluster; this jnode is binded to the
31174 +   first page of this cluster, so we have an extra-reference that will be put
31175 +   as soon as jnode is evicted from memory), other references will be cleaned
31176 +   up in flush time (assume that check in page cluster was successful).
31177 +*/
31178 +int grab_page_cluster(struct inode * inode,
31179 +                     struct cluster_handle * clust, rw_op rw)
31180 +{
31181 +       int i;
31182 +       int result = 0;
31183 +       jnode *node = NULL;
31184 +
31185 +       assert("edward-182", clust != NULL);
31186 +       assert("edward-183", clust->pages != NULL);
31187 +       assert("edward-1466", clust->node == NULL);
31188 +       assert("edward-1428", inode != NULL);
31189 +       assert("edward-1429", inode->i_mapping != NULL);
31190 +       assert("edward-184", clust->nr_pages <= cluster_nrpages(inode));
31191 +
31192 +       if (clust->nr_pages == 0)
31193 +               return 0;
31194 +
31195 +       for (i = 0; i < clust->nr_pages; i++) {
31196 +
31197 +               assert("edward-1044", clust->pages[i] == NULL);
31198 +
31199 +               clust->pages[i] =
31200 +                      find_or_create_page(inode->i_mapping,
31201 +                                          clust_to_pg(clust->index, inode) + i,
31202 +                                          reiser4_ctx_gfp_mask_get());
31203 +               if (!clust->pages[i]) {
31204 +                       result = RETERR(-ENOMEM);
31205 +                       break;
31206 +               }
31207 +               if (i == 0 && rw == WRITE_OP) {
31208 +                       node = jnode_of_page(clust->pages[i]);
31209 +                       if (IS_ERR(node)) {
31210 +                               result = PTR_ERR(node);
31211 +                               unlock_page(clust->pages[i]);
31212 +                               break;
31213 +                       }
31214 +                       JF_SET(node, JNODE_CLUSTER_PAGE);
31215 +                       assert("edward-920", jprivate(clust->pages[0]));
31216 +               }
31217 +               INODE_PGCOUNT_INC(inode);
31218 +               unlock_page(clust->pages[i]);
31219 +       }
31220 +       if (unlikely(result)) {
31221 +               while (i) {
31222 +                       put_cluster_page(clust->pages[--i]);
31223 +                       INODE_PGCOUNT_DEC(inode);
31224 +               }
31225 +               if (node && !IS_ERR(node))
31226 +                       jput(node);
31227 +               return result;
31228 +       }
31229 +       clust->node = node;
31230 +       return 0;
31231 +}
31232 +
31233 +static void truncate_page_cluster_range(struct inode * inode,
31234 +                                       struct page ** pages,
31235 +                                       cloff_t index,
31236 +                                       int from, int count,
31237 +                                       int even_cows)
31238 +{
31239 +       assert("edward-1467", count > 0);
31240 +       reiser4_invalidate_pages(inode->i_mapping,
31241 +                                clust_to_pg(index, inode) + from,
31242 +                                count, even_cows);
31243 +}
31244 +
31245 +/* Put @count pages starting from @from offset */
31246 +void __put_page_cluster(int from, int count,
31247 +                       struct page ** pages, struct inode  * inode)
31248 +{
31249 +       int i;
31250 +       assert("edward-1468", pages != NULL);
31251 +       assert("edward-1469", inode != NULL);
31252 +       assert("edward-1470", from >= 0 && count >= 0);
31253 +
31254 +       for (i = 0; i < count; i++) {
31255 +               assert("edward-1471", pages[from + i] != NULL);
31256 +               assert("edward-1472",
31257 +                      pages[from + i]->index == pages[from]->index + i);
31258 +
31259 +               put_cluster_page(pages[from + i]);
31260 +               INODE_PGCOUNT_DEC(inode);
31261 +       }
31262 +}
31263 +
31264 +/*
31265 + * This is dual to grab_page_cluster,
31266 + * however if @rw == WRITE_OP, then we call this function
31267 + * only if something is failed before checkin page cluster.
31268 + */
31269 +void put_page_cluster(struct cluster_handle * clust,
31270 +                     struct inode * inode, rw_op rw)
31271 +{
31272 +       assert("edward-445", clust != NULL);
31273 +       assert("edward-922", clust->pages != NULL);
31274 +       assert("edward-446",
31275 +              ergo(clust->nr_pages != 0, clust->pages[0] != NULL));
31276 +
31277 +       __put_page_cluster(0, clust->nr_pages, clust->pages, inode);
31278 +       if (rw == WRITE_OP) {
31279 +               if (unlikely(clust->node)) {
31280 +                       assert("edward-447",
31281 +                              clust->node == jprivate(clust->pages[0]));
31282 +                       jput(clust->node);
31283 +                       clust->node = NULL;
31284 +               }
31285 +       }
31286 +}
31287 +
31288 +#if REISER4_DEBUG
31289 +int cryptcompress_inode_ok(struct inode *inode)
31290 +{
31291 +       if (!(reiser4_inode_data(inode)->plugin_mask & (1 << PSET_FILE)))
31292 +               return 0;
31293 +       if (!cluster_shift_ok(inode_cluster_shift(inode)))
31294 +               return 0;
31295 +       return 1;
31296 +}
31297 +
31298 +static int window_ok(struct reiser4_slide * win, struct inode *inode)
31299 +{
31300 +       assert("edward-1115", win != NULL);
31301 +       assert("edward-1116", ergo(win->delta, win->stat == HOLE_WINDOW));
31302 +
31303 +       return (win->off != inode_cluster_size(inode)) &&
31304 +           (win->off + win->count + win->delta <= inode_cluster_size(inode));
31305 +}
31306 +
31307 +static int cluster_ok(struct cluster_handle * clust, struct inode *inode)
31308 +{
31309 +       assert("edward-279", clust != NULL);
31310 +
31311 +       if (!clust->pages)
31312 +               return 0;
31313 +       return (clust->win ? window_ok(clust->win, inode) : 1);
31314 +}
31315 +#if 0
31316 +static int pages_truncate_ok(struct inode *inode, pgoff_t start)
31317 +{
31318 +       int found;
31319 +       struct page * page;
31320 +
31321 +       found = find_get_pages(inode->i_mapping, start, 1, &page);
31322 +       if (found)
31323 +               put_cluster_page(page);
31324 +       return !found;
31325 +}
31326 +#else
31327 +#define pages_truncate_ok(inode, start) 1
31328 +#endif
31329 +
31330 +static int jnode_truncate_ok(struct inode *inode, cloff_t index)
31331 +{
31332 +       jnode *node;
31333 +       node = jlookup(current_tree, get_inode_oid(inode),
31334 +                      clust_to_pg(index, inode));
31335 +       if (likely(!node))
31336 +               return 1;
31337 +       jput(node);
31338 +       return 0;
31339 +}
31340 +
31341 +static int find_fake_appended(struct inode *inode, cloff_t * index);
31342 +
31343 +static int body_truncate_ok(struct inode *inode, cloff_t aidx)
31344 +{
31345 +       int result;
31346 +       cloff_t raidx;
31347 +
31348 +       result = find_fake_appended(inode, &raidx);
31349 +       return !result && (aidx == raidx);
31350 +}
31351 +#endif
31352 +
31353 +/* guess next window stat */
31354 +static inline window_stat next_window_stat(struct reiser4_slide * win)
31355 +{
31356 +       assert("edward-1130", win != NULL);
31357 +       return ((win->stat == HOLE_WINDOW && win->delta == 0) ?
31358 +               HOLE_WINDOW : DATA_WINDOW);
31359 +}
31360 +
31361 +/* guess and set next cluster index and window params */
31362 +static void move_update_window(struct inode * inode,
31363 +                              struct cluster_handle * clust,
31364 +                              loff_t file_off, loff_t to_file)
31365 +{
31366 +       struct reiser4_slide * win;
31367 +
31368 +       assert("edward-185", clust != NULL);
31369 +       assert("edward-438", clust->pages != NULL);
31370 +       assert("edward-281", cluster_ok(clust, inode));
31371 +
31372 +       win = clust->win;
31373 +       if (!win)
31374 +               return;
31375 +
31376 +       switch (win->stat) {
31377 +       case DATA_WINDOW:
31378 +               /* increment */
31379 +               clust->index++;
31380 +               win->stat = DATA_WINDOW;
31381 +               win->off = 0;
31382 +               win->count = min((loff_t)inode_cluster_size(inode), to_file);
31383 +               break;
31384 +       case HOLE_WINDOW:
31385 +               switch (next_window_stat(win)) {
31386 +               case HOLE_WINDOW:
31387 +                       /* skip */
31388 +                       clust->index = off_to_clust(file_off, inode);
31389 +                       win->stat = HOLE_WINDOW;
31390 +                       win->off = 0;
31391 +                       win->count = off_to_cloff(file_off, inode);
31392 +                       win->delta = min((loff_t)(inode_cluster_size(inode) -
31393 +                                                 win->count), to_file);
31394 +                       break;
31395 +               case DATA_WINDOW:
31396 +                       /* stay */
31397 +                       win->stat = DATA_WINDOW;
31398 +                       /* off+count+delta=inv */
31399 +                       win->off = win->off + win->count;
31400 +                       win->count = win->delta;
31401 +                       win->delta = 0;
31402 +                       break;
31403 +               default:
31404 +                       impossible("edward-282", "wrong next window state");
31405 +               }
31406 +               break;
31407 +       default:
31408 +               impossible("edward-283", "wrong current window state");
31409 +       }
31410 +       assert("edward-1068", cluster_ok(clust, inode));
31411 +}
31412 +
31413 +static int update_sd_cryptcompress(struct inode *inode)
31414 +{
31415 +       int result = 0;
31416 +
31417 +       assert("edward-978", reiser4_schedulable());
31418 +
31419 +       result = reiser4_grab_space_force(/* one for stat data update */
31420 +                                         estimate_update_common(inode),
31421 +                                         BA_CAN_COMMIT);
31422 +       if (result)
31423 +               return result;
31424 +       inode->i_ctime = inode->i_mtime = CURRENT_TIME;
31425 +       result = reiser4_update_sd(inode);
31426 +
31427 +       return result;
31428 +}
31429 +
31430 +static void uncapture_cluster_jnode(jnode * node)
31431 +{
31432 +       txn_atom *atom;
31433 +
31434 +       assert_spin_locked(&(node->guard));
31435 +
31436 +       atom = jnode_get_atom(node);
31437 +       if (atom == NULL) {
31438 +               assert("jmacd-7111", !JF_ISSET(node, JNODE_DIRTY));
31439 +               spin_unlock_jnode(node);
31440 +               return;
31441 +       }
31442 +       reiser4_uncapture_block(node);
31443 +       spin_unlock_atom(atom);
31444 +       jput(node);
31445 +}
31446 +
31447 +static void put_found_pages(struct page **pages, int nr)
31448 +{
31449 +       int i;
31450 +       for (i = 0; i < nr; i++) {
31451 +               assert("edward-1045", pages[i] != NULL);
31452 +               put_cluster_page(pages[i]);
31453 +       }
31454 +}
31455 +
31456 +/*             Lifecycle of a logical cluster in the system.
31457 + *
31458 + *
31459 + * Logical cluster of a cryptcompress file is represented in the system by
31460 + * . page cluster (in memory, primary cache, contains plain text);
31461 + * . disk cluster (in memory, secondary cache, contains transformed text).
31462 + * Primary cache is to reduce number of transform operations (compression,
31463 + * encryption), i.e. to implement transform-caching strategy.
31464 + * Secondary cache is to reduce number of I/O operations, i.e. for usual
31465 + * write-caching strategy. Page cluster is a set of pages, i.e. mapping of
31466 + * a logical cluster to the primary cache. Disk cluster is a set of items
31467 + * of the same type defined by some reiser4 item plugin id.
31468 + *
31469 + *              1. Performing modifications
31470 + *
31471 + * Every modification of a cryptcompress file is considered as a set of
31472 + * operations performed on file's logical clusters. Every such "atomic"
31473 + * modification is truncate, append and(or) overwrite some bytes of a
31474 + * logical cluster performed in the primary cache with the following
31475 + * synchronization with the secondary cache (in flush time). Disk clusters,
31476 + * which live in the secondary cache, are supposed to be synchronized with
31477 + * disk. The mechanism of synchronization of primary and secondary caches
31478 + * includes so-called checkin/checkout technique described below.
31479 + *
31480 + *              2. Submitting modifications
31481 + *
31482 + * Each page cluster has associated jnode (a special in-memory header to
31483 + * keep a track of transactions in reiser4), which is attached to its first
31484 + * page when grabbing page cluster for modifications (see grab_page_cluster).
31485 + * Submitting modifications (see checkin_logical_cluster) is going per logical
31486 + * cluster and includes:
31487 + * . checkin_cluster_size;
31488 + * . checkin_page_cluster.
31489 + * checkin_cluster_size() is resolved to file size update (which completely
31490 + * defines new size of logical cluster (number of file's bytes in a logical
31491 + * cluster).
31492 + * checkin_page_cluster() captures jnode of a page cluster and installs
31493 + * jnode's dirty flag (if needed) to indicate that modifications are
31494 + * successfully checked in.
31495 + *
31496 + *              3. Checking out modifications
31497 + *
31498 + * Is going per logical cluster in flush time (see checkout_logical_cluster).
31499 + * This is the time of synchronizing primary and secondary caches.
31500 + * checkout_logical_cluster() includes:
31501 + * . checkout_page_cluster (retrieving checked in pages).
31502 + * . uncapture jnode (including clear dirty flag and unlock)
31503 + *
31504 + *              4. Committing modifications
31505 + *
31506 + * Proceeding a synchronization of primary and secondary caches. When checking
31507 + * out page cluster (the phase above) pages are locked/flushed/unlocked
31508 + * one-by-one in ascending order of their indexes to contiguous stream, which
31509 + * is supposed to be transformed (compressed, encrypted), chopped up into items
31510 + * and committed to disk as a disk cluster.
31511 + *
31512 + *              5. Managing page references
31513 + *
31514 + * Every checked in page have a special additional "control" reference,
31515 + * which is dropped at checkout. We need this to avoid unexpected evicting
31516 + * pages from memory before checkout. Control references are managed so
31517 + * they are not accumulated with every checkin:
31518 + *
31519 + *            0
31520 + * checkin -> 1
31521 + *            0 -> checkout
31522 + * checkin -> 1
31523 + * checkin -> 1
31524 + * checkin -> 1
31525 + *            0 -> checkout
31526 + *           ...
31527 + *
31528 + * Every page cluster has its own unique "cluster lock". Update/drop
31529 + * references are serialized via this lock. Number of checked in cluster
31530 + * pages is calculated by i_size under cluster lock. File size is updated
31531 + * at every checkin action also under cluster lock (except cases of
31532 + * appending/truncating fake logical clusters).
31533 + *
31534 + * Proof of correctness:
31535 + *
31536 + * Since we update file size under cluster lock, in the case of non-fake
31537 + * logical cluster with its lock held we do have expected number of checked
31538 + * in pages. On the other hand, append/truncate of fake logical clusters
31539 + * doesn't change number of checked in pages of any cluster.
31540 + *
31541 + * NOTE-EDWARD: As cluster lock we use guard (spinlock_t) of its jnode.
31542 + * Currently, I don't see any reason to create a special lock for those
31543 + * needs.
31544 + */
31545 +
31546 +static inline void lock_cluster(jnode * node)
31547 +{
31548 +       spin_lock_jnode(node);
31549 +}
31550 +
31551 +static inline void unlock_cluster(jnode * node)
31552 +{
31553 +       spin_unlock_jnode(node);
31554 +}
31555 +
31556 +static inline void unlock_cluster_uncapture(jnode * node)
31557 +{
31558 +       uncapture_cluster_jnode(node);
31559 +}
31560 +
31561 +/* Set new file size by window. Cluster lock is required. */
31562 +static void checkin_file_size(struct cluster_handle * clust,
31563 +                             struct inode * inode)
31564 +{
31565 +       loff_t new_size;
31566 +       struct reiser4_slide * win;
31567 +
31568 +       assert("edward-1181", clust != NULL);
31569 +       assert("edward-1182", inode != NULL);
31570 +       assert("edward-1473", clust->pages != NULL);
31571 +       assert("edward-1474", clust->pages[0] != NULL);
31572 +       assert("edward-1475", jprivate(clust->pages[0]) != NULL);
31573 +       assert_spin_locked(&(jprivate(clust->pages[0])->guard));
31574 +
31575 +
31576 +       win = clust->win;
31577 +       assert("edward-1183", win != NULL);
31578 +
31579 +       new_size = clust_to_off(clust->index, inode) + win->off;
31580 +
31581 +       switch (clust->op) {
31582 +       case LC_APPOV:
31583 +               if (new_size + win->count <= i_size_read(inode))
31584 +                       /* overwrite only */
31585 +                       return;
31586 +               new_size += win->count;
31587 +               break;
31588 +       case LC_TRUNC:
31589 +               break;
31590 +       default:
31591 +               impossible("edward-1184", "bad page cluster option");
31592 +               break;
31593 +       }
31594 +       inode_check_scale_nolock(inode, i_size_read(inode), new_size);
31595 +       i_size_write(inode, new_size);
31596 +       return;
31597 +}
31598 +
31599 +static inline void checkin_cluster_size(struct cluster_handle * clust,
31600 +                                       struct inode * inode)
31601 +{
31602 +       if (clust->win)
31603 +               checkin_file_size(clust, inode);
31604 +}
31605 +
31606 +static int checkin_page_cluster(struct cluster_handle * clust,
31607 +                               struct inode * inode)
31608 +{
31609 +       int result;
31610 +       jnode * node;
31611 +       int old_nrpages = clust->old_nrpages;
31612 +       int new_nrpages = get_new_nrpages(clust);
31613 +
31614 +       node = clust->node;
31615 +
31616 +       assert("edward-221", node != NULL);
31617 +       assert("edward-971", clust->reserved == 1);
31618 +       assert("edward-1263",
31619 +              clust->reserved_prepped == estimate_update_cluster(inode));
31620 +       assert("edward-1264", clust->reserved_unprepped == 0);
31621 +
31622 +       if (JF_ISSET(node, JNODE_DIRTY)) {
31623 +               /*
31624 +                * page cluster was checked in, but not yet
31625 +                * checked out, so release related resources
31626 +                */
31627 +               free_reserved4cluster(inode, clust,
31628 +                                     estimate_update_cluster(inode));
31629 +               __put_page_cluster(0, clust->old_nrpages,
31630 +                                  clust->pages, inode);
31631 +       } else {
31632 +               result = capture_cluster_jnode(node);
31633 +               if (unlikely(result)) {
31634 +                       unlock_cluster(node);
31635 +                       return result;
31636 +               }
31637 +               jnode_make_dirty_locked(node);
31638 +               clust->reserved = 0;
31639 +       }
31640 +       unlock_cluster(node);
31641 +
31642 +       if (new_nrpages < old_nrpages) {
31643 +               /* truncate >= 1 complete pages */
31644 +               __put_page_cluster(new_nrpages,
31645 +                                  old_nrpages - new_nrpages,
31646 +                                  clust->pages, inode);
31647 +               truncate_page_cluster_range(inode,
31648 +                                           clust->pages, clust->index,
31649 +                                           new_nrpages,
31650 +                                           old_nrpages - new_nrpages,
31651 +                                           0);
31652 +       }
31653 +#if REISER4_DEBUG
31654 +       clust->reserved_prepped -= estimate_update_cluster(inode);
31655 +#endif
31656 +       return 0;
31657 +}
31658 +
31659 +/* Submit modifications of a logical cluster */
31660 +static int checkin_logical_cluster(struct cluster_handle * clust,
31661 +                                  struct inode *inode)
31662 +{
31663 +       int result = 0;
31664 +       jnode * node;
31665 +
31666 +       node = clust->node;
31667 +
31668 +       assert("edward-1035", node != NULL);
31669 +       assert("edward-1029", clust != NULL);
31670 +       assert("edward-1030", clust->reserved == 1);
31671 +       assert("edward-1031", clust->nr_pages != 0);
31672 +       assert("edward-1032", clust->pages != NULL);
31673 +       assert("edward-1033", clust->pages[0] != NULL);
31674 +       assert("edward-1446", jnode_is_cluster_page(node));
31675 +       assert("edward-1476", node == jprivate(clust->pages[0]));
31676 +
31677 +       lock_cluster(node);
31678 +       checkin_cluster_size(clust, inode);
31679 +       /* this will unlock cluster */
31680 +       result = checkin_page_cluster(clust, inode);
31681 +       jput(node);
31682 +       clust->node = NULL;
31683 +       return result;
31684 +}
31685 +
31686 +/*
31687 + * Retrieve size of logical cluster that was checked in at
31688 + * the latest modifying session (cluster lock is required)
31689 + */
31690 +static inline void checkout_cluster_size(struct cluster_handle * clust,
31691 +                                        struct inode * inode)
31692 +{
31693 +       struct tfm_cluster *tc = &clust->tc;
31694 +
31695 +       tc->len = lbytes(clust->index, inode);
31696 +       assert("edward-1478", tc->len != 0);
31697 +}
31698 +
31699 +/*
31700 + * Retrieve a page cluster with the latest submitted modifications
31701 + * and flush its pages to previously allocated contiguous stream.
31702 + */
31703 +static void checkout_page_cluster(struct cluster_handle * clust,
31704 +                                 jnode * node, struct inode * inode)
31705 +{
31706 +       int i;
31707 +       int found;
31708 +       int to_put;
31709 +       struct tfm_cluster *tc = &clust->tc;
31710 +
31711 +       /* find and put checked in pages: cluster is locked,
31712 +        * so we must get expected number (to_put) of pages
31713 +        */
31714 +       to_put = size_in_pages(lbytes(clust->index, inode));
31715 +       found = find_get_pages(inode->i_mapping,
31716 +                              clust_to_pg(clust->index, inode),
31717 +                              to_put, clust->pages);
31718 +       BUG_ON(found != to_put);
31719 +
31720 +       __put_page_cluster(0, to_put, clust->pages, inode);
31721 +       unlock_cluster_uncapture(node);
31722 +
31723 +       /* Flush found pages.
31724 +        *
31725 +        * Note, that we don't disable modifications while flushing,
31726 +        * moreover, some found pages can be truncated, as we have
31727 +        * released cluster lock.
31728 +        */
31729 +       for (i = 0; i < found; i++) {
31730 +               int in_page;
31731 +               char * data;
31732 +               assert("edward-1479",
31733 +                      clust->pages[i]->index == clust->pages[0]->index + i);
31734 +
31735 +               lock_page(clust->pages[i]);
31736 +               if (!PageUptodate(clust->pages[i])) {
31737 +                       /* page was truncated */
31738 +                       assert("edward-1480",
31739 +                              i_size_read(inode) <= page_offset(clust->pages[i]));
31740 +                       assert("edward-1481",
31741 +                              clust->pages[i]->mapping != inode->i_mapping);
31742 +                       unlock_page(clust->pages[i]);
31743 +                       break;
31744 +               }
31745 +               /* Update the number of bytes in the logical cluster,
31746 +                * as it could be partially truncated. Note, that only
31747 +                * partial truncate is possible (complete truncate can
31748 +                * not go here, as it is performed via ->kill_hook()
31749 +                 * called by cut_file_items(), and the last one must
31750 +                 * wait for znode locked with parent coord).
31751 +                */
31752 +               checkout_cluster_size(clust, inode);
31753 +
31754 +               /* this can be zero, as new file size is
31755 +                  checked in before truncating pages */
31756 +               in_page = __mbp(tc->len, i);
31757 +
31758 +               data = kmap(clust->pages[i]);
31759 +               memcpy(tfm_stream_data(tc, INPUT_STREAM) + pg_to_off(i),
31760 +                      data, in_page);
31761 +               kunmap(clust->pages[i]);
31762 +
31763 +               if (PageDirty(clust->pages[i]))
31764 +                       cancel_dirty_page(clust->pages[i], PAGE_CACHE_SIZE);
31765 +
31766 +               unlock_page(clust->pages[i]);
31767 +
31768 +               if (in_page < PAGE_CACHE_SIZE)
31769 +                       /* end of the file */
31770 +                       break;
31771 +       }
31772 +       put_found_pages(clust->pages, found); /* find_get_pages */
31773 +       tc->lsize = tc->len;
31774 +       return;
31775 +}
31776 +
31777 +/* Check out modifications of a logical cluster */
31778 +int checkout_logical_cluster(struct cluster_handle * clust,
31779 +                            jnode * node, struct inode *inode)
31780 +{
31781 +       int result;
31782 +       struct tfm_cluster *tc = &clust->tc;
31783 +
31784 +       assert("edward-980", node != NULL);
31785 +       assert("edward-236", inode != NULL);
31786 +       assert("edward-237", clust != NULL);
31787 +       assert("edward-240", !clust->win);
31788 +       assert("edward-241", reiser4_schedulable());
31789 +       assert("edward-718", cryptcompress_inode_ok(inode));
31790 +
31791 +       result = grab_tfm_stream(inode, tc, INPUT_STREAM);
31792 +       if (result) {
31793 +               warning("edward-1430", "alloc stream failed with ret=%d",
31794 +                       result);
31795 +               return RETERR(-E_REPEAT);
31796 +       }
31797 +       lock_cluster(node);
31798 +
31799 +       if (unlikely(!JF_ISSET(node, JNODE_DIRTY))) {
31800 +               /* race with another flush */
31801 +               warning("edward-982",
31802 +                       "checking out logical cluster %lu of inode %llu: "
31803 +                       "jnode is not dirty", clust->index,
31804 +                       (unsigned long long)get_inode_oid(inode));
31805 +               unlock_cluster(node);
31806 +               return RETERR(-E_REPEAT);
31807 +       }
31808 +       cluster_reserved2grabbed(estimate_update_cluster(inode));
31809 +
31810 +       /* this will unlock cluster */
31811 +       checkout_page_cluster(clust, node, inode);
31812 +       return 0;
31813 +}
31814 +
31815 +/* set hint for the cluster of the index @index */
31816 +static void set_hint_cluster(struct inode *inode, hint_t * hint,
31817 +                            cloff_t index, znode_lock_mode mode)
31818 +{
31819 +       reiser4_key key;
31820 +       assert("edward-722", cryptcompress_inode_ok(inode));
31821 +       assert("edward-723",
31822 +              inode_file_plugin(inode) ==
31823 +              file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID));
31824 +
31825 +       inode_file_plugin(inode)->key_by_inode(inode,
31826 +                                              clust_to_off(index, inode),
31827 +                                              &key);
31828 +
31829 +       reiser4_seal_init(&hint->seal, &hint->ext_coord.coord, &key);
31830 +       hint->offset = get_key_offset(&key);
31831 +       hint->mode = mode;
31832 +}
31833 +
31834 +void invalidate_hint_cluster(struct cluster_handle * clust)
31835 +{
31836 +       assert("edward-1291", clust != NULL);
31837 +       assert("edward-1292", clust->hint != NULL);
31838 +
31839 +       done_lh(&clust->hint->lh);
31840 +       hint_clr_valid(clust->hint);
31841 +}
31842 +
31843 +static void put_hint_cluster(struct cluster_handle * clust,
31844 +                            struct inode *inode, znode_lock_mode mode)
31845 +{
31846 +       assert("edward-1286", clust != NULL);
31847 +       assert("edward-1287", clust->hint != NULL);
31848 +
31849 +       set_hint_cluster(inode, clust->hint, clust->index + 1, mode);
31850 +       invalidate_hint_cluster(clust);
31851 +}
31852 +
31853 +static int balance_dirty_page_cluster(struct cluster_handle * clust,
31854 +                                     struct inode *inode, loff_t off,
31855 +                                     loff_t to_file,
31856 +                                     int nr_dirtied)
31857 +{
31858 +       int result;
31859 +       struct cryptcompress_info * info;
31860 +
31861 +       assert("edward-724", inode != NULL);
31862 +       assert("edward-725", cryptcompress_inode_ok(inode));
31863 +       assert("edward-1547",
31864 +              nr_dirtied != 0 && nr_dirtied <= cluster_nrpages(inode));
31865 +
31866 +       /* set next window params */
31867 +       move_update_window(inode, clust, off, to_file);
31868 +
31869 +       result = update_sd_cryptcompress(inode);
31870 +       if (result)
31871 +               return result;
31872 +       assert("edward-726", clust->hint->lh.owner == NULL);
31873 +       info = cryptcompress_inode_data(inode);
31874 +
31875 +       mutex_unlock(&info->checkin_mutex);
31876 +       reiser4_txn_restart_current();
31877 +       balance_dirty_pages_ratelimited_nr(inode->i_mapping, nr_dirtied);
31878 +       mutex_lock(&info->checkin_mutex);
31879 +       return 0;
31880 +}
31881 +
31882 +/* set zeroes to the page cluster, proceed it, and maybe, try to capture
31883 +   its pages */
31884 +static int write_hole(struct inode *inode, struct cluster_handle * clust,
31885 +                     loff_t file_off, loff_t to_file)
31886 +{
31887 +       int result = 0;
31888 +       unsigned cl_off, cl_count = 0;
31889 +       unsigned to_pg, pg_off;
31890 +       struct reiser4_slide * win;
31891 +
31892 +       assert("edward-190", clust != NULL);
31893 +       assert("edward-1069", clust->win != NULL);
31894 +       assert("edward-191", inode != NULL);
31895 +       assert("edward-727", cryptcompress_inode_ok(inode));
31896 +       assert("edward-1171", clust->dstat != INVAL_DISK_CLUSTER);
31897 +       assert("edward-1154",
31898 +              ergo(clust->dstat != FAKE_DISK_CLUSTER, clust->reserved == 1));
31899 +
31900 +       win = clust->win;
31901 +
31902 +       assert("edward-1070", win != NULL);
31903 +       assert("edward-201", win->stat == HOLE_WINDOW);
31904 +       assert("edward-192", cluster_ok(clust, inode));
31905 +
31906 +       if (win->off == 0 && win->count == inode_cluster_size(inode)) {
31907 +               /* This part of the hole will be represented by "fake"
31908 +                * logical cluster, i.e. which doesn't have appropriate
31909 +                * disk cluster until someone modify this logical cluster
31910 +                * and make it dirty.
31911 +                * So go forward here..
31912 +                */
31913 +               move_update_window(inode, clust, file_off, to_file);
31914 +               return 0;
31915 +       }
31916 +       cl_count = win->count;  /* number of zeroes to write */
31917 +       cl_off = win->off;
31918 +       pg_off = off_to_pgoff(win->off);
31919 +
31920 +       while (cl_count) {
31921 +               struct page *page;
31922 +               page = clust->pages[off_to_pg(cl_off)];
31923 +
31924 +               assert("edward-284", page != NULL);
31925 +
31926 +               to_pg = min((typeof(pg_off))PAGE_CACHE_SIZE - pg_off, cl_count);
31927 +               lock_page(page);
31928 +               zero_user(page, pg_off, to_pg);
31929 +               SetPageUptodate(page);
31930 +               reiser4_set_page_dirty_internal(page);
31931 +               mark_page_accessed(page);
31932 +               unlock_page(page);
31933 +
31934 +               cl_off += to_pg;
31935 +               cl_count -= to_pg;
31936 +               pg_off = 0;
31937 +       }
31938 +       if (!win->delta) {
31939 +               /* only zeroes in this window, try to capture
31940 +                */
31941 +               result = checkin_logical_cluster(clust, inode);
31942 +               if (result)
31943 +                       return result;
31944 +               put_hint_cluster(clust, inode, ZNODE_WRITE_LOCK);
31945 +               result = balance_dirty_page_cluster(clust,
31946 +                                                   inode, file_off, to_file,
31947 +                                                   win_count_to_nrpages(win));
31948 +       } else
31949 +               move_update_window(inode, clust, file_off, to_file);
31950 +       return result;
31951 +}
31952 +
31953 +/*
31954 +  The main disk search procedure for cryptcompress plugin, which
31955 +  . scans all items of disk cluster with the lock mode @mode
31956 +  . maybe reads each one (if @read)
31957 +  . maybe makes its znode dirty (if write lock mode was specified)
31958 +
31959 +  NOTE-EDWARD: Callers should handle the case when disk cluster
31960 +  is incomplete (-EIO)
31961 +*/
31962 +int find_disk_cluster(struct cluster_handle * clust,
31963 +                     struct inode *inode, int read, znode_lock_mode mode)
31964 +{
31965 +       flow_t f;
31966 +       hint_t *hint;
31967 +       int result = 0;
31968 +       int was_grabbed;
31969 +       ra_info_t ra_info;
31970 +       file_plugin *fplug;
31971 +       item_plugin *iplug;
31972 +       struct tfm_cluster *tc;
31973 +       struct cryptcompress_info * info;
31974 +
31975 +       assert("edward-138", clust != NULL);
31976 +       assert("edward-728", clust->hint != NULL);
31977 +       assert("edward-226", reiser4_schedulable());
31978 +       assert("edward-137", inode != NULL);
31979 +       assert("edward-729", cryptcompress_inode_ok(inode));
31980 +
31981 +       hint = clust->hint;
31982 +       fplug = inode_file_plugin(inode);
31983 +       was_grabbed = get_current_context()->grabbed_blocks;
31984 +       info = cryptcompress_inode_data(inode);
31985 +       tc = &clust->tc;
31986 +
31987 +       assert("edward-462", !tfm_cluster_is_uptodate(tc));
31988 +       assert("edward-461", ergo(read, tfm_stream_is_set(tc, INPUT_STREAM)));
31989 +
31990 +       dclust_init_extension(hint);
31991 +
31992 +       /* set key of the first disk cluster item */
31993 +       fplug->flow_by_inode(inode,
31994 +                            (read ? (char __user *)tfm_stream_data(tc, INPUT_STREAM) : NULL),
31995 +                            0 /* kernel space */ ,
31996 +                            inode_scaled_cluster_size(inode),
31997 +                            clust_to_off(clust->index, inode), READ_OP, &f);
31998 +       if (mode == ZNODE_WRITE_LOCK) {
31999 +               /* reserve for flush to make dirty all the leaf nodes
32000 +                  which contain disk cluster */
32001 +               result =
32002 +                   reiser4_grab_space_force(estimate_dirty_cluster(inode),
32003 +                                            BA_CAN_COMMIT);
32004 +               if (result)
32005 +                       goto out;
32006 +       }
32007 +
32008 +       ra_info.key_to_stop = f.key;
32009 +       set_key_offset(&ra_info.key_to_stop, get_key_offset(reiser4_max_key()));
32010 +
32011 +       while (f.length) {
32012 +               result = find_cluster_item(hint, &f.key, mode,
32013 +                                          NULL, FIND_EXACT,
32014 +                                          (mode == ZNODE_WRITE_LOCK ?
32015 +                                           CBK_FOR_INSERT : 0));
32016 +               switch (result) {
32017 +               case CBK_COORD_NOTFOUND:
32018 +                       result = 0;
32019 +                       if (inode_scaled_offset
32020 +                           (inode, clust_to_off(clust->index, inode)) ==
32021 +                           get_key_offset(&f.key)) {
32022 +                               /* first item not found, this is treated
32023 +                                  as disk cluster is absent */
32024 +                               clust->dstat = FAKE_DISK_CLUSTER;
32025 +                               goto out;
32026 +                       }
32027 +                       /* we are outside the cluster, stop search here */
32028 +                       assert("edward-146",
32029 +                              f.length != inode_scaled_cluster_size(inode));
32030 +                       goto ok;
32031 +               case CBK_COORD_FOUND:
32032 +                       assert("edward-148",
32033 +                              hint->ext_coord.coord.between == AT_UNIT);
32034 +                       assert("edward-460",
32035 +                              hint->ext_coord.coord.unit_pos == 0);
32036 +
32037 +                       coord_clear_iplug(&hint->ext_coord.coord);
32038 +                       result = zload_ra(hint->ext_coord.coord.node, &ra_info);
32039 +                       if (unlikely(result))
32040 +                               goto out;
32041 +                       iplug = item_plugin_by_coord(&hint->ext_coord.coord);
32042 +                       assert("edward-147",
32043 +                              item_id_by_coord(&hint->ext_coord.coord) ==
32044 +                              CTAIL_ID);
32045 +
32046 +                       result = iplug->s.file.read(NULL, &f, hint);
32047 +                       if (result) {
32048 +                               zrelse(hint->ext_coord.coord.node);
32049 +                               goto out;
32050 +                       }
32051 +                       if (mode == ZNODE_WRITE_LOCK) {
32052 +                               /* Don't make dirty more nodes then it was
32053 +                                  estimated (see comments before
32054 +                                  estimate_dirty_cluster). Missed nodes will be
32055 +                                  read up in flush time if they are evicted from
32056 +                                  memory */
32057 +                               if (dclust_get_extension_ncount(hint) <=
32058 +                                   estimate_dirty_cluster(inode))
32059 +                                  znode_make_dirty(hint->ext_coord.coord.node);
32060 +
32061 +                               znode_set_convertible(hint->ext_coord.coord.
32062 +                                                     node);
32063 +                       }
32064 +                       zrelse(hint->ext_coord.coord.node);
32065 +                       break;
32066 +               default:
32067 +                       goto out;
32068 +               }
32069 +       }
32070 + ok:
32071 +       /* at least one item was found  */
32072 +       /* NOTE-EDWARD: Callers should handle the case
32073 +          when disk cluster is incomplete (-EIO) */
32074 +       tc->len = inode_scaled_cluster_size(inode) - f.length;
32075 +       tc->lsize = lbytes(clust->index, inode);
32076 +       assert("edward-1196", tc->len > 0);
32077 +       assert("edward-1406", tc->lsize > 0);
32078 +
32079 +       if (hint_is_unprepped_dclust(clust->hint)) {
32080 +               clust->dstat = UNPR_DISK_CLUSTER;
32081 +       } else if (clust->index == info->trunc_index) {
32082 +               clust->dstat = TRNC_DISK_CLUSTER;
32083 +       } else {
32084 +               clust->dstat = PREP_DISK_CLUSTER;
32085 +               dclust_set_extension_dsize(clust->hint, tc->len);
32086 +       }
32087 + out:
32088 +       assert("edward-1339",
32089 +              get_current_context()->grabbed_blocks >= was_grabbed);
32090 +       grabbed2free(get_current_context(),
32091 +                    get_current_super_private(),
32092 +                    get_current_context()->grabbed_blocks - was_grabbed);
32093 +       return result;
32094 +}
32095 +
32096 +int get_disk_cluster_locked(struct cluster_handle * clust, struct inode *inode,
32097 +                           znode_lock_mode lock_mode)
32098 +{
32099 +       reiser4_key key;
32100 +       ra_info_t ra_info;
32101 +
32102 +       assert("edward-730", reiser4_schedulable());
32103 +       assert("edward-731", clust != NULL);
32104 +       assert("edward-732", inode != NULL);
32105 +
32106 +       if (hint_is_valid(clust->hint)) {
32107 +               assert("edward-1293", clust->dstat != INVAL_DISK_CLUSTER);
32108 +               assert("edward-1294",
32109 +                      znode_is_write_locked(clust->hint->lh.node));
32110 +               /* already have a valid locked position */
32111 +               return (clust->dstat ==
32112 +                       FAKE_DISK_CLUSTER ? CBK_COORD_NOTFOUND :
32113 +                       CBK_COORD_FOUND);
32114 +       }
32115 +       key_by_inode_cryptcompress(inode, clust_to_off(clust->index, inode),
32116 +                                  &key);
32117 +       ra_info.key_to_stop = key;
32118 +       set_key_offset(&ra_info.key_to_stop, get_key_offset(reiser4_max_key()));
32119 +
32120 +       return find_cluster_item(clust->hint, &key, lock_mode, NULL, FIND_EXACT,
32121 +                                CBK_FOR_INSERT);
32122 +}
32123 +
32124 +/* Read needed cluster pages before modifying.
32125 +   If success, @clust->hint contains locked position in the tree.
32126 +   Also:
32127 +   . find and set disk cluster state
32128 +   . make disk cluster dirty if its state is not FAKE_DISK_CLUSTER.
32129 +*/
32130 +static int read_some_cluster_pages(struct inode * inode,
32131 +                                  struct cluster_handle * clust)
32132 +{
32133 +       int i;
32134 +       int result = 0;
32135 +       item_plugin *iplug;
32136 +       struct reiser4_slide * win = clust->win;
32137 +       znode_lock_mode mode = ZNODE_WRITE_LOCK;
32138 +
32139 +       iplug = item_plugin_by_id(CTAIL_ID);
32140 +
32141 +       assert("edward-924", !tfm_cluster_is_uptodate(&clust->tc));
32142 +
32143 +#if REISER4_DEBUG
32144 +       if (clust->nr_pages == 0) {
32145 +               /* start write hole from fake disk cluster */
32146 +               assert("edward-1117", win != NULL);
32147 +               assert("edward-1118", win->stat == HOLE_WINDOW);
32148 +               assert("edward-1119", new_logical_cluster(clust, inode));
32149 +       }
32150 +#endif
32151 +       if (new_logical_cluster(clust, inode)) {
32152 +               /*
32153 +                  new page cluster is about to be written, nothing to read,
32154 +                */
32155 +               assert("edward-734", reiser4_schedulable());
32156 +               assert("edward-735", clust->hint->lh.owner == NULL);
32157 +
32158 +               if (clust->nr_pages) {
32159 +                       int off;
32160 +                       struct page * pg;
32161 +                       assert("edward-1419", clust->pages != NULL);
32162 +                       pg = clust->pages[clust->nr_pages - 1];
32163 +                       assert("edward-1420", pg != NULL);
32164 +                       off = off_to_pgoff(win->off+win->count+win->delta);
32165 +                       if (off) {
32166 +                               lock_page(pg);
32167 +                               zero_user_segment(pg, off, PAGE_CACHE_SIZE);
32168 +                               unlock_page(pg);
32169 +                       }
32170 +               }
32171 +               clust->dstat = FAKE_DISK_CLUSTER;
32172 +               return 0;
32173 +       }
32174 +       /*
32175 +          Here we should search for disk cluster to figure out its real state.
32176 +          Also there is one more important reason to do disk search: we need
32177 +          to make disk cluster _dirty_ if it exists
32178 +        */
32179 +
32180 +       /* if windows is specified, read the only pages
32181 +          that will be modified partially */
32182 +
32183 +       for (i = 0; i < clust->nr_pages; i++) {
32184 +               struct page *pg = clust->pages[i];
32185 +
32186 +               lock_page(pg);
32187 +               if (PageUptodate(pg)) {
32188 +                       unlock_page(pg);
32189 +                       continue;
32190 +               }
32191 +               unlock_page(pg);
32192 +
32193 +               if (win &&
32194 +                   i >= size_in_pages(win->off) &&
32195 +                   i < off_to_pg(win->off + win->count + win->delta))
32196 +                       /* page will be completely overwritten */
32197 +                       continue;
32198 +
32199 +               if (win && (i == clust->nr_pages - 1) &&
32200 +                   /* the last page is
32201 +                      partially modified,
32202 +                      not uptodate .. */
32203 +                   (size_in_pages(i_size_read(inode)) <= pg->index)) {
32204 +                       /* .. and appended,
32205 +                          so set zeroes to the rest */
32206 +                       int offset;
32207 +                       lock_page(pg);
32208 +                       assert("edward-1260",
32209 +                              size_in_pages(win->off + win->count +
32210 +                                            win->delta) - 1 == i);
32211 +
32212 +                       offset =
32213 +                           off_to_pgoff(win->off + win->count + win->delta);
32214 +                       zero_user_segment(pg, offset, PAGE_CACHE_SIZE);
32215 +                       unlock_page(pg);
32216 +                       /* still not uptodate */
32217 +                       break;
32218 +               }
32219 +               lock_page(pg);
32220 +               result = do_readpage_ctail(inode, clust, pg, mode);
32221 +
32222 +               assert("edward-1526", ergo(!result, PageUptodate(pg)));
32223 +               unlock_page(pg);
32224 +               if (result) {
32225 +                       warning("edward-219", "do_readpage_ctail failed");
32226 +                       goto out;
32227 +               }
32228 +       }
32229 +       if (!tfm_cluster_is_uptodate(&clust->tc)) {
32230 +               /* disk cluster unclaimed, but we need to make its znodes dirty
32231 +                * to make flush update convert its content
32232 +                */
32233 +               result = find_disk_cluster(clust, inode,
32234 +                                          0 /* do not read items */,
32235 +                                          mode);
32236 +       }
32237 + out:
32238 +       tfm_cluster_clr_uptodate(&clust->tc);
32239 +       return result;
32240 +}
32241 +
32242 +static int should_create_unprepped_cluster(struct cluster_handle * clust,
32243 +                                          struct inode * inode)
32244 +{
32245 +       assert("edward-737", clust != NULL);
32246 +
32247 +       switch (clust->dstat) {
32248 +       case PREP_DISK_CLUSTER:
32249 +       case UNPR_DISK_CLUSTER:
32250 +               return 0;
32251 +       case FAKE_DISK_CLUSTER:
32252 +               if (clust->win &&
32253 +                   clust->win->stat == HOLE_WINDOW && clust->nr_pages == 0) {
32254 +                       assert("edward-1172",
32255 +                              new_logical_cluster(clust, inode));
32256 +                       return 0;
32257 +               }
32258 +               return 1;
32259 +       default:
32260 +               impossible("edward-1173", "bad disk cluster state");
32261 +               return 0;
32262 +       }
32263 +}
32264 +
32265 +static int cryptcompress_make_unprepped_cluster(struct cluster_handle * clust,
32266 +                                               struct inode *inode)
32267 +{
32268 +       int result;
32269 +
32270 +       assert("edward-1123", reiser4_schedulable());
32271 +       assert("edward-737", clust != NULL);
32272 +       assert("edward-738", inode != NULL);
32273 +       assert("edward-739", cryptcompress_inode_ok(inode));
32274 +       assert("edward-1053", clust->hint != NULL);
32275 +
32276 +       if (!should_create_unprepped_cluster(clust, inode)) {
32277 +               if (clust->reserved) {
32278 +                       cluster_reserved2free(estimate_insert_cluster(inode));
32279 +#if REISER4_DEBUG
32280 +                       assert("edward-1267",
32281 +                              clust->reserved_unprepped ==
32282 +                              estimate_insert_cluster(inode));
32283 +                       clust->reserved_unprepped -=
32284 +                               estimate_insert_cluster(inode);
32285 +#endif
32286 +               }
32287 +               return 0;
32288 +       }
32289 +       assert("edward-1268", clust->reserved);
32290 +       cluster_reserved2grabbed(estimate_insert_cluster(inode));
32291 +#if REISER4_DEBUG
32292 +       assert("edward-1441",
32293 +              clust->reserved_unprepped == estimate_insert_cluster(inode));
32294 +       clust->reserved_unprepped -= estimate_insert_cluster(inode);
32295 +#endif
32296 +       result = ctail_insert_unprepped_cluster(clust, inode);
32297 +       if (result)
32298 +               return result;
32299 +
32300 +       inode_add_bytes(inode, inode_cluster_size(inode));
32301 +
32302 +       assert("edward-743", cryptcompress_inode_ok(inode));
32303 +       assert("edward-744", znode_is_write_locked(clust->hint->lh.node));
32304 +
32305 +       clust->dstat = UNPR_DISK_CLUSTER;
32306 +       return 0;
32307 +}
32308 +
32309 +/* . Grab page cluster for read, write, setattr, etc. operations;
32310 + * . Truncate its complete pages, if needed;
32311 + */
32312 +int prepare_page_cluster(struct inode * inode, struct cluster_handle * clust,
32313 +                        rw_op rw)
32314 +{
32315 +       assert("edward-177", inode != NULL);
32316 +       assert("edward-741", cryptcompress_inode_ok(inode));
32317 +       assert("edward-740", clust->pages != NULL);
32318 +
32319 +       set_cluster_nrpages(clust, inode);
32320 +       reset_cluster_pgset(clust, cluster_nrpages(inode));
32321 +       return grab_page_cluster(inode, clust, rw);
32322 +}
32323 +
32324 +/* Truncate complete page cluster of index @index.
32325 + * This is called by ->kill_hook() method of item
32326 + * plugin when deleting a disk cluster of such index.
32327 + */
32328 +void truncate_complete_page_cluster(struct inode *inode, cloff_t index,
32329 +                                   int even_cows)
32330 +{
32331 +       int found;
32332 +       int nr_pages;
32333 +       jnode *node;
32334 +       struct page *pages[MAX_CLUSTER_NRPAGES];
32335 +
32336 +       node = jlookup(current_tree, get_inode_oid(inode),
32337 +                      clust_to_pg(index, inode));
32338 +       nr_pages = size_in_pages(lbytes(index, inode));
32339 +       assert("edward-1483", nr_pages != 0);
32340 +       if (!node)
32341 +               goto truncate;
32342 +       found = find_get_pages(inode->i_mapping,
32343 +                              clust_to_pg(index, inode),
32344 +                              cluster_nrpages(inode), pages);
32345 +       if (!found) {
32346 +               assert("edward-1484", jnode_truncate_ok(inode, index));
32347 +               return;
32348 +       }
32349 +       lock_cluster(node);
32350 +
32351 +       if (reiser4_inode_get_flag(inode, REISER4_FILE_CONV_IN_PROGRESS)
32352 +           && index == 0)
32353 +               /* converting to unix_file is in progress */
32354 +               JF_CLR(node, JNODE_CLUSTER_PAGE);
32355 +       if (JF_ISSET(node, JNODE_DIRTY)) {
32356 +               /*
32357 +                * @nr_pages were checked in, but not yet checked out -
32358 +                * we need to release them. (also there can be pages
32359 +                * attached to page cache by read(), etc. - don't take
32360 +                * them into account).
32361 +                */
32362 +               assert("edward-1198", found >= nr_pages);
32363 +
32364 +               /* free disk space grabbed for disk cluster converting */
32365 +               cluster_reserved2grabbed(estimate_update_cluster(inode));
32366 +               grabbed2free(get_current_context(),
32367 +                            get_current_super_private(),
32368 +                            estimate_update_cluster(inode));
32369 +               __put_page_cluster(0, nr_pages, pages, inode);
32370 +
32371 +               /* This will clear dirty bit, uncapture and unlock jnode */
32372 +               unlock_cluster_uncapture(node);
32373 +       } else
32374 +               unlock_cluster(node);
32375 +       jput(node);                         /* jlookup */
32376 +       put_found_pages(pages, found); /* find_get_pages */
32377 + truncate:
32378 +       if (reiser4_inode_get_flag(inode, REISER4_FILE_CONV_IN_PROGRESS) &&
32379 +           index == 0)
32380 +               return;
32381 +       truncate_page_cluster_range(inode, pages, index, 0,
32382 +                                   cluster_nrpages(inode),
32383 +                                   even_cows);
32384 +       assert("edward-1201",
32385 +              ergo(!reiser4_inode_get_flag(inode,
32386 +                                           REISER4_FILE_CONV_IN_PROGRESS),
32387 +                   jnode_truncate_ok(inode, index)));
32388 +       return;
32389 +}
32390 +
32391 +/*
32392 + * Set cluster handle @clust of a logical cluster before
32393 + * modifications which are supposed to be committed.
32394 + *
32395 + * . grab cluster pages;
32396 + * . reserve disk space;
32397 + * . maybe read pages from disk and set the disk cluster dirty;
32398 + * . maybe write hole and check in (partially zeroed) logical cluster;
32399 + * . create 'unprepped' disk cluster for new or fake logical one.
32400 + */
32401 +static int prepare_logical_cluster(struct inode *inode,
32402 +                                  loff_t file_off, /* write position
32403 +                                                      in the file */
32404 +                                  loff_t to_file, /* bytes of users data
32405 +                                                     to write to the file */
32406 +                                  struct cluster_handle * clust,
32407 +                                  logical_cluster_op op)
32408 +{
32409 +       int result = 0;
32410 +       struct reiser4_slide * win = clust->win;
32411 +
32412 +       reset_cluster_params(clust);
32413 +       cluster_set_tfm_act(&clust->tc, TFMA_READ);
32414 +#if REISER4_DEBUG
32415 +       clust->ctx = get_current_context();
32416 +#endif
32417 +       assert("edward-1190", op != LC_INVAL);
32418 +
32419 +       clust->op = op;
32420 +
32421 +       result = prepare_page_cluster(inode, clust, WRITE_OP);
32422 +       if (result)
32423 +               return result;
32424 +       assert("edward-1447",
32425 +              ergo(clust->nr_pages != 0, jprivate(clust->pages[0])));
32426 +       assert("edward-1448",
32427 +              ergo(clust->nr_pages != 0,
32428 +                   jnode_is_cluster_page(jprivate(clust->pages[0]))));
32429 +
32430 +       result = reserve4cluster(inode, clust);
32431 +       if (result)
32432 +               goto err1;
32433 +       result = read_some_cluster_pages(inode, clust);
32434 +       if (result) {
32435 +               free_reserved4cluster(inode,
32436 +                                     clust,
32437 +                                     estimate_update_cluster(inode) +
32438 +                                     estimate_insert_cluster(inode));
32439 +               goto err1;
32440 +       }
32441 +       assert("edward-1124", clust->dstat != INVAL_DISK_CLUSTER);
32442 +
32443 +       result = cryptcompress_make_unprepped_cluster(clust, inode);
32444 +       if (result)
32445 +               goto err2;
32446 +       if (win && win->stat == HOLE_WINDOW) {
32447 +               result = write_hole(inode, clust, file_off, to_file);
32448 +               if (result)
32449 +                       goto err2;
32450 +       }
32451 +       return 0;
32452 + err2:
32453 +       free_reserved4cluster(inode, clust,
32454 +                             estimate_update_cluster(inode));
32455 + err1:
32456 +       put_page_cluster(clust, inode, WRITE_OP);
32457 +       assert("edward-1125", result == -ENOSPC);
32458 +       return result;
32459 +}
32460 +
32461 +/* set window by two offsets */
32462 +static void set_window(struct cluster_handle * clust,
32463 +                      struct reiser4_slide * win, struct inode *inode,
32464 +                      loff_t o1, loff_t o2)
32465 +{
32466 +       assert("edward-295", clust != NULL);
32467 +       assert("edward-296", inode != NULL);
32468 +       assert("edward-1071", win != NULL);
32469 +       assert("edward-297", o1 <= o2);
32470 +
32471 +       clust->index = off_to_clust(o1, inode);
32472 +
32473 +       win->off = off_to_cloff(o1, inode);
32474 +       win->count = min((loff_t)(inode_cluster_size(inode) - win->off),
32475 +                        o2 - o1);
32476 +       win->delta = 0;
32477 +
32478 +       clust->win = win;
32479 +}
32480 +
32481 +static int set_cluster_by_window(struct inode *inode,
32482 +                                struct cluster_handle * clust,
32483 +                                struct reiser4_slide * win, size_t length,
32484 +                                loff_t file_off)
32485 +{
32486 +       int result;
32487 +
32488 +       assert("edward-197", clust != NULL);
32489 +       assert("edward-1072", win != NULL);
32490 +       assert("edward-198", inode != NULL);
32491 +
32492 +       result = alloc_cluster_pgset(clust, cluster_nrpages(inode));
32493 +       if (result)
32494 +               return result;
32495 +
32496 +       if (file_off > i_size_read(inode)) {
32497 +               /* Uhmm, hole in cryptcompress file... */
32498 +               loff_t hole_size;
32499 +               hole_size = file_off - inode->i_size;
32500 +
32501 +               set_window(clust, win, inode, inode->i_size, file_off);
32502 +               win->stat = HOLE_WINDOW;
32503 +               if (win->off + hole_size < inode_cluster_size(inode))
32504 +                       /* there is also user's data to append to the hole */
32505 +                       win->delta = min(inode_cluster_size(inode) -
32506 +                                        (win->off + win->count), length);
32507 +               return 0;
32508 +       }
32509 +       set_window(clust, win, inode, file_off, file_off + length);
32510 +       win->stat = DATA_WINDOW;
32511 +       return 0;
32512 +}
32513 +
32514 +int set_cluster_by_page(struct cluster_handle * clust, struct page * page,
32515 +                       int count)
32516 +{
32517 +       int result = 0;
32518 +       int (*setting_actor)(struct cluster_handle * clust, int count);
32519 +
32520 +       assert("edward-1358", clust != NULL);
32521 +       assert("edward-1359", page != NULL);
32522 +       assert("edward-1360", page->mapping != NULL);
32523 +       assert("edward-1361", page->mapping->host != NULL);
32524 +
32525 +       setting_actor =
32526 +               (clust->pages ? reset_cluster_pgset : alloc_cluster_pgset);
32527 +       result = setting_actor(clust, count);
32528 +       clust->index = pg_to_clust(page->index, page->mapping->host);
32529 +       return result;
32530 +}
32531 +
32532 +/* reset all the params that not get updated */
32533 +void reset_cluster_params(struct cluster_handle * clust)
32534 +{
32535 +       assert("edward-197", clust != NULL);
32536 +
32537 +       clust->dstat = INVAL_DISK_CLUSTER;
32538 +       clust->tc.uptodate = 0;
32539 +       clust->tc.len = 0;
32540 +}
32541 +
32542 +/* the heart of write_cryptcompress */
32543 +static loff_t do_write_cryptcompress(struct file *file, struct inode *inode,
32544 +                                    const char __user *buf, size_t to_write,
32545 +                                    loff_t pos, struct psched_context *cont)
32546 +{
32547 +       int i;
32548 +       hint_t *hint;
32549 +       int result = 0;
32550 +       size_t count;
32551 +       struct reiser4_slide win;
32552 +       struct cluster_handle clust;
32553 +       struct cryptcompress_info * info;
32554 +
32555 +       assert("edward-154", buf != NULL);
32556 +       assert("edward-161", reiser4_schedulable());
32557 +       assert("edward-748", cryptcompress_inode_ok(inode));
32558 +       assert("edward-159", current_blocksize == PAGE_CACHE_SIZE);
32559 +       assert("edward-1274", get_current_context()->grabbed_blocks == 0);
32560 +
32561 +       hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
32562 +       if (hint == NULL)
32563 +               return RETERR(-ENOMEM);
32564 +
32565 +       result = load_file_hint(file, hint);
32566 +       if (result) {
32567 +               kfree(hint);
32568 +               return result;
32569 +       }
32570 +       count = to_write;
32571 +
32572 +       reiser4_slide_init(&win);
32573 +       cluster_init_read(&clust, &win);
32574 +       clust.hint = hint;
32575 +       info = cryptcompress_inode_data(inode);
32576 +
32577 +       mutex_lock(&info->checkin_mutex);
32578 +
32579 +       result = set_cluster_by_window(inode, &clust, &win, to_write, pos);
32580 +       if (result)
32581 +               goto out;
32582 +
32583 +       if (next_window_stat(&win) == HOLE_WINDOW) {
32584 +               /* write hole in this iteration
32585 +                  separated from the loop below */
32586 +               result = write_pschedule_hook(file, inode,
32587 +                                             pos,
32588 +                                             &clust,
32589 +                                             cont);
32590 +               if (result)
32591 +                       goto out;
32592 +               result = prepare_logical_cluster(inode, pos, count, &clust,
32593 +                                                LC_APPOV);
32594 +               if (result)
32595 +                       goto out;
32596 +       }
32597 +       do {
32598 +               const char __user * src;
32599 +               unsigned page_off, to_page;
32600 +
32601 +               assert("edward-750", reiser4_schedulable());
32602 +
32603 +               result = write_pschedule_hook(file, inode,
32604 +                                             pos + to_write - count,
32605 +                                             &clust,
32606 +                                             cont);
32607 +               if (result)
32608 +                       goto out;
32609 +               if (cont->state == PSCHED_ASSIGNED_NEW)
32610 +                       /* done_lh was called in write_pschedule_hook */
32611 +                       goto out_no_longterm_lock;
32612 +
32613 +               result = prepare_logical_cluster(inode, pos, count, &clust,
32614 +                                                LC_APPOV);
32615 +               if (result)
32616 +                       goto out;
32617 +
32618 +               assert("edward-751", cryptcompress_inode_ok(inode));
32619 +               assert("edward-204", win.stat == DATA_WINDOW);
32620 +               assert("edward-1288", hint_is_valid(clust.hint));
32621 +               assert("edward-752",
32622 +                      znode_is_write_locked(hint->ext_coord.coord.node));
32623 +               put_hint_cluster(&clust, inode, ZNODE_WRITE_LOCK);
32624 +
32625 +               /* set write position in page */
32626 +               page_off = off_to_pgoff(win.off);
32627 +
32628 +               /* copy user's data to cluster pages */
32629 +               for (i = off_to_pg(win.off), src = buf;
32630 +                    i < size_in_pages(win.off + win.count);
32631 +                    i++, src += to_page) {
32632 +                       to_page = __mbp(win.off + win.count, i) - page_off;
32633 +                       assert("edward-1039",
32634 +                              page_off + to_page <= PAGE_CACHE_SIZE);
32635 +                       assert("edward-287", clust.pages[i] != NULL);
32636 +
32637 +                       fault_in_pages_readable(src, to_page);
32638 +
32639 +                       lock_page(clust.pages[i]);
32640 +                       result =
32641 +                           __copy_from_user((char *)kmap(clust.pages[i]) +
32642 +                                            page_off, src, to_page);
32643 +                       kunmap(clust.pages[i]);
32644 +                       if (unlikely(result)) {
32645 +                               unlock_page(clust.pages[i]);
32646 +                               result = -EFAULT;
32647 +                               goto err2;
32648 +                       }
32649 +                       SetPageUptodate(clust.pages[i]);
32650 +                       reiser4_set_page_dirty_internal(clust.pages[i]);
32651 +                       flush_dcache_page(clust.pages[i]);
32652 +                       mark_page_accessed(clust.pages[i]);
32653 +                       unlock_page(clust.pages[i]);
32654 +                       page_off = 0;
32655 +               }
32656 +               assert("edward-753", cryptcompress_inode_ok(inode));
32657 +
32658 +               result = checkin_logical_cluster(&clust, inode);
32659 +               if (result)
32660 +                       goto err2;
32661 +
32662 +               buf   += win.count;
32663 +               count -= win.count;
32664 +
32665 +               result = balance_dirty_page_cluster(&clust, inode, 0, count,
32666 +                                                   win_count_to_nrpages(&win));
32667 +               if (result)
32668 +                       goto err1;
32669 +               assert("edward-755", hint->lh.owner == NULL);
32670 +               reset_cluster_params(&clust);
32671 +               continue;
32672 +       err2:
32673 +               put_page_cluster(&clust, inode, WRITE_OP);
32674 +       err1:
32675 +               if (clust.reserved)
32676 +                       free_reserved4cluster(inode,
32677 +                                             &clust,
32678 +                                             estimate_update_cluster(inode));
32679 +               break;
32680 +       } while (count);
32681 + out:
32682 +       done_lh(&hint->lh);
32683 +       save_file_hint(file, hint);
32684 + out_no_longterm_lock:
32685 +       mutex_unlock(&info->checkin_mutex);
32686 +       kfree(hint);
32687 +       put_cluster_handle(&clust);
32688 +       assert("edward-195",
32689 +              ergo((to_write == count),
32690 +                   (result < 0 || cont->state == PSCHED_ASSIGNED_NEW)));
32691 +       return (to_write - count) ? (to_write - count) : result;
32692 +}
32693 +
32694 +/**
32695 + * plugin->write()
32696 + * @file: file to write to
32697 + * @buf: address of user-space buffer
32698 + * @read_amount: number of bytes to write
32699 + * @off: position in file to write to
32700 + */
32701 +ssize_t write_cryptcompress(struct file *file, const char __user *buf,
32702 +                           size_t count, loff_t *off,
32703 +                           struct psched_context *cont)
32704 +{
32705 +       ssize_t result;
32706 +       struct inode *inode;
32707 +       reiser4_context *ctx;
32708 +       loff_t pos = *off;
32709 +       struct cryptcompress_info *info;
32710 +
32711 +       assert("edward-1449", cont->state == PSCHED_INVAL_STATE);
32712 +
32713 +       inode = file->f_dentry->d_inode;
32714 +       assert("edward-196", cryptcompress_inode_ok(inode));
32715 +
32716 +       info = cryptcompress_inode_data(inode);
32717 +       ctx = get_current_context();
32718 +
32719 +       result = generic_write_checks(file, &pos, &count, 0);
32720 +       if (unlikely(result != 0)) {
32721 +               context_set_commit_async(ctx);
32722 +               return result;
32723 +       }
32724 +       if (unlikely(count == 0))
32725 +               return 0;
32726 +       result = file_remove_suid(file);
32727 +       if (unlikely(result != 0)) {
32728 +               context_set_commit_async(ctx);
32729 +               return result;
32730 +       }
32731 +       /* remove_suid might create a transaction */
32732 +       reiser4_txn_restart(ctx);
32733 +
32734 +       result = do_write_cryptcompress(file, inode, buf, count, pos, cont);
32735 +
32736 +       if (unlikely(result < 0)) {
32737 +               context_set_commit_async(ctx);
32738 +               return result;
32739 +       }
32740 +       /* update position in a file */
32741 +       *off = pos + result;
32742 +       return result;
32743 +}
32744 +
32745 +/* plugin->readpages */
32746 +int readpages_cryptcompress(struct file *file, struct address_space *mapping,
32747 +                           struct list_head *pages, unsigned nr_pages)
32748 +{
32749 +       reiser4_context * ctx;
32750 +       int ret;
32751 +
32752 +       ctx = reiser4_init_context(mapping->host->i_sb);
32753 +       if (IS_ERR(ctx)) {
32754 +               ret = PTR_ERR(ctx);
32755 +               goto err;
32756 +       }
32757 +       /* cryptcompress file can be built of ctail items only */
32758 +       ret = readpages_ctail(file, mapping, pages);
32759 +       reiser4_txn_restart(ctx);
32760 +       reiser4_exit_context(ctx);
32761 +       if (ret) {
32762 +err:
32763 +               put_pages_list(pages);
32764 +       }
32765 +       return ret;
32766 +}
32767 +
32768 +static reiser4_block_nr cryptcompress_estimate_read(struct inode *inode)
32769 +{
32770 +       /* reserve one block to update stat data item */
32771 +       assert("edward-1193",
32772 +              inode_file_plugin(inode)->estimate.update ==
32773 +              estimate_update_common);
32774 +       return estimate_update_common(inode);
32775 +}
32776 +
32777 +/**
32778 + * plugin->read
32779 + * @file: file to read from
32780 + * @buf: address of user-space buffer
32781 + * @read_amount: number of bytes to read
32782 + * @off: position in file to read from
32783 + */
32784 +ssize_t read_cryptcompress(struct file * file, char __user *buf, size_t size,
32785 +                          loff_t * off)
32786 +{
32787 +       ssize_t result;
32788 +       struct inode *inode;
32789 +       reiser4_context *ctx;
32790 +       struct cryptcompress_info *info;
32791 +       reiser4_block_nr needed;
32792 +
32793 +       inode = file->f_dentry->d_inode;
32794 +       assert("edward-1194", !reiser4_inode_get_flag(inode, REISER4_NO_SD));
32795 +
32796 +       ctx = reiser4_init_context(inode->i_sb);
32797 +       if (IS_ERR(ctx))
32798 +               return PTR_ERR(ctx);
32799 +
32800 +       info = cryptcompress_inode_data(inode);
32801 +       needed = cryptcompress_estimate_read(inode);
32802 +
32803 +       result = reiser4_grab_space(needed, BA_CAN_COMMIT);
32804 +       if (result != 0) {
32805 +               reiser4_exit_context(ctx);
32806 +               return result;
32807 +       }
32808 +       result = do_sync_read(file, buf, size, off);
32809 +
32810 +       context_set_commit_async(ctx);
32811 +       reiser4_exit_context(ctx);
32812 +
32813 +       return result;
32814 +}
32815 +
32816 +/* Look for a disk cluster and keep lookup result in @found.
32817 + * If @index > 0, then find disk cluster of the index (@index - 1);
32818 + * If @index == 0, then find the rightmost disk cluster.
32819 + * Keep incremented index of the found disk cluster in @found.
32820 + * @found == 0 means that disk cluster was not found (in the last
32821 + * case (@index == 0) it means that file doesn't have disk clusters).
32822 + */
32823 +static int lookup_disk_cluster(struct inode *inode, cloff_t * found,
32824 +                              cloff_t index)
32825 +{
32826 +       int result;
32827 +       reiser4_key key;
32828 +       loff_t offset;
32829 +       hint_t *hint;
32830 +       lock_handle *lh;
32831 +       lookup_bias bias;
32832 +       coord_t *coord;
32833 +       item_plugin *iplug;
32834 +
32835 +       assert("edward-1131", inode != NULL);
32836 +       assert("edward-95", cryptcompress_inode_ok(inode));
32837 +
32838 +       hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
32839 +       if (hint == NULL)
32840 +               return RETERR(-ENOMEM);
32841 +       hint_init_zero(hint);
32842 +       lh = &hint->lh;
32843 +
32844 +       bias = (index ? FIND_EXACT : FIND_MAX_NOT_MORE_THAN);
32845 +       offset =
32846 +           (index ? clust_to_off(index, inode) -
32847 +            1 : get_key_offset(reiser4_max_key()));
32848 +
32849 +       key_by_inode_cryptcompress(inode, offset, &key);
32850 +
32851 +       /* find the last item of this object */
32852 +       result =
32853 +           find_cluster_item(hint, &key, ZNODE_READ_LOCK, NULL /* ra_info */,
32854 +                             bias, 0);
32855 +       if (cbk_errored(result)) {
32856 +               done_lh(lh);
32857 +               kfree(hint);
32858 +               return result;
32859 +       }
32860 +       if (result == CBK_COORD_NOTFOUND) {
32861 +               /* no real disk clusters */
32862 +               done_lh(lh);
32863 +               kfree(hint);
32864 +               *found = 0;
32865 +               return 0;
32866 +       }
32867 +       /* disk cluster is found */
32868 +       coord = &hint->ext_coord.coord;
32869 +       coord_clear_iplug(coord);
32870 +       result = zload(coord->node);
32871 +       if (unlikely(result)) {
32872 +               done_lh(lh);
32873 +               kfree(hint);
32874 +               return result;
32875 +       }
32876 +       iplug = item_plugin_by_coord(coord);
32877 +       assert("edward-277", iplug == item_plugin_by_id(CTAIL_ID));
32878 +       assert("edward-1202", ctail_ok(coord));
32879 +
32880 +       item_key_by_coord(coord, &key);
32881 +       *found = off_to_clust(get_key_offset(&key), inode) + 1;
32882 +
32883 +       assert("edward-1132", ergo(index, index == *found));
32884 +
32885 +       zrelse(coord->node);
32886 +       done_lh(lh);
32887 +       kfree(hint);
32888 +       return 0;
32889 +}
32890 +
32891 +static int find_fake_appended(struct inode *inode, cloff_t * index)
32892 +{
32893 +       return lookup_disk_cluster(inode, index,
32894 +                                  0 /* find last real one */ );
32895 +}
32896 +
32897 +/* Set left coord when unit is not found after node_lookup()
32898 +   This takes into account that there can be holes in a sequence
32899 +   of disk clusters */
32900 +
32901 +static void adjust_left_coord(coord_t * left_coord)
32902 +{
32903 +       switch (left_coord->between) {
32904 +       case AFTER_UNIT:
32905 +               left_coord->between = AFTER_ITEM;
32906 +       case AFTER_ITEM:
32907 +       case BEFORE_UNIT:
32908 +               break;
32909 +       default:
32910 +               impossible("edward-1204", "bad left coord to cut");
32911 +       }
32912 +       return;
32913 +}
32914 +
32915 +#define CRC_CUT_TREE_MIN_ITERATIONS 64
32916 +
32917 +/* plugin->cut_tree_worker */
32918 +int cut_tree_worker_cryptcompress(tap_t * tap, const reiser4_key * from_key,
32919 +                                 const reiser4_key * to_key,
32920 +                                 reiser4_key * smallest_removed,
32921 +                                 struct inode *object, int truncate,
32922 +                                 int *progress)
32923 +{
32924 +       lock_handle next_node_lock;
32925 +       coord_t left_coord;
32926 +       int result;
32927 +
32928 +       assert("edward-1158", tap->coord->node != NULL);
32929 +       assert("edward-1159", znode_is_write_locked(tap->coord->node));
32930 +       assert("edward-1160", znode_get_level(tap->coord->node) == LEAF_LEVEL);
32931 +
32932 +       *progress = 0;
32933 +       init_lh(&next_node_lock);
32934 +
32935 +       while (1) {
32936 +               znode *node;    /* node from which items are cut */
32937 +               node_plugin *nplug;     /* node plugin for @node */
32938 +
32939 +               node = tap->coord->node;
32940 +
32941 +               /* Move next_node_lock to the next node on the left. */
32942 +               result =
32943 +                   reiser4_get_left_neighbor(&next_node_lock, node,
32944 +                                             ZNODE_WRITE_LOCK,
32945 +                                             GN_CAN_USE_UPPER_LEVELS);
32946 +               if (result != 0 && result != -E_NO_NEIGHBOR)
32947 +                       break;
32948 +               /* FIXME-EDWARD: Check can we delete the node as a whole. */
32949 +               result = reiser4_tap_load(tap);
32950 +               if (result)
32951 +                       return result;
32952 +
32953 +               /* Prepare the second (right) point for cut_node() */
32954 +               if (*progress)
32955 +                       coord_init_last_unit(tap->coord, node);
32956 +
32957 +               else if (item_plugin_by_coord(tap->coord)->b.lookup == NULL)
32958 +                       /* set rightmost unit for the items without lookup method */
32959 +                       tap->coord->unit_pos = coord_last_unit_pos(tap->coord);
32960 +
32961 +               nplug = node->nplug;
32962 +
32963 +               assert("edward-1161", nplug);
32964 +               assert("edward-1162", nplug->lookup);
32965 +
32966 +               /* left_coord is leftmost unit cut from @node */
32967 +               result = nplug->lookup(node, from_key, FIND_EXACT, &left_coord);
32968 +
32969 +               if (IS_CBKERR(result))
32970 +                       break;
32971 +
32972 +               if (result == CBK_COORD_NOTFOUND)
32973 +                       adjust_left_coord(&left_coord);
32974 +
32975 +               /* adjust coordinates so that they are set to existing units */
32976 +               if (coord_set_to_right(&left_coord)
32977 +                   || coord_set_to_left(tap->coord)) {
32978 +                       result = 0;
32979 +                       break;
32980 +               }
32981 +
32982 +               if (coord_compare(&left_coord, tap->coord) ==
32983 +                   COORD_CMP_ON_RIGHT) {
32984 +                       /* keys from @from_key to @to_key are not in the tree */
32985 +                       result = 0;
32986 +                       break;
32987 +               }
32988 +
32989 +               /* cut data from one node */
32990 +               *smallest_removed = *reiser4_min_key();
32991 +               result = kill_node_content(&left_coord,
32992 +                                          tap->coord,
32993 +                                          from_key,
32994 +                                          to_key,
32995 +                                          smallest_removed,
32996 +                                          next_node_lock.node,
32997 +                                          object, truncate);
32998 +               reiser4_tap_relse(tap);
32999 +
33000 +               if (result)
33001 +                       break;
33002 +
33003 +               ++(*progress);
33004 +
33005 +               /* Check whether all items with keys >= from_key were removed
33006 +                * from the tree. */
33007 +               if (keyle(smallest_removed, from_key))
33008 +                       /* result = 0; */
33009 +                       break;
33010 +
33011 +               if (next_node_lock.node == NULL)
33012 +                       break;
33013 +
33014 +               result = reiser4_tap_move(tap, &next_node_lock);
33015 +               done_lh(&next_node_lock);
33016 +               if (result)
33017 +                       break;
33018 +
33019 +               /* Break long cut_tree operation (deletion of a large file) if
33020 +                * atom requires commit. */
33021 +               if (*progress > CRC_CUT_TREE_MIN_ITERATIONS
33022 +                   && current_atom_should_commit()) {
33023 +                       result = -E_REPEAT;
33024 +                       break;
33025 +               }
33026 +       }
33027 +       done_lh(&next_node_lock);
33028 +       return result;
33029 +}
33030 +
33031 +/* Append or expand hole in two steps:
33032 + * 1) set zeroes to the rightmost page of the rightmost non-fake
33033 + *    logical cluster;
33034 + * 2) expand hole via fake logical clusters (just increase i_size)
33035 + */
33036 +static int cryptcompress_append_hole(struct inode *inode /* with old size */,
33037 +                                    loff_t new_size)
33038 +{
33039 +       int result = 0;
33040 +       hint_t *hint;
33041 +       lock_handle *lh;
33042 +       loff_t hole_size;
33043 +       int nr_zeroes;
33044 +       struct reiser4_slide win;
33045 +       struct cluster_handle clust;
33046 +
33047 +       assert("edward-1133", inode->i_size < new_size);
33048 +       assert("edward-1134", reiser4_schedulable());
33049 +       assert("edward-1135", cryptcompress_inode_ok(inode));
33050 +       assert("edward-1136", current_blocksize == PAGE_CACHE_SIZE);
33051 +       assert("edward-1333", off_to_cloff(inode->i_size, inode) != 0);
33052 +
33053 +       hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
33054 +       if (hint == NULL)
33055 +               return RETERR(-ENOMEM);
33056 +       hint_init_zero(hint);
33057 +       lh = &hint->lh;
33058 +
33059 +       reiser4_slide_init(&win);
33060 +       cluster_init_read(&clust, &win);
33061 +       clust.hint = hint;
33062 +
33063 +       result = alloc_cluster_pgset(&clust, cluster_nrpages(inode));
33064 +       if (result)
33065 +               goto out;
33066 +       if (off_to_cloff(inode->i_size, inode) == 0)
33067 +               goto append_fake;
33068 +       hole_size = new_size - inode->i_size;
33069 +       nr_zeroes =
33070 +               inode_cluster_size(inode) - off_to_cloff(inode->i_size, inode);
33071 +       if (hole_size < nr_zeroes)
33072 +               nr_zeroes = hole_size;
33073 +       set_window(&clust, &win, inode, inode->i_size,
33074 +                  inode->i_size + nr_zeroes);
33075 +       win.stat = HOLE_WINDOW;
33076 +
33077 +       assert("edward-1137",
33078 +              clust.index == off_to_clust(inode->i_size, inode));
33079 +
33080 +       result = prepare_logical_cluster(inode, 0, 0, &clust, LC_APPOV);
33081 +
33082 +       assert("edward-1271", !result || result == -ENOSPC);
33083 +       if (result)
33084 +               goto out;
33085 +       assert("edward-1139",
33086 +              clust.dstat == PREP_DISK_CLUSTER ||
33087 +              clust.dstat == UNPR_DISK_CLUSTER);
33088 +
33089 +       assert("edward-1431", hole_size >= nr_zeroes);
33090 +       if (hole_size == nr_zeroes)
33091 +       /* nothing to append anymore */
33092 +               goto out;
33093 + append_fake:
33094 +       INODE_SET_SIZE(inode, new_size);
33095 + out:
33096 +       done_lh(lh);
33097 +       kfree(hint);
33098 +       put_cluster_handle(&clust);
33099 +       return result;
33100 +}
33101 +
33102 +static int update_cryptcompress_size(struct inode *inode, loff_t new_size,
33103 +                                    int update_sd)
33104 +{
33105 +       return (new_size & ((loff_t) (inode_cluster_size(inode)) - 1)
33106 +               ? 0 : reiser4_update_file_size(inode, new_size, update_sd));
33107 +}
33108 +
33109 +/* Prune cryptcompress file in two steps:
33110 + * 1) cut all nominated logical clusters except the leftmost one which
33111 + *    is to be partially truncated. Note, that there can be "holes"
33112 + *    represented by fake logical clusters.
33113 + * 2) set zeroes and capture leftmost partially truncated logical
33114 + *    cluster, if it is not fake; otherwise prune fake logical cluster
33115 + *    (just decrease i_size).
33116 + */
33117 +static int prune_cryptcompress(struct inode *inode, loff_t new_size,
33118 +                              int update_sd, cloff_t aidx)
33119 +{
33120 +       int result = 0;
33121 +       unsigned nr_zeroes;
33122 +       loff_t to_prune;
33123 +       loff_t old_size;
33124 +       cloff_t ridx;
33125 +
33126 +       hint_t *hint;
33127 +       lock_handle *lh;
33128 +       struct reiser4_slide win;
33129 +       struct cluster_handle clust;
33130 +
33131 +       assert("edward-1140", inode->i_size >= new_size);
33132 +       assert("edward-1141", reiser4_schedulable());
33133 +       assert("edward-1142", cryptcompress_inode_ok(inode));
33134 +       assert("edward-1143", current_blocksize == PAGE_CACHE_SIZE);
33135 +
33136 +       old_size = inode->i_size;
33137 +
33138 +       hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
33139 +       if (hint == NULL)
33140 +               return RETERR(-ENOMEM);
33141 +       hint_init_zero(hint);
33142 +       lh = &hint->lh;
33143 +
33144 +       reiser4_slide_init(&win);
33145 +       cluster_init_read(&clust, &win);
33146 +       clust.hint = hint;
33147 +
33148 +       /* calculate index of the rightmost logical cluster
33149 +          that will be completely truncated */
33150 +       ridx = size_in_lc(new_size, inode);
33151 +
33152 +       /* truncate all disk clusters starting from @ridx */
33153 +       assert("edward-1174", ridx <= aidx);
33154 +       old_size = inode->i_size;
33155 +       if (ridx != aidx) {
33156 +               struct cryptcompress_info * info;
33157 +               info = cryptcompress_inode_data(inode);
33158 +               result = cut_file_items(inode,
33159 +                                       clust_to_off(ridx, inode),
33160 +                                       update_sd,
33161 +                                       clust_to_off(aidx, inode),
33162 +                                       update_cryptcompress_size);
33163 +               info->trunc_index = ULONG_MAX;
33164 +               if (result)
33165 +                       goto out;
33166 +       }
33167 +       /*
33168 +        * there can be pages of fake logical clusters, truncate them
33169 +        */
33170 +       truncate_inode_pages(inode->i_mapping, clust_to_off(ridx, inode));
33171 +       assert("edward-1524",
33172 +              pages_truncate_ok(inode, clust_to_pg(ridx, inode)));
33173 +       /*
33174 +        * now perform partial truncate of last logical cluster
33175 +        */
33176 +       if (!off_to_cloff(new_size, inode)) {
33177 +               /* no partial truncate is needed */
33178 +               assert("edward-1145", inode->i_size == new_size);
33179 +               goto truncate_fake;
33180 +       }
33181 +       assert("edward-1146", new_size < inode->i_size);
33182 +
33183 +       to_prune = inode->i_size - new_size;
33184 +
33185 +       /* check if the last logical cluster is fake */
33186 +       result = lookup_disk_cluster(inode, &aidx, ridx);
33187 +       if (result)
33188 +               goto out;
33189 +       if (!aidx)
33190 +               /* yup, this is fake one */
33191 +               goto truncate_fake;
33192 +
33193 +       assert("edward-1148", aidx == ridx);
33194 +
33195 +       /* do partial truncate of the last page cluster,
33196 +          and try to capture this one */
33197 +       result = alloc_cluster_pgset(&clust, cluster_nrpages(inode));
33198 +       if (result)
33199 +               goto out;
33200 +       nr_zeroes = (off_to_pgoff(new_size) ?
33201 +                    PAGE_CACHE_SIZE - off_to_pgoff(new_size) : 0);
33202 +       set_window(&clust, &win, inode, new_size, new_size + nr_zeroes);
33203 +       win.stat = HOLE_WINDOW;
33204 +
33205 +       assert("edward-1149", clust.index == ridx - 1);
33206 +
33207 +       result = prepare_logical_cluster(inode, 0, 0, &clust, LC_TRUNC);
33208 +       if (result)
33209 +               goto out;
33210 +       assert("edward-1151",
33211 +              clust.dstat == PREP_DISK_CLUSTER ||
33212 +              clust.dstat == UNPR_DISK_CLUSTER);
33213 +
33214 +       assert("edward-1191", inode->i_size == new_size);
33215 +       assert("edward-1206", body_truncate_ok(inode, ridx));
33216 + truncate_fake:
33217 +       /* drop all the pages that don't have jnodes (i.e. pages
33218 +          which can not be truncated by cut_file_items() because
33219 +          of holes represented by fake disk clusters) including
33220 +          the pages of partially truncated cluster which was
33221 +          released by prepare_logical_cluster() */
33222 +       INODE_SET_SIZE(inode, new_size);
33223 +       truncate_inode_pages(inode->i_mapping, new_size);
33224 + out:
33225 +       assert("edward-1334", !result || result == -ENOSPC);
33226 +       assert("edward-1497",
33227 +              pages_truncate_ok(inode, size_in_pages(new_size)));
33228 +
33229 +       done_lh(lh);
33230 +       kfree(hint);
33231 +       put_cluster_handle(&clust);
33232 +       return result;
33233 +}
33234 +
33235 +/* Prepare cryptcompress file for truncate:
33236 + * prune or append rightmost fake logical clusters (if any)
33237 + */
33238 +static int start_truncate_fake(struct inode *inode, cloff_t aidx,
33239 +                              loff_t new_size, int update_sd)
33240 +{
33241 +       int result = 0;
33242 +       int bytes;
33243 +
33244 +       if (new_size > inode->i_size) {
33245 +               /* append */
33246 +               if (inode->i_size < clust_to_off(aidx, inode))
33247 +                       /* no fake bytes */
33248 +                       return 0;
33249 +               bytes = new_size - inode->i_size;
33250 +               INODE_SET_SIZE(inode, inode->i_size + bytes);
33251 +       } else {
33252 +               /* prune */
33253 +               if (inode->i_size <= clust_to_off(aidx, inode))
33254 +                       /* no fake bytes */
33255 +                       return 0;
33256 +               bytes = inode->i_size -
33257 +                       max(new_size, clust_to_off(aidx, inode));
33258 +               if (!bytes)
33259 +                       return 0;
33260 +               INODE_SET_SIZE(inode, inode->i_size - bytes);
33261 +               /* In the case of fake prune we need to drop page cluster.
33262 +                  There are only 2 cases for partially truncated page:
33263 +                  1. If is is dirty, therefore it is anonymous
33264 +                  (was dirtied via mmap), and will be captured
33265 +                  later via ->capture().
33266 +                  2. If is clean, therefore it is filled by zeroes.
33267 +                  In both cases we don't need to make it dirty and
33268 +                  capture here.
33269 +                */
33270 +               truncate_inode_pages(inode->i_mapping, inode->i_size);
33271 +       }
33272 +       if (update_sd)
33273 +               result = update_sd_cryptcompress(inode);
33274 +       return result;
33275 +}
33276 +
33277 +/**
33278 + * This is called in setattr_cryptcompress when it is used to truncate,
33279 + * and in delete_object_cryptcompress
33280 + */
33281 +static int cryptcompress_truncate(struct inode *inode, /* old size */
33282 +                                 loff_t new_size,      /* new size */
33283 +                                 int update_sd)
33284 +{
33285 +       int result;
33286 +       cloff_t aidx;
33287 +
33288 +       result = find_fake_appended(inode, &aidx);
33289 +       if (result)
33290 +               return result;
33291 +       assert("edward-1208",
33292 +              ergo(aidx > 0, inode->i_size > clust_to_off(aidx - 1, inode)));
33293 +
33294 +       result = start_truncate_fake(inode, aidx, new_size, update_sd);
33295 +       if (result)
33296 +               return result;
33297 +       if (inode->i_size == new_size)
33298 +               /* nothing to truncate anymore */
33299 +               return 0;
33300 +       result = (inode->i_size < new_size ?
33301 +                 cryptcompress_append_hole(inode, new_size) :
33302 +                 prune_cryptcompress(inode, new_size, update_sd, aidx));
33303 +       if (!result && update_sd)
33304 +               result = update_sd_cryptcompress(inode);
33305 +       return result;
33306 +}
33307 +
33308 +/* Capture an anonymous pager cluster. (Page cluser is
33309 + * anonymous if it contains at least one anonymous page
33310 + */
33311 +static int capture_anon_page_cluster(struct cluster_handle * clust,
33312 +                                    struct inode * inode)
33313 +{
33314 +       int result;
33315 +
33316 +       assert("edward-1073", clust != NULL);
33317 +       assert("edward-1074", inode != NULL);
33318 +       assert("edward-1075", clust->dstat == INVAL_DISK_CLUSTER);
33319 +
33320 +       result = prepare_logical_cluster(inode, 0, 0, clust, LC_APPOV);
33321 +       if (result)
33322 +               return result;
33323 +       set_cluster_pages_dirty(clust, inode);
33324 +       result = checkin_logical_cluster(clust, inode);
33325 +       put_hint_cluster(clust, inode, ZNODE_WRITE_LOCK);
33326 +       if (unlikely(result))
33327 +               put_page_cluster(clust, inode, WRITE_OP);
33328 +       return result;
33329 +}
33330 +
33331 +/* Starting from @index find tagged pages of the same page cluster.
33332 + * Clear the tag for each of them. Return number of found pages.
33333 + */
33334 +static int find_anon_page_cluster(struct address_space * mapping,
33335 +                                 pgoff_t * index, struct page ** pages)
33336 +{
33337 +       int i = 0;
33338 +       int found;
33339 +       spin_lock_irq(&mapping->tree_lock);
33340 +       do {
33341 +               /* looking for one page */
33342 +               found = radix_tree_gang_lookup_tag(&mapping->page_tree,
33343 +                                                  (void **)&pages[i],
33344 +                                                  *index, 1,
33345 +                                                  PAGECACHE_TAG_REISER4_MOVED);
33346 +               if (!found)
33347 +                       break;
33348 +               if (!same_page_cluster(pages[0], pages[i]))
33349 +                       break;
33350 +
33351 +               /* found */
33352 +               page_cache_get(pages[i]);
33353 +               *index = pages[i]->index + 1;
33354 +
33355 +               radix_tree_tag_clear(&mapping->page_tree,
33356 +                                    pages[i]->index,
33357 +                                    PAGECACHE_TAG_REISER4_MOVED);
33358 +               if (last_page_in_cluster(pages[i++]))
33359 +                       break;
33360 +       } while (1);
33361 +       spin_unlock_irq(&mapping->tree_lock);
33362 +       return i;
33363 +}
33364 +
33365 +#define MAX_PAGES_TO_CAPTURE  (1024)
33366 +
33367 +/* Capture anonymous page clusters */
33368 +static int capture_anon_pages(struct address_space * mapping, pgoff_t * index,
33369 +                             int to_capture)
33370 +{
33371 +       int count = 0;
33372 +       int found = 0;
33373 +       int result = 0;
33374 +       hint_t *hint;
33375 +       lock_handle *lh;
33376 +       struct inode * inode;
33377 +       struct cluster_handle clust;
33378 +       struct page * pages[MAX_CLUSTER_NRPAGES];
33379 +
33380 +       assert("edward-1127", mapping != NULL);
33381 +       assert("edward-1128", mapping->host != NULL);
33382 +       assert("edward-1440", mapping->host->i_mapping == mapping);
33383 +
33384 +       inode = mapping->host;
33385 +       hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
33386 +       if (hint == NULL)
33387 +               return RETERR(-ENOMEM);
33388 +       hint_init_zero(hint);
33389 +       lh = &hint->lh;
33390 +
33391 +       cluster_init_read(&clust, NULL);
33392 +       clust.hint = hint;
33393 +
33394 +       result = alloc_cluster_pgset(&clust, cluster_nrpages(inode));
33395 +       if (result)
33396 +               goto out;
33397 +
33398 +       while (to_capture > 0) {
33399 +               found = find_anon_page_cluster(mapping, index, pages);
33400 +               if (!found) {
33401 +                       *index = (pgoff_t) - 1;
33402 +                       break;
33403 +               }
33404 +               move_cluster_forward(&clust, inode, pages[0]->index);
33405 +               result = capture_anon_page_cluster(&clust, inode);
33406 +
33407 +               put_found_pages(pages, found); /* find_anon_page_cluster */
33408 +               if (result)
33409 +                       break;
33410 +               to_capture -= clust.nr_pages;
33411 +               count += clust.nr_pages;
33412 +       }
33413 +       if (result) {
33414 +               warning("edward-1077",
33415 +                       "Capture failed (inode %llu, result=%i, captured=%d)\n",
33416 +                       (unsigned long long)get_inode_oid(inode), result, count);
33417 +       } else {
33418 +               assert("edward-1078", ergo(found > 0, count > 0));
33419 +               if (to_capture <= 0)
33420 +                       /* there may be left more pages */
33421 +                       __mark_inode_dirty(inode, I_DIRTY_PAGES);
33422 +               result = count;
33423 +       }
33424 +      out:
33425 +       done_lh(lh);
33426 +       kfree(hint);
33427 +       put_cluster_handle(&clust);
33428 +       return result;
33429 +}
33430 +
33431 +/* Returns true if inode's mapping has dirty pages
33432 +   which do not belong to any atom */
33433 +static int cryptcompress_inode_has_anon_pages(struct inode *inode)
33434 +{
33435 +       int result;
33436 +       spin_lock_irq(&inode->i_mapping->tree_lock);
33437 +       result = radix_tree_tagged(&inode->i_mapping->page_tree,
33438 +                                  PAGECACHE_TAG_REISER4_MOVED);
33439 +       spin_unlock_irq(&inode->i_mapping->tree_lock);
33440 +       return result;
33441 +}
33442 +
33443 +/* plugin->writepages */
33444 +int writepages_cryptcompress(struct address_space *mapping,
33445 +                            struct writeback_control *wbc)
33446 +{
33447 +       int result = 0;
33448 +       long to_capture;
33449 +       pgoff_t nrpages;
33450 +       pgoff_t index = 0;
33451 +       struct inode *inode;
33452 +       struct cryptcompress_info *info;
33453 +
33454 +       inode = mapping->host;
33455 +       if (!cryptcompress_inode_has_anon_pages(inode))
33456 +               goto end;
33457 +       info = cryptcompress_inode_data(inode);
33458 +       nrpages = size_in_pages(i_size_read(inode));
33459 +
33460 +       if (wbc->sync_mode != WB_SYNC_ALL)
33461 +               to_capture = min(wbc->nr_to_write, (long)MAX_PAGES_TO_CAPTURE);
33462 +       else
33463 +               to_capture = MAX_PAGES_TO_CAPTURE;
33464 +       do {
33465 +               reiser4_context *ctx;
33466 +
33467 +               ctx = reiser4_init_context(inode->i_sb);
33468 +               if (IS_ERR(ctx)) {
33469 +                       result = PTR_ERR(ctx);
33470 +                       break;
33471 +               }
33472 +               /* avoid recursive calls to ->sync_inodes */
33473 +               ctx->nobalance = 1;
33474 +
33475 +               assert("edward-1079",
33476 +                      lock_stack_isclean(get_current_lock_stack()));
33477 +
33478 +               reiser4_txn_restart_current();
33479 +
33480 +               if (get_current_context()->entd) {
33481 +                       if (mutex_trylock(&info->checkin_mutex) == 0) {
33482 +                               /* the mutex might be occupied by
33483 +                                  entd caller */
33484 +                               result = RETERR(-EBUSY);
33485 +                               reiser4_exit_context(ctx);
33486 +                               break;
33487 +                       }
33488 +               } else
33489 +                       mutex_lock(&info->checkin_mutex);
33490 +
33491 +               result = capture_anon_pages(inode->i_mapping, &index,
33492 +                                           to_capture);
33493 +               mutex_unlock(&info->checkin_mutex);
33494 +
33495 +               if (result < 0) {
33496 +                       reiser4_exit_context(ctx);
33497 +                       break;
33498 +               }
33499 +               wbc->nr_to_write -= result;
33500 +               if (wbc->sync_mode != WB_SYNC_ALL) {
33501 +                       reiser4_exit_context(ctx);
33502 +                       break;
33503 +               }
33504 +               result = txnmgr_force_commit_all(inode->i_sb, 0);
33505 +               reiser4_exit_context(ctx);
33506 +       } while (result >= 0 && index < nrpages);
33507 +
33508 + end:
33509 +       if (is_in_reiser4_context()) {
33510 +               if (get_current_context()->nr_captured >= CAPTURE_APAGE_BURST) {
33511 +                       /* there are already pages to flush, flush them out,
33512 +                          do not delay until end of reiser4_sync_inodes */
33513 +                       reiser4_writeout(inode->i_sb, wbc);
33514 +                       get_current_context()->nr_captured = 0;
33515 +               }
33516 +       }
33517 +       return result;
33518 +}
33519 +
33520 +/* plugin->ioctl */
33521 +int ioctl_cryptcompress(struct inode *inode, struct file *filp,
33522 +                       unsigned int cmd, unsigned long arg)
33523 +{
33524 +       return RETERR(-ENOSYS);
33525 +}
33526 +
33527 +/* plugin->mmap */
33528 +int mmap_cryptcompress(struct file *file, struct vm_area_struct *vma)
33529 +{
33530 +       int result;
33531 +       struct inode *inode;
33532 +       reiser4_context *ctx;
33533 +
33534 +       inode = file->f_dentry->d_inode;
33535 +       ctx = reiser4_init_context(inode->i_sb);
33536 +       if (IS_ERR(ctx))
33537 +               return PTR_ERR(ctx);
33538 +       /*
33539 +        * generic_file_mmap will do update_atime. Grab space for stat data
33540 +        * update.
33541 +        */
33542 +       result = reiser4_grab_space_force
33543 +               (inode_file_plugin(inode)->estimate.update(inode),
33544 +                BA_CAN_COMMIT);
33545 +       if (result) {
33546 +               reiser4_exit_context(ctx);
33547 +               return result;
33548 +       }
33549 +       result = generic_file_mmap(file, vma);
33550 +       reiser4_exit_context(ctx);
33551 +       return result;
33552 +}
33553 +
33554 +/* plugin->delete_object */
33555 +int delete_object_cryptcompress(struct inode *inode)
33556 +{
33557 +       int result;
33558 +       struct cryptcompress_info * info;
33559 +
33560 +       assert("edward-429", inode->i_nlink == 0);
33561 +
33562 +       reiser4_txn_restart_current();
33563 +       info = cryptcompress_inode_data(inode);
33564 +
33565 +       mutex_lock(&info->checkin_mutex);
33566 +       result = cryptcompress_truncate(inode, 0, 0);
33567 +       mutex_unlock(&info->checkin_mutex);
33568 +
33569 +       if (result) {
33570 +               warning("edward-430",
33571 +                       "cannot truncate cryptcompress file  %lli: %i",
33572 +                       (unsigned long long)get_inode_oid(inode),
33573 +                       result);
33574 +       }
33575 +       truncate_inode_pages(inode->i_mapping, 0);
33576 +       assert("edward-1487", pages_truncate_ok(inode, 0));
33577 +       /* and remove stat data */
33578 +       return reiser4_delete_object_common(inode);
33579 +}
33580 +
33581 +/*
33582 + * plugin->setattr
33583 + * This implements actual truncate (see comments in reiser4/page_cache.c)
33584 + */
33585 +int setattr_cryptcompress(struct dentry *dentry, struct iattr *attr)
33586 +{
33587 +       int result;
33588 +       struct inode *inode;
33589 +       struct cryptcompress_info * info;
33590 +
33591 +       inode = dentry->d_inode;
33592 +       info = cryptcompress_inode_data(inode);
33593 +
33594 +       if (attr->ia_valid & ATTR_SIZE) {
33595 +               if (i_size_read(inode) != attr->ia_size) {
33596 +                       reiser4_context *ctx;
33597 +                       loff_t old_size;
33598 +
33599 +                       ctx = reiser4_init_context(dentry->d_inode->i_sb);
33600 +                       if (IS_ERR(ctx))
33601 +                               return PTR_ERR(ctx);
33602 +                       result = setattr_pschedule_hook(inode);
33603 +                       if (result) {
33604 +                               context_set_commit_async(ctx);
33605 +                               reiser4_exit_context(ctx);
33606 +                               return result;
33607 +                       }
33608 +                       old_size = i_size_read(inode);
33609 +                       inode_check_scale(inode, old_size, attr->ia_size);
33610 +
33611 +                       mutex_lock(&info->checkin_mutex);
33612 +                       result = cryptcompress_truncate(inode,
33613 +                                                       attr->ia_size,
33614 +                                                       1/* update sd */);
33615 +                       mutex_unlock(&info->checkin_mutex);
33616 +                       if (result) {
33617 +                            warning("edward-1192",
33618 +                                    "truncate_cryptcompress failed: oid %lli, "
33619 +                                    "old size %lld, new size %lld, retval %d",
33620 +                                    (unsigned long long)
33621 +                                    get_inode_oid(inode), old_size,
33622 +                                    attr->ia_size, result);
33623 +                       }
33624 +                       context_set_commit_async(ctx);
33625 +                       reiser4_exit_context(ctx);
33626 +               } else
33627 +                       result = 0;
33628 +       } else
33629 +               result = reiser4_setattr_common(dentry, attr);
33630 +       return result;
33631 +}
33632 +
33633 +/* plugin->release */
33634 +int release_cryptcompress(struct inode *inode, struct file *file)
33635 +{
33636 +       reiser4_context *ctx = reiser4_init_context(inode->i_sb);
33637 +
33638 +       if (IS_ERR(ctx))
33639 +               return PTR_ERR(ctx);
33640 +       reiser4_free_file_fsdata(file);
33641 +       reiser4_exit_context(ctx);
33642 +       return 0;
33643 +}
33644 +
33645 +/* plugin->prepare_write */
33646 +int prepare_write_cryptcompress(struct file *file, struct page *page,
33647 +                               unsigned from, unsigned to)
33648 +{
33649 +       return -EINVAL;
33650 +}
33651 +
33652 +/* plugin->commit_write */
33653 +int commit_write_cryptcompress(struct file *file, struct page *page,
33654 +                              unsigned from, unsigned to)
33655 +{
33656 +       BUG();
33657 +       return 0;
33658 +}
33659 +
33660 +/* plugin->bmap */
33661 +sector_t bmap_cryptcompress(struct address_space *mapping, sector_t lblock)
33662 +{
33663 +       return -EINVAL;
33664 +}
33665 +
33666 +/*
33667 +  Local variables:
33668 +  c-indentation-style: "K&R"
33669 +  mode-name: "LC"
33670 +  c-basic-offset: 8
33671 +  tab-width: 8
33672 +  fill-column: 80
33673 +  scroll-step: 1
33674 +  End:
33675 +*/
33676 diff -puN /dev/null fs/reiser4/plugin/file/cryptcompress.h
33677 --- /dev/null
33678 +++ a/fs/reiser4/plugin/file/cryptcompress.h
33679 @@ -0,0 +1,616 @@
33680 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
33681 +/* See http://www.namesys.com/cryptcompress_design.html */
33682 +
33683 +#if !defined( __FS_REISER4_CRYPTCOMPRESS_H__ )
33684 +#define __FS_REISER4_CRYPTCOMPRESS_H__
33685 +
33686 +#include "../../page_cache.h"
33687 +#include "../compress/compress.h"
33688 +#include "../crypto/cipher.h"
33689 +
33690 +#include <linux/pagemap.h>
33691 +
33692 +#define MIN_CLUSTER_SHIFT PAGE_CACHE_SHIFT
33693 +#define MAX_CLUSTER_SHIFT 16
33694 +#define MAX_CLUSTER_NRPAGES (1U << MAX_CLUSTER_SHIFT >> PAGE_CACHE_SHIFT)
33695 +#define DC_CHECKSUM_SIZE 4
33696 +
33697 +#define MIN_LATTICE_FACTOR 1
33698 +#define MAX_LATTICE_FACTOR 32
33699 +
33700 +/* this mask contains all non-standard plugins that might
33701 +   be present in reiser4-specific part of inode managed by
33702 +   cryptcompress file plugin */
33703 +#define cryptcompress_mask                             \
33704 +       ((1 << PSET_FILE) |                             \
33705 +        (1 << PSET_CLUSTER) |                          \
33706 +        (1 << PSET_CIPHER) |                           \
33707 +        (1 << PSET_DIGEST) |                           \
33708 +        (1 << PSET_COMPRESSION) |                      \
33709 +        (1 << PSET_COMPRESSION_MODE))
33710 +
33711 +#if REISER4_DEBUG
33712 +static inline int cluster_shift_ok(int shift)
33713 +{
33714 +       return (shift >= MIN_CLUSTER_SHIFT) && (shift <= MAX_CLUSTER_SHIFT);
33715 +}
33716 +#endif
33717 +
33718 +#if REISER4_DEBUG
33719 +#define INODE_PGCOUNT(inode)                                           \
33720 +({                                                                     \
33721 +       assert("edward-1530", inode_file_plugin(inode) ==               \
33722 +              file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID));        \
33723 +       atomic_read(&cryptcompress_inode_data(inode)->pgcount);         \
33724 + })
33725 +#define INODE_PGCOUNT_INC(inode)                                       \
33726 +do {                                                                   \
33727 +       assert("edward-1531", inode_file_plugin(inode) ==               \
33728 +              file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID));        \
33729 +       atomic_inc(&cryptcompress_inode_data(inode)->pgcount);          \
33730 +} while (0)
33731 +#define INODE_PGCOUNT_DEC(inode)                                       \
33732 +do {                                                                   \
33733 +       if (inode_file_plugin(inode) ==                                 \
33734 +           file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID))            \
33735 +               atomic_dec(&cryptcompress_inode_data(inode)->pgcount);  \
33736 +} while (0)
33737 +#else
33738 +#define INODE_PGCOUNT(inode) (0)
33739 +#define INODE_PGCOUNT_INC(inode)
33740 +#define INODE_PGCOUNT_DEC(inode)
33741 +#endif /* REISER4_DEBUG */
33742 +
33743 +struct tfm_stream {
33744 +       __u8 *data;
33745 +       size_t size;
33746 +};
33747 +
33748 +typedef enum {
33749 +       INPUT_STREAM,
33750 +       OUTPUT_STREAM,
33751 +       LAST_STREAM
33752 +} tfm_stream_id;
33753 +
33754 +typedef struct tfm_stream * tfm_unit[LAST_STREAM];
33755 +
33756 +static inline __u8 *ts_data(struct tfm_stream * stm)
33757 +{
33758 +       assert("edward-928", stm != NULL);
33759 +       return stm->data;
33760 +}
33761 +
33762 +static inline size_t ts_size(struct tfm_stream * stm)
33763 +{
33764 +       assert("edward-929", stm != NULL);
33765 +       return stm->size;
33766 +}
33767 +
33768 +static inline void set_ts_size(struct tfm_stream * stm, size_t size)
33769 +{
33770 +       assert("edward-930", stm != NULL);
33771 +
33772 +       stm->size = size;
33773 +}
33774 +
33775 +static inline int alloc_ts(struct tfm_stream ** stm)
33776 +{
33777 +       assert("edward-931", stm);
33778 +       assert("edward-932", *stm == NULL);
33779 +
33780 +       *stm = kzalloc(sizeof(**stm), reiser4_ctx_gfp_mask_get());
33781 +       if (!*stm)
33782 +               return -ENOMEM;
33783 +       return 0;
33784 +}
33785 +
33786 +static inline void free_ts(struct tfm_stream * stm)
33787 +{
33788 +       assert("edward-933", !ts_data(stm));
33789 +       assert("edward-934", !ts_size(stm));
33790 +
33791 +       kfree(stm);
33792 +}
33793 +
33794 +static inline int alloc_ts_data(struct tfm_stream * stm, size_t size)
33795 +{
33796 +       assert("edward-935", !ts_data(stm));
33797 +       assert("edward-936", !ts_size(stm));
33798 +       assert("edward-937", size != 0);
33799 +
33800 +       stm->data = reiser4_vmalloc(size);
33801 +       if (!stm->data)
33802 +               return -ENOMEM;
33803 +       set_ts_size(stm, size);
33804 +       return 0;
33805 +}
33806 +
33807 +static inline void free_ts_data(struct tfm_stream * stm)
33808 +{
33809 +       assert("edward-938", equi(ts_data(stm), ts_size(stm)));
33810 +
33811 +       if (ts_data(stm))
33812 +               vfree(ts_data(stm));
33813 +       memset(stm, 0, sizeof *stm);
33814 +}
33815 +
33816 +/* Write modes for item conversion in flush convert phase */
33817 +typedef enum {
33818 +       CRC_APPEND_ITEM = 1,
33819 +       CRC_OVERWRITE_ITEM = 2,
33820 +       CRC_CUT_ITEM = 3
33821 +} cryptcompress_write_mode_t;
33822 +
33823 +typedef enum {
33824 +       LC_INVAL  = 0,   /* invalid value */
33825 +       LC_APPOV = 1,    /* append and/or overwrite */
33826 +       LC_TRUNC = 2     /* truncate */
33827 +} logical_cluster_op;
33828 +
33829 +/* Transform cluster.
33830 + * Intermediate state between page cluster and disk cluster
33831 + * Is used for data transform (compression/encryption)
33832 + */
33833 +struct tfm_cluster {
33834 +       coa_set coa;      /* compression algorithms info */
33835 +       tfm_unit tun;     /* plain and transformed streams */
33836 +       tfm_action act;
33837 +       int uptodate;
33838 +       int lsize;        /* number of bytes in logical cluster */
33839 +       int len;          /* length of the transform stream */
33840 +};
33841 +
33842 +static inline coa_t get_coa(struct tfm_cluster * tc, reiser4_compression_id id,
33843 +                           tfm_action act)
33844 +{
33845 +       return tc->coa[id][act];
33846 +}
33847 +
33848 +static inline void set_coa(struct tfm_cluster * tc, reiser4_compression_id id,
33849 +                          tfm_action act, coa_t coa)
33850 +{
33851 +       tc->coa[id][act] = coa;
33852 +}
33853 +
33854 +static inline int alloc_coa(struct tfm_cluster * tc, compression_plugin * cplug)
33855 +{
33856 +       coa_t coa;
33857 +
33858 +       coa = cplug->alloc(tc->act);
33859 +       if (IS_ERR(coa))
33860 +               return PTR_ERR(coa);
33861 +       set_coa(tc, cplug->h.id, tc->act, coa);
33862 +       return 0;
33863 +}
33864 +
33865 +static inline int
33866 +grab_coa(struct tfm_cluster * tc, compression_plugin * cplug)
33867 +{
33868 +       return (cplug->alloc && !get_coa(tc, cplug->h.id, tc->act) ?
33869 +               alloc_coa(tc, cplug) : 0);
33870 +}
33871 +
33872 +static inline void free_coa_set(struct tfm_cluster * tc)
33873 +{
33874 +       tfm_action j;
33875 +       reiser4_compression_id i;
33876 +       compression_plugin *cplug;
33877 +
33878 +       assert("edward-810", tc != NULL);
33879 +
33880 +       for (j = 0; j < TFMA_LAST; j++)
33881 +               for (i = 0; i < LAST_COMPRESSION_ID; i++) {
33882 +                       if (!get_coa(tc, i, j))
33883 +                               continue;
33884 +                       cplug = compression_plugin_by_id(i);
33885 +                       assert("edward-812", cplug->free != NULL);
33886 +                       cplug->free(get_coa(tc, i, j), j);
33887 +                       set_coa(tc, i, j, 0);
33888 +               }
33889 +       return;
33890 +}
33891 +
33892 +static inline struct tfm_stream * get_tfm_stream(struct tfm_cluster * tc,
33893 +                                                tfm_stream_id id)
33894 +{
33895 +       return tc->tun[id];
33896 +}
33897 +
33898 +static inline void set_tfm_stream(struct tfm_cluster * tc,
33899 +                                 tfm_stream_id id, struct tfm_stream * ts)
33900 +{
33901 +       tc->tun[id] = ts;
33902 +}
33903 +
33904 +static inline __u8 *tfm_stream_data(struct tfm_cluster * tc, tfm_stream_id id)
33905 +{
33906 +       return ts_data(get_tfm_stream(tc, id));
33907 +}
33908 +
33909 +static inline void set_tfm_stream_data(struct tfm_cluster * tc,
33910 +                                      tfm_stream_id id, __u8 * data)
33911 +{
33912 +       get_tfm_stream(tc, id)->data = data;
33913 +}
33914 +
33915 +static inline size_t tfm_stream_size(struct tfm_cluster * tc, tfm_stream_id id)
33916 +{
33917 +       return ts_size(get_tfm_stream(tc, id));
33918 +}
33919 +
33920 +static inline void
33921 +set_tfm_stream_size(struct tfm_cluster * tc, tfm_stream_id id, size_t size)
33922 +{
33923 +       get_tfm_stream(tc, id)->size = size;
33924 +}
33925 +
33926 +static inline int
33927 +alloc_tfm_stream(struct tfm_cluster * tc, size_t size, tfm_stream_id id)
33928 +{
33929 +       assert("edward-939", tc != NULL);
33930 +       assert("edward-940", !get_tfm_stream(tc, id));
33931 +
33932 +       tc->tun[id] = kzalloc(sizeof(struct tfm_stream),
33933 +                             reiser4_ctx_gfp_mask_get());
33934 +       if (!tc->tun[id])
33935 +               return -ENOMEM;
33936 +       return alloc_ts_data(get_tfm_stream(tc, id), size);
33937 +}
33938 +
33939 +static inline int
33940 +realloc_tfm_stream(struct tfm_cluster * tc, size_t size, tfm_stream_id id)
33941 +{
33942 +       assert("edward-941", tfm_stream_size(tc, id) < size);
33943 +       free_ts_data(get_tfm_stream(tc, id));
33944 +       return alloc_ts_data(get_tfm_stream(tc, id), size);
33945 +}
33946 +
33947 +static inline void free_tfm_stream(struct tfm_cluster * tc, tfm_stream_id id)
33948 +{
33949 +       free_ts_data(get_tfm_stream(tc, id));
33950 +       free_ts(get_tfm_stream(tc, id));
33951 +       set_tfm_stream(tc, id, 0);
33952 +}
33953 +
33954 +static inline unsigned coa_overrun(compression_plugin * cplug, int ilen)
33955 +{
33956 +       return (cplug->overrun != NULL ? cplug->overrun(ilen) : 0);
33957 +}
33958 +
33959 +static inline void free_tfm_unit(struct tfm_cluster * tc)
33960 +{
33961 +       tfm_stream_id id;
33962 +       for (id = 0; id < LAST_STREAM; id++) {
33963 +               if (!get_tfm_stream(tc, id))
33964 +                       continue;
33965 +               free_tfm_stream(tc, id);
33966 +       }
33967 +}
33968 +
33969 +static inline void put_tfm_cluster(struct tfm_cluster * tc)
33970 +{
33971 +       assert("edward-942", tc != NULL);
33972 +       free_coa_set(tc);
33973 +       free_tfm_unit(tc);
33974 +}
33975 +
33976 +static inline int tfm_cluster_is_uptodate(struct tfm_cluster * tc)
33977 +{
33978 +       assert("edward-943", tc != NULL);
33979 +       assert("edward-944", tc->uptodate == 0 || tc->uptodate == 1);
33980 +       return (tc->uptodate == 1);
33981 +}
33982 +
33983 +static inline void tfm_cluster_set_uptodate(struct tfm_cluster * tc)
33984 +{
33985 +       assert("edward-945", tc != NULL);
33986 +       assert("edward-946", tc->uptodate == 0 || tc->uptodate == 1);
33987 +       tc->uptodate = 1;
33988 +       return;
33989 +}
33990 +
33991 +static inline void tfm_cluster_clr_uptodate(struct tfm_cluster * tc)
33992 +{
33993 +       assert("edward-947", tc != NULL);
33994 +       assert("edward-948", tc->uptodate == 0 || tc->uptodate == 1);
33995 +       tc->uptodate = 0;
33996 +       return;
33997 +}
33998 +
33999 +static inline int tfm_stream_is_set(struct tfm_cluster * tc, tfm_stream_id id)
34000 +{
34001 +       return (get_tfm_stream(tc, id) &&
34002 +               tfm_stream_data(tc, id) && tfm_stream_size(tc, id));
34003 +}
34004 +
34005 +static inline int tfm_cluster_is_set(struct tfm_cluster * tc)
34006 +{
34007 +       int i;
34008 +       for (i = 0; i < LAST_STREAM; i++)
34009 +               if (!tfm_stream_is_set(tc, i))
34010 +                       return 0;
34011 +       return 1;
34012 +}
34013 +
34014 +static inline void alternate_streams(struct tfm_cluster * tc)
34015 +{
34016 +       struct tfm_stream *tmp = get_tfm_stream(tc, INPUT_STREAM);
34017 +
34018 +       set_tfm_stream(tc, INPUT_STREAM, get_tfm_stream(tc, OUTPUT_STREAM));
34019 +       set_tfm_stream(tc, OUTPUT_STREAM, tmp);
34020 +}
34021 +
34022 +/* Set of states to indicate a kind of data
34023 + * that will be written to the window */
34024 +typedef enum {
34025 +       DATA_WINDOW,            /* user's data */
34026 +       HOLE_WINDOW             /* zeroes (such kind of data can be written
34027 +                                * if we start to write from offset > i_size) */
34028 +} window_stat;
34029 +
34030 +/* Window (of logical cluster size) discretely sliding along a file.
34031 + * Is used to locate hole region in a logical cluster to be properly
34032 + * represented on disk.
34033 + * We split a write to cryptcompress file into writes to its logical
34034 + * clusters. Before writing to a logical cluster we set a window, i.e.
34035 + * calculate values of the following fields:
34036 + */
34037 +struct reiser4_slide {
34038 +       unsigned off;           /* offset to write from */
34039 +       unsigned count;         /* number of bytes to write */
34040 +       unsigned delta;         /* number of bytes to append to the hole */
34041 +       window_stat stat;       /* what kind of data will be written starting
34042 +                                  from @off */
34043 +};
34044 +
34045 +/* Possible states of a disk cluster */
34046 +typedef enum {
34047 +       INVAL_DISK_CLUSTER,     /* unknown state */
34048 +       PREP_DISK_CLUSTER,      /* disk cluster got converted by flush
34049 +                                * at least 1 time */
34050 +       UNPR_DISK_CLUSTER,      /* disk cluster just created and should be
34051 +                                * converted by flush */
34052 +       FAKE_DISK_CLUSTER,      /* disk cluster doesn't exist neither in memory
34053 +                                * nor on disk */
34054 +       TRNC_DISK_CLUSTER       /* disk cluster is partially truncated */
34055 +} disk_cluster_stat;
34056 +
34057 +/* The following structure represents various stages of the same logical
34058 + * cluster of index @index:
34059 + * . fixed slide
34060 + * . page cluster         (stage in primary cache)
34061 + * . transform cluster    (transition stage)
34062 + * . disk cluster         (stage in secondary cache)
34063 + * This structure is used in transition and synchronizing operations, e.g.
34064 + * transform cluster is a transition state when synchronizing page cluster
34065 + * and disk cluster.
34066 + * FIXME: Encapsulate page cluster, disk cluster.
34067 + */
34068 +struct cluster_handle {
34069 +       cloff_t index;           /* offset in a file (unit is a cluster size) */
34070 +       int index_valid;         /* for validating the index above, if needed */
34071 +       struct file *file;       /* host file */
34072 +
34073 +       /* logical cluster */
34074 +       struct reiser4_slide *win; /* sliding window to locate holes */
34075 +       logical_cluster_op op;   /* logical cluster operation (truncate or
34076 +                                   append/overwrite) */
34077 +       /* transform cluster */
34078 +       struct tfm_cluster tc;   /* contains all needed info to synchronize
34079 +                                   page cluster and disk cluster) */
34080 +        /* page cluster */
34081 +       int nr_pages;            /* number of pages of current checkin action */
34082 +       int old_nrpages;         /* number of pages of last checkin action */
34083 +       struct page **pages;     /* attached pages */
34084 +       jnode * node;            /* jnode for capture */
34085 +
34086 +       /* disk cluster */
34087 +       hint_t *hint;            /* current position in the tree */
34088 +       disk_cluster_stat dstat; /* state of the current disk cluster */
34089 +       int reserved;            /* is space for disk cluster reserved */
34090 +#if REISER4_DEBUG
34091 +       reiser4_context *ctx;
34092 +       int reserved_prepped;
34093 +       int reserved_unprepped;
34094 +#endif
34095 +
34096 +};
34097 +
34098 +static inline __u8 * tfm_input_data (struct cluster_handle * clust)
34099 +{
34100 +       return tfm_stream_data(&clust->tc, INPUT_STREAM);
34101 +}
34102 +
34103 +static inline __u8 * tfm_output_data (struct cluster_handle * clust)
34104 +{
34105 +       return tfm_stream_data(&clust->tc, OUTPUT_STREAM);
34106 +}
34107 +
34108 +static inline int reset_cluster_pgset(struct cluster_handle * clust,
34109 +                                     int nrpages)
34110 +{
34111 +       assert("edward-1057", clust->pages != NULL);
34112 +       memset(clust->pages, 0, sizeof(*clust->pages) * nrpages);
34113 +       return 0;
34114 +}
34115 +
34116 +static inline int alloc_cluster_pgset(struct cluster_handle * clust,
34117 +                                     int nrpages)
34118 +{
34119 +       assert("edward-949", clust != NULL);
34120 +       assert("edward-1362", clust->pages == NULL);
34121 +       assert("edward-950", nrpages != 0 && nrpages <= MAX_CLUSTER_NRPAGES);
34122 +
34123 +       clust->pages = kzalloc(sizeof(*clust->pages) * nrpages,
34124 +                              reiser4_ctx_gfp_mask_get());
34125 +       if (!clust->pages)
34126 +               return RETERR(-ENOMEM);
34127 +       return 0;
34128 +}
34129 +
34130 +static inline void move_cluster_pgset(struct cluster_handle *clust,
34131 +                                     struct page ***pages, int * nr_pages)
34132 +{
34133 +       assert("edward-1545", clust != NULL && clust->pages != NULL);
34134 +       assert("edward-1546", pages != NULL && *pages == NULL);
34135 +       *pages = clust->pages;
34136 +       *nr_pages = clust->nr_pages;
34137 +       clust->pages = NULL;
34138 +}
34139 +
34140 +static inline void free_cluster_pgset(struct cluster_handle * clust)
34141 +{
34142 +       assert("edward-951", clust->pages != NULL);
34143 +       kfree(clust->pages);
34144 +       clust->pages = NULL;
34145 +}
34146 +
34147 +static inline void put_cluster_handle(struct cluster_handle * clust)
34148 +{
34149 +       assert("edward-435", clust != NULL);
34150 +
34151 +       put_tfm_cluster(&clust->tc);
34152 +       if (clust->pages)
34153 +               free_cluster_pgset(clust);
34154 +       memset(clust, 0, sizeof *clust);
34155 +}
34156 +
34157 +static inline void inc_keyload_count(struct reiser4_crypto_info * data)
34158 +{
34159 +       assert("edward-1410", data != NULL);
34160 +       data->keyload_count++;
34161 +}
34162 +
34163 +static inline void dec_keyload_count(struct reiser4_crypto_info * data)
34164 +{
34165 +       assert("edward-1411", data != NULL);
34166 +       assert("edward-1412", data->keyload_count > 0);
34167 +       data->keyload_count--;
34168 +}
34169 +
34170 +static inline int capture_cluster_jnode(jnode * node)
34171 +{
34172 +       return reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0);
34173 +}
34174 +
34175 +/* cryptcompress specific part of reiser4_inode */
34176 +struct cryptcompress_info {
34177 +       struct mutex checkin_mutex;  /* This is to serialize
34178 +                                     * checkin_logical_cluster operations */
34179 +       cloff_t trunc_index;         /* Index of the leftmost truncated disk
34180 +                                     * cluster (to resolve races with read) */
34181 +       struct reiser4_crypto_info *crypt;
34182 +       /*
34183 +        * the following 2 fields are controlled by compression mode plugin
34184 +        */
34185 +       int compress_toggle;          /* Current status of compressibility */
34186 +       int lattice_factor;           /* Factor of dynamic lattice. FIXME: Have
34187 +                                      * a compression_toggle to keep the factor
34188 +                                      */
34189 +#if REISER4_DEBUG
34190 +       atomic_t pgcount;             /* number of grabbed pages */
34191 +#endif
34192 +};
34193 +
34194 +static inline void set_compression_toggle (struct cryptcompress_info * info, int val)
34195 +{
34196 +       info->compress_toggle = val;
34197 +}
34198 +
34199 +static inline int get_compression_toggle (struct cryptcompress_info * info)
34200 +{
34201 +       return info->compress_toggle;
34202 +}
34203 +
34204 +static inline int compression_is_on(struct cryptcompress_info * info)
34205 +{
34206 +       return get_compression_toggle(info) == 1;
34207 +}
34208 +
34209 +static inline void turn_on_compression(struct cryptcompress_info * info)
34210 +{
34211 +       set_compression_toggle(info, 1);
34212 +}
34213 +
34214 +static inline void turn_off_compression(struct cryptcompress_info * info)
34215 +{
34216 +       set_compression_toggle(info, 0);
34217 +}
34218 +
34219 +static inline void set_lattice_factor(struct cryptcompress_info * info, int val)
34220 +{
34221 +       info->lattice_factor = val;
34222 +}
34223 +
34224 +static inline int get_lattice_factor(struct cryptcompress_info * info)
34225 +{
34226 +       return info->lattice_factor;
34227 +}
34228 +
34229 +struct cryptcompress_info *cryptcompress_inode_data(const struct inode *);
34230 +int equal_to_rdk(znode *, const reiser4_key *);
34231 +int goto_right_neighbor(coord_t *, lock_handle *);
34232 +int cryptcompress_inode_ok(struct inode *inode);
34233 +int coord_is_unprepped_ctail(const coord_t * coord);
34234 +extern int do_readpage_ctail(struct inode *, struct cluster_handle *,
34235 +                            struct page * page, znode_lock_mode mode);
34236 +extern int ctail_insert_unprepped_cluster(struct cluster_handle * clust,
34237 +                                         struct inode * inode);
34238 +extern int readpages_cryptcompress(struct file*, struct address_space*,
34239 +                                  struct list_head*, unsigned);
34240 +int bind_cryptcompress(struct inode *child, struct inode *parent);
34241 +void destroy_inode_cryptcompress(struct inode * inode);
34242 +int grab_page_cluster(struct inode *inode, struct cluster_handle * clust,
34243 +                     rw_op rw);
34244 +int write_pschedule_hook(struct file *file, struct inode * inode,
34245 +                        loff_t pos, struct cluster_handle * clust,
34246 +                        struct psched_context * cont);
34247 +int setattr_pschedule_hook(struct inode * inode);
34248 +struct reiser4_crypto_info * inode_crypto_info(struct inode * inode);
34249 +void inherit_crypto_info_common(struct inode * parent, struct inode * object,
34250 +                               int (*can_inherit)(struct inode * child,
34251 +                                                  struct inode * parent));
34252 +void reiser4_attach_crypto_info(struct inode * inode,
34253 +                               struct reiser4_crypto_info * info);
34254 +void change_crypto_info(struct inode * inode, struct reiser4_crypto_info * new);
34255 +struct reiser4_crypto_info * reiser4_alloc_crypto_info (struct inode * inode);
34256 +
34257 +static inline struct crypto_blkcipher * info_get_cipher(struct reiser4_crypto_info * info)
34258 +{
34259 +       return info->cipher;
34260 +}
34261 +
34262 +static inline void info_set_cipher(struct reiser4_crypto_info * info,
34263 +                                  struct crypto_blkcipher * tfm)
34264 +{
34265 +       info->cipher = tfm;
34266 +}
34267 +
34268 +static inline struct crypto_hash * info_get_digest(struct reiser4_crypto_info * info)
34269 +{
34270 +       return info->digest;
34271 +}
34272 +
34273 +static inline void info_set_digest(struct reiser4_crypto_info * info,
34274 +                                  struct crypto_hash * tfm)
34275 +{
34276 +       info->digest = tfm;
34277 +}
34278 +
34279 +static inline void put_cluster_page(struct page * page)
34280 +{
34281 +       page_cache_release(page);
34282 +}
34283 +
34284 +#endif /* __FS_REISER4_CRYPTCOMPRESS_H__ */
34285 +
34286 +/* Make Linus happy.
34287 +   Local variables:
34288 +   c-indentation-style: "K&R"
34289 +   mode-name: "LC"
34290 +   c-basic-offset: 8
34291 +   tab-width: 8
34292 +   fill-column: 120
34293 +   scroll-step: 1
34294 +   End:
34295 +*/
34296 diff -puN /dev/null fs/reiser4/plugin/file/file.c
34297 --- /dev/null
34298 +++ a/fs/reiser4/plugin/file/file.c
34299 @@ -0,0 +1,2728 @@
34300 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
34301 + * reiser4/README */
34302 +
34303 +/*
34304 + * this file contains implementations of inode/file/address_space/file plugin
34305 + * operations specific for "unix file plugin" (plugin id is
34306 + * UNIX_FILE_PLUGIN_ID). "Unix file" is either built of tail items only
34307 + * (FORMATTING_ID) or of extent items only (EXTENT_POINTER_ID) or empty (have
34308 + * no items but stat data)
34309 + */
34310 +
34311 +#include "../../inode.h"
34312 +#include "../../super.h"
34313 +#include "../../tree_walk.h"
34314 +#include "../../carry.h"
34315 +#include "../../page_cache.h"
34316 +#include "../../ioctl.h"
34317 +#include "../object.h"
34318 +#include "../cluster.h"
34319 +#include "../../safe_link.h"
34320 +
34321 +#include <linux/writeback.h>
34322 +#include <linux/pagevec.h>
34323 +#include <linux/syscalls.h>
34324 +
34325 +
34326 +static int unpack(struct file *file, struct inode *inode, int forever);
34327 +static void drop_access(struct unix_file_info *);
34328 +static int hint_validate(hint_t * hint, const reiser4_key * key, int check_key,
34329 +                        znode_lock_mode lock_mode);
34330 +
34331 +/* Get exclusive access and make sure that file is not partially
34332 + * converted (It may happen that another process is doing tail
34333 + * conversion. If so, wait until it completes)
34334 + */
34335 +static inline void get_exclusive_access_careful(struct unix_file_info * uf_info,
34336 +                                               struct inode *inode)
34337 +{
34338 +        do {
34339 +               get_exclusive_access(uf_info);
34340 +               if (!reiser4_inode_get_flag(inode, REISER4_PART_IN_CONV))
34341 +                       break;
34342 +               drop_exclusive_access(uf_info);
34343 +               schedule();
34344 +       } while (1);
34345 +}
34346 +
34347 +/* get unix file plugin specific portion of inode */
34348 +struct unix_file_info *unix_file_inode_data(const struct inode *inode)
34349 +{
34350 +       return &reiser4_inode_data(inode)->file_plugin_data.unix_file_info;
34351 +}
34352 +
34353 +/**
34354 + * equal_to_rdk - compare key and znode's right delimiting key
34355 + * @node: node whose right delimiting key to compare with @key
34356 + * @key: key to compare with @node's right delimiting key
34357 + *
34358 + * Returns true if @key is equal to right delimiting key of @node.
34359 + */
34360 +int equal_to_rdk(znode *node, const reiser4_key *key)
34361 +{
34362 +       int result;
34363 +
34364 +       read_lock_dk(znode_get_tree(node));
34365 +       result = keyeq(key, znode_get_rd_key(node));
34366 +       read_unlock_dk(znode_get_tree(node));
34367 +       return result;
34368 +}
34369 +
34370 +#if REISER4_DEBUG
34371 +
34372 +/**
34373 + * equal_to_ldk - compare key and znode's left delimiting key
34374 + * @node: node whose left delimiting key to compare with @key
34375 + * @key: key to compare with @node's left delimiting key
34376 + *
34377 + * Returns true if @key is equal to left delimiting key of @node.
34378 + */
34379 +int equal_to_ldk(znode *node, const reiser4_key *key)
34380 +{
34381 +       int result;
34382 +
34383 +       read_lock_dk(znode_get_tree(node));
34384 +       result = keyeq(key, znode_get_ld_key(node));
34385 +       read_unlock_dk(znode_get_tree(node));
34386 +       return result;
34387 +}
34388 +
34389 +/**
34390 + * check_coord - check whether coord corresponds to key
34391 + * @coord: coord to check
34392 + * @key: key @coord has to correspond to
34393 + *
34394 + * Returns true if @coord is set as if it was set as result of lookup with @key
34395 + * in coord->node.
34396 + */
34397 +static int check_coord(const coord_t *coord, const reiser4_key *key)
34398 +{
34399 +       coord_t twin;
34400 +
34401 +       node_plugin_by_node(coord->node)->lookup(coord->node, key,
34402 +                                                FIND_MAX_NOT_MORE_THAN, &twin);
34403 +       return coords_equal(coord, &twin);
34404 +}
34405 +
34406 +#endif /* REISER4_DEBUG */
34407 +
34408 +/**
34409 + * init_uf_coord - initialize extended coord
34410 + * @uf_coord:
34411 + * @lh:
34412 + *
34413 + *
34414 + */
34415 +void init_uf_coord(uf_coord_t *uf_coord, lock_handle *lh)
34416 +{
34417 +       coord_init_zero(&uf_coord->coord);
34418 +       coord_clear_iplug(&uf_coord->coord);
34419 +       uf_coord->lh = lh;
34420 +       init_lh(lh);
34421 +       memset(&uf_coord->extension, 0, sizeof(uf_coord->extension));
34422 +       uf_coord->valid = 0;
34423 +}
34424 +
34425 +static void validate_extended_coord(uf_coord_t *uf_coord, loff_t offset)
34426 +{
34427 +       assert("vs-1333", uf_coord->valid == 0);
34428 +
34429 +       if (coord_is_between_items(&uf_coord->coord))
34430 +               return;
34431 +
34432 +       assert("vs-1348",
34433 +              item_plugin_by_coord(&uf_coord->coord)->s.file.
34434 +              init_coord_extension);
34435 +
34436 +       item_body_by_coord(&uf_coord->coord);
34437 +       item_plugin_by_coord(&uf_coord->coord)->s.file.
34438 +           init_coord_extension(uf_coord, offset);
34439 +}
34440 +
34441 +/**
34442 + * goto_right_neighbor - lock right neighbor, drop current node lock
34443 + * @coord:
34444 + * @lh:
34445 + *
34446 + * Obtain lock on right neighbor and drop lock on current node.
34447 + */
34448 +int goto_right_neighbor(coord_t *coord, lock_handle *lh)
34449 +{
34450 +       int result;
34451 +       lock_handle lh_right;
34452 +
34453 +       assert("vs-1100", znode_is_locked(coord->node));
34454 +
34455 +       init_lh(&lh_right);
34456 +       result = reiser4_get_right_neighbor(&lh_right, coord->node,
34457 +                                           znode_is_wlocked(coord->node) ?
34458 +                                           ZNODE_WRITE_LOCK : ZNODE_READ_LOCK,
34459 +                                           GN_CAN_USE_UPPER_LEVELS);
34460 +       if (result) {
34461 +               done_lh(&lh_right);
34462 +               return result;
34463 +       }
34464 +
34465 +       /*
34466 +        * we hold two longterm locks on neighboring nodes. Unlock left of
34467 +        * them
34468 +        */
34469 +       done_lh(lh);
34470 +
34471 +       coord_init_first_unit_nocheck(coord, lh_right.node);
34472 +       move_lh(lh, &lh_right);
34473 +
34474 +       return 0;
34475 +
34476 +}
34477 +
34478 +/**
34479 + * set_file_state
34480 + * @uf_info:
34481 + * @cbk_result:
34482 + * @level:
34483 + *
34484 + * This is to be used by find_file_item and in find_file_state to
34485 + * determine real state of file
34486 + */
34487 +static void set_file_state(struct unix_file_info *uf_info, int cbk_result,
34488 +                          tree_level level)
34489 +{
34490 +       if (cbk_errored(cbk_result))
34491 +               /* error happened in find_file_item */
34492 +               return;
34493 +
34494 +       assert("vs-1164", level == LEAF_LEVEL || level == TWIG_LEVEL);
34495 +
34496 +       if (uf_info->container == UF_CONTAINER_UNKNOWN) {
34497 +               if (cbk_result == CBK_COORD_NOTFOUND)
34498 +                       uf_info->container = UF_CONTAINER_EMPTY;
34499 +               else if (level == LEAF_LEVEL)
34500 +                       uf_info->container = UF_CONTAINER_TAILS;
34501 +               else
34502 +                       uf_info->container = UF_CONTAINER_EXTENTS;
34503 +       } else {
34504 +               /*
34505 +                * file state is known, check whether it is set correctly if
34506 +                * file is not being tail converted
34507 +                */
34508 +               if (!reiser4_inode_get_flag(unix_file_info_to_inode(uf_info),
34509 +                                           REISER4_PART_IN_CONV)) {
34510 +                       assert("vs-1162",
34511 +                              ergo(level == LEAF_LEVEL &&
34512 +                                   cbk_result == CBK_COORD_FOUND,
34513 +                                   uf_info->container == UF_CONTAINER_TAILS));
34514 +                       assert("vs-1165",
34515 +                              ergo(level == TWIG_LEVEL &&
34516 +                                   cbk_result == CBK_COORD_FOUND,
34517 +                                   uf_info->container == UF_CONTAINER_EXTENTS));
34518 +               }
34519 +       }
34520 +}
34521 +
34522 +int find_file_item_nohint(coord_t *coord, lock_handle *lh,
34523 +                         const reiser4_key *key, znode_lock_mode lock_mode,
34524 +                         struct inode *inode)
34525 +{
34526 +       return reiser4_object_lookup(inode, key, coord, lh, lock_mode,
34527 +                                    FIND_MAX_NOT_MORE_THAN,
34528 +                                    TWIG_LEVEL, LEAF_LEVEL,
34529 +                                    (lock_mode == ZNODE_READ_LOCK) ? CBK_UNIQUE :
34530 +                                    (CBK_UNIQUE | CBK_FOR_INSERT),
34531 +                                    NULL /* ra_info */ );
34532 +}
34533 +
34534 +/**
34535 + * find_file_item - look for file item in the tree
34536 + * @hint: provides coordinate, lock handle, seal
34537 + * @key: key for search
34538 + * @mode: mode of lock to put on returned node
34539 + * @ra_info:
34540 + * @inode:
34541 + *
34542 + * This finds position in the tree corresponding to @key. It first tries to use
34543 + * @hint's seal if it is set.
34544 + */
34545 +int find_file_item(hint_t *hint, const reiser4_key *key,
34546 +                  znode_lock_mode lock_mode,
34547 +                  struct inode *inode)
34548 +{
34549 +       int result;
34550 +       coord_t *coord;
34551 +       lock_handle *lh;
34552 +
34553 +       assert("nikita-3030", reiser4_schedulable());
34554 +       assert("vs-1707", hint != NULL);
34555 +       assert("vs-47", inode != NULL);
34556 +
34557 +       coord = &hint->ext_coord.coord;
34558 +       lh = hint->ext_coord.lh;
34559 +       init_lh(lh);
34560 +
34561 +       result = hint_validate(hint, key, 1 /* check key */, lock_mode);
34562 +       if (!result) {
34563 +               if (coord->between == AFTER_UNIT &&
34564 +                   equal_to_rdk(coord->node, key)) {
34565 +                       result = goto_right_neighbor(coord, lh);
34566 +                       if (result == -E_NO_NEIGHBOR)
34567 +                               return RETERR(-EIO);
34568 +                       if (result)
34569 +                               return result;
34570 +                       assert("vs-1152", equal_to_ldk(coord->node, key));
34571 +                       /*
34572 +                        * we moved to different node. Invalidate coord
34573 +                        * extension, zload is necessary to init it again
34574 +                        */
34575 +                       hint->ext_coord.valid = 0;
34576 +               }
34577 +
34578 +               set_file_state(unix_file_inode_data(inode), CBK_COORD_FOUND,
34579 +                              znode_get_level(coord->node));
34580 +
34581 +               return CBK_COORD_FOUND;
34582 +       }
34583 +
34584 +       coord_init_zero(coord);
34585 +       result = find_file_item_nohint(coord, lh, key, lock_mode, inode);
34586 +       set_file_state(unix_file_inode_data(inode), result,
34587 +                      znode_get_level(coord->node));
34588 +
34589 +       /* FIXME: we might already have coord extension initialized */
34590 +       hint->ext_coord.valid = 0;
34591 +       return result;
34592 +}
34593 +
34594 +/* plugin->u.file.write_flowom = NULL
34595 +   plugin->u.file.read_flow = NULL */
34596 +
34597 +void hint_init_zero(hint_t * hint)
34598 +{
34599 +       memset(hint, 0, sizeof(*hint));
34600 +       init_lh(&hint->lh);
34601 +       hint->ext_coord.lh = &hint->lh;
34602 +}
34603 +
34604 +static int find_file_state(struct inode *inode, struct unix_file_info *uf_info)
34605 +{
34606 +       int result;
34607 +       reiser4_key key;
34608 +       coord_t coord;
34609 +       lock_handle lh;
34610 +
34611 +       assert("vs-1628", ea_obtained(uf_info));
34612 +
34613 +       if (uf_info->container == UF_CONTAINER_UNKNOWN) {
34614 +               key_by_inode_and_offset_common(inode, 0, &key);
34615 +               init_lh(&lh);
34616 +               result = find_file_item_nohint(&coord, &lh, &key,
34617 +                                              ZNODE_READ_LOCK, inode);
34618 +               set_file_state(uf_info, result, znode_get_level(coord.node));
34619 +               done_lh(&lh);
34620 +               if (!cbk_errored(result))
34621 +                       result = 0;
34622 +       } else
34623 +               result = 0;
34624 +       assert("vs-1074",
34625 +              ergo(result == 0, uf_info->container != UF_CONTAINER_UNKNOWN));
34626 +       reiser4_txn_restart_current();
34627 +       return result;
34628 +}
34629 +
34630 +/**
34631 + * Estimate and reserve space needed to truncate page
34632 + * which gets partially truncated: one block for page
34633 + * itself, stat-data update (estimate_one_insert_into_item)
34634 + * and one item insertion (estimate_one_insert_into_item)
34635 + * which may happen if page corresponds to hole extent and
34636 + * unallocated one will have to be created
34637 + */
34638 +static int reserve_partial_page(reiser4_tree * tree)
34639 +{
34640 +       grab_space_enable();
34641 +       return reiser4_grab_reserved(reiser4_get_current_sb(),
34642 +                                    1 +
34643 +                                    2 * estimate_one_insert_into_item(tree),
34644 +                                    BA_CAN_COMMIT);
34645 +}
34646 +
34647 +/* estimate and reserve space needed to cut one item and update one stat data */
34648 +static int reserve_cut_iteration(reiser4_tree * tree)
34649 +{
34650 +       __u64 estimate = estimate_one_item_removal(tree)
34651 +           + estimate_one_insert_into_item(tree);
34652 +
34653 +       assert("nikita-3172", lock_stack_isclean(get_current_lock_stack()));
34654 +
34655 +       grab_space_enable();
34656 +       /* We need to double our estimate now that we can delete more than one
34657 +          node. */
34658 +       return reiser4_grab_reserved(reiser4_get_current_sb(), estimate * 2,
34659 +                                    BA_CAN_COMMIT);
34660 +}
34661 +
34662 +int reiser4_update_file_size(struct inode *inode, loff_t new_size,
34663 +                            int update_sd)
34664 +{
34665 +       int result = 0;
34666 +
34667 +       INODE_SET_SIZE(inode, new_size);
34668 +       if (update_sd) {
34669 +               inode->i_ctime = inode->i_mtime = CURRENT_TIME;
34670 +               result = reiser4_update_sd(inode);
34671 +       }
34672 +       return result;
34673 +}
34674 +
34675 +/**
34676 + * Cut file items one by one starting from the last one until
34677 + * new file size (inode->i_size) is reached. Reserve space
34678 + * and update file stat data on every single cut from the tree
34679 + */
34680 +int cut_file_items(struct inode *inode, loff_t new_size,
34681 +                  int update_sd, loff_t cur_size,
34682 +                  int (*update_actor) (struct inode *, loff_t, int))
34683 +{
34684 +       reiser4_key from_key, to_key;
34685 +       reiser4_key smallest_removed;
34686 +       file_plugin *fplug = inode_file_plugin(inode);
34687 +       int result;
34688 +       int progress = 0;
34689 +
34690 +       assert("vs-1248",
34691 +              fplug == file_plugin_by_id(UNIX_FILE_PLUGIN_ID) ||
34692 +              fplug == file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID));
34693 +
34694 +       fplug->key_by_inode(inode, new_size, &from_key);
34695 +       to_key = from_key;
34696 +       set_key_offset(&to_key, cur_size - 1 /*get_key_offset(reiser4_max_key()) */ );
34697 +       /* this loop normally runs just once */
34698 +       while (1) {
34699 +               result = reserve_cut_iteration(reiser4_tree_by_inode(inode));
34700 +               if (result)
34701 +                       break;
34702 +
34703 +               result = reiser4_cut_tree_object(current_tree, &from_key, &to_key,
34704 +                                                &smallest_removed, inode, 1,
34705 +                                                &progress);
34706 +               if (result == -E_REPEAT) {
34707 +                       /**
34708 +                        * -E_REPEAT is a signal to interrupt a long
34709 +                        * file truncation process
34710 +                        */
34711 +                       if (progress) {
34712 +                               result = update_actor(inode,
34713 +                                             get_key_offset(&smallest_removed),
34714 +                                             update_sd);
34715 +                               if (result)
34716 +                                       break;
34717 +                       }
34718 +                       /* the below does up(sbinfo->delete_mutex).
34719 +                        * Do not get folled */
34720 +                       reiser4_release_reserved(inode->i_sb);
34721 +                       /**
34722 +                        * reiser4_cut_tree_object() was interrupted probably
34723 +                        * because current atom requires commit, we have to
34724 +                        * release transaction handle to allow atom commit.
34725 +                        */
34726 +                       reiser4_txn_restart_current();
34727 +                       continue;
34728 +               }
34729 +               if (result
34730 +                   && !(result == CBK_COORD_NOTFOUND && new_size == 0
34731 +                        && inode->i_size == 0))
34732 +                       break;
34733 +
34734 +               set_key_offset(&smallest_removed, new_size);
34735 +               /* Final sd update after the file gets its correct size */
34736 +               result = update_actor(inode, get_key_offset(&smallest_removed),
34737 +                                     update_sd);
34738 +               break;
34739 +       }
34740 +
34741 +       /* the below does up(sbinfo->delete_mutex). Do not get folled */
34742 +       reiser4_release_reserved(inode->i_sb);
34743 +
34744 +       return result;
34745 +}
34746 +
34747 +int find_or_create_extent(struct page *page);
34748 +
34749 +/* part of truncate_file_body: it is called when truncate is used to make file
34750 +   shorter */
34751 +static int shorten_file(struct inode *inode, loff_t new_size)
34752 +{
34753 +       int result;
34754 +       struct page *page;
34755 +       int padd_from;
34756 +       unsigned long index;
34757 +       struct unix_file_info *uf_info;
34758 +
34759 +       /*
34760 +        * all items of ordinary reiser4 file are grouped together. That is why
34761 +        * we can use reiser4_cut_tree. Plan B files (for instance) can not be
34762 +        * truncated that simply
34763 +        */
34764 +       result = cut_file_items(inode, new_size, 1 /*update_sd */ ,
34765 +                               get_key_offset(reiser4_max_key()),
34766 +                               reiser4_update_file_size);
34767 +       if (result)
34768 +               return result;
34769 +
34770 +       uf_info = unix_file_inode_data(inode);
34771 +       assert("vs-1105", new_size == inode->i_size);
34772 +       if (new_size == 0) {
34773 +               uf_info->container = UF_CONTAINER_EMPTY;
34774 +               return 0;
34775 +       }
34776 +
34777 +       result = find_file_state(inode, uf_info);
34778 +       if (result)
34779 +               return result;
34780 +       if (uf_info->container == UF_CONTAINER_TAILS)
34781 +               /*
34782 +                * No need to worry about zeroing last page after new file
34783 +                * end
34784 +                */
34785 +               return 0;
34786 +
34787 +       padd_from = inode->i_size & (PAGE_CACHE_SIZE - 1);
34788 +       if (!padd_from)
34789 +               /* file is truncated to page boundary */
34790 +               return 0;
34791 +
34792 +       result = reserve_partial_page(reiser4_tree_by_inode(inode));
34793 +       if (result) {
34794 +               reiser4_release_reserved(inode->i_sb);
34795 +               return result;
34796 +       }
34797 +
34798 +       /* last page is partially truncated - zero its content */
34799 +       index = (inode->i_size >> PAGE_CACHE_SHIFT);
34800 +       page = read_mapping_page(inode->i_mapping, index, NULL);
34801 +       if (IS_ERR(page)) {
34802 +               /*
34803 +                * the below does up(sbinfo->delete_mutex). Do not get
34804 +                * confused
34805 +                */
34806 +               reiser4_release_reserved(inode->i_sb);
34807 +               if (likely(PTR_ERR(page) == -EINVAL)) {
34808 +                       /* looks like file is built of tail items */
34809 +                       return 0;
34810 +               }
34811 +               return PTR_ERR(page);
34812 +       }
34813 +       wait_on_page_locked(page);
34814 +       if (!PageUptodate(page)) {
34815 +               page_cache_release(page);
34816 +               /*
34817 +                * the below does up(sbinfo->delete_mutex). Do not get
34818 +                * confused
34819 +                */
34820 +               reiser4_release_reserved(inode->i_sb);
34821 +               return RETERR(-EIO);
34822 +       }
34823 +
34824 +       /*
34825 +        * if page correspons to hole extent unit - unallocated one will be
34826 +        * created here. This is not necessary
34827 +        */
34828 +       result = find_or_create_extent(page);
34829 +
34830 +       /*
34831 +        * FIXME: cut_file_items has already updated inode. Probably it would
34832 +        * be better to update it here when file is really truncated
34833 +        */
34834 +       if (result) {
34835 +               page_cache_release(page);
34836 +               /*
34837 +                * the below does up(sbinfo->delete_mutex). Do not get
34838 +                * confused
34839 +                */
34840 +               reiser4_release_reserved(inode->i_sb);
34841 +               return result;
34842 +       }
34843 +
34844 +       lock_page(page);
34845 +       assert("vs-1066", PageLocked(page));
34846 +       zero_user_segment(page, padd_from, PAGE_CACHE_SIZE);
34847 +       unlock_page(page);
34848 +       page_cache_release(page);
34849 +       /* the below does up(sbinfo->delete_mutex). Do not get confused */
34850 +       reiser4_release_reserved(inode->i_sb);
34851 +       return 0;
34852 +}
34853 +
34854 +/**
34855 + * should_have_notail
34856 + * @uf_info:
34857 + * @new_size:
34858 + *
34859 + * Calls formatting plugin to see whether file of size @new_size has to be
34860 + * stored in unformatted nodes or in tail items. 0 is returned for later case.
34861 + */
34862 +static int should_have_notail(const struct unix_file_info *uf_info, loff_t new_size)
34863 +{
34864 +       if (!uf_info->tplug)
34865 +               return 1;
34866 +       return !uf_info->tplug->have_tail(unix_file_info_to_inode(uf_info),
34867 +                                         new_size);
34868 +
34869 +}
34870 +
34871 +/**
34872 + * truncate_file_body - change length of file
34873 + * @inode: inode of file
34874 + * @new_size: new file length
34875 + *
34876 + * Adjusts items file @inode is built of to match @new_size. It may either cut
34877 + * items or add them to represent a hole at the end of file. The caller has to
34878 + * obtain exclusive access to the file.
34879 + */
34880 +static int truncate_file_body(struct inode *inode, struct iattr *attr)
34881 +{
34882 +       int result;
34883 +       loff_t new_size = attr->ia_size;
34884 +
34885 +       if (inode->i_size < new_size) {
34886 +               /* expanding truncate */
34887 +               struct unix_file_info *uf_info = unix_file_inode_data(inode);
34888 +
34889 +               result = find_file_state(inode, uf_info);
34890 +               if (result)
34891 +                       return result;
34892 +
34893 +               if (should_have_notail(uf_info, new_size)) {
34894 +                       /*
34895 +                        * file of size @new_size has to be built of
34896 +                        * extents. If it is built of tails - convert to
34897 +                        * extents
34898 +                        */
34899 +                       if (uf_info->container ==  UF_CONTAINER_TAILS) {
34900 +                               /*
34901 +                                * if file is being convered by another process
34902 +                                * - wait until it completes
34903 +                                */
34904 +                               while (1) {
34905 +                                       if (reiser4_inode_get_flag(inode,
34906 +                                                                  REISER4_PART_IN_CONV)) {
34907 +                                               drop_exclusive_access(uf_info);
34908 +                                               schedule();
34909 +                                               get_exclusive_access(uf_info);
34910 +                                               continue;
34911 +                                       }
34912 +                                       break;
34913 +                               }
34914 +
34915 +                               if (uf_info->container ==  UF_CONTAINER_TAILS) {
34916 +                                       result = tail2extent(uf_info);
34917 +                                       if (result)
34918 +                                               return result;
34919 +                               }
34920 +                       }
34921 +                       result = reiser4_write_extent(NULL, inode, NULL,
34922 +                                                     0, &new_size);
34923 +                       if (result)
34924 +                               return result;
34925 +                       uf_info->container = UF_CONTAINER_EXTENTS;
34926 +               } else {
34927 +                       if (uf_info->container ==  UF_CONTAINER_EXTENTS) {
34928 +                               result = reiser4_write_extent(NULL, inode, NULL,
34929 +                                                             0, &new_size);
34930 +                               if (result)
34931 +                                       return result;
34932 +                       } else {
34933 +                               result = reiser4_write_tail(NULL, inode, NULL,
34934 +                                                           0, &new_size);
34935 +                               if (result)
34936 +                                       return result;
34937 +                               uf_info->container = UF_CONTAINER_TAILS;
34938 +                       }
34939 +               }
34940 +               BUG_ON(result > 0);
34941 +               result = reiser4_update_file_size(inode, new_size, 1);
34942 +               BUG_ON(result != 0);
34943 +       } else
34944 +               result = shorten_file(inode, new_size);
34945 +       return result;
34946 +}
34947 +
34948 +/* plugin->u.write_sd_by_inode = write_sd_by_inode_common */
34949 +
34950 +/**
34951 + * load_file_hint - copy hint from struct file to local variable
34952 + * @file: file to get hint from
34953 + * @hint: structure to fill
34954 + *
34955 + * Reiser4 specific portion of struct file may contain information (hint)
34956 + * stored on exiting from previous read or write. That information includes
34957 + * seal of znode and coord within that znode where previous read or write
34958 + * stopped. This function copies that information to @hint if it was stored or
34959 + * initializes @hint by 0s otherwise.
34960 + */
34961 +int load_file_hint(struct file *file, hint_t *hint)
34962 +{
34963 +       reiser4_file_fsdata *fsdata;
34964 +
34965 +       if (file) {
34966 +               fsdata = reiser4_get_file_fsdata(file);
34967 +               if (IS_ERR(fsdata))
34968 +                       return PTR_ERR(fsdata);
34969 +
34970 +               spin_lock_inode(file->f_dentry->d_inode);
34971 +               if (reiser4_seal_is_set(&fsdata->reg.hint.seal)) {
34972 +                       *hint = fsdata->reg.hint;
34973 +                       init_lh(&hint->lh);
34974 +                       hint->ext_coord.lh = &hint->lh;
34975 +                       spin_unlock_inode(file->f_dentry->d_inode);
34976 +                       /*
34977 +                        * force re-validation of the coord on the first
34978 +                        * iteration of the read/write loop.
34979 +                        */
34980 +                       hint->ext_coord.valid = 0;
34981 +                       assert("nikita-19892", coords_equal(&hint->seal.coord1,
34982 +                                                           &hint->ext_coord.
34983 +                                                           coord));
34984 +                       return 0;
34985 +               }
34986 +               memset(&fsdata->reg.hint, 0, sizeof(hint_t));
34987 +               spin_unlock_inode(file->f_dentry->d_inode);
34988 +       }
34989 +       hint_init_zero(hint);
34990 +       return 0;
34991 +}
34992 +
34993 +/**
34994 + * save_file_hint - copy hint to reiser4 private struct file's part
34995 + * @file: file to save hint in
34996 + * @hint: hint to save
34997 + *
34998 + * This copies @hint to reiser4 private part of struct file. It can help
34999 + * speedup future accesses to the file.
35000 + */
35001 +void save_file_hint(struct file *file, const hint_t *hint)
35002 +{
35003 +       reiser4_file_fsdata *fsdata;
35004 +
35005 +       assert("edward-1337", hint != NULL);
35006 +
35007 +       if (!file || !reiser4_seal_is_set(&hint->seal))
35008 +               return;
35009 +       fsdata = reiser4_get_file_fsdata(file);
35010 +       assert("vs-965", !IS_ERR(fsdata));
35011 +       assert("nikita-19891",
35012 +              coords_equal(&hint->seal.coord1, &hint->ext_coord.coord));
35013 +       assert("vs-30", hint->lh.owner == NULL);
35014 +       spin_lock_inode(file->f_dentry->d_inode);
35015 +       fsdata->reg.hint = *hint;
35016 +       spin_unlock_inode(file->f_dentry->d_inode);
35017 +       return;
35018 +}
35019 +
35020 +void reiser4_unset_hint(hint_t * hint)
35021 +{
35022 +       assert("vs-1315", hint);
35023 +       hint->ext_coord.valid = 0;
35024 +       reiser4_seal_done(&hint->seal);
35025 +       done_lh(&hint->lh);
35026 +}
35027 +
35028 +/* coord must be set properly. So, that reiser4_set_hint
35029 +   has nothing to do */
35030 +void reiser4_set_hint(hint_t * hint, const reiser4_key * key,
35031 +                     znode_lock_mode mode)
35032 +{
35033 +       ON_DEBUG(coord_t * coord = &hint->ext_coord.coord);
35034 +       assert("vs-1207", WITH_DATA(coord->node, check_coord(coord, key)));
35035 +
35036 +       reiser4_seal_init(&hint->seal, &hint->ext_coord.coord, key);
35037 +       hint->offset = get_key_offset(key);
35038 +       hint->mode = mode;
35039 +       done_lh(&hint->lh);
35040 +}
35041 +
35042 +int hint_is_set(const hint_t * hint)
35043 +{
35044 +       return reiser4_seal_is_set(&hint->seal);
35045 +}
35046 +
35047 +#if REISER4_DEBUG
35048 +static int all_but_offset_key_eq(const reiser4_key * k1, const reiser4_key * k2)
35049 +{
35050 +       return (get_key_locality(k1) == get_key_locality(k2) &&
35051 +               get_key_type(k1) == get_key_type(k2) &&
35052 +               get_key_band(k1) == get_key_band(k2) &&
35053 +               get_key_ordering(k1) == get_key_ordering(k2) &&
35054 +               get_key_objectid(k1) == get_key_objectid(k2));
35055 +}
35056 +#endif
35057 +
35058 +static int
35059 +hint_validate(hint_t * hint, const reiser4_key * key, int check_key,
35060 +             znode_lock_mode lock_mode)
35061 +{
35062 +       if (!hint || !hint_is_set(hint) || hint->mode != lock_mode)
35063 +               /* hint either not set or set by different operation */
35064 +               return RETERR(-E_REPEAT);
35065 +
35066 +       assert("vs-1277", all_but_offset_key_eq(key, &hint->seal.key));
35067 +
35068 +       if (check_key && get_key_offset(key) != hint->offset)
35069 +               /* hint is set for different key */
35070 +               return RETERR(-E_REPEAT);
35071 +
35072 +       assert("vs-31", hint->ext_coord.lh == &hint->lh);
35073 +       return reiser4_seal_validate(&hint->seal, &hint->ext_coord.coord, key,
35074 +                                    hint->ext_coord.lh, lock_mode,
35075 +                                    ZNODE_LOCK_LOPRI);
35076 +}
35077 +
35078 +/**
35079 + * Look for place at twig level for extent corresponding to page,
35080 + * call extent's writepage method to create unallocated extent if
35081 + * it does not exist yet, initialize jnode, capture page
35082 + */
35083 +int find_or_create_extent(struct page *page)
35084 +{
35085 +       int result;
35086 +       struct inode *inode;
35087 +       int plugged_hole;
35088 +
35089 +       jnode *node;
35090 +
35091 +       assert("vs-1065", page->mapping && page->mapping->host);
35092 +       inode = page->mapping->host;
35093 +
35094 +       lock_page(page);
35095 +       node = jnode_of_page(page);
35096 +       if (IS_ERR(node)) {
35097 +               unlock_page(page);
35098 +               return PTR_ERR(node);
35099 +       }
35100 +       JF_SET(node, JNODE_WRITE_PREPARED);
35101 +       unlock_page(page);
35102 +
35103 +       if (node->blocknr == 0) {
35104 +               plugged_hole = 0;
35105 +               result = reiser4_update_extent(inode, node, page_offset(page),
35106 +                                              &plugged_hole);
35107 +               if (result) {
35108 +                       JF_CLR(node, JNODE_WRITE_PREPARED);
35109 +                       jput(node);
35110 +                       warning("edward-1549",
35111 +                               "reiser4_update_extent failed: %d", result);
35112 +                       return result;
35113 +               }
35114 +               if (plugged_hole)
35115 +                       reiser4_update_sd(inode);
35116 +       } else {
35117 +               spin_lock_jnode(node);
35118 +               result = reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0);
35119 +               BUG_ON(result != 0);
35120 +               jnode_make_dirty_locked(node);
35121 +               spin_unlock_jnode(node);
35122 +       }
35123 +
35124 +       BUG_ON(node->atom == NULL);
35125 +       JF_CLR(node, JNODE_WRITE_PREPARED);
35126 +       jput(node);
35127 +
35128 +       if (get_current_context()->entd) {
35129 +               entd_context *ent = get_entd_context(node->tree->super);
35130 +
35131 +               if (ent->cur_request->page == page)
35132 +                       ent->cur_request->node = node;
35133 +       }
35134 +       return 0;
35135 +}
35136 +
35137 +/**
35138 + * has_anonymous_pages - check whether inode has pages dirtied via mmap
35139 + * @inode: inode to check
35140 + *
35141 + * Returns true if inode's mapping has dirty pages which do not belong to any
35142 + * atom. Those are either tagged PAGECACHE_TAG_REISER4_MOVED in mapping's page
35143 + * tree or were eflushed and can be found via jnodes tagged
35144 + * EFLUSH_TAG_ANONYMOUS in radix tree of jnodes.
35145 + */
35146 +static int has_anonymous_pages(struct inode *inode)
35147 +{
35148 +       int result;
35149 +
35150 +       spin_lock_irq(&inode->i_mapping->tree_lock);
35151 +       result = radix_tree_tagged(&inode->i_mapping->page_tree, PAGECACHE_TAG_REISER4_MOVED);
35152 +       spin_unlock_irq(&inode->i_mapping->tree_lock);
35153 +       return result;
35154 +}
35155 +
35156 +/**
35157 + * capture_page_and_create_extent -
35158 + * @page: page to be captured
35159 + *
35160 + * Grabs space for extent creation and stat data update and calls function to
35161 + * do actual work.
35162 + */
35163 +static int capture_page_and_create_extent(struct page *page)
35164 +{
35165 +       int result;
35166 +       struct inode *inode;
35167 +
35168 +       assert("vs-1084", page->mapping && page->mapping->host);
35169 +       inode = page->mapping->host;
35170 +       assert("vs-1139",
35171 +              unix_file_inode_data(inode)->container == UF_CONTAINER_EXTENTS);
35172 +       /* page belongs to file */
35173 +       assert("vs-1393",
35174 +              inode->i_size > page_offset(page));
35175 +
35176 +       /* page capture may require extent creation (if it does not exist yet)
35177 +          and stat data's update (number of blocks changes on extent
35178 +          creation) */
35179 +       grab_space_enable();
35180 +       result = reiser4_grab_space(2 * estimate_one_insert_into_item
35181 +                                   (reiser4_tree_by_inode(inode)),
35182 +                                   BA_CAN_COMMIT);
35183 +       if (likely(!result))
35184 +               result = find_or_create_extent(page);
35185 +
35186 +       if (result != 0)
35187 +               SetPageError(page);
35188 +       return result;
35189 +}
35190 +
35191 +/* this is implementation of method commit_write of struct
35192 +   address_space_operations for unix file plugin */
35193 +int
35194 +commit_write_unix_file(struct file *file, struct page *page,
35195 +                      unsigned from, unsigned to)
35196 +{
35197 +       reiser4_context *ctx;
35198 +       struct inode *inode;
35199 +       int result;
35200 +
35201 +       assert("umka-3101", file != NULL);
35202 +       assert("umka-3102", page != NULL);
35203 +       assert("umka-3093", PageLocked(page));
35204 +
35205 +       SetPageUptodate(page);
35206 +
35207 +       inode = page->mapping->host;
35208 +       ctx = reiser4_init_context(page->mapping->host->i_sb);
35209 +       if (IS_ERR(ctx))
35210 +               return PTR_ERR(ctx);
35211 +       page_cache_get(page);
35212 +       unlock_page(page);
35213 +       result = capture_page_and_create_extent(page);
35214 +       lock_page(page);
35215 +       page_cache_release(page);
35216 +
35217 +       /* don't commit transaction under inode semaphore */
35218 +       context_set_commit_async(ctx);
35219 +       reiser4_exit_context(ctx);
35220 +       return result;
35221 +}
35222 +
35223 +/*
35224 + * Support for "anonymous" pages and jnodes.
35225 + *
35226 + * When file is write-accessed through mmap pages can be dirtied from the user
35227 + * level. In this case kernel is not notified until one of following happens:
35228 + *
35229 + *     (1) msync()
35230 + *
35231 + *     (2) truncate() (either explicit or through unlink)
35232 + *
35233 + *     (3) VM scanner starts reclaiming mapped pages, dirtying them before
35234 + *     starting write-back.
35235 + *
35236 + * As a result of (3) ->writepage may be called on a dirty page without
35237 + * jnode. Such page is called "anonymous" in reiser4. Certain work-loads
35238 + * (iozone) generate huge number of anonymous pages. Emergency flush handles
35239 + * this situation by creating jnode for anonymous page, starting IO on the
35240 + * page, and marking jnode with JNODE_KEEPME bit so that it's not thrown out of
35241 + * memory. Such jnode is also called anonymous.
35242 + *
35243 + * reiser4_sync_sb() method tries to insert anonymous pages and jnodes into
35244 + * tree. This is done by capture_anonymous_*() functions below.
35245 + */
35246 +
35247 +/**
35248 + * capture_anonymous_page - involve page into transaction
35249 + * @pg: page to deal with
35250 + *
35251 + * Takes care that @page has corresponding metadata in the tree, creates jnode
35252 + * for @page and captures it. On success 1 is returned.
35253 + */
35254 +static int capture_anonymous_page(struct page *page)
35255 +{
35256 +       int result;
35257 +
35258 +       if (PageWriteback(page))
35259 +               /* FIXME: do nothing? */
35260 +               return 0;
35261 +
35262 +       result = capture_page_and_create_extent(page);
35263 +       if (result == 0) {
35264 +               result = 1;
35265 +       } else
35266 +               warning("nikita-3329",
35267 +                               "Cannot capture anon page: %i", result);
35268 +
35269 +       return result;
35270 +}
35271 +
35272 +/**
35273 + * capture_anonymous_pages - find and capture pages dirtied via mmap
35274 + * @mapping: address space where to look for pages
35275 + * @index: start index
35276 + * @to_capture: maximum number of pages to capture
35277 + *
35278 + * Looks for pages tagged REISER4_MOVED starting from the *@index-th page,
35279 + * captures (involves into atom) them, returns number of captured pages,
35280 + * updates @index to next page after the last captured one.
35281 + */
35282 +static int
35283 +capture_anonymous_pages(struct address_space *mapping, pgoff_t *index,
35284 +                       unsigned int to_capture)
35285 +{
35286 +       int result;
35287 +       struct pagevec pvec;
35288 +       unsigned int i, count;
35289 +       int nr;
35290 +
35291 +       pagevec_init(&pvec, 0);
35292 +       count = min(pagevec_space(&pvec), to_capture);
35293 +       nr = 0;
35294 +
35295 +       /* find pages tagged MOVED */
35296 +       spin_lock_irq(&mapping->tree_lock);
35297 +       pvec.nr = radix_tree_gang_lookup_tag(&mapping->page_tree,
35298 +                                            (void **)pvec.pages, *index, count,
35299 +                                            PAGECACHE_TAG_REISER4_MOVED);
35300 +       if (pagevec_count(&pvec) == 0) {
35301 +               /*
35302 +                * there are no pages tagged MOVED in mapping->page_tree
35303 +                * starting from *index
35304 +                */
35305 +               spin_unlock_irq(&mapping->tree_lock);
35306 +               *index = (pgoff_t)-1;
35307 +               return 0;
35308 +       }
35309 +
35310 +       /* clear MOVED tag for all found pages */
35311 +       for (i = 0; i < pagevec_count(&pvec); i++) {
35312 +               page_cache_get(pvec.pages[i]);
35313 +               radix_tree_tag_clear(&mapping->page_tree, pvec.pages[i]->index,
35314 +                                    PAGECACHE_TAG_REISER4_MOVED);
35315 +       }
35316 +       spin_unlock_irq(&mapping->tree_lock);
35317 +
35318 +
35319 +       *index = pvec.pages[i - 1]->index + 1;
35320 +
35321 +       for (i = 0; i < pagevec_count(&pvec); i++) {
35322 +               /*
35323 +                * tag PAGECACHE_TAG_REISER4_MOVED will be cleared by
35324 +                * reiser4_set_page_dirty_internal which is called when jnode is
35325 +                * captured
35326 +                */
35327 +               result = capture_anonymous_page(pvec.pages[i]);
35328 +               if (result == 1)
35329 +                       nr++;
35330 +               else {
35331 +                       if (result < 0) {
35332 +                               warning("vs-1454",
35333 +                                       "failed to capture page: "
35334 +                                       "result=%d, captured=%d)\n",
35335 +                                       result, i);
35336 +
35337 +                               /*
35338 +                                * set MOVED tag to all pages which left not
35339 +                                * captured
35340 +                                */
35341 +                               spin_lock_irq(&mapping->tree_lock);
35342 +                               for (; i < pagevec_count(&pvec); i ++) {
35343 +                                       radix_tree_tag_set(&mapping->page_tree,
35344 +                                                          pvec.pages[i]->index,
35345 +                                                          PAGECACHE_TAG_REISER4_MOVED);
35346 +                               }
35347 +                               spin_unlock_irq(&mapping->tree_lock);
35348 +
35349 +                               pagevec_release(&pvec);
35350 +                               return result;
35351 +                       } else {
35352 +                               /*
35353 +                                * result == 0. capture_anonymous_page returns
35354 +                                * 0 for Writeback-ed page. Set MOVED tag on
35355 +                                * that page
35356 +                                */
35357 +                               spin_lock_irq(&mapping->tree_lock);
35358 +                               radix_tree_tag_set(&mapping->page_tree,
35359 +                                                  pvec.pages[i]->index,
35360 +                                                  PAGECACHE_TAG_REISER4_MOVED);
35361 +                               spin_unlock_irq(&mapping->tree_lock);
35362 +                               if (i == 0)
35363 +                                       *index = pvec.pages[0]->index;
35364 +                               else
35365 +                                       *index = pvec.pages[i - 1]->index + 1;
35366 +                       }
35367 +               }
35368 +       }
35369 +       pagevec_release(&pvec);
35370 +       return nr;
35371 +}
35372 +
35373 +/**
35374 + * capture_anonymous_jnodes - find and capture anonymous jnodes
35375 + * @mapping: address space where to look for jnodes
35376 + * @from: start index
35377 + * @to: end index
35378 + * @to_capture: maximum number of jnodes to capture
35379 + *
35380 + * Looks for jnodes tagged EFLUSH_TAG_ANONYMOUS in inode's tree of jnodes in
35381 + * the range of indexes @from-@to and captures them, returns number of captured
35382 + * jnodes, updates @from to next jnode after the last captured one.
35383 + */
35384 +static int
35385 +capture_anonymous_jnodes(struct address_space *mapping,
35386 +                        pgoff_t *from, pgoff_t to, int to_capture)
35387 +{
35388 +       *from = to;
35389 +       return 0;
35390 +}
35391 +
35392 +/*
35393 + * Commit atom of the jnode of a page.
35394 + */
35395 +static int sync_page(struct page *page)
35396 +{
35397 +       int result;
35398 +       do {
35399 +               jnode *node;
35400 +               txn_atom *atom;
35401 +
35402 +               lock_page(page);
35403 +               node = jprivate(page);
35404 +               if (node != NULL) {
35405 +                       spin_lock_jnode(node);
35406 +                       atom = jnode_get_atom(node);
35407 +                       spin_unlock_jnode(node);
35408 +               } else
35409 +                       atom = NULL;
35410 +               unlock_page(page);
35411 +               result = reiser4_sync_atom(atom);
35412 +       } while (result == -E_REPEAT);
35413 +       /*
35414 +        * ZAM-FIXME-HANS: document the logic of this loop, is it just to
35415 +        * handle the case where more pages get added to the atom while we are
35416 +        * syncing it?
35417 +        */
35418 +       assert("nikita-3485", ergo(result == 0,
35419 +                                  get_current_context()->trans->atom == NULL));
35420 +       return result;
35421 +}
35422 +
35423 +/*
35424 + * Commit atoms of pages on @pages list.
35425 + * call sync_page for each page from mapping's page tree
35426 + */
35427 +static int sync_page_list(struct inode *inode)
35428 +{
35429 +       int result;
35430 +       struct address_space *mapping;
35431 +       unsigned long from;     /* start index for radix_tree_gang_lookup */
35432 +       unsigned int found;     /* return value for radix_tree_gang_lookup */
35433 +
35434 +       mapping = inode->i_mapping;
35435 +       from = 0;
35436 +       result = 0;
35437 +       spin_lock_irq(&mapping->tree_lock);
35438 +       while (result == 0) {
35439 +               struct page *page;
35440 +
35441 +               found =
35442 +                   radix_tree_gang_lookup(&mapping->page_tree, (void **)&page,
35443 +                                          from, 1);
35444 +               assert("edward-1550", found < 2);
35445 +               if (found == 0)
35446 +                       break;
35447 +               /**
35448 +                * page may not leave radix tree because it is protected from
35449 +                * truncating by inode->i_mutex locked by sys_fsync
35450 +                */
35451 +               page_cache_get(page);
35452 +               spin_unlock_irq(&mapping->tree_lock);
35453 +
35454 +               from = page->index + 1;
35455 +
35456 +               result = sync_page(page);
35457 +
35458 +               page_cache_release(page);
35459 +               spin_lock_irq(&mapping->tree_lock);
35460 +       }
35461 +
35462 +       spin_unlock_irq(&mapping->tree_lock);
35463 +       return result;
35464 +}
35465 +
35466 +static int commit_file_atoms(struct inode *inode)
35467 +{
35468 +       int result;
35469 +       struct unix_file_info *uf_info;
35470 +
35471 +       uf_info = unix_file_inode_data(inode);
35472 +
35473 +       get_exclusive_access(uf_info);
35474 +       /*
35475 +        * find what items file is made from
35476 +        */
35477 +       result = find_file_state(inode, uf_info);
35478 +       drop_exclusive_access(uf_info);
35479 +       if (result != 0)
35480 +               return result;
35481 +
35482 +       /*
35483 +        * file state cannot change because we are under ->i_mutex
35484 +        */
35485 +       switch (uf_info->container) {
35486 +       case UF_CONTAINER_EXTENTS:
35487 +               /* find_file_state might open join an atom */
35488 +               reiser4_txn_restart_current();
35489 +               result =
35490 +                   /*
35491 +                    * when we are called by
35492 +                    * filemap_fdatawrite->
35493 +                    *    do_writepages()->
35494 +                    *       reiser4_writepages()
35495 +                    *
35496 +                    * inode->i_mapping->dirty_pages are spices into
35497 +                    * ->io_pages, leaving ->dirty_pages dirty.
35498 +                    *
35499 +                    * When we are called from
35500 +                    * reiser4_fsync()->sync_unix_file(), we have to
35501 +                    * commit atoms of all pages on the ->dirty_list.
35502 +                    *
35503 +                    * So for simplicity we just commit ->io_pages and
35504 +                    * ->dirty_pages.
35505 +                    */
35506 +                   sync_page_list(inode);
35507 +               break;
35508 +       case UF_CONTAINER_TAILS:
35509 +               /*
35510 +                * NOTE-NIKITA probably we can be smarter for tails. For now
35511 +                * just commit all existing atoms.
35512 +                */
35513 +               result = txnmgr_force_commit_all(inode->i_sb, 0);
35514 +               break;
35515 +       case UF_CONTAINER_EMPTY:
35516 +               result = 0;
35517 +               break;
35518 +       case UF_CONTAINER_UNKNOWN:
35519 +       default:
35520 +               result = -EIO;
35521 +               break;
35522 +       }
35523 +
35524 +       /*
35525 +        * commit current transaction: there can be captured nodes from
35526 +        * find_file_state() and finish_conversion().
35527 +        */
35528 +       reiser4_txn_restart_current();
35529 +       return result;
35530 +}
35531 +
35532 +/**
35533 + * writepages_unix_file - writepages of struct address_space_operations
35534 + * @mapping:
35535 + * @wbc:
35536 + *
35537 + * This captures anonymous pages and anonymous jnodes. Anonymous pages are
35538 + * pages which are dirtied via mmapping. Anonymous jnodes are ones which were
35539 + * created by reiser4_writepage.
35540 + */
35541 +int writepages_unix_file(struct address_space *mapping,
35542 +                    struct writeback_control *wbc)
35543 +{
35544 +       int result;
35545 +       struct unix_file_info *uf_info;
35546 +       pgoff_t pindex, jindex, nr_pages;
35547 +       long to_capture;
35548 +       struct inode *inode;
35549 +
35550 +       inode = mapping->host;
35551 +       if (!has_anonymous_pages(inode)) {
35552 +               result = 0;
35553 +               goto end;
35554 +       }
35555 +       jindex = pindex = wbc->range_start >> PAGE_CACHE_SHIFT;
35556 +       result = 0;
35557 +       nr_pages = size_in_pages(i_size_read(inode));
35558 +
35559 +       uf_info = unix_file_inode_data(inode);
35560 +
35561 +       do {
35562 +               reiser4_context *ctx;
35563 +
35564 +               if (wbc->sync_mode != WB_SYNC_ALL)
35565 +                       to_capture = min(wbc->nr_to_write, CAPTURE_APAGE_BURST);
35566 +               else
35567 +                       to_capture = CAPTURE_APAGE_BURST;
35568 +
35569 +               ctx = reiser4_init_context(inode->i_sb);
35570 +               if (IS_ERR(ctx)) {
35571 +                       result = PTR_ERR(ctx);
35572 +                       break;
35573 +               }
35574 +               /* avoid recursive calls to ->sync_inodes */
35575 +               ctx->nobalance = 1;
35576 +               assert("zam-760", lock_stack_isclean(get_current_lock_stack()));
35577 +               assert("edward-1551", LOCK_CNT_NIL(inode_sem_w));
35578 +               assert("edward-1552", LOCK_CNT_NIL(inode_sem_r));
35579 +
35580 +               reiser4_txn_restart_current();
35581 +
35582 +               /* we have to get nonexclusive access to the file */
35583 +               if (get_current_context()->entd) {
35584 +                       /*
35585 +                        * use nonblocking version of nonexclusive_access to
35586 +                        * avoid deadlock which might look like the following:
35587 +                        * process P1 holds NEA on file F1 and called entd to
35588 +                        * reclaim some memory. Entd works for P1 and is going
35589 +                        * to capture pages of file F2. To do that entd has to
35590 +                        * get NEA to F2. F2 is held by process P2 which also
35591 +                        * called entd. But entd is serving P1 at the moment
35592 +                        * and P2 has to wait. Process P3 trying to get EA to
35593 +                        * file F2. Existence of pending EA request to file F2
35594 +                        * makes impossible for entd to get NEA to file
35595 +                        * F2. Neither of these process can continue. Using
35596 +                        * nonblocking version of gettign NEA is supposed to
35597 +                        * avoid this deadlock.
35598 +                        */
35599 +                       if (try_to_get_nonexclusive_access(uf_info) == 0) {
35600 +                               result = RETERR(-EBUSY);
35601 +                               reiser4_exit_context(ctx);
35602 +                               break;
35603 +                       }
35604 +               } else
35605 +                       get_nonexclusive_access(uf_info);
35606 +
35607 +               while (to_capture > 0) {
35608 +                       pgoff_t start;
35609 +
35610 +                       assert("vs-1727", jindex <= pindex);
35611 +                       if (pindex == jindex) {
35612 +                               start = pindex;
35613 +                               result =
35614 +                                   capture_anonymous_pages(inode->i_mapping,
35615 +                                                           &pindex,
35616 +                                                           to_capture);
35617 +                               if (result <= 0)
35618 +                                       break;
35619 +                               to_capture -= result;
35620 +                               wbc->nr_to_write -= result;
35621 +                               if (start + result == pindex) {
35622 +                                       jindex = pindex;
35623 +                                       continue;
35624 +                               }
35625 +                               if (to_capture <= 0)
35626 +                                       break;
35627 +                       }
35628 +                       /* deal with anonymous jnodes between jindex and pindex */
35629 +                       result =
35630 +                           capture_anonymous_jnodes(inode->i_mapping, &jindex,
35631 +                                                    pindex, to_capture);
35632 +                       if (result < 0)
35633 +                               break;
35634 +                       to_capture -= result;
35635 +                       get_current_context()->nr_captured += result;
35636 +
35637 +                       if (jindex == (pgoff_t) - 1) {
35638 +                               assert("vs-1728", pindex == (pgoff_t) - 1);
35639 +                               break;
35640 +                       }
35641 +               }
35642 +               if (to_capture <= 0)
35643 +                       /* there may be left more pages */
35644 +                       __mark_inode_dirty(inode, I_DIRTY_PAGES);
35645 +
35646 +               drop_nonexclusive_access(uf_info);
35647 +               if (result < 0) {
35648 +                       /* error happened */
35649 +                       reiser4_exit_context(ctx);
35650 +                       return result;
35651 +               }
35652 +               if (wbc->sync_mode != WB_SYNC_ALL) {
35653 +                       reiser4_exit_context(ctx);
35654 +                       return 0;
35655 +               }
35656 +               result = commit_file_atoms(inode);
35657 +               reiser4_exit_context(ctx);
35658 +               if (pindex >= nr_pages && jindex == pindex)
35659 +                       break;
35660 +       } while (1);
35661 +
35662 +      end:
35663 +       if (is_in_reiser4_context()) {
35664 +               if (get_current_context()->nr_captured >= CAPTURE_APAGE_BURST) {
35665 +                       /*
35666 +                        * there are already pages to flush, flush them out, do
35667 +                        * not delay until end of reiser4_sync_inodes
35668 +                        */
35669 +                       reiser4_writeout(inode->i_sb, wbc);
35670 +                       get_current_context()->nr_captured = 0;
35671 +               }
35672 +       }
35673 +       return result;
35674 +}
35675 +
35676 +/**
35677 + * readpage_unix_file_nolock - readpage of struct address_space_operations
35678 + * @file:
35679 + * @page:
35680 + *
35681 + * Compose a key and search for item containing information about @page
35682 + * data. If item is found - its readpage method is called.
35683 + */
35684 +int readpage_unix_file(struct file *file, struct page *page)
35685 +{
35686 +       reiser4_context *ctx;
35687 +       int result;
35688 +       struct inode *inode;
35689 +       reiser4_key key;
35690 +       item_plugin *iplug;
35691 +       hint_t *hint;
35692 +       lock_handle *lh;
35693 +       coord_t *coord;
35694 +
35695 +       assert("vs-1062", PageLocked(page));
35696 +       assert("vs-976", !PageUptodate(page));
35697 +       assert("vs-1061", page->mapping && page->mapping->host);
35698 +
35699 +       if (page->mapping->host->i_size <= page_offset(page)) {
35700 +               /* page is out of file */
35701 +               zero_user(page, 0, PAGE_CACHE_SIZE);
35702 +               SetPageUptodate(page);
35703 +               unlock_page(page);
35704 +               return 0;
35705 +       }
35706 +
35707 +       inode = page->mapping->host;
35708 +       ctx = reiser4_init_context(inode->i_sb);
35709 +       if (IS_ERR(ctx)) {
35710 +               unlock_page(page);
35711 +               return PTR_ERR(ctx);
35712 +       }
35713 +
35714 +       hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
35715 +       if (hint == NULL) {
35716 +               unlock_page(page);
35717 +               reiser4_exit_context(ctx);
35718 +               return RETERR(-ENOMEM);
35719 +       }
35720 +
35721 +       result = load_file_hint(file, hint);
35722 +       if (result) {
35723 +               kfree(hint);
35724 +               unlock_page(page);
35725 +               reiser4_exit_context(ctx);
35726 +               return result;
35727 +       }
35728 +       lh = &hint->lh;
35729 +
35730 +       /* get key of first byte of the page */
35731 +       key_by_inode_and_offset_common(inode, page_offset(page), &key);
35732 +
35733 +       /* look for file metadata corresponding to first byte of page */
35734 +       page_cache_get(page);
35735 +       unlock_page(page);
35736 +       result = find_file_item(hint, &key, ZNODE_READ_LOCK, inode);
35737 +       lock_page(page);
35738 +       page_cache_release(page);
35739 +
35740 +       if (page->mapping == NULL) {
35741 +               /*
35742 +                * readpage allows truncate to run concurrently. Page was
35743 +                * truncated while it was not locked
35744 +                */
35745 +               done_lh(lh);
35746 +               kfree(hint);
35747 +               unlock_page(page);
35748 +               reiser4_txn_restart(ctx);
35749 +               reiser4_exit_context(ctx);
35750 +               return -EINVAL;
35751 +       }
35752 +
35753 +       if (result != CBK_COORD_FOUND || hint->ext_coord.coord.between != AT_UNIT) {
35754 +               if (result == CBK_COORD_FOUND &&
35755 +                   hint->ext_coord.coord.between != AT_UNIT)
35756 +                       /* file is truncated */
35757 +                       result = -EINVAL;
35758 +               done_lh(lh);
35759 +               kfree(hint);
35760 +               unlock_page(page);
35761 +               reiser4_txn_restart(ctx);
35762 +               reiser4_exit_context(ctx);
35763 +               return result;
35764 +       }
35765 +
35766 +       /*
35767 +        * item corresponding to page is found. It can not be removed because
35768 +        * znode lock is held
35769 +        */
35770 +       if (PageUptodate(page)) {
35771 +               done_lh(lh);
35772 +               kfree(hint);
35773 +               unlock_page(page);
35774 +               reiser4_txn_restart(ctx);
35775 +               reiser4_exit_context(ctx);
35776 +               return 0;
35777 +       }
35778 +
35779 +       coord = &hint->ext_coord.coord;
35780 +       result = zload(coord->node);
35781 +       if (result) {
35782 +               done_lh(lh);
35783 +               kfree(hint);
35784 +               unlock_page(page);
35785 +               reiser4_txn_restart(ctx);
35786 +               reiser4_exit_context(ctx);
35787 +               return result;
35788 +       }
35789 +
35790 +       validate_extended_coord(&hint->ext_coord, page_offset(page));
35791 +
35792 +       if (!coord_is_existing_unit(coord)) {
35793 +               /* this indicates corruption */
35794 +               warning("vs-280",
35795 +                       "Looking for page %lu of file %llu (size %lli). "
35796 +                       "No file items found (%d). File is corrupted?\n",
35797 +                       page->index, (unsigned long long)get_inode_oid(inode),
35798 +                       inode->i_size, result);
35799 +               zrelse(coord->node);
35800 +               done_lh(lh);
35801 +               kfree(hint);
35802 +               unlock_page(page);
35803 +               reiser4_txn_restart(ctx);
35804 +               reiser4_exit_context(ctx);
35805 +               return RETERR(-EIO);
35806 +       }
35807 +
35808 +       /*
35809 +        * get plugin of found item or use plugin if extent if there are no
35810 +        * one
35811 +        */
35812 +       iplug = item_plugin_by_coord(coord);
35813 +       if (iplug->s.file.readpage)
35814 +               result = iplug->s.file.readpage(coord, page);
35815 +       else
35816 +               result = RETERR(-EINVAL);
35817 +
35818 +       if (!result) {
35819 +               set_key_offset(&key,
35820 +                              (loff_t) (page->index + 1) << PAGE_CACHE_SHIFT);
35821 +               /* FIXME should call reiser4_set_hint() */
35822 +               reiser4_unset_hint(hint);
35823 +       } else {
35824 +               unlock_page(page);
35825 +               reiser4_unset_hint(hint);
35826 +       }
35827 +       assert("vs-979",
35828 +              ergo(result == 0, (PageLocked(page) || PageUptodate(page))));
35829 +       assert("vs-9791", ergo(result != 0, !PageLocked(page)));
35830 +
35831 +       zrelse(coord->node);
35832 +       done_lh(lh);
35833 +
35834 +       save_file_hint(file, hint);
35835 +       kfree(hint);
35836 +
35837 +       /*
35838 +        * FIXME: explain why it is needed. HINT: page allocation in write can
35839 +        * not be done when atom is not NULL because reiser4_writepage can not
35840 +        * kick entd and have to eflush
35841 +        */
35842 +       reiser4_txn_restart(ctx);
35843 +       reiser4_exit_context(ctx);
35844 +       return result;
35845 +}
35846 +
35847 +struct uf_readpages_context {
35848 +       lock_handle lh;
35849 +       coord_t coord;
35850 +};
35851 +
35852 +/* A callback function for readpages_unix_file/read_cache_pages.
35853 + * If the file is build of tails, then return error (-ENOENT).
35854 + *
35855 + * @data -- a pointer to reiser4_readpages_context object,
35856 + *            to save the twig lock and the coord between
35857 + *            read_cache_page iterations.
35858 + * @page -- page to start read.
35859 + */
35860 +static int uf_readpages_filler(void * data, struct page * page)
35861 +{
35862 +       struct uf_readpages_context *rc = data;
35863 +       jnode * node;
35864 +       int ret = 0;
35865 +       reiser4_extent *ext;
35866 +       __u64 ext_index;
35867 +       int cbk_done = 0;
35868 +       struct address_space * mapping = page->mapping;
35869 +
35870 +       if (PageUptodate(page)) {
35871 +               unlock_page(page);
35872 +               return 0;
35873 +       }
35874 +       page_cache_get(page);
35875 +
35876 +       if (rc->lh.node == 0) {
35877 +               /* no twig lock  - have to do tree search. */
35878 +               reiser4_key key;
35879 +       repeat:
35880 +               unlock_page(page);
35881 +               key_by_inode_and_offset_common(
35882 +                       mapping->host, page_offset(page), &key);
35883 +               ret = coord_by_key(
35884 +                       &get_super_private(mapping->host->i_sb)->tree,
35885 +                       &key, &rc->coord, &rc->lh,
35886 +                       ZNODE_READ_LOCK, FIND_EXACT,
35887 +                       TWIG_LEVEL, TWIG_LEVEL, CBK_UNIQUE, NULL);
35888 +               if (unlikely(ret))
35889 +                       goto exit;
35890 +               lock_page(page);
35891 +               if (PageUptodate(page))
35892 +                       goto unlock;
35893 +               cbk_done = 1;
35894 +       }
35895 +       ret = zload(rc->coord.node);
35896 +       if (unlikely(ret))
35897 +               goto unlock;
35898 +       if (!coord_is_existing_item(&rc->coord) ||
35899 +           !item_is_extent(&rc->coord)) {
35900 +               zrelse(rc->coord.node);
35901 +               ret = RETERR(-EIO);
35902 +               goto unlock;
35903 +       }
35904 +       ext = extent_by_coord(&rc->coord);
35905 +       ext_index = extent_unit_index(&rc->coord);
35906 +       if (page->index < ext_index ||
35907 +           page->index >= ext_index + extent_get_width(ext)) {
35908 +               /* the page index doesn't belong to the extent unit
35909 +                  which the coord points to - release the lock and
35910 +                  repeat with tree search. */
35911 +               zrelse(rc->coord.node);
35912 +               done_lh(&rc->lh);
35913 +               /* we can be here after a CBK call only in case of
35914 +                  corruption of the tree or the tree lookup algorithm bug. */
35915 +               if (unlikely(cbk_done)) {
35916 +                       ret = RETERR(-EIO);
35917 +                       goto unlock;
35918 +               }
35919 +               goto repeat;
35920 +       }
35921 +       node = jnode_of_page(page);
35922 +       if (unlikely(IS_ERR(node))) {
35923 +               zrelse(rc->coord.node);
35924 +               ret = PTR_ERR(node);
35925 +               goto unlock;
35926 +       }
35927 +       ret = reiser4_do_readpage_extent(ext, page->index - ext_index, page);
35928 +       jput(node);
35929 +       zrelse(rc->coord.node);
35930 +       if (likely(!ret))
35931 +               goto exit;
35932 + unlock:
35933 +       unlock_page(page);
35934 + exit:
35935 +       page_cache_release(page);
35936 +       return ret;
35937 +}
35938 +
35939 +/**
35940 + * readpages_unix_file - called by the readahead code, starts reading for each
35941 + * page of given list of pages
35942 + */
35943 +int readpages_unix_file(
35944 +       struct file *file, struct address_space *mapping,
35945 +       struct list_head *pages, unsigned nr_pages)
35946 +{
35947 +       reiser4_context *ctx;
35948 +       struct uf_readpages_context rc;
35949 +       int ret;
35950 +
35951 +       ctx = reiser4_init_context(mapping->host->i_sb);
35952 +       if (IS_ERR(ctx)) {
35953 +               put_pages_list(pages);
35954 +               return PTR_ERR(ctx);
35955 +       }
35956 +       init_lh(&rc.lh);
35957 +       ret = read_cache_pages(mapping, pages,  uf_readpages_filler, &rc);
35958 +       done_lh(&rc.lh);
35959 +       context_set_commit_async(ctx);
35960 +       /* close the transaction to protect further page allocation from deadlocks */
35961 +       reiser4_txn_restart(ctx);
35962 +       reiser4_exit_context(ctx);
35963 +       return ret;
35964 +}
35965 +
35966 +static reiser4_block_nr unix_file_estimate_read(struct inode *inode,
35967 +                                               loff_t count UNUSED_ARG)
35968 +{
35969 +       /* We should reserve one block, because of updating of the stat data
35970 +          item */
35971 +       assert("vs-1249",
35972 +              inode_file_plugin(inode)->estimate.update ==
35973 +              estimate_update_common);
35974 +       return estimate_update_common(inode);
35975 +}
35976 +
35977 +/* this is called with nonexclusive access obtained, file's container can not change */
35978 +static ssize_t read_file(hint_t *hint, struct file *file,      /* file to read from to */
35979 +                        char __user *buf,      /* address of user-space buffer */
35980 +                        size_t count,  /* number of bytes to read */
35981 +                        loff_t *off)
35982 +{
35983 +       int result;
35984 +       struct inode *inode;
35985 +       flow_t flow;
35986 +       int (*read_f) (struct file *, flow_t *, hint_t *);
35987 +       coord_t *coord;
35988 +       znode *loaded;
35989 +
35990 +       inode = file->f_dentry->d_inode;
35991 +
35992 +       /* build flow */
35993 +       assert("vs-1250",
35994 +              inode_file_plugin(inode)->flow_by_inode ==
35995 +              flow_by_inode_unix_file);
35996 +       result =
35997 +           flow_by_inode_unix_file(inode, buf, 1 /* user space */ , count,
35998 +                                   *off, READ_OP, &flow);
35999 +       if (unlikely(result))
36000 +               return result;
36001 +
36002 +       /* get seal and coord sealed with it from reiser4 private data
36003 +          of struct file.  The coord will tell us where our last read
36004 +          of this file finished, and the seal will help to determine
36005 +          if that location is still valid.
36006 +        */
36007 +       coord = &hint->ext_coord.coord;
36008 +       while (flow.length && result == 0) {
36009 +               result =
36010 +                       find_file_item(hint, &flow.key, ZNODE_READ_LOCK, inode);
36011 +               if (cbk_errored(result))
36012 +                       /* error happened */
36013 +                       break;
36014 +
36015 +               if (coord->between != AT_UNIT) {
36016 +                       /* there were no items corresponding to given offset */
36017 +                       done_lh(hint->ext_coord.lh);
36018 +                       break;
36019 +               }
36020 +
36021 +               loaded = coord->node;
36022 +               result = zload(loaded);
36023 +               if (unlikely(result)) {
36024 +                       done_lh(hint->ext_coord.lh);
36025 +                       break;
36026 +               }
36027 +
36028 +               if (hint->ext_coord.valid == 0)
36029 +                       validate_extended_coord(&hint->ext_coord,
36030 +                                               get_key_offset(&flow.key));
36031 +
36032 +               assert("vs-4", hint->ext_coord.valid == 1);
36033 +               assert("vs-33", hint->ext_coord.lh == &hint->lh);
36034 +               /* call item's read method */
36035 +               read_f = item_plugin_by_coord(coord)->s.file.read;
36036 +               result = read_f(file, &flow, hint);
36037 +               zrelse(loaded);
36038 +               done_lh(hint->ext_coord.lh);
36039 +       }
36040 +
36041 +       return (count - flow.length) ? (count - flow.length) : result;
36042 +}
36043 +
36044 +static ssize_t read_unix_file_container_tails(struct file*, char __user*, size_t, loff_t*);
36045 +
36046 +/**
36047 + * read_unix_file - read of struct file_operations
36048 + * @file: file to read from
36049 + * @buf: address of user-space buffer
36050 + * @read_amount: number of bytes to read
36051 + * @off: position in file to read from
36052 + *
36053 + * This is implementation of vfs's read method of struct file_operations for
36054 + * unix file plugin.
36055 + */
36056 +ssize_t read_unix_file(struct file *file, char __user *buf, size_t read_amount,
36057 +                      loff_t *off)
36058 +{
36059 +       reiser4_context *ctx;
36060 +       ssize_t result;
36061 +       struct inode *inode;
36062 +       struct unix_file_info *uf_info;
36063 +
36064 +       if (unlikely(read_amount == 0))
36065 +               return 0;
36066 +
36067 +       assert("umka-072", file != NULL);
36068 +       assert("umka-074", off != NULL);
36069 +       inode = file->f_dentry->d_inode;
36070 +       assert("vs-972", !reiser4_inode_get_flag(inode, REISER4_NO_SD));
36071 +
36072 +       ctx = reiser4_init_context(inode->i_sb);
36073 +       if (IS_ERR(ctx))
36074 +               return PTR_ERR(ctx);
36075 +       uf_info = unix_file_inode_data(inode);
36076 +       if (uf_info->container == UF_CONTAINER_UNKNOWN) {
36077 +               get_exclusive_access(uf_info);
36078 +               result = find_file_state(inode, uf_info);
36079 +               if (unlikely(result != 0))
36080 +                       goto out;
36081 +       } else
36082 +               get_nonexclusive_access(uf_info);
36083 +       result = reiser4_grab_space_force(unix_file_estimate_read(inode, read_amount),
36084 +                                         BA_CAN_COMMIT);
36085 +       if (unlikely(result != 0))
36086 +               goto out;
36087 +       if (uf_info->container == UF_CONTAINER_EXTENTS){
36088 +               result = do_sync_read(file, buf, read_amount, off);
36089 +       } else if (uf_info->container == UF_CONTAINER_TAILS ||
36090 +                  reiser4_inode_get_flag(inode, REISER4_PART_IN_CONV) ||
36091 +                  reiser4_inode_get_flag(inode, REISER4_PART_MIXED)) {
36092 +               result = read_unix_file_container_tails(file, buf, read_amount, off);
36093 +       } else {
36094 +               assert("zam-1085", uf_info->container == UF_CONTAINER_EMPTY);
36095 +               result = 0;
36096 +       }
36097 +out:
36098 +       drop_access(uf_info);
36099 +       context_set_commit_async(ctx);
36100 +       reiser4_exit_context(ctx);
36101 +       return result;
36102 +}
36103 +
36104 +static ssize_t read_unix_file_container_tails(
36105 +       struct file *file, char __user *buf, size_t read_amount, loff_t *off)
36106 +{
36107 +       int result;
36108 +       struct inode *inode;
36109 +       hint_t *hint;
36110 +       struct unix_file_info *uf_info;
36111 +       size_t count, read, left;
36112 +       loff_t size;
36113 +
36114 +       assert("umka-072", file != NULL);
36115 +       assert("umka-074", off != NULL);
36116 +       inode = file->f_dentry->d_inode;
36117 +       assert("vs-972", !reiser4_inode_get_flag(inode, REISER4_NO_SD));
36118 +
36119 +       hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
36120 +       if (hint == NULL)
36121 +               return RETERR(-ENOMEM);
36122 +
36123 +       result = load_file_hint(file, hint);
36124 +       if (result) {
36125 +               kfree(hint);
36126 +               return result;
36127 +       }
36128 +
36129 +       left = read_amount;
36130 +       count = 0;
36131 +       uf_info = unix_file_inode_data(inode);
36132 +       while (left > 0) {
36133 +               reiser4_txn_restart_current();
36134 +               size = i_size_read(inode);
36135 +               if (*off >= size)
36136 +                       /* position to read from is past the end of file */
36137 +                       break;
36138 +               if (*off + left > size)
36139 +                       left = size - *off;
36140 +               /* faultin user page */
36141 +               result = fault_in_pages_writeable(buf, left > PAGE_CACHE_SIZE ? PAGE_CACHE_SIZE : left);
36142 +               if (result)
36143 +                       return RETERR(-EFAULT);
36144 +
36145 +               read = read_file(hint, file, buf,
36146 +                                left > PAGE_CACHE_SIZE ? PAGE_CACHE_SIZE : left,
36147 +                                off);
36148 +               if (read < 0) {
36149 +                       result = read;
36150 +                       break;
36151 +               }
36152 +               left -= read;
36153 +               buf += read;
36154 +
36155 +               /* update position in a file */
36156 +               *off += read;
36157 +               /* total number of read bytes */
36158 +               count += read;
36159 +       }
36160 +       done_lh(&hint->lh);
36161 +       save_file_hint(file, hint);
36162 +       kfree(hint);
36163 +       if (count)
36164 +               file_accessed(file);
36165 +       /* return number of read bytes or error code if nothing is read */
36166 +       return count ? count : result;
36167 +}
36168 +
36169 +/* This function takes care about @file's pages. First of all it checks if
36170 +   filesystems readonly and if so gets out. Otherwise, it throws out all
36171 +   pages of file if it was mapped for read and going to be mapped for write
36172 +   and consists of tails. This is done in order to not manage few copies
36173 +   of the data (first in page cache and second one in tails them selves)
36174 +   for the case of mapping files consisting tails.
36175 +
36176 +   Here also tail2extent conversion is performed if it is allowed and file
36177 +   is going to be written or mapped for write. This functions may be called
36178 +   from write_unix_file() or mmap_unix_file(). */
36179 +static int check_pages_unix_file(struct file *file, struct inode *inode)
36180 +{
36181 +       reiser4_invalidate_pages(inode->i_mapping, 0,
36182 +                                (inode->i_size + PAGE_CACHE_SIZE -
36183 +                                 1) >> PAGE_CACHE_SHIFT, 0);
36184 +       return unpack(file, inode, 0 /* not forever */ );
36185 +}
36186 +
36187 +/**
36188 + * mmap_unix_file - mmap of struct file_operations
36189 + * @file: file to mmap
36190 + * @vma:
36191 + *
36192 + * This is implementation of vfs's mmap method of struct file_operations for
36193 + * unix file plugin. It converts file to extent if necessary. Sets
36194 + * reiser4_inode's flag - REISER4_HAS_MMAP.
36195 + */
36196 +int mmap_unix_file(struct file *file, struct vm_area_struct *vma)
36197 +{
36198 +       reiser4_context *ctx;
36199 +       int result;
36200 +       struct inode *inode;
36201 +       struct unix_file_info *uf_info;
36202 +       reiser4_block_nr needed;
36203 +
36204 +       inode = file->f_dentry->d_inode;
36205 +       ctx = reiser4_init_context(inode->i_sb);
36206 +       if (IS_ERR(ctx))
36207 +               return PTR_ERR(ctx);
36208 +
36209 +       uf_info = unix_file_inode_data(inode);
36210 +
36211 +       get_exclusive_access_careful(uf_info, inode);
36212 +
36213 +       if (!IS_RDONLY(inode) && (vma->vm_flags & (VM_MAYWRITE | VM_SHARED))) {
36214 +               /*
36215 +                * we need file built of extent items. If it is still built of
36216 +                * tail items we have to convert it. Find what items the file
36217 +                * is built of
36218 +                */
36219 +               result = find_file_state(inode, uf_info);
36220 +               if (result != 0) {
36221 +                       drop_exclusive_access(uf_info);
36222 +                       reiser4_exit_context(ctx);
36223 +                       return result;
36224 +               }
36225 +
36226 +               assert("vs-1648", (uf_info->container == UF_CONTAINER_TAILS ||
36227 +                                  uf_info->container == UF_CONTAINER_EXTENTS ||
36228 +                                  uf_info->container == UF_CONTAINER_EMPTY));
36229 +               if (uf_info->container == UF_CONTAINER_TAILS) {
36230 +                       /*
36231 +                        * invalidate all pages and convert file from tails to
36232 +                        * extents
36233 +                        */
36234 +                       result = check_pages_unix_file(file, inode);
36235 +                       if (result) {
36236 +                               drop_exclusive_access(uf_info);
36237 +                               reiser4_exit_context(ctx);
36238 +                               return result;
36239 +                       }
36240 +               }
36241 +       }
36242 +
36243 +       /*
36244 +        * generic_file_mmap will do update_atime. Grab space for stat data
36245 +        * update.
36246 +        */
36247 +       needed = inode_file_plugin(inode)->estimate.update(inode);
36248 +       result = reiser4_grab_space_force(needed, BA_CAN_COMMIT);
36249 +       if (result) {
36250 +               drop_exclusive_access(uf_info);
36251 +               reiser4_exit_context(ctx);
36252 +               return result;
36253 +       }
36254 +
36255 +       result = generic_file_mmap(file, vma);
36256 +       if (result == 0) {
36257 +               /* mark file as having mapping. */
36258 +               reiser4_inode_set_flag(inode, REISER4_HAS_MMAP);
36259 +       }
36260 +
36261 +       drop_exclusive_access(uf_info);
36262 +       reiser4_exit_context(ctx);
36263 +       return result;
36264 +}
36265 +
36266 +/**
36267 + * find_first_item
36268 + * @inode:
36269 + *
36270 + * Finds file item which is responsible for first byte in the file.
36271 + */
36272 +static int find_first_item(struct inode *inode)
36273 +{
36274 +       coord_t coord;
36275 +       lock_handle lh;
36276 +       reiser4_key key;
36277 +       int result;
36278 +
36279 +       coord_init_zero(&coord);
36280 +       init_lh(&lh);
36281 +       inode_file_plugin(inode)->key_by_inode(inode, 0, &key);
36282 +       result = find_file_item_nohint(&coord, &lh, &key, ZNODE_READ_LOCK,
36283 +                                      inode);
36284 +       if (result == CBK_COORD_FOUND) {
36285 +               if (coord.between == AT_UNIT) {
36286 +                       result = zload(coord.node);
36287 +                       if (result == 0) {
36288 +                               result = item_id_by_coord(&coord);
36289 +                               zrelse(coord.node);
36290 +                               if (result != EXTENT_POINTER_ID &&
36291 +                                   result != FORMATTING_ID)
36292 +                                       result = RETERR(-EIO);
36293 +                       }
36294 +               } else
36295 +                       result = RETERR(-EIO);
36296 +       }
36297 +       done_lh(&lh);
36298 +       return result;
36299 +}
36300 +
36301 +/**
36302 + * open_unix_file
36303 + * @inode:
36304 + * @file:
36305 + *
36306 + * If filesystem is not readonly - complete uncompleted tail conversion if
36307 + * there was one
36308 + */
36309 +int open_unix_file(struct inode *inode, struct file *file)
36310 +{
36311 +       int result;
36312 +       reiser4_context *ctx;
36313 +       struct unix_file_info *uf_info;
36314 +
36315 +       if (IS_RDONLY(inode))
36316 +               return 0;
36317 +
36318 +       if (!reiser4_inode_get_flag(inode, REISER4_PART_MIXED))
36319 +               return 0;
36320 +
36321 +       ctx = reiser4_init_context(inode->i_sb);
36322 +       if (IS_ERR(ctx))
36323 +               return PTR_ERR(ctx);
36324 +
36325 +       uf_info = unix_file_inode_data(inode);
36326 +
36327 +       get_exclusive_access_careful(uf_info, inode);
36328 +
36329 +       if (!reiser4_inode_get_flag(inode, REISER4_PART_MIXED)) {
36330 +               /*
36331 +                * other process completed the conversion
36332 +                */
36333 +               drop_exclusive_access(uf_info);
36334 +               reiser4_exit_context(ctx);
36335 +               return 0;
36336 +       }
36337 +
36338 +       /*
36339 +        * file left in semi converted state after unclean shutdown or another
36340 +        * thread is doing conversion and dropped exclusive access which doing
36341 +        * balance dirty pages. Complete the conversion
36342 +        */
36343 +       result = find_first_item(inode);
36344 +       if (result == EXTENT_POINTER_ID)
36345 +               /*
36346 +                * first item is extent, therefore there was incomplete
36347 +                * tail2extent conversion. Complete it
36348 +                */
36349 +               result = tail2extent(unix_file_inode_data(inode));
36350 +       else if (result == FORMATTING_ID)
36351 +               /*
36352 +                * first item is formatting item, therefore there was
36353 +                * incomplete extent2tail conversion. Complete it
36354 +                */
36355 +               result = extent2tail(file, unix_file_inode_data(inode));
36356 +       else
36357 +               result = -EIO;
36358 +
36359 +       assert("vs-1712",
36360 +              ergo(result == 0,
36361 +                   (!reiser4_inode_get_flag(inode, REISER4_PART_MIXED) &&
36362 +                    !reiser4_inode_get_flag(inode, REISER4_PART_IN_CONV))));
36363 +       drop_exclusive_access(uf_info);
36364 +       reiser4_exit_context(ctx);
36365 +       return result;
36366 +}
36367 +
36368 +#define NEITHER_OBTAINED 0
36369 +#define EA_OBTAINED 1
36370 +#define NEA_OBTAINED 2
36371 +
36372 +static void drop_access(struct unix_file_info *uf_info)
36373 +{
36374 +       if (uf_info->exclusive_use)
36375 +               drop_exclusive_access(uf_info);
36376 +       else
36377 +               drop_nonexclusive_access(uf_info);
36378 +}
36379 +
36380 +#define debug_wuf(format, ...) printk("%s: %d: %s: " format "\n", \
36381 +                             __FILE__, __LINE__, __FUNCTION__, ## __VA_ARGS__)
36382 +
36383 +/**
36384 + * write_unix_file - private ->write() method of unix_file plugin.
36385 + *
36386 + * @file: file to write to
36387 + * @buf: address of user-space buffer
36388 + * @count: number of bytes to write
36389 + * @pos: position in file to write to
36390 + * @cont: unused argument, as we don't perform plugin conversion when being
36391 + * managed by unix_file plugin.
36392 + */
36393 +ssize_t write_unix_file(struct file *file, const char __user *buf,
36394 +                       size_t count, loff_t *pos, struct psched_context *cont)
36395 +{
36396 +       int result;
36397 +       reiser4_context *ctx;
36398 +       struct inode *inode;
36399 +       struct unix_file_info *uf_info;
36400 +       ssize_t written;
36401 +       int try_free_space;
36402 +       int to_write = PAGE_CACHE_SIZE * WRITE_GRANULARITY;
36403 +       size_t left;
36404 +       ssize_t (*write_op)(struct file *, struct inode *,
36405 +                           const char __user *, size_t,
36406 +                           loff_t *pos);
36407 +       int ea;
36408 +       loff_t new_size;
36409 +
36410 +       ctx = get_current_context();
36411 +       inode = file->f_dentry->d_inode;
36412 +
36413 +       assert("vs-947", !reiser4_inode_get_flag(inode, REISER4_NO_SD));
36414 +       assert("vs-9471", (!reiser4_inode_get_flag(inode, REISER4_PART_MIXED)));
36415 +
36416 +       /* check amount of bytes to write and writing position */
36417 +       result = generic_write_checks(file, pos, &count, 0);
36418 +       if (result) {
36419 +               context_set_commit_async(ctx);
36420 +               return result;
36421 +       }
36422 +
36423 +       result = file_remove_suid(file);
36424 +       if (result) {
36425 +               context_set_commit_async(ctx);
36426 +               return result;
36427 +       }
36428 +       /* remove_suid might create a transaction */
36429 +       reiser4_txn_restart(ctx);
36430 +
36431 +       uf_info = unix_file_inode_data(inode);
36432 +
36433 +       current->backing_dev_info = inode->i_mapping->backing_dev_info;
36434 +       written = 0;
36435 +       try_free_space = 0;
36436 +       left = count;
36437 +       ea = NEITHER_OBTAINED;
36438 +
36439 +       new_size = i_size_read(inode);
36440 +       if (*pos + count > new_size)
36441 +               new_size = *pos + count;
36442 +
36443 +       while (left) {
36444 +               if (left < to_write)
36445 +                       to_write = left;
36446 +
36447 +               if (uf_info->container == UF_CONTAINER_EMPTY) {
36448 +                       get_exclusive_access(uf_info);
36449 +                       ea = EA_OBTAINED;
36450 +                       if (uf_info->container != UF_CONTAINER_EMPTY) {
36451 +                               /* file is made not empty by another process */
36452 +                               drop_exclusive_access(uf_info);
36453 +                               ea = NEITHER_OBTAINED;
36454 +                               continue;
36455 +                       }
36456 +               } else if (uf_info->container == UF_CONTAINER_UNKNOWN) {
36457 +                       /*
36458 +                        * get exclusive access directly just to not have to
36459 +                        * re-obtain it if file will appear empty
36460 +                        */
36461 +                       get_exclusive_access(uf_info);
36462 +                       ea = EA_OBTAINED;
36463 +                       result = find_file_state(inode, uf_info);
36464 +                       if (result) {
36465 +                               drop_exclusive_access(uf_info);
36466 +                               ea = NEITHER_OBTAINED;
36467 +                               break;
36468 +                       }
36469 +               } else {
36470 +                       get_nonexclusive_access(uf_info);
36471 +                       ea = NEA_OBTAINED;
36472 +               }
36473 +
36474 +               /* either EA or NEA is obtained. Choose item write method */
36475 +               if (uf_info->container == UF_CONTAINER_EXTENTS) {
36476 +                       /* file is built of extent items */
36477 +                       write_op = reiser4_write_extent;
36478 +               } else if (uf_info->container == UF_CONTAINER_EMPTY) {
36479 +                       /* file is empty */
36480 +                       if (should_have_notail(uf_info, new_size))
36481 +                               write_op = reiser4_write_extent;
36482 +                       else
36483 +                               write_op = reiser4_write_tail;
36484 +               } else {
36485 +                       /* file is built of tail items */
36486 +                       if (should_have_notail(uf_info, new_size)) {
36487 +                               if (ea == NEA_OBTAINED) {
36488 +                                       drop_nonexclusive_access(uf_info);
36489 +                                       get_exclusive_access(uf_info);
36490 +                                       ea = EA_OBTAINED;
36491 +                               }
36492 +                               if (uf_info->container == UF_CONTAINER_TAILS) {
36493 +                                       /*
36494 +                                        * if file is being convered by another
36495 +                                        * process - wait until it completes
36496 +                                        */
36497 +                                       while (1) {
36498 +                                               if (reiser4_inode_get_flag(inode,
36499 +                                                                          REISER4_PART_IN_CONV)) {
36500 +                                                       drop_exclusive_access(uf_info);
36501 +                                                       schedule();
36502 +                                                       get_exclusive_access(uf_info);
36503 +                                                       continue;
36504 +                                               }
36505 +                                               break;
36506 +                                       }
36507 +                                       if (uf_info->container ==  UF_CONTAINER_TAILS) {
36508 +                                               result = tail2extent(uf_info);
36509 +                                               if (result) {
36510 +                                                       drop_exclusive_access(uf_info);
36511 +                                                       context_set_commit_async(ctx);
36512 +                                                       break;
36513 +                                               }
36514 +                                       }
36515 +                               }
36516 +                               drop_exclusive_access(uf_info);
36517 +                               ea = NEITHER_OBTAINED;
36518 +                               continue;
36519 +                       }
36520 +                       write_op = reiser4_write_tail;
36521 +               }
36522 +
36523 +               written = write_op(file, inode, buf, to_write, pos);
36524 +               if (written == -ENOSPC && try_free_space) {
36525 +                       drop_access(uf_info);
36526 +                       txnmgr_force_commit_all(inode->i_sb, 0);
36527 +                       try_free_space = 0;
36528 +                       continue;
36529 +               }
36530 +               if (written < 0) {
36531 +                       drop_access(uf_info);
36532 +                       result = written;
36533 +                       break;
36534 +               }
36535 +               /* something is written. */
36536 +               if (uf_info->container == UF_CONTAINER_EMPTY) {
36537 +                       assert("edward-1553", ea == EA_OBTAINED);
36538 +                       uf_info->container =
36539 +                               (write_op == reiser4_write_extent) ?
36540 +                               UF_CONTAINER_EXTENTS : UF_CONTAINER_TAILS;
36541 +               } else {
36542 +                       assert("edward-1554", ergo(uf_info->container == UF_CONTAINER_EXTENTS,
36543 +                                       write_op == reiser4_write_extent));
36544 +                       assert("edward-1555", ergo(uf_info->container == UF_CONTAINER_TAILS,
36545 +                                       write_op == reiser4_write_tail));
36546 +               }
36547 +               if (*pos + written > inode->i_size)
36548 +                       INODE_SET_FIELD(inode, i_size, *pos + written);
36549 +               file_update_time(file);
36550 +               result = reiser4_update_sd(inode);
36551 +               if (result) {
36552 +                       current->backing_dev_info = NULL;
36553 +                       drop_access(uf_info);
36554 +                       context_set_commit_async(ctx);
36555 +                       break;
36556 +               }
36557 +               drop_access(uf_info);
36558 +               ea = NEITHER_OBTAINED;
36559 +               reiser4_txn_restart(ctx);
36560 +               current->journal_info = NULL;
36561 +               /*
36562 +                * tell VM how many pages were dirtied. Maybe number of pages
36563 +                * which were dirty already should not be counted
36564 +                */
36565 +               balance_dirty_pages_ratelimited_nr(inode->i_mapping,
36566 +                                                  (written + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE);
36567 +               current->journal_info = ctx;
36568 +
36569 +               left -= written;
36570 +               buf += written;
36571 +               *pos += written;
36572 +       }
36573 +       if (result == 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
36574 +               reiser4_txn_restart_current();
36575 +               grab_space_enable();
36576 +               result = reiser4_sync_file_common(file, file->f_dentry,
36577 +                                                 0 /* data and stat data */);
36578 +               if (result)
36579 +                       warning("reiser4-7", "failed to sync file %llu",
36580 +                               (unsigned long long)get_inode_oid(inode));
36581 +       }
36582 +
36583 +       current->backing_dev_info = NULL;
36584 +
36585 +       /*
36586 +        * return number of written bytes or error code if nothing is
36587 +        * written. Note, that it does not work correctly in case when
36588 +        * sync_unix_file returns error
36589 +        */
36590 +       return (count - left) ? (count - left) : result;
36591 +}
36592 +
36593 +/**
36594 + * release_unix_file - release of struct file_operations
36595 + * @inode: inode of released file
36596 + * @file: file to release
36597 + *
36598 + * Implementation of release method of struct file_operations for unix file
36599 + * plugin. If last reference to indode is released - convert all extent items
36600 + * into tail items if necessary. Frees reiser4 specific file data.
36601 + */
36602 +int release_unix_file(struct inode *inode, struct file *file)
36603 +{
36604 +       reiser4_context *ctx;
36605 +       struct unix_file_info *uf_info;
36606 +       int result;
36607 +       int in_reiser4;
36608 +
36609 +       in_reiser4 = is_in_reiser4_context();
36610 +
36611 +       ctx = reiser4_init_context(inode->i_sb);
36612 +       if (IS_ERR(ctx))
36613 +               return PTR_ERR(ctx);
36614 +
36615 +       result = 0;
36616 +       if (in_reiser4 == 0) {
36617 +               uf_info = unix_file_inode_data(inode);
36618 +
36619 +               get_exclusive_access_careful(uf_info, inode);
36620 +               if (atomic_read(&file->f_dentry->d_count) == 1 &&
36621 +                   uf_info->container == UF_CONTAINER_EXTENTS &&
36622 +                   !should_have_notail(uf_info, inode->i_size) &&
36623 +                   !rofs_inode(inode)) {
36624 +                       result = extent2tail(file, uf_info);
36625 +                       if (result != 0) {
36626 +                               context_set_commit_async(ctx);
36627 +                               warning("nikita-3233",
36628 +                                       "Failed (%d) to convert in %s (%llu)",
36629 +                                       result, __FUNCTION__,
36630 +                                       (unsigned long long)
36631 +                                       get_inode_oid(inode));
36632 +                       }
36633 +               }
36634 +               drop_exclusive_access(uf_info);
36635 +       } else {
36636 +               /*
36637 +                  we are within reiser4 context already. How latter is
36638 +                  possible? Simple:
36639 +
36640 +                  (gdb) bt
36641 +                  #0  get_exclusive_access ()
36642 +                  #2  0xc01e56d3 in release_unix_file ()
36643 +                  #3  0xc01c3643 in reiser4_release ()
36644 +                  #4  0xc014cae0 in __fput ()
36645 +                  #5  0xc013ffc3 in remove_vm_struct ()
36646 +                  #6  0xc0141786 in exit_mmap ()
36647 +                  #7  0xc0118480 in mmput ()
36648 +                  #8  0xc0133205 in oom_kill ()
36649 +                  #9  0xc01332d1 in out_of_memory ()
36650 +                  #10 0xc013bc1d in try_to_free_pages ()
36651 +                  #11 0xc013427b in __alloc_pages ()
36652 +                  #12 0xc013f058 in do_anonymous_page ()
36653 +                  #13 0xc013f19d in do_no_page ()
36654 +                  #14 0xc013f60e in handle_mm_fault ()
36655 +                  #15 0xc01131e5 in do_page_fault ()
36656 +                  #16 0xc0104935 in error_code ()
36657 +                  #17 0xc025c0c6 in __copy_to_user_ll ()
36658 +                  #18 0xc01d496f in reiser4_read_tail ()
36659 +                  #19 0xc01e4def in read_unix_file ()
36660 +                  #20 0xc01c3504 in reiser4_read ()
36661 +                  #21 0xc014bd4f in vfs_read ()
36662 +                  #22 0xc014bf66 in sys_read ()
36663 +                */
36664 +               warning("vs-44", "out of memory?");
36665 +       }
36666 +
36667 +       reiser4_free_file_fsdata(file);
36668 +
36669 +       reiser4_exit_context(ctx);
36670 +       return result;
36671 +}
36672 +
36673 +static void set_file_notail(struct inode *inode)
36674 +{
36675 +       reiser4_inode *state;
36676 +       formatting_plugin *tplug;
36677 +
36678 +       state = reiser4_inode_data(inode);
36679 +       tplug = formatting_plugin_by_id(NEVER_TAILS_FORMATTING_ID);
36680 +       force_plugin_pset(inode, PSET_FORMATTING, (reiser4_plugin *)tplug);
36681 +}
36682 +
36683 +/* if file is built of tails - convert it to extents */
36684 +static int unpack(struct file *filp, struct inode *inode, int forever)
36685 +{
36686 +       int result = 0;
36687 +       struct unix_file_info *uf_info;
36688 +
36689 +       uf_info = unix_file_inode_data(inode);
36690 +       assert("vs-1628", ea_obtained(uf_info));
36691 +
36692 +       result = find_file_state(inode, uf_info);
36693 +       if (result)
36694 +               return result;
36695 +       assert("vs-1074", uf_info->container != UF_CONTAINER_UNKNOWN);
36696 +
36697 +       if (uf_info->container == UF_CONTAINER_TAILS) {
36698 +               /*
36699 +                * if file is being convered by another process - wait until it
36700 +                * completes
36701 +                */
36702 +               while (1) {
36703 +                       if (reiser4_inode_get_flag(inode,
36704 +                                                  REISER4_PART_IN_CONV)) {
36705 +                               drop_exclusive_access(uf_info);
36706 +                               schedule();
36707 +                               get_exclusive_access(uf_info);
36708 +                               continue;
36709 +                       }
36710 +                       break;
36711 +               }
36712 +               if (uf_info->container == UF_CONTAINER_TAILS) {
36713 +                       result = tail2extent(uf_info);
36714 +                       if (result)
36715 +                               return result;
36716 +               }
36717 +       }
36718 +       if (forever) {
36719 +               /* safe new formatting plugin in stat data */
36720 +               __u64 tograb;
36721 +
36722 +               set_file_notail(inode);
36723 +
36724 +               grab_space_enable();
36725 +               tograb = inode_file_plugin(inode)->estimate.update(inode);
36726 +               result = reiser4_grab_space(tograb, BA_CAN_COMMIT);
36727 +               result = reiser4_update_sd(inode);
36728 +       }
36729 +
36730 +       return result;
36731 +}
36732 +
36733 +/* implentation of vfs' ioctl method of struct file_operations for unix file
36734 +   plugin
36735 +*/
36736 +int
36737 +ioctl_unix_file(struct inode *inode, struct file *filp,
36738 +               unsigned int cmd, unsigned long arg UNUSED_ARG)
36739 +{
36740 +       reiser4_context *ctx;
36741 +       int result;
36742 +
36743 +       ctx = reiser4_init_context(inode->i_sb);
36744 +       if (IS_ERR(ctx))
36745 +               return PTR_ERR(ctx);
36746 +
36747 +       switch (cmd) {
36748 +       case REISER4_IOC_UNPACK:
36749 +               get_exclusive_access(unix_file_inode_data(inode));
36750 +               result = unpack(filp, inode, 1 /* forever */ );
36751 +               drop_exclusive_access(unix_file_inode_data(inode));
36752 +               break;
36753 +
36754 +       default:
36755 +               result = RETERR(-ENOSYS);
36756 +               break;
36757 +       }
36758 +       reiser4_exit_context(ctx);
36759 +       return result;
36760 +}
36761 +
36762 +/* implentation of vfs' bmap method of struct address_space_operations for unix
36763 +   file plugin
36764 +*/
36765 +sector_t bmap_unix_file(struct address_space * mapping, sector_t lblock)
36766 +{
36767 +       reiser4_context *ctx;
36768 +       sector_t result;
36769 +       reiser4_key key;
36770 +       coord_t coord;
36771 +       lock_handle lh;
36772 +       struct inode *inode;
36773 +       item_plugin *iplug;
36774 +       sector_t block;
36775 +
36776 +       inode = mapping->host;
36777 +
36778 +       ctx = reiser4_init_context(inode->i_sb);
36779 +       if (IS_ERR(ctx))
36780 +               return PTR_ERR(ctx);
36781 +       key_by_inode_and_offset_common(inode,
36782 +                                      (loff_t) lblock * current_blocksize,
36783 +                                      &key);
36784 +
36785 +       init_lh(&lh);
36786 +       result =
36787 +           find_file_item_nohint(&coord, &lh, &key, ZNODE_READ_LOCK, inode);
36788 +       if (cbk_errored(result)) {
36789 +               done_lh(&lh);
36790 +               reiser4_exit_context(ctx);
36791 +               return result;
36792 +       }
36793 +
36794 +       result = zload(coord.node);
36795 +       if (result) {
36796 +               done_lh(&lh);
36797 +               reiser4_exit_context(ctx);
36798 +               return result;
36799 +       }
36800 +
36801 +       iplug = item_plugin_by_coord(&coord);
36802 +       if (iplug->s.file.get_block) {
36803 +               result = iplug->s.file.get_block(&coord, lblock, &block);
36804 +               if (result == 0)
36805 +                       result = block;
36806 +       } else
36807 +               result = RETERR(-EINVAL);
36808 +
36809 +       zrelse(coord.node);
36810 +       done_lh(&lh);
36811 +       reiser4_exit_context(ctx);
36812 +       return result;
36813 +}
36814 +
36815 +/**
36816 + * flow_by_inode_unix_file - initizlize structure flow
36817 + * @inode: inode of file for which read or write is abou
36818 + * @buf: buffer to perform read to or write from
36819 + * @user: flag showing whether @buf is user space or kernel space
36820 + * @size: size of buffer @buf
36821 + * @off: start offset fro read or write
36822 + * @op: READ or WRITE
36823 + * @flow:
36824 + *
36825 + * Initializes fields of @flow: key, size of data, i/o mode (read or write).
36826 + */
36827 +int flow_by_inode_unix_file(struct inode *inode,
36828 +                           const char __user *buf, int user,
36829 +                           loff_t size, loff_t off,
36830 +                           rw_op op, flow_t *flow)
36831 +{
36832 +       assert("nikita-1100", inode != NULL);
36833 +
36834 +       flow->length = size;
36835 +       memcpy(&flow->data, &buf, sizeof(buf));
36836 +       flow->user = user;
36837 +       flow->op = op;
36838 +       assert("nikita-1931", inode_file_plugin(inode) != NULL);
36839 +       assert("nikita-1932",
36840 +              inode_file_plugin(inode)->key_by_inode ==
36841 +              key_by_inode_and_offset_common);
36842 +       /* calculate key of write position and insert it into flow->key */
36843 +       return key_by_inode_and_offset_common(inode, off, &flow->key);
36844 +}
36845 +
36846 +/* plugin->u.file.set_plug_in_sd = NULL
36847 +   plugin->u.file.set_plug_in_inode = NULL
36848 +   plugin->u.file.create_blank_sd = NULL */
36849 +/* plugin->u.file.delete */
36850 +/*
36851 +   plugin->u.file.add_link = reiser4_add_link_common
36852 +   plugin->u.file.rem_link = NULL */
36853 +
36854 +/* plugin->u.file.owns_item
36855 +   this is common_file_owns_item with assertion */
36856 +/* Audited by: green(2002.06.15) */
36857 +int
36858 +owns_item_unix_file(const struct inode *inode /* object to check against */ ,
36859 +                   const coord_t * coord /* coord to check */ )
36860 +{
36861 +       int result;
36862 +
36863 +       result = owns_item_common(inode, coord);
36864 +       if (!result)
36865 +               return 0;
36866 +       if (!plugin_of_group(item_plugin_by_coord(coord),
36867 +                            UNIX_FILE_METADATA_ITEM_TYPE))
36868 +               return 0;
36869 +       assert("vs-547",
36870 +              item_id_by_coord(coord) == EXTENT_POINTER_ID ||
36871 +              item_id_by_coord(coord) == FORMATTING_ID);
36872 +       return 1;
36873 +}
36874 +
36875 +static int setattr_truncate(struct inode *inode, struct iattr *attr)
36876 +{
36877 +       int result;
36878 +       int s_result;
36879 +       loff_t old_size;
36880 +       reiser4_tree *tree;
36881 +
36882 +       inode_check_scale(inode, inode->i_size, attr->ia_size);
36883 +
36884 +       old_size = inode->i_size;
36885 +       tree = reiser4_tree_by_inode(inode);
36886 +
36887 +       result = safe_link_grab(tree, BA_CAN_COMMIT);
36888 +       if (result == 0)
36889 +               result = safe_link_add(inode, SAFE_TRUNCATE);
36890 +       if (result == 0)
36891 +               result = truncate_file_body(inode, attr);
36892 +       if (result)
36893 +               warning("vs-1588", "truncate_file failed: oid %lli, "
36894 +                       "old size %lld, new size %lld, retval %d",
36895 +                       (unsigned long long)get_inode_oid(inode),
36896 +                       old_size, attr->ia_size, result);
36897 +
36898 +       s_result = safe_link_grab(tree, BA_CAN_COMMIT);
36899 +       if (s_result == 0)
36900 +               s_result =
36901 +                   safe_link_del(tree, get_inode_oid(inode), SAFE_TRUNCATE);
36902 +       if (s_result != 0) {
36903 +               warning("nikita-3417", "Cannot kill safelink %lli: %i",
36904 +                       (unsigned long long)get_inode_oid(inode), s_result);
36905 +       }
36906 +       safe_link_release(tree);
36907 +       return result;
36908 +}
36909 +
36910 +/* plugin->u.file.setattr method */
36911 +/* This calls inode_setattr and if truncate is in effect it also takes
36912 +   exclusive inode access to avoid races */
36913 +int setattr_unix_file(struct dentry *dentry,   /* Object to change attributes */
36914 +                     struct iattr *attr /* change description */ )
36915 +{
36916 +       int result;
36917 +
36918 +       if (attr->ia_valid & ATTR_SIZE) {
36919 +               reiser4_context *ctx;
36920 +               struct unix_file_info *uf_info;
36921 +
36922 +               /* truncate does reservation itself and requires exclusive
36923 +                  access obtained */
36924 +               ctx = reiser4_init_context(dentry->d_inode->i_sb);
36925 +               if (IS_ERR(ctx))
36926 +                       return PTR_ERR(ctx);
36927 +
36928 +               uf_info = unix_file_inode_data(dentry->d_inode);
36929 +               get_exclusive_access_careful(uf_info, dentry->d_inode);
36930 +               result = setattr_truncate(dentry->d_inode, attr);
36931 +               drop_exclusive_access(uf_info);
36932 +               context_set_commit_async(ctx);
36933 +               reiser4_exit_context(ctx);
36934 +       } else
36935 +               result = reiser4_setattr_common(dentry, attr);
36936 +
36937 +       return result;
36938 +}
36939 +
36940 +/* plugin->u.file.init_inode_data */
36941 +void
36942 +init_inode_data_unix_file(struct inode *inode,
36943 +                         reiser4_object_create_data * crd, int create)
36944 +{
36945 +       struct unix_file_info *data;
36946 +
36947 +       data = unix_file_inode_data(inode);
36948 +       data->container = create ? UF_CONTAINER_EMPTY : UF_CONTAINER_UNKNOWN;
36949 +       init_rwsem(&data->latch);
36950 +       data->tplug = inode_formatting_plugin(inode);
36951 +       data->exclusive_use = 0;
36952 +
36953 +#if REISER4_DEBUG
36954 +       data->ea_owner = NULL;
36955 +       atomic_set(&data->nr_neas, 0);
36956 +#endif
36957 +       init_inode_ordering(inode, crd, create);
36958 +}
36959 +
36960 +/**
36961 + * delete_unix_file - delete_object of file_plugin
36962 + * @inode: inode to be deleted
36963 + *
36964 + * Truncates file to length 0, removes stat data and safe link.
36965 + */
36966 +int delete_object_unix_file(struct inode *inode)
36967 +{
36968 +       struct unix_file_info *uf_info;
36969 +       int result;
36970 +
36971 +       if (reiser4_inode_get_flag(inode, REISER4_NO_SD))
36972 +               return 0;
36973 +
36974 +       /* truncate file bogy first */
36975 +       uf_info = unix_file_inode_data(inode);
36976 +       get_exclusive_access(uf_info);
36977 +       result = shorten_file(inode, 0 /* size */ );
36978 +       drop_exclusive_access(uf_info);
36979 +
36980 +       if (result)
36981 +               warning("edward-1556",
36982 +                       "failed to truncate file (%llu) on removal: %d",
36983 +                       get_inode_oid(inode), result);
36984 +
36985 +       /* remove stat data and safe link */
36986 +       return reiser4_delete_object_common(inode);
36987 +}
36988 +
36989 +int
36990 +prepare_write_unix_file(struct file *file, struct page *page,
36991 +                       unsigned from, unsigned to)
36992 +{
36993 +       reiser4_context *ctx;
36994 +       struct unix_file_info *uf_info;
36995 +       int ret;
36996 +
36997 +       ctx = reiser4_init_context(file->f_dentry->d_inode->i_sb);
36998 +       if (IS_ERR(ctx))
36999 +               return PTR_ERR(ctx);
37000 +
37001 +       uf_info = unix_file_inode_data(file->f_dentry->d_inode);
37002 +       get_exclusive_access(uf_info);
37003 +       ret = find_file_state(file->f_dentry->d_inode, uf_info);
37004 +       if (ret == 0) {
37005 +               if (uf_info->container == UF_CONTAINER_TAILS)
37006 +                       ret = -EINVAL;
37007 +               else
37008 +                       ret = do_prepare_write(file, page, from, to);
37009 +       }
37010 +       drop_exclusive_access(uf_info);
37011 +
37012 +       /* don't commit transaction under inode semaphore */
37013 +       context_set_commit_async(ctx);
37014 +       reiser4_exit_context(ctx);
37015 +       return ret;
37016 +}
37017 +
37018 +/*
37019 + * Local variables:
37020 + * c-indentation-style: "K&R"
37021 + * mode-name: "LC"
37022 + * c-basic-offset: 8
37023 + * tab-width: 8
37024 + * fill-column: 79
37025 + * scroll-step: 1
37026 + * End:
37027 + */
37028 diff -puN /dev/null fs/reiser4/plugin/file/file.h
37029 --- /dev/null
37030 +++ a/fs/reiser4/plugin/file/file.h
37031 @@ -0,0 +1,331 @@
37032 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
37033 + * reiser4/README */
37034 +
37035 +/* this file contains declarations of methods implementing
37036 +   file plugins (UNIX_FILE_PLUGIN_ID, CRYPTCOMPRESS_FILE_PLUGIN_ID
37037 +   and SYMLINK_FILE_PLUGIN_ID) */
37038 +
37039 +#if !defined( __REISER4_FILE_H__ )
37040 +#define __REISER4_FILE_H__
37041 +
37042 +/* possible states when scheduling a new file plugin */
37043 +typedef enum {
37044 +       PSCHED_INVAL_STATE,    /* invalid state */
37045 +       PSCHED_SCHED_POINT,    /* scheduling point has been achieved */
37046 +       PSCHED_REMAINS_OLD,    /* made a decision to be managed by old plugin */
37047 +       PSCHED_ASSIGNED_NEW    /* new plugin has been scheduled */
37048 +} psched_state;
37049 +
37050 +struct psched_context {
37051 +       int nr_pages;
37052 +       struct page **pages;
37053 +       psched_state state;
37054 +};
37055 +
37056 +/**
37057 + * Declarations of common/careful/generic methods.
37058 + * Suppose ->foo() is a vs method (of f_ops, i_ops, or a_ops);
37059 + * Then common reiser4 method for foo looks like reiser4_foo_common;
37060 + * careful method looks like reiser4_foo_careful;
37061 + * generic method looks like reiser4_foo.
37062 + *
37063 + * Common method is a simple instruction set eligible for more
37064 + * then one plugin id.
37065 + *
37066 + * Generic method looks at the plugin installed in inode's
37067 + * plugin set and calls its appropriate method.
37068 + *
37069 + * Careful method looks like generic method with protected pset
37070 + * (see plugin/file/file_conversion.c for details).
37071 + */
37072 +
37073 +/* inode operations */
37074 +int reiser4_setattr(struct dentry *, struct iattr *);
37075 +
37076 +/* file operations */
37077 +ssize_t reiser4_read_careful(struct file *, char __user *buf,
37078 +                            size_t count, loff_t *off);
37079 +ssize_t reiser4_write_careful(struct file *, const char __user *buf,
37080 +                             size_t count, loff_t * off);
37081 +int reiser4_ioctl_careful(struct inode *inode, struct file *filp,
37082 +                         unsigned int cmd, unsigned long arg);
37083 +int reiser4_mmap_careful(struct file *, struct vm_area_struct *);
37084 +int reiser4_open_careful(struct inode *inode, struct file *file);
37085 +int reiser4_release_careful(struct inode *, struct file *);
37086 +int reiser4_sync_file_common(struct file *, struct dentry *, int datasync);
37087 +
37088 +/* address space operations */
37089 +int reiser4_readpage(struct file *, struct page *);
37090 +int reiser4_readpages(struct file*, struct address_space*, struct list_head*,
37091 +                     unsigned);
37092 +int reiser4_writepages(struct address_space *, struct writeback_control *);
37093 +int reiser4_prepare_write(struct file *, struct page *, unsigned from,
37094 +                         unsigned to);
37095 +int reiser4_commit_write(struct file *, struct page *, unsigned from,
37096 +                        unsigned to);
37097 +sector_t reiser4_bmap_careful(struct address_space *, sector_t lblock);
37098 +
37099 +/*
37100 + * Private methods of unix-file plugin
37101 + * (UNIX_FILE_PLUGIN_ID)
37102 + */
37103 +
37104 +/* private inode operations */
37105 +int setattr_unix_file(struct dentry *, struct iattr *);
37106 +
37107 +/* private file operations */
37108 +
37109 +ssize_t read_unix_file(struct file *, char __user *buf, size_t read_amount,
37110 +                      loff_t *off);
37111 +ssize_t write_unix_file(struct file *, const char __user *buf, size_t write_amount,
37112 +                       loff_t * off, struct psched_context * cont);
37113 +int ioctl_unix_file(struct inode *, struct file *, unsigned int cmd,
37114 +                   unsigned long arg);
37115 +int mmap_unix_file(struct file *, struct vm_area_struct *);
37116 +int open_unix_file(struct inode *, struct file *);
37117 +int release_unix_file(struct inode *, struct file *);
37118 +
37119 +/* private address space operations */
37120 +int readpage_unix_file(struct file *, struct page *);
37121 +int readpages_unix_file(struct file*, struct address_space*, struct list_head*, unsigned);
37122 +int writepages_unix_file(struct address_space *, struct writeback_control *);
37123 +int prepare_write_unix_file(struct file *, struct page *, unsigned from,
37124 +                           unsigned to);
37125 +int commit_write_unix_file(struct file *, struct page *, unsigned from,
37126 +                          unsigned to);
37127 +sector_t bmap_unix_file(struct address_space *, sector_t lblock);
37128 +
37129 +/* other private methods */
37130 +int delete_object_unix_file(struct inode *);
37131 +int flow_by_inode_unix_file(struct inode *, const char __user *buf,
37132 +                           int user, loff_t, loff_t, rw_op, flow_t *);
37133 +int owns_item_unix_file(const struct inode *, const coord_t *);
37134 +void init_inode_data_unix_file(struct inode *, reiser4_object_create_data *,
37135 +                              int create);
37136 +
37137 +/*
37138 + * Private methods of cryptcompress file plugin
37139 + * (CRYPTCOMPRESS_FILE_PLUGIN_ID)
37140 + */
37141 +
37142 +/* private inode operations */
37143 +int setattr_cryptcompress(struct dentry *, struct iattr *);
37144 +
37145 +/* private file operations */
37146 +ssize_t read_cryptcompress(struct file *, char __user *buf,
37147 +                          size_t count, loff_t *off);
37148 +ssize_t write_cryptcompress(struct file *, const char __user *buf,
37149 +                           size_t count, loff_t * off,
37150 +                           struct psched_context *cont);
37151 +int ioctl_cryptcompress(struct inode *, struct file *, unsigned int cmd,
37152 +                       unsigned long arg);
37153 +int mmap_cryptcompress(struct file *, struct vm_area_struct *);
37154 +int open_cryptcompress(struct inode *, struct file *);
37155 +int release_cryptcompress(struct inode *, struct file *);
37156 +
37157 +/* private address space operations */
37158 +int readpage_cryptcompress(struct file *, struct page *);
37159 +int readpages_cryptcompress(struct file*, struct address_space*,
37160 +                           struct list_head*, unsigned);
37161 +int writepages_cryptcompress(struct address_space *,
37162 +                            struct writeback_control *);
37163 +int prepare_write_cryptcompress(struct file *, struct page *, unsigned from,
37164 +                               unsigned to);
37165 +int commit_write_cryptcompress(struct file *, struct page *, unsigned from,
37166 +                              unsigned to);
37167 +sector_t bmap_cryptcompress(struct address_space *, sector_t lblock);
37168 +
37169 +/* other private methods */
37170 +int flow_by_inode_cryptcompress(struct inode *, const char __user *buf,
37171 +                               int user, loff_t, loff_t, rw_op, flow_t *);
37172 +int key_by_inode_cryptcompress(struct inode *, loff_t off, reiser4_key *);
37173 +int create_object_cryptcompress(struct inode *, struct inode *,
37174 +                               reiser4_object_create_data *);
37175 +int delete_object_cryptcompress(struct inode *);
37176 +void init_inode_data_cryptcompress(struct inode *, reiser4_object_create_data *,
37177 +                                  int create);
37178 +int cut_tree_worker_cryptcompress(tap_t *, const reiser4_key * from_key,
37179 +                                 const reiser4_key * to_key,
37180 +                                 reiser4_key * smallest_removed,
37181 +                                 struct inode *object, int truncate,
37182 +                                 int *progress);
37183 +void destroy_inode_cryptcompress(struct inode *);
37184 +
37185 +/*
37186 + * Private methods of symlink file plugin
37187 + * (SYMLINK_FILE_PLUGIN_ID)
37188 + */
37189 +int reiser4_create_symlink(struct inode *symlink, struct inode *dir,
37190 +                          reiser4_object_create_data *);
37191 +void destroy_inode_symlink(struct inode *);
37192 +
37193 +/*
37194 + * all the write into unix file is performed by item write method. Write method
37195 + * of unix file plugin only decides which item plugin (extent or tail) and in
37196 + * which mode (one from the enum below) to call
37197 + */
37198 +typedef enum {
37199 +       FIRST_ITEM = 1,
37200 +       APPEND_ITEM = 2,
37201 +       OVERWRITE_ITEM = 3
37202 +} write_mode_t;
37203 +
37204 +/* unix file may be in one the following states */
37205 +typedef enum {
37206 +       UF_CONTAINER_UNKNOWN = 0,
37207 +       UF_CONTAINER_TAILS = 1,
37208 +       UF_CONTAINER_EXTENTS = 2,
37209 +       UF_CONTAINER_EMPTY = 3
37210 +} file_container_t;
37211 +
37212 +struct formatting_plugin;
37213 +struct inode;
37214 +
37215 +/* unix file plugin specific part of reiser4 inode */
37216 +struct unix_file_info {
37217 +       /*
37218 +        * this read-write lock protects file containerization change. Accesses
37219 +        * which do not change file containerization (see file_container_t)
37220 +        * (read, readpage, writepage, write (until tail conversion is
37221 +        * involved)) take read-lock. Accesses which modify file
37222 +        * containerization (truncate, conversion from tail to extent and back)
37223 +        * take write-lock.
37224 +        */
37225 +       struct rw_semaphore latch;
37226 +       /* this enum specifies which items are used to build the file */
37227 +       file_container_t container;
37228 +       /*
37229 +        * plugin which controls when file is to be converted to extents and
37230 +        * back to tail
37231 +        */
37232 +       struct formatting_plugin *tplug;
37233 +       /* if this is set, file is in exclusive use */
37234 +       int exclusive_use;
37235 +#if REISER4_DEBUG
37236 +       /* pointer to task struct of thread owning exclusive access to file */
37237 +       void *ea_owner;
37238 +       atomic_t nr_neas;
37239 +       void *last_reader;
37240 +#endif
37241 +};
37242 +
37243 +struct unix_file_info *unix_file_inode_data(const struct inode *inode);
37244 +void get_exclusive_access(struct unix_file_info *);
37245 +void drop_exclusive_access(struct unix_file_info *);
37246 +void get_nonexclusive_access(struct unix_file_info *);
37247 +void drop_nonexclusive_access(struct unix_file_info *);
37248 +int try_to_get_nonexclusive_access(struct unix_file_info *);
37249 +int find_file_item(hint_t *, const reiser4_key *, znode_lock_mode,
37250 +                  struct inode *);
37251 +int find_file_item_nohint(coord_t *, lock_handle *,
37252 +                         const reiser4_key *, znode_lock_mode,
37253 +                         struct inode *);
37254 +
37255 +int load_file_hint(struct file *, hint_t *);
37256 +void save_file_hint(struct file *, const hint_t *);
37257 +
37258 +#include "../item/extent.h"
37259 +#include "../item/tail.h"
37260 +#include "../item/ctail.h"
37261 +
37262 +struct uf_coord {
37263 +       coord_t coord;
37264 +       lock_handle *lh;
37265 +       int valid;
37266 +       union {
37267 +               struct extent_coord_extension extent;
37268 +               struct tail_coord_extension tail;
37269 +               struct ctail_coord_extension ctail;
37270 +       } extension;
37271 +};
37272 +
37273 +#include "../../forward.h"
37274 +#include "../../seal.h"
37275 +#include "../../lock.h"
37276 +
37277 +/*
37278 + * This structure is used to speed up file operations (reads and writes).  A
37279 + * hint is a suggestion about where a key resolved to last time.  A seal
37280 + * indicates whether a node has been modified since a hint was last recorded.
37281 + * You check the seal, and if the seal is still valid, you can use the hint
37282 + * without traversing the tree again.
37283 + */
37284 +struct hint {
37285 +       seal_t seal; /* a seal over last file item accessed */
37286 +       uf_coord_t ext_coord;
37287 +       loff_t offset;
37288 +       znode_lock_mode mode;
37289 +       lock_handle lh;
37290 +};
37291 +
37292 +static inline int hint_is_valid(hint_t * hint)
37293 +{
37294 +       return hint->ext_coord.valid;
37295 +}
37296 +
37297 +static inline void hint_set_valid(hint_t * hint)
37298 +{
37299 +       hint->ext_coord.valid = 1;
37300 +}
37301 +
37302 +static inline void hint_clr_valid(hint_t * hint)
37303 +{
37304 +       hint->ext_coord.valid = 0;
37305 +}
37306 +
37307 +int load_file_hint(struct file *, hint_t *);
37308 +void save_file_hint(struct file *, const hint_t *);
37309 +void hint_init_zero(hint_t *);
37310 +void reiser4_set_hint(hint_t *, const reiser4_key *, znode_lock_mode);
37311 +int hint_is_set(const hint_t *);
37312 +void reiser4_unset_hint(hint_t *);
37313 +
37314 +int reiser4_update_file_size(struct inode *, loff_t, int update_sd);
37315 +int cut_file_items(struct inode *, loff_t new_size,
37316 +                  int update_sd, loff_t cur_size,
37317 +                  int (*update_actor) (struct inode *, loff_t, int));
37318 +#if REISER4_DEBUG
37319 +
37320 +/* return 1 is exclusive access is obtained, 0 - otherwise */
37321 +static inline int ea_obtained(struct unix_file_info * uf_info)
37322 +{
37323 +       int ret;
37324 +
37325 +       ret = down_read_trylock(&uf_info->latch);
37326 +       if (ret)
37327 +               up_read(&uf_info->latch);
37328 +       return !ret;
37329 +}
37330 +
37331 +#endif
37332 +
37333 +#define WRITE_GRANULARITY 32
37334 +
37335 +int tail2extent(struct unix_file_info *);
37336 +int extent2tail(struct file *, struct unix_file_info *);
37337 +
37338 +int goto_right_neighbor(coord_t *, lock_handle *);
37339 +int find_or_create_extent(struct page *);
37340 +int equal_to_ldk(znode *, const reiser4_key *);
37341 +
37342 +void init_uf_coord(uf_coord_t *uf_coord, lock_handle *lh);
37343 +
37344 +static inline int cbk_errored(int cbk_result)
37345 +{
37346 +       return (cbk_result != CBK_COORD_NOTFOUND
37347 +               && cbk_result != CBK_COORD_FOUND);
37348 +}
37349 +
37350 +/* __REISER4_FILE_H__ */
37351 +#endif
37352 +
37353 +/*
37354 + * Local variables:
37355 + * c-indentation-style: "K&R"
37356 + * mode-name: "LC"
37357 + * c-basic-offset: 8
37358 + * tab-width: 8
37359 + * fill-column: 79
37360 + * scroll-step: 1
37361 + * End:
37362 +*/
37363 diff -puN /dev/null fs/reiser4/plugin/file/file_conversion.c
37364 --- /dev/null
37365 +++ a/fs/reiser4/plugin/file/file_conversion.c
37366 @@ -0,0 +1,689 @@
37367 +/* Copyright 2001, 2002, 2003 by Hans Reiser,
37368 +   licensing governed by reiser4/README */
37369 +
37370 +/**
37371 + * This file contains plugin schedule hooks, and plugin conversion methods.
37372 + *
37373 + * Plugin schedule hook makes a decision (at plugin schedule point) about the
37374 + * most reasonable plugins for managing a regular file. Usually such decisions
37375 + * is made by some O(1)-heuristic.
37376 + *
37377 + * By default we assign a unix_file plugin id when writing incompressible file
37378 + * managed by cryptcompress plugin id. Currently used heuristic for estimating
37379 + * compressibility is very simple: if first complete logical cluster (64K by
37380 + * default) of a file is incompressible, then we make a decision, that the whole
37381 + * file is incompressible (*).
37382 + *
37383 + * To enable a conversion we install a special "magic" compression mode plugin
37384 + * (CONVX_COMPRESSION_MODE_ID, see plugin/compress/compress_mode.c for details)
37385 + * at file creation time (**).
37386 + *
37387 + * Note, that we don't perform back conversion (unix_file->cryptcompress)
37388 + * because of compatibility reasons (see http://dev.namesys.com/Version4.X.Y
37389 + * for details).
37390 + *
37391 + * The conversion is accompanied by rebuilding disk structures of a file, so it
37392 + * is important to protect them from being interacted with other plugins which
37393 + * don't expect them to be in such inconsistent state. For this to be protected
37394 + * we serialize readers and writers of a file's conversion set (FCS).
37395 + *
37396 + * We define FCS as a file plugin installed in inode's pset plus file's data
37397 + * and metadata that this file plugin manipulates with (items, etc).
37398 + * Note, that FCS is defined per file.
37399 + * FCS reader is defined as a set of instruction of the following type:
37400 + * {inode_file_plugin(inode)->method()} (I.e. retrieving a file plugin id
37401 + * conjoined with all method's instructions should be atomic).
37402 + * FCS writer is a set of instructions that perform file plugin conversion
37403 + * (convert items, update pset, etc).
37404 + * Example:
37405 + * reiser4_write_careful() supplied to VFS as a ->write() file operation is
37406 + * composed of the following (optional) instructions:
37407 + *             1              2                         3
37408 + * *********************** ####### -------------------------------------------->
37409 + *
37410 + * 1) "****" are instructions performed on behalf of cryptcompress file plugin;
37411 + * 2) "####" is a FCS writer (performing a conversion cryptcompress->unix_file);
37412 + * 3) "----" are instructions performed on behalf of unix_file plugin;
37413 + * Here (1) and (3) are FCS readers.
37414 + *
37415 + * In this example FCS readers and writers are already serialized (by design),
37416 + * however there can be readers and writers executing at the same time in
37417 + * different contexts, so we need a common mechanism of serialization.
37418 + *
37419 + * Currently serialization of FCS readers and writers is performed via acquiring
37420 + * a special per-inode rw-semaphore (conv_sem). And yes, {down, up}_read is for
37421 + * FCS readers, and  {down, up}_write is for FCS writers, see the macros below
37422 + * for passive/active protection.
37423 + *
37424 + * ---
37425 + * (*)  This heuristic can be changed to a better one (benchmarking is needed).
37426 + * (**) Such technique allows to keep enable/disable state on disk.
37427 + */
37428 +
37429 +#include "../../inode.h"
37430 +#include "../cluster.h"
37431 +#include "file.h"
37432 +
37433 +#define conversion_enabled(inode)                                      \
37434 +        (inode_compression_mode_plugin(inode) ==                      \
37435 +         compression_mode_plugin_by_id(CONVX_COMPRESSION_MODE_ID))
37436 +
37437 +/**
37438 + * Located sections (readers and writers of @pset) are not permanently
37439 + * critical: cryptcompress file can be converted only if the conversion
37440 + * is enabled (see the macrio above). Also we don't perform back
37441 + * conversion. The following helper macro is a sanity check to decide
37442 + * if we need the protection (locks are always additional overheads).
37443 + */
37444 +#define should_protect(inode)                                          \
37445 +       (inode_file_plugin(inode) ==                                    \
37446 +        file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID) &&             \
37447 +        conversion_enabled(inode))
37448 +/**
37449 + * To avoid confusion with read/write file operations, we'll speak about
37450 + * "passive" protection for FCS readers and "active" protection for FCS
37451 + * writers. All methods with active or passive protection have suffix
37452 + * "careful".
37453 + */
37454 +/**
37455 + * Macros for passive protection.
37456 + *
37457 + * Construct invariant operation to be supplied to VFS.
37458 + * The macro accepts the following lexemes:
37459 + * @type - type of the value represented by the compound statement;
37460 + * @method - name of an operation to be supplied to VFS (reiser4 file
37461 + * plugin also should contain a method with such name).
37462 + */
37463 +#define PROT_PASSIVE(type, method, args)                               \
37464 +({                                                                     \
37465 +       type _result;                                                   \
37466 +       struct rw_semaphore * guard =                                   \
37467 +               &reiser4_inode_data(inode)->conv_sem;                   \
37468 +                                                                       \
37469 +       if (should_protect(inode)) {                                    \
37470 +               down_read(guard);                                       \
37471 +               if (!should_protect(inode))                             \
37472 +                       up_read(guard);                                 \
37473 +       }                                                               \
37474 +       _result = inode_file_plugin(inode)->method args;                \
37475 +       if (should_protect(inode))                                      \
37476 +               up_read(guard);                                         \
37477 +       _result;                                                        \
37478 +})
37479 +
37480 +#define PROT_PASSIVE_VOID(method, args)                                        \
37481 +({                                                                     \
37482 +       struct rw_semaphore * guard =                                   \
37483 +               &reiser4_inode_data(inode)->conv_sem;                   \
37484 +                                                                       \
37485 +       if (should_protect(inode)) {                                    \
37486 +               down_read(guard);                                       \
37487 +               if (!should_protect(inode))                             \
37488 +                       up_read(guard);                                 \
37489 +       }                                                               \
37490 +       inode_file_plugin(inode)->method args;                          \
37491 +                                                                       \
37492 +       if (should_protect(inode))                                      \
37493 +               up_read(guard);                                         \
37494 +})
37495 +
37496 +/* Pass management to the unix-file plugin with "notail" policy */
37497 +static int __cryptcompress2unixfile(struct file *file, struct inode * inode)
37498 +{
37499 +       int result;
37500 +       reiser4_inode *info;
37501 +       struct unix_file_info * uf;
37502 +       info = reiser4_inode_data(inode);
37503 +
37504 +       result = aset_set_unsafe(&info->pset,
37505 +                           PSET_FILE,
37506 +                           (reiser4_plugin *)
37507 +                           file_plugin_by_id(UNIX_FILE_PLUGIN_ID));
37508 +       if (result)
37509 +               return result;
37510 +       result = aset_set_unsafe(&info->pset,
37511 +                           PSET_FORMATTING,
37512 +                           (reiser4_plugin *)
37513 +                           formatting_plugin_by_id(NEVER_TAILS_FORMATTING_ID));
37514 +       if (result)
37515 +               return result;
37516 +       /* get rid of non-standard plugins */
37517 +       info->plugin_mask &= ~cryptcompress_mask;
37518 +       /* get rid of plugin stat-data extension */
37519 +       info->extmask &= ~(1 << PLUGIN_STAT);
37520 +
37521 +       reiser4_inode_clr_flag(inode, REISER4_SDLEN_KNOWN);
37522 +
37523 +       /* FIXME use init_inode_data_unix_file() instead,
37524 +          but aviod init_inode_ordering() */
37525 +       /* Init unix-file specific part of inode */
37526 +       uf = unix_file_inode_data(inode);
37527 +       uf->container = UF_CONTAINER_UNKNOWN;
37528 +       init_rwsem(&uf->latch);
37529 +       uf->tplug = inode_formatting_plugin(inode);
37530 +       uf->exclusive_use = 0;
37531 +#if REISER4_DEBUG
37532 +       uf->ea_owner = NULL;
37533 +       atomic_set(&uf->nr_neas, 0);
37534 +#endif
37535 +       /**
37536 +        * we was carefull for file_ops, inode_ops and as_ops
37537 +        * to be invariant for plugin conversion, so there is
37538 +        * no need to update ones already installed in the
37539 +        * vfs's residence.
37540 +        */
37541 +       return 0;
37542 +}
37543 +
37544 +#if REISER4_DEBUG
37545 +static int disabled_conversion_inode_ok(struct inode * inode)
37546 +{
37547 +       __u64 extmask = reiser4_inode_data(inode)->extmask;
37548 +       __u16 plugin_mask = reiser4_inode_data(inode)->plugin_mask;
37549 +
37550 +       return ((extmask & (1 << LIGHT_WEIGHT_STAT)) &&
37551 +               (extmask & (1 << UNIX_STAT)) &&
37552 +               (extmask & (1 << LARGE_TIMES_STAT)) &&
37553 +               (extmask & (1 << PLUGIN_STAT)) &&
37554 +               (plugin_mask & (1 << PSET_COMPRESSION_MODE)));
37555 +}
37556 +#endif
37557 +
37558 +/**
37559 + * Disable future attempts to schedule/convert file plugin.
37560 + * This function is called by plugin schedule hooks.
37561 + *
37562 + * To disable conversion we assign any compression mode plugin id
37563 + * different from CONVX_COMPRESSION_MODE_ID.
37564 + */
37565 +static int disable_conversion(struct inode * inode)
37566 +{
37567 +       int result;
37568 +       result =
37569 +              force_plugin_pset(inode,
37570 +                                PSET_COMPRESSION_MODE,
37571 +                                (reiser4_plugin *)compression_mode_plugin_by_id
37572 +                                (LATTD_COMPRESSION_MODE_ID));
37573 +       assert("edward-1500",
37574 +              ergo(!result, disabled_conversion_inode_ok(inode)));
37575 +       return result;
37576 +}
37577 +
37578 +/**
37579 + * Check if we really have achieved plugin scheduling point
37580 + */
37581 +static int check_psched_point(struct inode * inode,
37582 +                             loff_t pos /* position in the
37583 +                                           file to write from */,
37584 +                             struct cluster_handle * clust,
37585 +                             struct psched_context * cont)
37586 +{
37587 +       assert("edward-1505", conversion_enabled(inode));
37588 +       /*
37589 +        * if file size is more then cluster size, then compressible
37590 +        * status must be figured out (i.e. compression was disabled,
37591 +        * or file plugin was converted to unix_file)
37592 +        */
37593 +       assert("edward-1506", inode->i_size <= inode_cluster_size(inode));
37594 +
37595 +       if (pos > inode->i_size)
37596 +               /* first logical cluster will contain a (partial) hole */
37597 +               return disable_conversion(inode);
37598 +       if (pos < inode_cluster_size(inode))
37599 +               /* writing to the first logical cluster */
37600 +               return 0;
37601 +       /*
37602 +        * here we have:
37603 +        * cluster_size <= pos <= i_size <= cluster_size,
37604 +        * and, hence,  pos == i_size == cluster_size
37605 +        */
37606 +       assert("edward-1498",
37607 +              pos == inode->i_size &&
37608 +              pos == inode_cluster_size(inode));
37609 +       assert("edward-1539", cont != NULL);
37610 +       assert("edward-1540", cont->state == PSCHED_INVAL_STATE);
37611 +
37612 +       cont->state = PSCHED_SCHED_POINT;
37613 +       return 0;
37614 +}
37615 +
37616 +static void start_check_compressibility(struct inode * inode,
37617 +                                       struct cluster_handle * clust,
37618 +                                       hint_t * hint)
37619 +{
37620 +       assert("edward-1507", clust->index == 1);
37621 +       assert("edward-1508", !tfm_cluster_is_uptodate(&clust->tc));
37622 +       assert("edward-1509", cluster_get_tfm_act(&clust->tc) == TFMA_READ);
37623 +
37624 +       hint_init_zero(hint);
37625 +       clust->hint = hint;
37626 +       clust->index --;
37627 +       clust->nr_pages = size_in_pages(lbytes(clust->index, inode));
37628 +
37629 +       /* first logical cluster (of index #0) must be complete */
37630 +       assert("edward-1510", lbytes(clust->index, inode) ==
37631 +              inode_cluster_size(inode));
37632 +}
37633 +
37634 +static void finish_check_compressibility(struct inode * inode,
37635 +                                        struct cluster_handle * clust,
37636 +                                        hint_t * hint)
37637 +{
37638 +       reiser4_unset_hint(clust->hint);
37639 +       clust->hint = hint;
37640 +       clust->index ++;
37641 +}
37642 +
37643 +#if REISER4_DEBUG
37644 +static int prepped_dclust_ok(hint_t * hint)
37645 +{
37646 +       reiser4_key key;
37647 +       coord_t * coord = &hint->ext_coord.coord;
37648 +
37649 +       item_key_by_coord(coord, &key);
37650 +       return (item_id_by_coord(coord) == CTAIL_ID &&
37651 +               !coord_is_unprepped_ctail(coord) &&
37652 +               (get_key_offset(&key) + nr_units_ctail(coord) ==
37653 +                dclust_get_extension_dsize(hint)));
37654 +}
37655 +#endif
37656 +
37657 +#define fifty_persent(size) (size >> 1)
37658 +/* evaluation of data compressibility */
37659 +#define data_is_compressible(osize, isize)             \
37660 +       (osize < fifty_persent(isize))
37661 +
37662 +/**
37663 + * A simple O(1)-heuristic for compressibility.
37664 + * This is called not more then one time per file's life.
37665 + * Read first logical cluster (of index #0) and estimate its compressibility.
37666 + * Save estimation result in @cont.
37667 + */
37668 +static int read_check_compressibility(struct inode * inode,
37669 +                                     struct cluster_handle * clust,
37670 +                                     struct psched_context * cont)
37671 +{
37672 +       int i;
37673 +       int result;
37674 +       __u32 dst_len;
37675 +       hint_t tmp_hint;
37676 +       hint_t * cur_hint = clust->hint;
37677 +       assert("edward-1541", cont->state == PSCHED_SCHED_POINT);
37678 +
37679 +       start_check_compressibility(inode, clust, &tmp_hint);
37680 +
37681 +       reset_cluster_pgset(clust, cluster_nrpages(inode));
37682 +       result = grab_page_cluster(inode, clust, READ_OP);
37683 +       if (result)
37684 +               return result;
37685 +       /* Read page cluster here */
37686 +       for (i = 0; i < clust->nr_pages; i++) {
37687 +               struct page *page = clust->pages[i];
37688 +               lock_page(page);
37689 +               result = do_readpage_ctail(inode, clust, page,
37690 +                                          ZNODE_READ_LOCK);
37691 +               unlock_page(page);
37692 +               if (result)
37693 +                       goto error;
37694 +       }
37695 +       tfm_cluster_clr_uptodate(&clust->tc);
37696 +
37697 +       cluster_set_tfm_act(&clust->tc, TFMA_WRITE);
37698 +
37699 +       if (hint_is_valid(&tmp_hint) && !hint_is_unprepped_dclust(&tmp_hint)) {
37700 +               /* lenght of compressed data is known, no need to compress */
37701 +               assert("edward-1511",
37702 +                      znode_is_any_locked(tmp_hint.lh.node));
37703 +               assert("edward-1512",
37704 +                      WITH_DATA(tmp_hint.ext_coord.coord.node,
37705 +                                prepped_dclust_ok(&tmp_hint)));
37706 +               dst_len = dclust_get_extension_dsize(&tmp_hint);
37707 +       }
37708 +       else {
37709 +               struct tfm_cluster * tc = &clust->tc;
37710 +               compression_plugin * cplug = inode_compression_plugin(inode);
37711 +               result = grab_tfm_stream(inode, tc, INPUT_STREAM);
37712 +               if (result)
37713 +                       goto error;
37714 +               for (i = 0; i < clust->nr_pages; i++) {
37715 +                       char *data;
37716 +                       lock_page(clust->pages[i]);
37717 +                       BUG_ON(!PageUptodate(clust->pages[i]));
37718 +                       data = kmap(clust->pages[i]);
37719 +                       memcpy(tfm_stream_data(tc, INPUT_STREAM) + pg_to_off(i),
37720 +                              data, PAGE_CACHE_SIZE);
37721 +                       kunmap(clust->pages[i]);
37722 +                       unlock_page(clust->pages[i]);
37723 +               }
37724 +               result = grab_tfm_stream(inode, tc, OUTPUT_STREAM);
37725 +               if (result)
37726 +                       goto error;
37727 +               result = grab_coa(tc, cplug);
37728 +               if (result)
37729 +                       goto error;
37730 +               tc->len = tc->lsize = lbytes(clust->index, inode);
37731 +               assert("edward-1513", tc->len == inode_cluster_size(inode));
37732 +               dst_len = tfm_stream_size(tc, OUTPUT_STREAM);
37733 +               cplug->compress(get_coa(tc, cplug->h.id, tc->act),
37734 +                               tfm_input_data(clust), tc->len,
37735 +                               tfm_output_data(clust), &dst_len);
37736 +               assert("edward-1514",
37737 +                      dst_len <= tfm_stream_size(tc, OUTPUT_STREAM));
37738 +       }
37739 +       finish_check_compressibility(inode, clust, cur_hint);
37740 +       cont->state =
37741 +               (data_is_compressible(dst_len, inode_cluster_size(inode)) ?
37742 +                PSCHED_REMAINS_OLD :
37743 +                PSCHED_ASSIGNED_NEW);
37744 +       return 0;
37745 + error:
37746 +       put_page_cluster(clust, inode, READ_OP);
37747 +       return result;
37748 +}
37749 +
37750 +/* Cut disk cluster of index @idx */
37751 +static int cut_disk_cluster(struct inode * inode, cloff_t idx)
37752 +{
37753 +       reiser4_key from, to;
37754 +       assert("edward-1515", inode_file_plugin(inode) ==
37755 +              file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID));
37756 +       key_by_inode_cryptcompress(inode, clust_to_off(idx, inode), &from);
37757 +       to = from;
37758 +       set_key_offset(&to,
37759 +                      get_key_offset(&from) + inode_cluster_size(inode) - 1);
37760 +       return reiser4_cut_tree(reiser4_tree_by_inode(inode),
37761 +                               &from, &to, inode, 0);
37762 +}
37763 +
37764 +static int reserve_cryptcompress2unixfile(struct inode *inode)
37765 +{
37766 +       reiser4_block_nr unformatted_nodes;
37767 +       reiser4_tree *tree;
37768 +
37769 +       tree = reiser4_tree_by_inode(inode);
37770 +
37771 +       /* number of unformatted nodes which will be created */
37772 +       unformatted_nodes = cluster_nrpages(inode); /* N */
37773 +
37774 +       /*
37775 +        * space required for one iteration of extent->tail conversion:
37776 +        *
37777 +        *     1. kill ctail items
37778 +        *
37779 +        *     2. insert N unformatted nodes
37780 +        *
37781 +        *     3. insert N (worst-case single-block
37782 +        *     extents) extent units.
37783 +        *
37784 +        *     4. drilling to the leaf level by coord_by_key()
37785 +        *
37786 +        *     5. possible update of stat-data
37787 +        *
37788 +        */
37789 +       grab_space_enable();
37790 +       return reiser4_grab_space
37791 +               (2 * tree->height +
37792 +                unformatted_nodes  +
37793 +                unformatted_nodes * estimate_one_insert_into_item(tree) +
37794 +                1 + estimate_one_insert_item(tree) +
37795 +                inode_file_plugin(inode)->estimate.update(inode),
37796 +                BA_CAN_COMMIT);
37797 +}
37798 +
37799 +/**
37800 + * Convert cryptcompress file plugin to unix_file plugin.
37801 + */
37802 +static int cryptcompress2unixfile(struct file * file, struct inode * inode,
37803 +                                 struct psched_context * cont)
37804 +{
37805 +       int i;
37806 +       int result = 0;
37807 +       struct cryptcompress_info *cr_info;
37808 +       struct unix_file_info *uf_info;
37809 +       assert("edward-1516", cont->pages[0]->index == 0);
37810 +
37811 +       /* release all cryptcompress-specific resources */
37812 +       cr_info = cryptcompress_inode_data(inode);
37813 +       result = reserve_cryptcompress2unixfile(inode);
37814 +       if (result)
37815 +               goto out;
37816 +       /* tell kill_hook to not truncate pages */
37817 +       reiser4_inode_set_flag(inode, REISER4_FILE_CONV_IN_PROGRESS);
37818 +       result = cut_disk_cluster(inode, 0);
37819 +       if (result)
37820 +               goto out;
37821 +       /* captured jnode of cluster and assotiated resources (pages,
37822 +          reserved disk space) were released by ->kill_hook() method
37823 +          of the item plugin */
37824 +
37825 +       result = __cryptcompress2unixfile(file, inode);
37826 +       if (result)
37827 +               goto out;
37828 +       /* At this point file is managed by unix file plugin */
37829 +
37830 +       uf_info = unix_file_inode_data(inode);
37831 +
37832 +       assert("edward-1518",
37833 +              ergo(jprivate(cont->pages[0]),
37834 +                   !jnode_is_cluster_page(jprivate(cont->pages[0]))));
37835 +       for(i = 0; i < cont->nr_pages; i++) {
37836 +               assert("edward-1519", cont->pages[i]);
37837 +               assert("edward-1520", PageUptodate(cont->pages[i]));
37838 +
37839 +               result = find_or_create_extent(cont->pages[i]);
37840 +               if (result)
37841 +                       break;
37842 +       }
37843 +       if (unlikely(result))
37844 +               goto out;
37845 +       uf_info->container = UF_CONTAINER_EXTENTS;
37846 +       result = reiser4_update_sd(inode);
37847 + out:
37848 +       all_grabbed2free();
37849 +       return result;
37850 +}
37851 +
37852 +#define convert_file_plugin cryptcompress2unixfile
37853 +
37854 +/**
37855 + * This is called by ->write() method of a cryptcompress file plugin.
37856 + * Make a decision about the most reasonable file plugin id to manage
37857 + * the file.
37858 + */
37859 +int write_pschedule_hook(struct file * file, struct inode * inode,
37860 +                        loff_t pos, struct cluster_handle * clust,
37861 +                        struct psched_context * cont)
37862 +{
37863 +       int result;
37864 +       if (!conversion_enabled(inode))
37865 +               return 0;
37866 +       result = check_psched_point(inode, pos, clust, cont);
37867 +       if (result || cont->state != PSCHED_SCHED_POINT)
37868 +               return result;
37869 +       result = read_check_compressibility(inode, clust, cont);
37870 +       if (result)
37871 +               return result;
37872 +       if (cont->state == PSCHED_REMAINS_OLD) {
37873 +               put_page_cluster(clust, inode, READ_OP);
37874 +               return disable_conversion(inode);
37875 +       }
37876 +       assert("edward-1543", cont->state == PSCHED_ASSIGNED_NEW);
37877 +       /*
37878 +        * page cluster is grabbed and uptodate. It will be
37879 +        * released with a pgset after plugin conversion is
37880 +        * finished, see put_psched_context().
37881 +        */
37882 +       reiser4_unset_hint(clust->hint);
37883 +       move_cluster_pgset(clust, &cont->pages, &cont->nr_pages);
37884 +       return 0;
37885 +}
37886 +
37887 +/**
37888 + * This is called by ->setattr() method of cryptcompress file plugin.
37889 + */
37890 +int setattr_pschedule_hook(struct inode * inode)
37891 +{
37892 +       if (conversion_enabled(inode))
37893 +               return disable_conversion(inode);
37894 +       return 0;
37895 +}
37896 +
37897 +static inline void init_psched_context(struct psched_context * cont)
37898 +{
37899 +       memset(cont, 0, sizeof(*cont));
37900 +}
37901 +
37902 +static inline void done_psched_context(struct psched_context * cont,
37903 +                                      struct inode * inode)
37904 +{
37905 +       if (cont->pages) {
37906 +               __put_page_cluster(0, cont->nr_pages, cont->pages, inode);
37907 +               kfree(cont->pages);
37908 +       }
37909 +}
37910 +/**
37911 + * Here are wrappers with "protection", aka Reiser4 "careful" methods.
37912 + * They are used by vfs (as methods of file_ops, inode_ops or as_ops),
37913 + * which is not aware of plugin conversion performed by Reiser4.
37914 + */
37915 +
37916 +/*
37917 + * Wrappers with active protection for:
37918 + *
37919 + * ->write();
37920 + */
37921 +
37922 +/*
37923 + * ->write() file operation supplied to VFS.
37924 + * Write a file in 3 steps (some of them can be optional).
37925 + */
37926 +ssize_t reiser4_write_careful(struct file *file, const char __user *buf,
37927 +                             size_t count, loff_t *off)
37928 +{
37929 +       int result;
37930 +       reiser4_context *ctx;
37931 +       ssize_t written_old = 0; /* bytes written with initial plugin */
37932 +       ssize_t written_new = 0; /* bytes written with new plugin */
37933 +       struct psched_context cont;
37934 +       struct inode * inode = file->f_dentry->d_inode;
37935 +
37936 +       ctx = reiser4_init_context(inode->i_sb);
37937 +       if (IS_ERR(ctx))
37938 +               return PTR_ERR(ctx);
37939 +       init_psched_context(&cont);
37940 +       mutex_lock(&inode->i_mutex);
37941 +       /**
37942 +        * First step.
37943 +        * Start write with initial file plugin.
37944 +        * Keep a plugin schedule status at @cont (if any).
37945 +        */
37946 +       written_old = inode_file_plugin(inode)->write(file,
37947 +                                                     buf,
37948 +                                                     count,
37949 +                                                     off,
37950 +                                                     &cont);
37951 +       if (cont.state != PSCHED_ASSIGNED_NEW || written_old < 0)
37952 +               goto exit;
37953 +       /**
37954 +        * Second step.
37955 +        * New file plugin has been scheduled.
37956 +        * Perform conversion to the new plugin.
37957 +        */
37958 +       down_read(&reiser4_inode_data(inode)->conv_sem);
37959 +       result = convert_file_plugin(file, inode, &cont);
37960 +       up_read(&reiser4_inode_data(inode)->conv_sem);
37961 +       if (result) {
37962 +               warning("edward-1544",
37963 +                       "Inode %llu: file plugin conversion failed (%d)",
37964 +                       (unsigned long long)get_inode_oid(inode),
37965 +                       result);
37966 +               context_set_commit_async(ctx);
37967 +               goto exit;
37968 +       }
37969 +       reiser4_txn_restart(ctx);
37970 +       /**
37971 +        * Third step:
37972 +        * Finish write with the new file plugin.
37973 +        */
37974 +       assert("edward-1536",
37975 +              inode_file_plugin(inode) ==
37976 +              file_plugin_by_id(UNIX_FILE_PLUGIN_ID));
37977 +
37978 +       written_new = inode_file_plugin(inode)->write(file,
37979 +                                                     buf + written_old,
37980 +                                                     count - written_old,
37981 +                                                     off,
37982 +                                                     NULL);
37983 + exit:
37984 +       mutex_unlock(&inode->i_mutex);
37985 +       done_psched_context(&cont, inode);
37986 +       reiser4_exit_context(ctx);
37987 +
37988 +       return written_old + (written_new < 0 ? 0 : written_new);
37989 +}
37990 +
37991 +/* Wrappers with passive protection for:
37992 + *
37993 + * ->open();
37994 + * ->read();
37995 + * ->ioctl();
37996 + * ->mmap();
37997 + * ->release();
37998 + * ->bmap().
37999 + */
38000 +
38001 +int reiser4_open_careful(struct inode *inode, struct file *file)
38002 +{
38003 +       return PROT_PASSIVE(int, open, (inode, file));
38004 +}
38005 +
38006 +ssize_t reiser4_read_careful(struct file * file, char __user * buf,
38007 +                            size_t size, loff_t * off)
38008 +{
38009 +       struct inode * inode = file->f_dentry->d_inode;
38010 +       return PROT_PASSIVE(ssize_t, read, (file, buf, size, off));
38011 +}
38012 +
38013 +int reiser4_ioctl_careful(struct inode *inode, struct file *filp,
38014 +                         unsigned int cmd, unsigned long arg)
38015 +{
38016 +       return PROT_PASSIVE(int, ioctl, (inode, filp, cmd, arg));
38017 +}
38018 +
38019 +int reiser4_mmap_careful(struct file *file, struct vm_area_struct *vma)
38020 +{
38021 +       struct inode *inode = file->f_dentry->d_inode;
38022 +       return PROT_PASSIVE(int, mmap, (file, vma));
38023 +}
38024 +
38025 +int reiser4_release_careful(struct inode *inode, struct file *file)
38026 +{
38027 +       return PROT_PASSIVE(int, release, (inode, file));
38028 +}
38029 +
38030 +sector_t reiser4_bmap_careful(struct address_space * mapping, sector_t lblock)
38031 +{
38032 +       struct inode *inode = mapping->host;
38033 +       return PROT_PASSIVE(sector_t, bmap, (mapping, lblock));
38034 +}
38035 +
38036 +/*
38037 + * Wrappers without protection for:
38038 + *
38039 + * ->setattr()
38040 + */
38041 +int reiser4_setattr(struct dentry *dentry, struct iattr *attr)
38042 +{
38043 +       return inode_file_plugin(dentry->d_inode)->setattr(dentry, attr);
38044 +}
38045 +
38046 +/*
38047 +  Local variables:
38048 +  c-indentation-style: "K&R"
38049 +  mode-name: "LC"
38050 +  c-basic-offset: 8
38051 +  tab-width: 8
38052 +  fill-column: 80
38053 +  scroll-step: 1
38054 +  End:
38055 +*/
38056 diff -puN /dev/null fs/reiser4/plugin/file/symfile.c
38057 --- /dev/null
38058 +++ a/fs/reiser4/plugin/file/symfile.c
38059 @@ -0,0 +1,87 @@
38060 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
38061 +
38062 +/* Symfiles are a generalization of Unix symlinks.
38063 +
38064 +   A symfile when read behaves as though you took its contents and
38065 +   substituted them into the reiser4 naming system as the right hand side
38066 +   of an assignment, and then read that which you had assigned to it.
38067 +
38068 +   A key issue for symfiles is how to implement writes through to
38069 +   subfiles.  In general, one must have some method of determining what
38070 +   of that which is written to the symfile is written to what subfile.
38071 +   This can be done by use of custom plugin methods written by users, or
38072 +   by using a few general methods we provide for those willing to endure
38073 +   the insertion of delimiters into what is read.
38074 +
38075 +   Writing to symfiles without delimiters to denote what is written to
38076 +   what subfile is not supported by any plugins we provide in this
38077 +   release.  Our most sophisticated support for writes is that embodied
38078 +   by the invert plugin (see invert.c).
38079 +
38080 +   A read only version of the /etc/passwd file might be
38081 +   constructed as a symfile whose contents are as follows:
38082 +
38083 +   /etc/passwd/userlines/*
38084 +
38085 +   or
38086 +
38087 +   /etc/passwd/userlines/demidov+/etc/passwd/userlines/edward+/etc/passwd/userlines/reiser+/etc/passwd/userlines/root
38088 +
38089 +   or
38090 +
38091 +   /etc/passwd/userlines/(demidov+edward+reiser+root)
38092 +
38093 +   A symfile with contents
38094 +
38095 +   /filenameA+"(some text stored in the uninvertable symfile)+/filenameB
38096 +
38097 +   will return when read
38098 +
38099 +   The contents of filenameAsome text stored in the uninvertable symfileThe contents of filenameB
38100 +
38101 +   and write of what has been read will not be possible to implement as
38102 +   an identity operation because there are no delimiters denoting the
38103 +   boundaries of what is to be written to what subfile.
38104 +
38105 +   Note that one could make this a read/write symfile if one specified
38106 +   delimiters, and the write method understood those delimiters delimited
38107 +   what was written to subfiles.
38108 +
38109 +   So, specifying the symfile in a manner that allows writes:
38110 +
38111 +   /etc/passwd/userlines/demidov+"(
38112 +   )+/etc/passwd/userlines/edward+"(
38113 +   )+/etc/passwd/userlines/reiser+"(
38114 +   )+/etc/passwd/userlines/root+"(
38115 +   )
38116 +
38117 +   or
38118 +
38119 +   /etc/passwd/userlines/(demidov+"(
38120 +   )+edward+"(
38121 +   )+reiser+"(
38122 +   )+root+"(
38123 +   ))
38124 +
38125 +   and the file demidov might be specified as:
38126 +
38127 +   /etc/passwd/userlines/demidov/username+"(:)+/etc/passwd/userlines/demidov/password+"(:)+/etc/passwd/userlines/demidov/userid+"(:)+/etc/passwd/userlines/demidov/groupid+"(:)+/etc/passwd/userlines/demidov/gecos+"(:)+/etc/passwd/userlines/demidov/home+"(:)+/etc/passwd/userlines/demidov/shell
38128 +
38129 +   or
38130 +
38131 +   /etc/passwd/userlines/demidov/(username+"(:)+password+"(:)+userid+"(:)+groupid+"(:)+gecos+"(:)+home+"(:)+shell)
38132 +
38133 +   Notice that if the file demidov has a carriage return in it, the
38134 +   parsing fails, but then if you put carriage returns in the wrong place
38135 +   in a normal /etc/passwd file it breaks things also.
38136 +
38137 +   Note that it is forbidden to have no text between two interpolations
38138 +   if one wants to be able to define what parts of a write go to what
38139 +   subfiles referenced in an interpolation.
38140 +
38141 +   If one wants to be able to add new lines by writing to the file, one
38142 +   must either write a custom plugin for /etc/passwd that knows how to
38143 +   name an added line, or one must use an invert, or one must use a more
38144 +   sophisticated symfile syntax that we are not planning to write for
38145 +   version 4.0.
38146 +*/
38147 diff -puN /dev/null fs/reiser4/plugin/file/symlink.c
38148 --- /dev/null
38149 +++ a/fs/reiser4/plugin/file/symlink.c
38150 @@ -0,0 +1,95 @@
38151 +/* Copyright 2002, 2003, 2005 by Hans Reiser, licensing governed by reiser4/README */
38152 +
38153 +#include "../../inode.h"
38154 +
38155 +#include <linux/types.h>
38156 +#include <linux/fs.h>
38157 +
38158 +/* file plugin methods specific for symlink files
38159 +   (SYMLINK_FILE_PLUGIN_ID) */
38160 +
38161 +/* this is implementation of create_object method of file plugin for
38162 +   SYMLINK_FILE_PLUGIN_ID
38163 + */
38164 +
38165 +/**
38166 + * reiser4_create_symlink - create_object of file plugin for SYMLINK_FILE_PLUGIN_ID
38167 + * @symlink: inode of symlink object
38168 + * @dir: inode of parent directory
38169 + * @info:  parameters of new object
38170 + *
38171 + * Inserts stat data with symlink extension where into the tree.
38172 + */
38173 +int reiser4_create_symlink(struct inode *symlink,
38174 +                          struct inode *dir UNUSED_ARG,
38175 +                          reiser4_object_create_data *data /* info passed to us
38176 +                                                            * this is filled by
38177 +                                                            * reiser4() syscall
38178 +                                                            * in particular */)
38179 +{
38180 +       int result;
38181 +
38182 +       assert("nikita-680", symlink != NULL);
38183 +       assert("nikita-681", S_ISLNK(symlink->i_mode));
38184 +       assert("nikita-685", reiser4_inode_get_flag(symlink, REISER4_NO_SD));
38185 +       assert("nikita-682", dir != NULL);
38186 +       assert("nikita-684", data != NULL);
38187 +       assert("nikita-686", data->id == SYMLINK_FILE_PLUGIN_ID);
38188 +
38189 +       /*
38190 +        * stat data of symlink has symlink extension in which we store
38191 +        * symlink content, that is, path symlink is pointing to.
38192 +        */
38193 +       reiser4_inode_data(symlink)->extmask |= (1 << SYMLINK_STAT);
38194 +
38195 +       assert("vs-838", symlink->i_private == NULL);
38196 +       symlink->i_private = (void *)data->name;
38197 +
38198 +       assert("vs-843", symlink->i_size == 0);
38199 +       INODE_SET_FIELD(symlink, i_size, strlen(data->name));
38200 +
38201 +       /* insert stat data appended with data->name */
38202 +       result = inode_file_plugin(symlink)->write_sd_by_inode(symlink);
38203 +       if (result) {
38204 +               /* FIXME-VS: Make sure that symlink->i_private is not attached
38205 +                  to kmalloced data */
38206 +               INODE_SET_FIELD(symlink, i_size, 0);
38207 +       } else {
38208 +               assert("vs-849", symlink->i_private
38209 +                      && reiser4_inode_get_flag(symlink,
38210 +                                                REISER4_GENERIC_PTR_USED));
38211 +               assert("vs-850",
38212 +                      !memcmp((char *)symlink->i_private, data->name,
38213 +                              (size_t) symlink->i_size + 1));
38214 +       }
38215 +       return result;
38216 +}
38217 +
38218 +/* this is implementation of destroy_inode method of file plugin for
38219 +   SYMLINK_FILE_PLUGIN_ID
38220 + */
38221 +void destroy_inode_symlink(struct inode *inode)
38222 +{
38223 +       assert("edward-799",
38224 +              inode_file_plugin(inode) ==
38225 +              file_plugin_by_id(SYMLINK_FILE_PLUGIN_ID));
38226 +       assert("edward-800", !is_bad_inode(inode) && is_inode_loaded(inode));
38227 +       assert("edward-801", reiser4_inode_get_flag(inode,
38228 +                                                   REISER4_GENERIC_PTR_USED));
38229 +       assert("vs-839", S_ISLNK(inode->i_mode));
38230 +
38231 +       kfree(inode->i_private);
38232 +       inode->i_private = NULL;
38233 +       reiser4_inode_clr_flag(inode, REISER4_GENERIC_PTR_USED);
38234 +}
38235 +
38236 +/*
38237 +  Local variables:
38238 +  c-indentation-style: "K&R"
38239 +  mode-name: "LC"
38240 +  c-basic-offset: 8
38241 +  tab-width: 8
38242 +  fill-column: 80
38243 +  scroll-step: 1
38244 +  End:
38245 +*/
38246 diff -puN /dev/null fs/reiser4/plugin/file/tail_conversion.c
38247 --- /dev/null
38248 +++ a/fs/reiser4/plugin/file/tail_conversion.c
38249 @@ -0,0 +1,737 @@
38250 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
38251 +
38252 +#include "../../inode.h"
38253 +#include "../../super.h"
38254 +#include "../../page_cache.h"
38255 +#include "../../carry.h"
38256 +#include "../../safe_link.h"
38257 +#include "../../vfs_ops.h"
38258 +
38259 +#include <linux/writeback.h>
38260 +
38261 +/* this file contains:
38262 +   tail2extent and extent2tail */
38263 +
38264 +/* exclusive access to a file is acquired when file state changes: tail2extent, empty2tail, extent2tail, etc */
38265 +void get_exclusive_access(struct unix_file_info * uf_info)
38266 +{
38267 +       assert("nikita-3028", reiser4_schedulable());
38268 +       assert("nikita-3047", LOCK_CNT_NIL(inode_sem_w));
38269 +       assert("nikita-3048", LOCK_CNT_NIL(inode_sem_r));
38270 +       /*
38271 +        * "deadlock avoidance": sometimes we commit a transaction under
38272 +        * rw-semaphore on a file. Such commit can deadlock with another
38273 +        * thread that captured some block (hence preventing atom from being
38274 +        * committed) and waits on rw-semaphore.
38275 +        */
38276 +       reiser4_txn_restart_current();
38277 +       LOCK_CNT_INC(inode_sem_w);
38278 +       down_write(&uf_info->latch);
38279 +       uf_info->exclusive_use = 1;
38280 +       assert("vs-1713", uf_info->ea_owner == NULL);
38281 +       assert("vs-1713", atomic_read(&uf_info->nr_neas) == 0);
38282 +       ON_DEBUG(uf_info->ea_owner = current);
38283 +}
38284 +
38285 +void drop_exclusive_access(struct unix_file_info * uf_info)
38286 +{
38287 +       assert("vs-1714", uf_info->ea_owner == current);
38288 +       assert("vs-1715", atomic_read(&uf_info->nr_neas) == 0);
38289 +       ON_DEBUG(uf_info->ea_owner = NULL);
38290 +       uf_info->exclusive_use = 0;
38291 +       up_write(&uf_info->latch);
38292 +       assert("nikita-3049", LOCK_CNT_NIL(inode_sem_r));
38293 +       assert("nikita-3049", LOCK_CNT_GTZ(inode_sem_w));
38294 +       LOCK_CNT_DEC(inode_sem_w);
38295 +       reiser4_txn_restart_current();
38296 +}
38297 +
38298 +/**
38299 + * nea_grabbed - do something when file semaphore is down_read-ed
38300 + * @uf_info:
38301 + *
38302 + * This is called when nonexclisive access is obtained on file. All it does is
38303 + * for debugging purposes.
38304 + */
38305 +static void nea_grabbed(struct unix_file_info *uf_info)
38306 +{
38307 +#if REISER4_DEBUG
38308 +       LOCK_CNT_INC(inode_sem_r);
38309 +       assert("vs-1716", uf_info->ea_owner == NULL);
38310 +       atomic_inc(&uf_info->nr_neas);
38311 +       uf_info->last_reader = current;
38312 +#endif
38313 +}
38314 +
38315 +/**
38316 + * get_nonexclusive_access - get nonexclusive access to a file
38317 + * @uf_info: unix file specific part of inode to obtain access to
38318 + *
38319 + * Nonexclusive access is obtained on a file before read, write, readpage.
38320 + */
38321 +void get_nonexclusive_access(struct unix_file_info *uf_info)
38322 +{
38323 +       assert("nikita-3029", reiser4_schedulable());
38324 +       assert("nikita-3361", get_current_context()->trans->atom == NULL);
38325 +
38326 +       down_read(&uf_info->latch);
38327 +       nea_grabbed(uf_info);
38328 +}
38329 +
38330 +/**
38331 + * try_to_get_nonexclusive_access - try to get nonexclusive access to a file
38332 + * @uf_info: unix file specific part of inode to obtain access to
38333 + *
38334 + * Non-blocking version of nonexclusive access obtaining.
38335 + */
38336 +int try_to_get_nonexclusive_access(struct unix_file_info *uf_info)
38337 +{
38338 +       int result;
38339 +
38340 +       result = down_read_trylock(&uf_info->latch);
38341 +       if (result)
38342 +               nea_grabbed(uf_info);
38343 +       return result;
38344 +}
38345 +
38346 +void drop_nonexclusive_access(struct unix_file_info * uf_info)
38347 +{
38348 +       assert("vs-1718", uf_info->ea_owner == NULL);
38349 +       assert("vs-1719", atomic_read(&uf_info->nr_neas) > 0);
38350 +       ON_DEBUG(atomic_dec(&uf_info->nr_neas));
38351 +
38352 +       up_read(&uf_info->latch);
38353 +
38354 +       LOCK_CNT_DEC(inode_sem_r);
38355 +       reiser4_txn_restart_current();
38356 +}
38357 +
38358 +/* part of tail2extent. Cut all items covering @count bytes starting from
38359 +   @offset */
38360 +/* Audited by: green(2002.06.15) */
38361 +static int cut_formatting_items(struct inode *inode, loff_t offset, int count)
38362 +{
38363 +       reiser4_key from, to;
38364 +
38365 +       /* AUDIT: How about putting an assertion here, what would check
38366 +          all provided range is covered by tail items only? */
38367 +       /* key of first byte in the range to be cut  */
38368 +       inode_file_plugin(inode)->key_by_inode(inode, offset, &from);
38369 +
38370 +       /* key of last byte in that range */
38371 +       to = from;
38372 +       set_key_offset(&to, (__u64) (offset + count - 1));
38373 +
38374 +       /* cut everything between those keys */
38375 +       return reiser4_cut_tree(reiser4_tree_by_inode(inode), &from, &to,
38376 +                               inode, 0);
38377 +}
38378 +
38379 +static void release_all_pages(struct page **pages, unsigned nr_pages)
38380 +{
38381 +       unsigned i;
38382 +
38383 +       for (i = 0; i < nr_pages; i++) {
38384 +               if (pages[i] == NULL) {
38385 +#if REISER4_DEBUG
38386 +                       unsigned j;
38387 +                       for (j = i + 1; j < nr_pages; j++)
38388 +                               assert("vs-1620", pages[j] == NULL);
38389 +#endif
38390 +                       break;
38391 +               }
38392 +               page_cache_release(pages[i]);
38393 +               pages[i] = NULL;
38394 +       }
38395 +}
38396 +
38397 +/* part of tail2extent. replace tail items with extent one. Content of tail
38398 +   items (@count bytes) being cut are copied already into
38399 +   pages. extent_writepage method is called to create extents corresponding to
38400 +   those pages */
38401 +static int replace(struct inode *inode, struct page **pages, unsigned nr_pages, int count)
38402 +{
38403 +       int result;
38404 +       unsigned i;
38405 +       STORE_COUNTERS;
38406 +
38407 +       if (nr_pages == 0)
38408 +               return 0;
38409 +
38410 +       assert("vs-596", pages[0]);
38411 +
38412 +       /* cut copied items */
38413 +       result = cut_formatting_items(inode, page_offset(pages[0]), count);
38414 +       if (result)
38415 +               return result;
38416 +
38417 +       CHECK_COUNTERS;
38418 +
38419 +       /* put into tree replacement for just removed items: extent item, namely */
38420 +       for (i = 0; i < nr_pages; i++) {
38421 +               result = add_to_page_cache_lru(pages[i], inode->i_mapping,
38422 +                                              pages[i]->index,
38423 +                                              mapping_gfp_mask(inode->
38424 +                                                               i_mapping));
38425 +               if (result)
38426 +                       break;
38427 +               unlock_page(pages[i]);
38428 +               result = find_or_create_extent(pages[i]);
38429 +               if (result)
38430 +                       break;
38431 +               SetPageUptodate(pages[i]);
38432 +       }
38433 +       return result;
38434 +}
38435 +
38436 +#define TAIL2EXTENT_PAGE_NUM 3 /* number of pages to fill before cutting tail
38437 +                                * items */
38438 +
38439 +static int reserve_tail2extent_iteration(struct inode *inode)
38440 +{
38441 +       reiser4_block_nr unformatted_nodes;
38442 +       reiser4_tree *tree;
38443 +
38444 +       tree = reiser4_tree_by_inode(inode);
38445 +
38446 +       /* number of unformatted nodes which will be created */
38447 +       unformatted_nodes = TAIL2EXTENT_PAGE_NUM;
38448 +
38449 +       /*
38450 +        * space required for one iteration of extent->tail conversion:
38451 +        *
38452 +        *     1. kill N tail items
38453 +        *
38454 +        *     2. insert TAIL2EXTENT_PAGE_NUM unformatted nodes
38455 +        *
38456 +        *     3. insert TAIL2EXTENT_PAGE_NUM (worst-case single-block
38457 +        *     extents) extent units.
38458 +        *
38459 +        *     4. drilling to the leaf level by coord_by_key()
38460 +        *
38461 +        *     5. possible update of stat-data
38462 +        *
38463 +        */
38464 +       grab_space_enable();
38465 +       return reiser4_grab_space
38466 +           (2 * tree->height +
38467 +            TAIL2EXTENT_PAGE_NUM +
38468 +            TAIL2EXTENT_PAGE_NUM * estimate_one_insert_into_item(tree) +
38469 +            1 + estimate_one_insert_item(tree) +
38470 +            inode_file_plugin(inode)->estimate.update(inode), BA_CAN_COMMIT);
38471 +}
38472 +
38473 +/* clear stat data's flag indicating that conversion is being converted */
38474 +static int complete_conversion(struct inode *inode)
38475 +{
38476 +       int result;
38477 +
38478 +       grab_space_enable();
38479 +       result =
38480 +           reiser4_grab_space(inode_file_plugin(inode)->estimate.update(inode),
38481 +                              BA_CAN_COMMIT);
38482 +       if (result == 0) {
38483 +               reiser4_inode_clr_flag(inode, REISER4_PART_MIXED);
38484 +               result = reiser4_update_sd(inode);
38485 +       }
38486 +       if (result)
38487 +               warning("vs-1696", "Failed to clear converting bit of %llu: %i",
38488 +                       (unsigned long long)get_inode_oid(inode), result);
38489 +       return 0;
38490 +}
38491 +
38492 +/**
38493 + * find_start
38494 + * @inode:
38495 + * @id:
38496 + * @offset:
38497 + *
38498 + * this is used by tail2extent and extent2tail to detect where previous
38499 + * uncompleted conversion stopped
38500 + */
38501 +static int find_start(struct inode *inode, reiser4_plugin_id id, __u64 *offset)
38502 +{
38503 +       int result;
38504 +       lock_handle lh;
38505 +       coord_t coord;
38506 +       struct unix_file_info *ufo;
38507 +       int found;
38508 +       reiser4_key key;
38509 +
38510 +       ufo = unix_file_inode_data(inode);
38511 +       init_lh(&lh);
38512 +       result = 0;
38513 +       found = 0;
38514 +       inode_file_plugin(inode)->key_by_inode(inode, *offset, &key);
38515 +       do {
38516 +               init_lh(&lh);
38517 +               result = find_file_item_nohint(&coord, &lh, &key,
38518 +                                              ZNODE_READ_LOCK, inode);
38519 +
38520 +               if (result == CBK_COORD_FOUND) {
38521 +                       if (coord.between == AT_UNIT) {
38522 +                               /*coord_clear_iplug(&coord); */
38523 +                               result = zload(coord.node);
38524 +                               if (result == 0) {
38525 +                                       if (item_id_by_coord(&coord) == id)
38526 +                                               found = 1;
38527 +                                       else
38528 +                                               item_plugin_by_coord(&coord)->s.
38529 +                                                   file.append_key(&coord,
38530 +                                                                   &key);
38531 +                                       zrelse(coord.node);
38532 +                               }
38533 +                       } else
38534 +                               result = RETERR(-ENOENT);
38535 +               }
38536 +               done_lh(&lh);
38537 +       } while (result == 0 && !found);
38538 +       *offset = get_key_offset(&key);
38539 +       return result;
38540 +}
38541 +
38542 +/**
38543 + * tail2extent
38544 + * @uf_info:
38545 + *
38546 + *
38547 + */
38548 +int tail2extent(struct unix_file_info *uf_info)
38549 +{
38550 +       int result;
38551 +       reiser4_key key;        /* key of next byte to be moved to page */
38552 +       char *p_data;           /* data of page */
38553 +       unsigned page_off = 0,  /* offset within the page where to copy data */
38554 +           count;              /* number of bytes of item which can be
38555 +                                * copied to page */
38556 +       struct page *pages[TAIL2EXTENT_PAGE_NUM];
38557 +       struct page *page;
38558 +       int done;               /* set to 1 when all file is read */
38559 +       char *item;
38560 +       int i;
38561 +       struct inode *inode;
38562 +       int first_iteration;
38563 +       int bytes;
38564 +       __u64 offset;
38565 +
38566 +       assert("nikita-3362", ea_obtained(uf_info));
38567 +       inode = unix_file_info_to_inode(uf_info);
38568 +       assert("nikita-3412", !IS_RDONLY(inode));
38569 +       assert("vs-1649", uf_info->container != UF_CONTAINER_EXTENTS);
38570 +       assert("", !reiser4_inode_get_flag(inode, REISER4_PART_IN_CONV));
38571 +
38572 +       offset = 0;
38573 +       first_iteration = 1;
38574 +       result = 0;
38575 +       if (reiser4_inode_get_flag(inode, REISER4_PART_MIXED)) {
38576 +               /*
38577 +                * file is marked on disk as there was a conversion which did
38578 +                * not complete due to either crash or some error. Find which
38579 +                * offset tail conversion stopped at
38580 +                */
38581 +               result = find_start(inode, FORMATTING_ID, &offset);
38582 +               if (result == -ENOENT) {
38583 +                       /* no tail items found, everything is converted */
38584 +                       uf_info->container = UF_CONTAINER_EXTENTS;
38585 +                       complete_conversion(inode);
38586 +                       return 0;
38587 +               } else if (result != 0)
38588 +                       /* some other error */
38589 +                       return result;
38590 +               first_iteration = 0;
38591 +       }
38592 +
38593 +       reiser4_inode_set_flag(inode, REISER4_PART_IN_CONV);
38594 +
38595 +       /* get key of first byte of a file */
38596 +       inode_file_plugin(inode)->key_by_inode(inode, offset, &key);
38597 +
38598 +       done = 0;
38599 +       while (done == 0) {
38600 +               memset(pages, 0, sizeof(pages));
38601 +               result = reserve_tail2extent_iteration(inode);
38602 +               if (result != 0) {
38603 +                       reiser4_inode_clr_flag(inode, REISER4_PART_IN_CONV);
38604 +                       goto out;
38605 +               }
38606 +               if (first_iteration) {
38607 +                       reiser4_inode_set_flag(inode, REISER4_PART_MIXED);
38608 +                       reiser4_update_sd(inode);
38609 +                       first_iteration = 0;
38610 +               }
38611 +               bytes = 0;
38612 +               for (i = 0; i < sizeof_array(pages) && done == 0; i++) {
38613 +                       assert("vs-598",
38614 +                              (get_key_offset(&key) & ~PAGE_CACHE_MASK) == 0);
38615 +                       page = alloc_page(reiser4_ctx_gfp_mask_get());
38616 +                       if (!page) {
38617 +                               result = RETERR(-ENOMEM);
38618 +                               goto error;
38619 +                       }
38620 +
38621 +                       page->index =
38622 +                           (unsigned long)(get_key_offset(&key) >>
38623 +                                           PAGE_CACHE_SHIFT);
38624 +                       /*
38625 +                        * usually when one is going to longterm lock znode (as
38626 +                        * find_file_item does, for instance) he must not hold
38627 +                        * locked pages. However, there is an exception for
38628 +                        * case tail2extent. Pages appearing here are not
38629 +                        * reachable to everyone else, they are clean, they do
38630 +                        * not have jnodes attached so keeping them locked do
38631 +                        * not risk deadlock appearance
38632 +                        */
38633 +                       assert("vs-983", !PagePrivate(page));
38634 +                       reiser4_invalidate_pages(inode->i_mapping, page->index,
38635 +                                                1, 0);
38636 +
38637 +                       for (page_off = 0; page_off < PAGE_CACHE_SIZE;) {
38638 +                               coord_t coord;
38639 +                               lock_handle lh;
38640 +
38641 +                               /* get next item */
38642 +                               /* FIXME: we might want to readahead here */
38643 +                               init_lh(&lh);
38644 +                               result =
38645 +                                   find_file_item_nohint(&coord, &lh, &key,
38646 +                                                         ZNODE_READ_LOCK,
38647 +                                                         inode);
38648 +                               if (result != CBK_COORD_FOUND) {
38649 +                                       /*
38650 +                                        * error happened of not items of file
38651 +                                        * were found
38652 +                                        */
38653 +                                       done_lh(&lh);
38654 +                                       page_cache_release(page);
38655 +                                       goto error;
38656 +                               }
38657 +
38658 +                               if (coord.between == AFTER_UNIT) {
38659 +                                       /*
38660 +                                        * end of file is reached. Padd page
38661 +                                        * with zeros
38662 +                                        */
38663 +                                       done_lh(&lh);
38664 +                                       done = 1;
38665 +                                       p_data = kmap_atomic(page, KM_USER0);
38666 +                                       memset(p_data + page_off, 0,
38667 +                                              PAGE_CACHE_SIZE - page_off);
38668 +                                       kunmap_atomic(p_data, KM_USER0);
38669 +                                       break;
38670 +                               }
38671 +
38672 +                               result = zload(coord.node);
38673 +                               if (result) {
38674 +                                       page_cache_release(page);
38675 +                                       done_lh(&lh);
38676 +                                       goto error;
38677 +                               }
38678 +                               assert("vs-856", coord.between == AT_UNIT);
38679 +                               item = ((char *)item_body_by_coord(&coord)) +
38680 +                                       coord.unit_pos;
38681 +
38682 +                               /* how many bytes to copy */
38683 +                               count =
38684 +                                   item_length_by_coord(&coord) -
38685 +                                   coord.unit_pos;
38686 +                               /* limit length of copy to end of page */
38687 +                               if (count > PAGE_CACHE_SIZE - page_off)
38688 +                                       count = PAGE_CACHE_SIZE - page_off;
38689 +
38690 +                               /*
38691 +                                * copy item (as much as will fit starting from
38692 +                                * the beginning of the item) into the page
38693 +                                */
38694 +                               p_data = kmap_atomic(page, KM_USER0);
38695 +                               memcpy(p_data + page_off, item, count);
38696 +                               kunmap_atomic(p_data, KM_USER0);
38697 +
38698 +                               page_off += count;
38699 +                               bytes += count;
38700 +                               set_key_offset(&key,
38701 +                                              get_key_offset(&key) + count);
38702 +
38703 +                               zrelse(coord.node);
38704 +                               done_lh(&lh);
38705 +                       } /* end of loop which fills one page by content of
38706 +                          * formatting items */
38707 +
38708 +                       if (page_off) {
38709 +                               /* something was copied into page */
38710 +                               pages[i] = page;
38711 +                       } else {
38712 +                               page_cache_release(page);
38713 +                               assert("vs-1648", done == 1);
38714 +                               break;
38715 +                       }
38716 +               } /* end of loop through pages of one conversion iteration */
38717 +
38718 +               if (i > 0) {
38719 +                       result = replace(inode, pages, i, bytes);
38720 +                       release_all_pages(pages, sizeof_array(pages));
38721 +                       if (result)
38722 +                               goto error;
38723 +                       /*
38724 +                        * We have to drop exclusive access to avoid deadlock
38725 +                        * which may happen because called by reiser4_writepages
38726 +                        * capture_unix_file requires to get non-exclusive
38727 +                        * access to a file. It is safe to drop EA in the middle
38728 +                        * of tail2extent conversion because write_unix_file,
38729 +                        * setattr_unix_file(truncate), mmap_unix_file,
38730 +                        * release_unix_file(extent2tail) checks if conversion
38731 +                        * is not in progress (see comments before
38732 +                        * get_exclusive_access_careful().
38733 +                        * Other processes that acquire non-exclusive access
38734 +                        * (read_unix_file, reiser4_writepages, etc) should work
38735 +                        * on partially converted files.
38736 +                        */
38737 +                       drop_exclusive_access(uf_info);
38738 +                       /* throttle the conversion */
38739 +                       reiser4_throttle_write(inode);
38740 +                       get_exclusive_access(uf_info);
38741 +
38742 +                       /*
38743 +                        * nobody is allowed to complete conversion but a
38744 +                        * process which started it
38745 +                        */
38746 +                       assert("", reiser4_inode_get_flag(inode,
38747 +                                                         REISER4_PART_MIXED));
38748 +               }
38749 +       }
38750 +       if (result == 0) {
38751 +               /* file is converted to extent items */
38752 +               reiser4_inode_clr_flag(inode, REISER4_PART_IN_CONV);
38753 +               assert("vs-1697", reiser4_inode_get_flag(inode,
38754 +                                                        REISER4_PART_MIXED));
38755 +
38756 +               uf_info->container = UF_CONTAINER_EXTENTS;
38757 +               complete_conversion(inode);
38758 +       } else {
38759 +               /*
38760 +                * conversion is not complete. Inode was already marked as
38761 +                * REISER4_PART_MIXED and stat-data were updated at the first
38762 +                * iteration of the loop above.
38763 +                */
38764 +       error:
38765 +               release_all_pages(pages, sizeof_array(pages));
38766 +               reiser4_inode_clr_flag(inode, REISER4_PART_IN_CONV);
38767 +               warning("edward-1548", "Partial conversion of %llu: %i",
38768 +                       (unsigned long long)get_inode_oid(inode), result);
38769 +       }
38770 +
38771 + out:
38772 +       /* this flag should be cleared, otherwise get_exclusive_access_careful()
38773 +          will fall into infinite loop */
38774 +       assert("edward-1549", !reiser4_inode_get_flag(inode,
38775 +                                                     REISER4_PART_IN_CONV));
38776 +       return result;
38777 +}
38778 +
38779 +static int reserve_extent2tail_iteration(struct inode *inode)
38780 +{
38781 +       reiser4_tree *tree;
38782 +
38783 +       tree = reiser4_tree_by_inode(inode);
38784 +       /*
38785 +        * reserve blocks for (in this order):
38786 +        *
38787 +        *     1. removal of extent item
38788 +        *
38789 +        *     2. insertion of tail by insert_flow()
38790 +        *
38791 +        *     3. drilling to the leaf level by coord_by_key()
38792 +        *
38793 +        *     4. possible update of stat-data
38794 +        */
38795 +       grab_space_enable();
38796 +       return reiser4_grab_space
38797 +           (estimate_one_item_removal(tree) +
38798 +            estimate_insert_flow(tree->height) +
38799 +            1 + estimate_one_insert_item(tree) +
38800 +            inode_file_plugin(inode)->estimate.update(inode), BA_CAN_COMMIT);
38801 +}
38802 +
38803 +/* for every page of file: read page, cut part of extent pointing to this page,
38804 +   put data of page tree by tail item */
38805 +int extent2tail(struct file * file, struct unix_file_info *uf_info)
38806 +{
38807 +       int result;
38808 +       struct inode *inode;
38809 +       struct page *page;
38810 +       unsigned long num_pages, i;
38811 +       unsigned long start_page;
38812 +       reiser4_key from;
38813 +       reiser4_key to;
38814 +       unsigned count;
38815 +       __u64 offset;
38816 +
38817 +       assert("nikita-3362", ea_obtained(uf_info));
38818 +       inode = unix_file_info_to_inode(uf_info);
38819 +       assert("nikita-3412", !IS_RDONLY(inode));
38820 +       assert("vs-1649", uf_info->container != UF_CONTAINER_TAILS);
38821 +       assert("", !reiser4_inode_get_flag(inode, REISER4_PART_IN_CONV));
38822 +
38823 +       offset = 0;
38824 +       if (reiser4_inode_get_flag(inode, REISER4_PART_MIXED)) {
38825 +               /*
38826 +                * file is marked on disk as there was a conversion which did
38827 +                * not complete due to either crash or some error. Find which
38828 +                * offset tail conversion stopped at
38829 +                */
38830 +               result = find_start(inode, EXTENT_POINTER_ID, &offset);
38831 +               if (result == -ENOENT) {
38832 +                       /* no extent found, everything is converted */
38833 +                       uf_info->container = UF_CONTAINER_TAILS;
38834 +                       complete_conversion(inode);
38835 +                       return 0;
38836 +               } else if (result != 0)
38837 +                       /* some other error */
38838 +                       return result;
38839 +       }
38840 +
38841 +       reiser4_inode_set_flag(inode, REISER4_PART_IN_CONV);
38842 +
38843 +       /* number of pages in the file */
38844 +       num_pages =
38845 +           (inode->i_size + - offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
38846 +       start_page = offset >> PAGE_CACHE_SHIFT;
38847 +
38848 +       inode_file_plugin(inode)->key_by_inode(inode, offset, &from);
38849 +       to = from;
38850 +
38851 +       result = 0;
38852 +       for (i = 0; i < num_pages; i++) {
38853 +               __u64 start_byte;
38854 +
38855 +               result = reserve_extent2tail_iteration(inode);
38856 +               if (result != 0)
38857 +                       break;
38858 +               if (i == 0 && offset == 0) {
38859 +                       reiser4_inode_set_flag(inode, REISER4_PART_MIXED);
38860 +                       reiser4_update_sd(inode);
38861 +               }
38862 +
38863 +               page = read_mapping_page(inode->i_mapping,
38864 +                                        (unsigned)(i + start_page), NULL);
38865 +               if (IS_ERR(page)) {
38866 +                       result = PTR_ERR(page);
38867 +                       break;
38868 +               }
38869 +
38870 +               wait_on_page_locked(page);
38871 +
38872 +               if (!PageUptodate(page)) {
38873 +                       page_cache_release(page);
38874 +                       result = RETERR(-EIO);
38875 +                       break;
38876 +               }
38877 +
38878 +               /* cut part of file we have read */
38879 +               start_byte = (__u64) ((i + start_page) << PAGE_CACHE_SHIFT);
38880 +               set_key_offset(&from, start_byte);
38881 +               set_key_offset(&to, start_byte + PAGE_CACHE_SIZE - 1);
38882 +               /*
38883 +                * reiser4_cut_tree_object() returns -E_REPEAT to allow atom
38884 +                * commits during over-long truncates. But
38885 +                * extent->tail conversion should be performed in one
38886 +                * transaction.
38887 +                */
38888 +               result = reiser4_cut_tree(reiser4_tree_by_inode(inode), &from,
38889 +                                         &to, inode, 0);
38890 +
38891 +               if (result) {
38892 +                       page_cache_release(page);
38893 +                       break;
38894 +               }
38895 +
38896 +               /* put page data into tree via tail_write */
38897 +               count = PAGE_CACHE_SIZE;
38898 +               if ((i == (num_pages - 1)) &&
38899 +                   (inode->i_size & ~PAGE_CACHE_MASK))
38900 +                       /* last page can be incompleted */
38901 +                       count = (inode->i_size & ~PAGE_CACHE_MASK);
38902 +               while (count) {
38903 +                       loff_t pos = start_byte;
38904 +
38905 +                       assert("edward-1537",
38906 +                              file != NULL && file->f_dentry != NULL);
38907 +                       assert("edward-1538",
38908 +                              file->f_dentry->d_inode == inode);
38909 +
38910 +                       result = reiser4_write_tail(file, inode,
38911 +                                                   (char __user *)kmap(page),
38912 +                                                   count, &pos);
38913 +                       reiser4_free_file_fsdata(file);
38914 +                       if (result <= 0) {
38915 +                               warning("", "reiser4_write_tail failed");
38916 +                               page_cache_release(page);
38917 +                               reiser4_inode_clr_flag(inode, REISER4_PART_IN_CONV);
38918 +                               return result;
38919 +                       }
38920 +                       count -= result;
38921 +               }
38922 +
38923 +               /* release page */
38924 +               lock_page(page);
38925 +               /* page is already detached from jnode and mapping. */
38926 +               assert("vs-1086", page->mapping == NULL);
38927 +               assert("nikita-2690",
38928 +                      (!PagePrivate(page) && jprivate(page) == 0));
38929 +               /* waiting for writeback completion with page lock held is
38930 +                * perfectly valid. */
38931 +               wait_on_page_writeback(page);
38932 +               reiser4_drop_page(page);
38933 +               /* release reference taken by read_cache_page() above */
38934 +               page_cache_release(page);
38935 +
38936 +               drop_exclusive_access(uf_info);
38937 +               /* throttle the conversion */
38938 +               reiser4_throttle_write(inode);
38939 +               get_exclusive_access(uf_info);
38940 +               /*
38941 +                * nobody is allowed to complete conversion but a process which
38942 +                * started it
38943 +                */
38944 +               assert("", reiser4_inode_get_flag(inode, REISER4_PART_MIXED));
38945 +       }
38946 +
38947 +       reiser4_inode_clr_flag(inode, REISER4_PART_IN_CONV);
38948 +
38949 +       if (i == num_pages) {
38950 +               /* file is converted to formatted items */
38951 +               assert("vs-1698", reiser4_inode_get_flag(inode,
38952 +                                                        REISER4_PART_MIXED));
38953 +               assert("vs-1260",
38954 +                      inode_has_no_jnodes(reiser4_inode_data(inode)));
38955 +
38956 +               uf_info->container = UF_CONTAINER_TAILS;
38957 +               complete_conversion(inode);
38958 +               return 0;
38959 +       }
38960 +       /*
38961 +        * conversion is not complete. Inode was already marked as
38962 +        * REISER4_PART_MIXED and stat-data were updated at the first
38963 +        * iteration of the loop above.
38964 +        */
38965 +       warning("nikita-2282",
38966 +               "Partial conversion of %llu: %lu of %lu: %i",
38967 +               (unsigned long long)get_inode_oid(inode), i,
38968 +               num_pages, result);
38969 +
38970 +       /* this flag should be cleared, otherwise get_exclusive_access_careful()
38971 +          will fall into infinite loop */
38972 +       assert("edward-1550", !reiser4_inode_get_flag(inode,
38973 +                                                     REISER4_PART_IN_CONV));
38974 +       return result;
38975 +}
38976 +
38977 +/*
38978 + * Local variables:
38979 + * c-indentation-style: "K&R"
38980 + * mode-name: "LC"
38981 + * c-basic-offset: 8
38982 + * tab-width: 8
38983 + * fill-column: 79
38984 + * scroll-step: 1
38985 + * End:
38986 + */
38987 diff -puN /dev/null fs/reiser4/plugin/file_ops.c
38988 --- /dev/null
38989 +++ a/fs/reiser4/plugin/file_ops.c
38990 @@ -0,0 +1,205 @@
38991 +/* Copyright 2005 by Hans Reiser, licensing governed by
38992 +   reiser4/README */
38993 +
38994 +/* this file contains typical implementations for some of methods of
38995 +   struct file_operations and of struct address_space_operations
38996 +*/
38997 +
38998 +#include "../inode.h"
38999 +#include "object.h"
39000 +
39001 +/* file operations */
39002 +
39003 +/* implementation of vfs's llseek method of struct file_operations for
39004 +   typical directory can be found in readdir_common.c
39005 +*/
39006 +loff_t reiser4_llseek_dir_common(struct file *, loff_t, int origin);
39007 +
39008 +/* implementation of vfs's readdir method of struct file_operations for
39009 +   typical directory can be found in readdir_common.c
39010 +*/
39011 +int reiser4_readdir_common(struct file *, void *dirent, filldir_t);
39012 +
39013 +/**
39014 + * reiser4_release_dir_common - release of struct file_operations
39015 + * @inode: inode of released file
39016 + * @file: file to release
39017 + *
39018 + * Implementation of release method of struct file_operations for typical
39019 + * directory. All it does is freeing of reiser4 specific file data.
39020 +*/
39021 +int reiser4_release_dir_common(struct inode *inode, struct file *file)
39022 +{
39023 +       reiser4_context *ctx;
39024 +
39025 +       ctx = reiser4_init_context(inode->i_sb);
39026 +       if (IS_ERR(ctx))
39027 +               return PTR_ERR(ctx);
39028 +       reiser4_free_file_fsdata(file);
39029 +       reiser4_exit_context(ctx);
39030 +       return 0;
39031 +}
39032 +
39033 +/* this is common implementation of vfs's fsync method of struct
39034 +   file_operations
39035 +*/
39036 +int reiser4_sync_common(struct file *file, struct dentry *dentry, int datasync)
39037 +{
39038 +       reiser4_context *ctx;
39039 +       int result;
39040 +
39041 +       ctx = reiser4_init_context(dentry->d_inode->i_sb);
39042 +       if (IS_ERR(ctx))
39043 +               return PTR_ERR(ctx);
39044 +       result = txnmgr_force_commit_all(dentry->d_inode->i_sb, 0);
39045 +
39046 +       context_set_commit_async(ctx);
39047 +       reiser4_exit_context(ctx);
39048 +       return result;
39049 +}
39050 +
39051 +/*
39052 + * common sync method for regular files.
39053 + *
39054 + * We are trying to be smart here. Instead of committing all atoms (original
39055 + * solution), we scan dirty pages of this file and commit all atoms they are
39056 + * part of.
39057 + *
39058 + * Situation is complicated by anonymous pages: i.e., extent-less pages
39059 + * dirtied through mmap. Fortunately sys_fsync() first calls
39060 + * filemap_fdatawrite() that will ultimately call reiser4_writepages(), insert
39061 + * all missing extents and capture anonymous pages.
39062 + */
39063 +int reiser4_sync_file_common(struct file *file,
39064 +                            struct dentry *dentry, int datasync)
39065 +{
39066 +       reiser4_context *ctx;
39067 +       txn_atom *atom;
39068 +       reiser4_block_nr reserve;
39069 +
39070 +       ctx = reiser4_init_context(dentry->d_inode->i_sb);
39071 +       if (IS_ERR(ctx))
39072 +               return PTR_ERR(ctx);
39073 +
39074 +       reserve = estimate_update_common(dentry->d_inode);
39075 +       if (reiser4_grab_space(reserve, BA_CAN_COMMIT)) {
39076 +               reiser4_exit_context(ctx);
39077 +               return RETERR(-ENOSPC);
39078 +       }
39079 +       write_sd_by_inode_common(dentry->d_inode);
39080 +
39081 +       atom = get_current_atom_locked();
39082 +       spin_lock_txnh(ctx->trans);
39083 +       force_commit_atom(ctx->trans);
39084 +       reiser4_exit_context(ctx);
39085 +       return 0;
39086 +}
39087 +
39088 +/* this is common implementation of vfs's sendfile method of struct
39089 +   file_operations
39090 +
39091 +   Reads @count bytes from @file and calls @actor for every page read. This is
39092 +   needed for loop back devices support.
39093 +*/
39094 +#if 0
39095 +ssize_t
39096 +sendfile_common(struct file *file, loff_t *ppos, size_t count,
39097 +               read_actor_t actor, void *target)
39098 +{
39099 +       reiser4_context *ctx;
39100 +       ssize_t result;
39101 +
39102 +       ctx = reiser4_init_context(file->f_dentry->d_inode->i_sb);
39103 +       if (IS_ERR(ctx))
39104 +               return PTR_ERR(ctx);
39105 +       result = generic_file_sendfile(file, ppos, count, actor, target);
39106 +       reiser4_exit_context(ctx);
39107 +       return result;
39108 +}
39109 +#endif  /*  0  */
39110 +
39111 +/* address space operations */
39112 +
39113 +/* this is common implementation of vfs's prepare_write method of struct
39114 +   address_space_operations
39115 +*/
39116 +int
39117 +prepare_write_common(struct file *file, struct page *page, unsigned from,
39118 +                    unsigned to)
39119 +{
39120 +       reiser4_context *ctx;
39121 +       int result;
39122 +
39123 +       ctx = reiser4_init_context(page->mapping->host->i_sb);
39124 +       result = do_prepare_write(file, page, from, to);
39125 +
39126 +       /* don't commit transaction under inode semaphore */
39127 +       context_set_commit_async(ctx);
39128 +       reiser4_exit_context(ctx);
39129 +
39130 +       return result;
39131 +}
39132 +
39133 +/* this is helper for prepare_write_common and prepare_write_unix_file
39134 + */
39135 +int
39136 +do_prepare_write(struct file *file, struct page *page, unsigned from,
39137 +                unsigned to)
39138 +{
39139 +       int result;
39140 +       file_plugin *fplug;
39141 +       struct inode *inode;
39142 +
39143 +       assert("umka-3099", file != NULL);
39144 +       assert("umka-3100", page != NULL);
39145 +       assert("umka-3095", PageLocked(page));
39146 +
39147 +       if (to - from == PAGE_CACHE_SIZE || PageUptodate(page))
39148 +               return 0;
39149 +
39150 +       inode = page->mapping->host;
39151 +       fplug = inode_file_plugin(inode);
39152 +
39153 +       if (page->mapping->a_ops->readpage == NULL)
39154 +               return RETERR(-EINVAL);
39155 +
39156 +       result = page->mapping->a_ops->readpage(file, page);
39157 +       if (result != 0) {
39158 +               SetPageError(page);
39159 +               ClearPageUptodate(page);
39160 +               /* All reiser4 readpage() implementations should return the
39161 +                * page locked in case of error. */
39162 +               assert("nikita-3472", PageLocked(page));
39163 +       } else {
39164 +               /*
39165 +                * ->readpage() either:
39166 +                *
39167 +                *     1. starts IO against @page. @page is locked for IO in
39168 +                *     this case.
39169 +                *
39170 +                *     2. doesn't start IO. @page is unlocked.
39171 +                *
39172 +                * In either case, page should be locked.
39173 +                */
39174 +               lock_page(page);
39175 +               /*
39176 +                * IO (if any) is completed at this point. Check for IO
39177 +                * errors.
39178 +                */
39179 +               if (!PageUptodate(page))
39180 +                       result = RETERR(-EIO);
39181 +       }
39182 +       assert("umka-3098", PageLocked(page));
39183 +       return result;
39184 +}
39185 +
39186 +/*
39187 + * Local variables:
39188 + * c-indentation-style: "K&R"
39189 + * mode-name: "LC"
39190 + * c-basic-offset: 8
39191 + * tab-width: 8
39192 + * fill-column: 79
39193 + * scroll-step: 1
39194 + * End:
39195 + */
39196 diff -puN /dev/null fs/reiser4/plugin/file_ops_readdir.c
39197 --- /dev/null
39198 +++ a/fs/reiser4/plugin/file_ops_readdir.c
39199 @@ -0,0 +1,658 @@
39200 +/* Copyright 2005 by Hans Reiser, licensing governed by
39201 + * reiser4/README */
39202 +
39203 +#include "../inode.h"
39204 +
39205 +/* return true, iff @coord points to the valid directory item that is part of
39206 + * @inode directory. */
39207 +static int is_valid_dir_coord(struct inode *inode, coord_t *coord)
39208 +{
39209 +       return plugin_of_group(item_plugin_by_coord(coord),
39210 +                              DIR_ENTRY_ITEM_TYPE) &&
39211 +              inode_file_plugin(inode)->owns_item(inode, coord);
39212 +}
39213 +
39214 +/* compare two logical positions within the same directory */
39215 +static cmp_t dir_pos_cmp(const struct dir_pos *p1, const struct dir_pos *p2)
39216 +{
39217 +       cmp_t result;
39218 +
39219 +       assert("nikita-2534", p1 != NULL);
39220 +       assert("nikita-2535", p2 != NULL);
39221 +
39222 +       result = de_id_cmp(&p1->dir_entry_key, &p2->dir_entry_key);
39223 +       if (result == EQUAL_TO) {
39224 +               int diff;
39225 +
39226 +               diff = p1->pos - p2->pos;
39227 +               result =
39228 +                   (diff < 0) ? LESS_THAN : (diff ? GREATER_THAN : EQUAL_TO);
39229 +       }
39230 +       return result;
39231 +}
39232 +
39233 +/* see comment before reiser4_readdir_common() for overview of why "adjustment"
39234 + * is necessary. */
39235 +static void
39236 +adjust_dir_pos(struct file *dir, struct readdir_pos *readdir_spot,
39237 +              const struct dir_pos *mod_point, int adj)
39238 +{
39239 +       struct dir_pos *pos;
39240 +
39241 +       /*
39242 +        * new directory entry was added (adj == +1) or removed (adj == -1) at
39243 +        * the @mod_point. Directory file descriptor @dir is doing readdir and
39244 +        * is currently positioned at @readdir_spot. Latter has to be updated
39245 +        * to maintain stable readdir.
39246 +        */
39247 +       /* directory is positioned to the beginning. */
39248 +       if (readdir_spot->entry_no == 0)
39249 +               return;
39250 +
39251 +       pos = &readdir_spot->position;
39252 +       switch (dir_pos_cmp(mod_point, pos)) {
39253 +       case LESS_THAN:
39254 +               /* @mod_pos is _before_ @readdir_spot, that is, entry was
39255 +                * added/removed on the left (in key order) of current
39256 +                * position. */
39257 +               /* logical number of directory entry readdir is "looking" at
39258 +                * changes */
39259 +               readdir_spot->entry_no += adj;
39260 +               assert("nikita-2577",
39261 +                      ergo(dir != NULL, reiser4_get_dir_fpos(dir) + adj >= 0));
39262 +               if (de_id_cmp(&pos->dir_entry_key,
39263 +                             &mod_point->dir_entry_key) == EQUAL_TO) {
39264 +                       assert("nikita-2575", mod_point->pos < pos->pos);
39265 +                       /*
39266 +                        * if entry added/removed has the same key as current
39267 +                        * for readdir, update counter of duplicate keys in
39268 +                        * @readdir_spot.
39269 +                        */
39270 +                       pos->pos += adj;
39271 +               }
39272 +               break;
39273 +       case GREATER_THAN:
39274 +               /* directory is modified after @pos: nothing to do. */
39275 +               break;
39276 +       case EQUAL_TO:
39277 +               /* cannot insert an entry readdir is looking at, because it
39278 +                  already exists. */
39279 +               assert("nikita-2576", adj < 0);
39280 +               /* directory entry to which @pos points to is being
39281 +                  removed.
39282 +
39283 +                  NOTE-NIKITA: Right thing to do is to update @pos to point
39284 +                  to the next entry. This is complex (we are under spin-lock
39285 +                  for one thing). Just rewind it to the beginning. Next
39286 +                  readdir will have to scan the beginning of
39287 +                  directory. Proper solution is to use semaphore in
39288 +                  spin lock's stead and use rewind_right() here.
39289 +
39290 +                  NOTE-NIKITA: now, semaphore is used, so...
39291 +                */
39292 +               memset(readdir_spot, 0, sizeof *readdir_spot);
39293 +       }
39294 +}
39295 +
39296 +/* scan all file-descriptors for this directory and adjust their
39297 +   positions respectively. Should be used by implementations of
39298 +   add_entry and rem_entry of dir plugin */
39299 +void reiser4_adjust_dir_file(struct inode *dir, const struct dentry *de,
39300 +                            int offset, int adj)
39301 +{
39302 +       reiser4_file_fsdata *scan;
39303 +       struct dir_pos mod_point;
39304 +
39305 +       assert("nikita-2536", dir != NULL);
39306 +       assert("nikita-2538", de != NULL);
39307 +       assert("nikita-2539", adj != 0);
39308 +
39309 +       build_de_id(dir, &de->d_name, &mod_point.dir_entry_key);
39310 +       mod_point.pos = offset;
39311 +
39312 +       spin_lock_inode(dir);
39313 +
39314 +       /*
39315 +        * new entry was added/removed in directory @dir. Scan all file
39316 +        * descriptors for @dir that are currently involved into @readdir and
39317 +        * update them.
39318 +        */
39319 +
39320 +       list_for_each_entry(scan, get_readdir_list(dir), dir.linkage)
39321 +               adjust_dir_pos(scan->back, &scan->dir.readdir, &mod_point, adj);
39322 +
39323 +       spin_unlock_inode(dir);
39324 +}
39325 +
39326 +/*
39327 + * traverse tree to start/continue readdir from the readdir position @pos.
39328 + */
39329 +static int dir_go_to(struct file *dir, struct readdir_pos *pos, tap_t *tap)
39330 +{
39331 +       reiser4_key key;
39332 +       int result;
39333 +       struct inode *inode;
39334 +
39335 +       assert("nikita-2554", pos != NULL);
39336 +
39337 +       inode = dir->f_dentry->d_inode;
39338 +       result = inode_dir_plugin(inode)->build_readdir_key(dir, &key);
39339 +       if (result != 0)
39340 +               return result;
39341 +       result = reiser4_object_lookup(inode,
39342 +                                      &key,
39343 +                                      tap->coord,
39344 +                                      tap->lh,
39345 +                                      tap->mode,
39346 +                                      FIND_EXACT,
39347 +                                      LEAF_LEVEL, LEAF_LEVEL,
39348 +                                      0, &tap->ra_info);
39349 +       if (result == CBK_COORD_FOUND)
39350 +               result = rewind_right(tap, (int)pos->position.pos);
39351 +       else {
39352 +               tap->coord->node = NULL;
39353 +               done_lh(tap->lh);
39354 +               result = RETERR(-EIO);
39355 +       }
39356 +       return result;
39357 +}
39358 +
39359 +/*
39360 + * handling of non-unique keys: calculate at what ordinal position within
39361 + * sequence of directory items with identical keys @pos is.
39362 + */
39363 +static int set_pos(struct inode *inode, struct readdir_pos *pos, tap_t *tap)
39364 +{
39365 +       int result;
39366 +       coord_t coord;
39367 +       lock_handle lh;
39368 +       tap_t scan;
39369 +       de_id *did;
39370 +       reiser4_key de_key;
39371 +
39372 +       coord_init_zero(&coord);
39373 +       init_lh(&lh);
39374 +       reiser4_tap_init(&scan, &coord, &lh, ZNODE_READ_LOCK);
39375 +       reiser4_tap_copy(&scan, tap);
39376 +       reiser4_tap_load(&scan);
39377 +       pos->position.pos = 0;
39378 +
39379 +       did = &pos->position.dir_entry_key;
39380 +
39381 +       if (is_valid_dir_coord(inode, scan.coord)) {
39382 +
39383 +               build_de_id_by_key(unit_key_by_coord(scan.coord, &de_key), did);
39384 +
39385 +               while (1) {
39386 +
39387 +                       result = go_prev_unit(&scan);
39388 +                       if (result != 0)
39389 +                               break;
39390 +
39391 +                       if (!is_valid_dir_coord(inode, scan.coord)) {
39392 +                               result = -EINVAL;
39393 +                               break;
39394 +                       }
39395 +
39396 +                       /* get key of directory entry */
39397 +                       unit_key_by_coord(scan.coord, &de_key);
39398 +                       if (de_id_key_cmp(did, &de_key) != EQUAL_TO) {
39399 +                               /* duplicate-sequence is over */
39400 +                               break;
39401 +                       }
39402 +                       pos->position.pos++;
39403 +               }
39404 +       } else
39405 +               result = RETERR(-ENOENT);
39406 +       reiser4_tap_relse(&scan);
39407 +       reiser4_tap_done(&scan);
39408 +       return result;
39409 +}
39410 +
39411 +/*
39412 + * "rewind" directory to @offset, i.e., set @pos and @tap correspondingly.
39413 + */
39414 +static int dir_rewind(struct file *dir, struct readdir_pos *pos, tap_t *tap)
39415 +{
39416 +       __u64 destination;
39417 +       __s64 shift;
39418 +       int result;
39419 +       struct inode *inode;
39420 +       loff_t dirpos;
39421 +
39422 +       assert("nikita-2553", dir != NULL);
39423 +       assert("nikita-2548", pos != NULL);
39424 +       assert("nikita-2551", tap->coord != NULL);
39425 +       assert("nikita-2552", tap->lh != NULL);
39426 +
39427 +       dirpos = reiser4_get_dir_fpos(dir);
39428 +       shift = dirpos - pos->fpos;
39429 +       /* this is logical directory entry within @dir which we are rewinding
39430 +        * to */
39431 +       destination = pos->entry_no + shift;
39432 +
39433 +       inode = dir->f_dentry->d_inode;
39434 +       if (dirpos < 0)
39435 +               return RETERR(-EINVAL);
39436 +       else if (destination == 0ll || dirpos == 0) {
39437 +               /* rewind to the beginning of directory */
39438 +               memset(pos, 0, sizeof *pos);
39439 +               return dir_go_to(dir, pos, tap);
39440 +       } else if (destination >= inode->i_size)
39441 +               return RETERR(-ENOENT);
39442 +
39443 +       if (shift < 0) {
39444 +               /* I am afraid of negative numbers */
39445 +               shift = -shift;
39446 +               /* rewinding to the left */
39447 +               if (shift <= (int)pos->position.pos) {
39448 +                       /* destination is within sequence of entries with
39449 +                          duplicate keys. */
39450 +                       result = dir_go_to(dir, pos, tap);
39451 +               } else {
39452 +                       shift -= pos->position.pos;
39453 +                       while (1) {
39454 +                               /* repetitions: deadlock is possible when
39455 +                                  going to the left. */
39456 +                               result = dir_go_to(dir, pos, tap);
39457 +                               if (result == 0) {
39458 +                                       result = rewind_left(tap, shift);
39459 +                                       if (result == -E_DEADLOCK) {
39460 +                                               reiser4_tap_done(tap);
39461 +                                               continue;
39462 +                                       }
39463 +                               }
39464 +                               break;
39465 +                       }
39466 +               }
39467 +       } else {
39468 +               /* rewinding to the right */
39469 +               result = dir_go_to(dir, pos, tap);
39470 +               if (result == 0)
39471 +                       result = rewind_right(tap, shift);
39472 +       }
39473 +       if (result == 0) {
39474 +               result = set_pos(inode, pos, tap);
39475 +               if (result == 0) {
39476 +                       /* update pos->position.pos */
39477 +                       pos->entry_no = destination;
39478 +                       pos->fpos = dirpos;
39479 +               }
39480 +       }
39481 +       return result;
39482 +}
39483 +
39484 +/*
39485 + * Function that is called by common_readdir() on each directory entry while
39486 + * doing readdir. ->filldir callback may block, so we had to release long term
39487 + * lock while calling it. To avoid repeating tree traversal, seal is used. If
39488 + * seal is broken, we return -E_REPEAT. Node is unlocked in this case.
39489 + *
39490 + * Whether node is unlocked in case of any other error is undefined. It is
39491 + * guaranteed to be still locked if success (0) is returned.
39492 + *
39493 + * When ->filldir() wants no more, feed_entry() returns 1, and node is
39494 + * unlocked.
39495 + */
39496 +static int
39497 +feed_entry(struct file *f, struct readdir_pos *pos, tap_t *tap,
39498 +          filldir_t filldir, void *dirent)
39499 +{
39500 +       item_plugin *iplug;
39501 +       char *name;
39502 +       reiser4_key sd_key;
39503 +       int result;
39504 +       char buf[DE_NAME_BUF_LEN];
39505 +       char name_buf[32];
39506 +       char *local_name;
39507 +       unsigned file_type;
39508 +       seal_t seal;
39509 +       coord_t *coord;
39510 +       reiser4_key entry_key;
39511 +
39512 +       coord = tap->coord;
39513 +       iplug = item_plugin_by_coord(coord);
39514 +
39515 +       /* pointer to name within the node */
39516 +       name = iplug->s.dir.extract_name(coord, buf);
39517 +       assert("nikita-1371", name != NULL);
39518 +
39519 +       /* key of object the entry points to */
39520 +       if (iplug->s.dir.extract_key(coord, &sd_key) != 0)
39521 +               return RETERR(-EIO);
39522 +
39523 +       /* we must release longterm znode lock before calling filldir to avoid
39524 +          deadlock which may happen if filldir causes page fault. So, copy
39525 +          name to intermediate buffer */
39526 +       if (strlen(name) + 1 > sizeof(name_buf)) {
39527 +               local_name = kmalloc(strlen(name) + 1,
39528 +                                    reiser4_ctx_gfp_mask_get());
39529 +               if (local_name == NULL)
39530 +                       return RETERR(-ENOMEM);
39531 +       } else
39532 +               local_name = name_buf;
39533 +
39534 +       strcpy(local_name, name);
39535 +       file_type = iplug->s.dir.extract_file_type(coord);
39536 +
39537 +       unit_key_by_coord(coord, &entry_key);
39538 +       reiser4_seal_init(&seal, coord, &entry_key);
39539 +
39540 +       longterm_unlock_znode(tap->lh);
39541 +
39542 +       /*
39543 +        * send information about directory entry to the ->filldir() filler
39544 +        * supplied to us by caller (VFS).
39545 +        *
39546 +        * ->filldir is entitled to do weird things. For example, ->filldir
39547 +        * supplied by knfsd re-enters file system. Make sure no locks are
39548 +        * held.
39549 +        */
39550 +       assert("nikita-3436", lock_stack_isclean(get_current_lock_stack()));
39551 +
39552 +       reiser4_txn_restart_current();
39553 +       result = filldir(dirent, name, (int)strlen(name),
39554 +                        /* offset of this entry */
39555 +                        f->f_pos,
39556 +                        /* inode number of object bounden by this entry */
39557 +                        oid_to_uino(get_key_objectid(&sd_key)), file_type);
39558 +       if (local_name != name_buf)
39559 +               kfree(local_name);
39560 +       if (result < 0)
39561 +               /* ->filldir() is satisfied. (no space in buffer, IOW) */
39562 +               result = 1;
39563 +       else
39564 +               result = reiser4_seal_validate(&seal, coord, &entry_key,
39565 +                                              tap->lh, tap->mode,
39566 +                                              ZNODE_LOCK_HIPRI);
39567 +       return result;
39568 +}
39569 +
39570 +static void move_entry(struct readdir_pos *pos, coord_t *coord)
39571 +{
39572 +       reiser4_key de_key;
39573 +       de_id *did;
39574 +
39575 +       /* update @pos */
39576 +       ++pos->entry_no;
39577 +       did = &pos->position.dir_entry_key;
39578 +
39579 +       /* get key of directory entry */
39580 +       unit_key_by_coord(coord, &de_key);
39581 +
39582 +       if (de_id_key_cmp(did, &de_key) == EQUAL_TO)
39583 +               /* we are within sequence of directory entries
39584 +                  with duplicate keys. */
39585 +               ++pos->position.pos;
39586 +       else {
39587 +               pos->position.pos = 0;
39588 +               build_de_id_by_key(&de_key, did);
39589 +       }
39590 +       ++pos->fpos;
39591 +}
39592 +
39593 +/*
39594 + *     STATELESS READDIR
39595 + *
39596 + * readdir support in reiser4 relies on ability to update readdir_pos embedded
39597 + * into reiser4_file_fsdata on each directory modification (name insertion and
39598 + * removal), see reiser4_readdir_common() function below. This obviously doesn't
39599 + * work when reiser4 is accessed over NFS, because NFS doesn't keep any state
39600 + * across client READDIR requests for the same directory.
39601 + *
39602 + * To address this we maintain a "pool" of detached reiser4_file_fsdata
39603 + * (d_cursor). Whenever NFS readdir request comes, we detect this, and try to
39604 + * find detached reiser4_file_fsdata corresponding to previous readdir
39605 + * request. In other words, additional state is maintained on the
39606 + * server. (This is somewhat contrary to the design goals of NFS protocol.)
39607 + *
39608 + * To efficiently detect when our ->readdir() method is called by NFS server,
39609 + * dentry is marked as "stateless" in reiser4_decode_fh() (this is checked by
39610 + * file_is_stateless() function).
39611 + *
39612 + * To find out d_cursor in the pool, we encode client id (cid) in the highest
39613 + * bits of NFS readdir cookie: when first readdir request comes to the given
39614 + * directory from the given client, cookie is set to 0. This situation is
39615 + * detected, global cid_counter is incremented, and stored in highest bits of
39616 + * all direntry offsets returned to the client, including last one. As the
39617 + * only valid readdir cookie is one obtained as direntry->offset, we are
39618 + * guaranteed that next readdir request (continuing current one) will have
39619 + * current cid in the highest bits of starting readdir cookie. All d_cursors
39620 + * are hashed into per-super-block hash table by (oid, cid) key.
39621 + *
39622 + * In addition d_cursors are placed into per-super-block radix tree where they
39623 + * are keyed by oid alone. This is necessary to efficiently remove them during
39624 + * rmdir.
39625 + *
39626 + * At last, currently unused d_cursors are linked into special list. This list
39627 + * is used d_cursor_shrink to reclaim d_cursors on memory pressure.
39628 + *
39629 + */
39630 +
39631 +/*
39632 + * prepare for readdir.
39633 + */
39634 +static int dir_readdir_init(struct file *f, tap_t *tap,
39635 +                           struct readdir_pos **pos)
39636 +{
39637 +       struct inode *inode;
39638 +       reiser4_file_fsdata *fsdata;
39639 +       int result;
39640 +
39641 +       assert("nikita-1359", f != NULL);
39642 +       inode = f->f_dentry->d_inode;
39643 +       assert("nikita-1360", inode != NULL);
39644 +
39645 +       if (!S_ISDIR(inode->i_mode))
39646 +               return RETERR(-ENOTDIR);
39647 +
39648 +       /* try to find detached readdir state */
39649 +       result = reiser4_attach_fsdata(f, inode);
39650 +       if (result != 0)
39651 +               return result;
39652 +
39653 +       fsdata = reiser4_get_file_fsdata(f);
39654 +       assert("nikita-2571", fsdata != NULL);
39655 +       if (IS_ERR(fsdata))
39656 +               return PTR_ERR(fsdata);
39657 +
39658 +       /* add file descriptor to the readdir list hanging of directory
39659 +        * inode. This list is used to scan "readdirs-in-progress" while
39660 +        * inserting or removing names in the directory. */
39661 +       spin_lock_inode(inode);
39662 +       if (list_empty_careful(&fsdata->dir.linkage))
39663 +               list_add(&fsdata->dir.linkage, get_readdir_list(inode));
39664 +       *pos = &fsdata->dir.readdir;
39665 +       spin_unlock_inode(inode);
39666 +
39667 +       /* move @tap to the current position */
39668 +       return dir_rewind(f, *pos, tap);
39669 +}
39670 +
39671 +/* this is implementation of vfs's llseek method of struct file_operations for
39672 +   typical directory
39673 +   See comment before reiser4_readdir_common() for explanation.
39674 +*/
39675 +loff_t reiser4_llseek_dir_common(struct file *file, loff_t off, int origin)
39676 +{
39677 +       reiser4_context *ctx;
39678 +       loff_t result;
39679 +       struct inode *inode;
39680 +
39681 +       inode = file->f_dentry->d_inode;
39682 +
39683 +       ctx = reiser4_init_context(inode->i_sb);
39684 +       if (IS_ERR(ctx))
39685 +               return PTR_ERR(ctx);
39686 +
39687 +       mutex_lock(&inode->i_mutex);
39688 +
39689 +       /* update ->f_pos */
39690 +       result = default_llseek(file, off, origin);
39691 +       if (result >= 0) {
39692 +               int ff;
39693 +               coord_t coord;
39694 +               lock_handle lh;
39695 +               tap_t tap;
39696 +               struct readdir_pos *pos;
39697 +
39698 +               coord_init_zero(&coord);
39699 +               init_lh(&lh);
39700 +               reiser4_tap_init(&tap, &coord, &lh, ZNODE_READ_LOCK);
39701 +
39702 +               ff = dir_readdir_init(file, &tap, &pos);
39703 +               reiser4_detach_fsdata(file);
39704 +               if (ff != 0)
39705 +                       result = (loff_t) ff;
39706 +               reiser4_tap_done(&tap);
39707 +       }
39708 +       reiser4_detach_fsdata(file);
39709 +       mutex_unlock(&inode->i_mutex);
39710 +
39711 +       reiser4_exit_context(ctx);
39712 +       return result;
39713 +}
39714 +
39715 +/* this is common implementation of vfs's readdir method of struct
39716 +   file_operations
39717 +
39718 +   readdir problems:
39719 +
39720 +   readdir(2)/getdents(2) interface is based on implicit assumption that
39721 +   readdir can be restarted from any particular point by supplying file system
39722 +   with off_t-full of data. That is, file system fills ->d_off field in struct
39723 +   dirent and later user passes ->d_off to the seekdir(3), which is, actually,
39724 +   implemented by glibc as lseek(2) on directory.
39725 +
39726 +   Reiser4 cannot restart readdir from 64 bits of data, because two last
39727 +   components of the key of directory entry are unknown, which given 128 bits:
39728 +   locality and type fields in the key of directory entry are always known, to
39729 +   start readdir() from given point objectid and offset fields have to be
39730 +   filled.
39731 +
39732 +   Traditional UNIX API for scanning through directory
39733 +   (readdir/seekdir/telldir/opendir/closedir/rewindir/getdents) is based on the
39734 +   assumption that directory is structured very much like regular file, in
39735 +   particular, it is implied that each name within given directory (directory
39736 +   entry) can be uniquely identified by scalar offset and that such offset is
39737 +   stable across the life-time of the name is identifies.
39738 +
39739 +   This is manifestly not so for reiser4. In reiser4 the only stable unique
39740 +   identifies for the directory entry is its key that doesn't fit into
39741 +   seekdir/telldir API.
39742 +
39743 +   solution:
39744 +
39745 +   Within each file descriptor participating in readdir-ing of directory
39746 +   plugin/dir/dir.h:readdir_pos is maintained. This structure keeps track of
39747 +   the "current" directory entry that file descriptor looks at. It contains a
39748 +   key of directory entry (plus some additional info to deal with non-unique
39749 +   keys that we wouldn't dwell onto here) and a logical position of this
39750 +   directory entry starting from the beginning of the directory, that is
39751 +   ordinal number of this entry in the readdir order.
39752 +
39753 +   Obviously this logical position is not stable in the face of directory
39754 +   modifications. To work around this, on each addition or removal of directory
39755 +   entry all file descriptors for directory inode are scanned and their
39756 +   readdir_pos are updated accordingly (adjust_dir_pos()).
39757 +*/
39758 +int reiser4_readdir_common(struct file *f /* directory file being read */,
39759 +                          void *dirent /* opaque data passed to us by VFS */,
39760 +                          filldir_t filld /* filler function passed to us
39761 +                                           * by VFS */)
39762 +{
39763 +       reiser4_context *ctx;
39764 +       int result;
39765 +       struct inode *inode;
39766 +       coord_t coord;
39767 +       lock_handle lh;
39768 +       tap_t tap;
39769 +       struct readdir_pos *pos;
39770 +
39771 +       assert("nikita-1359", f != NULL);
39772 +       inode = f->f_dentry->d_inode;
39773 +       assert("nikita-1360", inode != NULL);
39774 +
39775 +       if (!S_ISDIR(inode->i_mode))
39776 +               return RETERR(-ENOTDIR);
39777 +
39778 +       ctx = reiser4_init_context(inode->i_sb);
39779 +       if (IS_ERR(ctx))
39780 +               return PTR_ERR(ctx);
39781 +
39782 +       coord_init_zero(&coord);
39783 +       init_lh(&lh);
39784 +       reiser4_tap_init(&tap, &coord, &lh, ZNODE_READ_LOCK);
39785 +
39786 +       reiser4_readdir_readahead_init(inode, &tap);
39787 +
39788 +repeat:
39789 +       result = dir_readdir_init(f, &tap, &pos);
39790 +       if (result == 0) {
39791 +               result = reiser4_tap_load(&tap);
39792 +               /* scan entries one by one feeding them to @filld */
39793 +               while (result == 0) {
39794 +                       coord_t *coord;
39795 +
39796 +                       coord = tap.coord;
39797 +                       assert("nikita-2572", coord_is_existing_unit(coord));
39798 +                       assert("nikita-3227", is_valid_dir_coord(inode, coord));
39799 +
39800 +                       result = feed_entry(f, pos, &tap, filld, dirent);
39801 +                       if (result > 0) {
39802 +                               break;
39803 +                       } else if (result == 0) {
39804 +                               ++f->f_pos;
39805 +                               result = go_next_unit(&tap);
39806 +                               if (result == -E_NO_NEIGHBOR ||
39807 +                                   result == -ENOENT) {
39808 +                                       result = 0;
39809 +                                       break;
39810 +                               } else if (result == 0) {
39811 +                                       if (is_valid_dir_coord(inode, coord))
39812 +                                               move_entry(pos, coord);
39813 +                                       else
39814 +                                               break;
39815 +                               }
39816 +                       } else if (result == -E_REPEAT) {
39817 +                               /* feed_entry() had to restart. */
39818 +                               ++f->f_pos;
39819 +                               reiser4_tap_relse(&tap);
39820 +                               goto repeat;
39821 +                       } else
39822 +                               warning("vs-1617",
39823 +                                       "reiser4_readdir_common: unexpected error %d",
39824 +                                       result);
39825 +               }
39826 +               reiser4_tap_relse(&tap);
39827 +
39828 +               if (result >= 0)
39829 +                       f->f_version = inode->i_version;
39830 +       } else if (result == -E_NO_NEIGHBOR || result == -ENOENT)
39831 +               result = 0;
39832 +       reiser4_tap_done(&tap);
39833 +       reiser4_detach_fsdata(f);
39834 +
39835 +       /* try to update directory's atime */
39836 +       if (reiser4_grab_space_force(inode_file_plugin(inode)->estimate.update(inode),
39837 +                              BA_CAN_COMMIT) != 0)
39838 +               warning("", "failed to update atime on readdir: %llu",
39839 +                       get_inode_oid(inode));
39840 +       else
39841 +               file_accessed(f);
39842 +
39843 +       context_set_commit_async(ctx);
39844 +       reiser4_exit_context(ctx);
39845 +
39846 +       return (result <= 0) ? result : 0;
39847 +}
39848 +
39849 +/*
39850 + * Local variables:
39851 + * c-indentation-style: "K&R"
39852 + * mode-name: "LC"
39853 + * c-basic-offset: 8
39854 + * tab-width: 8
39855 + * fill-column: 79
39856 + * End:
39857 + */
39858 diff -puN /dev/null fs/reiser4/plugin/file_plugin_common.c
39859 --- /dev/null
39860 +++ a/fs/reiser4/plugin/file_plugin_common.c
39861 @@ -0,0 +1,1011 @@
39862 +/* Copyright 2005 by Hans Reiser, licensing governed by
39863 +   reiser4/README */
39864 +
39865 +/* this file contains typical implementations for most of methods of
39866 +   file plugin
39867 +*/
39868 +
39869 +#include "../inode.h"
39870 +#include "object.h"
39871 +#include "../safe_link.h"
39872 +
39873 +#include <linux/quotaops.h>
39874 +
39875 +static int insert_new_sd(struct inode *inode);
39876 +static int update_sd(struct inode *inode);
39877 +
39878 +/* this is common implementation of write_sd_by_inode method of file plugin
39879 +   either insert stat data or update it
39880 + */
39881 +int write_sd_by_inode_common(struct inode *inode/* object to save */)
39882 +{
39883 +       int result;
39884 +
39885 +       assert("nikita-730", inode != NULL);
39886 +
39887 +       if (reiser4_inode_get_flag(inode, REISER4_NO_SD))
39888 +               /* object doesn't have stat-data yet */
39889 +               result = insert_new_sd(inode);
39890 +       else
39891 +               result = update_sd(inode);
39892 +       if (result != 0 && result != -ENAMETOOLONG && result != -ENOMEM)
39893 +               /* Don't issue warnings about "name is too long" */
39894 +               warning("nikita-2221", "Failed to save sd for %llu: %i",
39895 +                       (unsigned long long)get_inode_oid(inode), result);
39896 +       return result;
39897 +}
39898 +
39899 +/* this is common implementation of key_by_inode method of file plugin
39900 + */
39901 +int
39902 +key_by_inode_and_offset_common(struct inode *inode, loff_t off,
39903 +                              reiser4_key * key)
39904 +{
39905 +       reiser4_key_init(key);
39906 +       set_key_locality(key, reiser4_inode_data(inode)->locality_id);
39907 +       set_key_ordering(key, get_inode_ordering(inode));
39908 +       set_key_objectid(key, get_inode_oid(inode));    /*FIXME: inode->i_ino */
39909 +       set_key_type(key, KEY_BODY_MINOR);
39910 +       set_key_offset(key, (__u64) off);
39911 +       return 0;
39912 +}
39913 +
39914 +/* this is common implementation of set_plug_in_inode method of file plugin
39915 + */
39916 +int set_plug_in_inode_common(struct inode *object /* inode to set plugin on */ ,
39917 +                            struct inode *parent /* parent object */ ,
39918 +                            reiser4_object_create_data * data  /* creational
39919 +                                                                * data */ )
39920 +{
39921 +       __u64 mask;
39922 +
39923 +       object->i_mode = data->mode;
39924 +       /* this should be plugin decision */
39925 +       object->i_uid = current_fsuid();
39926 +       object->i_mtime = object->i_atime = object->i_ctime = CURRENT_TIME;
39927 +
39928 +       /* support for BSD style group-id assignment. See mount's manual page
39929 +          description of bsdgroups ext2 mount options for more details */
39930 +       if (reiser4_is_set(object->i_sb, REISER4_BSD_GID))
39931 +               object->i_gid = parent->i_gid;
39932 +       else if (parent->i_mode & S_ISGID) {
39933 +               /* parent directory has sguid bit */
39934 +               object->i_gid = parent->i_gid;
39935 +               if (S_ISDIR(object->i_mode))
39936 +                       /* sguid is inherited by sub-directories */
39937 +                       object->i_mode |= S_ISGID;
39938 +       } else
39939 +               object->i_gid = current_fsgid();
39940 +
39941 +       /* this object doesn't have stat-data yet */
39942 +       reiser4_inode_set_flag(object, REISER4_NO_SD);
39943 +#if 0
39944 +       /* this is now called after all inode plugins are initialized:
39945 +          do_create_vfs_child after adjust_to_parent */
39946 +       /* setup inode and file-operations for this inode */
39947 +       setup_inode_ops(object, data);
39948 +#endif
39949 +       object->i_nlink = 0;
39950 +       reiser4_seal_init(&reiser4_inode_data(object)->sd_seal, NULL, NULL);
39951 +       mask = (1 << UNIX_STAT) | (1 << LIGHT_WEIGHT_STAT);
39952 +       if (!reiser4_is_set(object->i_sb, REISER4_32_BIT_TIMES))
39953 +               mask |= (1 << LARGE_TIMES_STAT);
39954 +
39955 +       reiser4_inode_data(object)->extmask = mask;
39956 +       return 0;
39957 +}
39958 +
39959 +/* this is common implementation of adjust_to_parent method of file plugin for
39960 +   regular files
39961 + */
39962 +int adjust_to_parent_common(struct inode *object /* new object */ ,
39963 +                           struct inode *parent /* parent directory */ ,
39964 +                           struct inode *root/* root directory */)
39965 +{
39966 +       assert("nikita-2165", object != NULL);
39967 +       if (parent == NULL)
39968 +               parent = root;
39969 +       assert("nikita-2069", parent != NULL);
39970 +
39971 +       /*
39972 +        * inherit missing plugins from parent
39973 +        */
39974 +
39975 +       grab_plugin_pset(object, parent, PSET_FILE);
39976 +       grab_plugin_pset(object, parent, PSET_SD);
39977 +       grab_plugin_pset(object, parent, PSET_FORMATTING);
39978 +       grab_plugin_pset(object, parent, PSET_PERM);
39979 +       return 0;
39980 +}
39981 +
39982 +/* this is common implementation of adjust_to_parent method of file plugin for
39983 +   typical directories
39984 + */
39985 +int adjust_to_parent_common_dir(struct inode *object /* new object */ ,
39986 +                               struct inode *parent /* parent directory */ ,
39987 +                               struct inode *root/* root directory */)
39988 +{
39989 +       int result = 0;
39990 +       pset_member memb;
39991 +
39992 +       assert("nikita-2166", object != NULL);
39993 +       if (parent == NULL)
39994 +               parent = root;
39995 +       assert("nikita-2167", parent != NULL);
39996 +
39997 +       /*
39998 +        * inherit missing plugins from parent
39999 +        */
40000 +       for (memb = 0; memb < PSET_LAST; ++memb) {
40001 +               result = grab_plugin_pset(object, parent, memb);
40002 +               if (result != 0)
40003 +                       break;
40004 +       }
40005 +       return result;
40006 +}
40007 +
40008 +int adjust_to_parent_cryptcompress(struct inode *object /* new object */ ,
40009 +                                  struct inode *parent /* parent directory */,
40010 +                                  struct inode *root/* root directory */)
40011 +{
40012 +       int result;
40013 +       result = adjust_to_parent_common(object, parent, root);
40014 +       if (result)
40015 +               return result;
40016 +       assert("edward-1416", parent != NULL);
40017 +
40018 +       grab_plugin_pset(object, parent, PSET_CLUSTER);
40019 +       grab_plugin_pset(object, parent, PSET_CIPHER);
40020 +       grab_plugin_pset(object, parent, PSET_DIGEST);
40021 +       grab_plugin_pset(object, parent, PSET_COMPRESSION);
40022 +       grab_plugin_pset(object, parent, PSET_COMPRESSION_MODE);
40023 +
40024 +       return 0;
40025 +}
40026 +
40027 +/* this is common implementation of create_object method of file plugin
40028 + */
40029 +int reiser4_create_object_common(struct inode *object, struct inode *parent,
40030 +                                reiser4_object_create_data * data)
40031 +{
40032 +       reiser4_block_nr reserve;
40033 +       assert("nikita-744", object != NULL);
40034 +       assert("nikita-745", parent != NULL);
40035 +       assert("nikita-747", data != NULL);
40036 +       assert("nikita-748", reiser4_inode_get_flag(object, REISER4_NO_SD));
40037 +
40038 +       reserve = estimate_create_common(object);
40039 +       if (reiser4_grab_space(reserve, BA_CAN_COMMIT))
40040 +               return RETERR(-ENOSPC);
40041 +       return write_sd_by_inode_common(object);
40042 +}
40043 +
40044 +static int common_object_delete_no_reserve(struct inode *inode);
40045 +
40046 +/**
40047 + * reiser4_delete_object_common - delete_object of file_plugin
40048 + * @inode: inode to be deleted
40049 + *
40050 + * This is common implementation of delete_object method of file_plugin. It
40051 + * applies to object its deletion consists of removing two items - stat data
40052 + * and safe-link.
40053 + */
40054 +int reiser4_delete_object_common(struct inode *inode)
40055 +{
40056 +       int result;
40057 +
40058 +       assert("nikita-1477", inode != NULL);
40059 +       /* FIXME: if file body deletion failed (i/o error, for instance),
40060 +          inode->i_size can be != 0 here */
40061 +       assert("nikita-3420", inode->i_size == 0 || S_ISLNK(inode->i_mode));
40062 +       assert("nikita-3421", inode->i_nlink == 0);
40063 +
40064 +       if (!reiser4_inode_get_flag(inode, REISER4_NO_SD)) {
40065 +               reiser4_block_nr reserve;
40066 +
40067 +               /* grab space which is needed to remove 2 items from the tree:
40068 +                  stat data and safe-link */
40069 +               reserve = 2 *
40070 +                 estimate_one_item_removal(reiser4_tree_by_inode(inode));
40071 +               if (reiser4_grab_space_force(reserve,
40072 +                                            BA_RESERVED | BA_CAN_COMMIT))
40073 +                       return RETERR(-ENOSPC);
40074 +               result = common_object_delete_no_reserve(inode);
40075 +       } else
40076 +               result = 0;
40077 +       return result;
40078 +}
40079 +
40080 +/**
40081 + * reiser4_delete_dir_common - delete_object of file_plugin
40082 + * @inode: inode to be deleted
40083 + *
40084 + * This is common implementation of delete_object method of file_plugin for
40085 + * typical directory. It calls done method of dir_plugin to remove "." and
40086 + * removes stat data and safe-link.
40087 + */
40088 +int reiser4_delete_dir_common(struct inode *inode)
40089 +{
40090 +       int result;
40091 +       dir_plugin *dplug;
40092 +
40093 +       assert("", (get_current_context() &&
40094 +                   get_current_context()->trans->atom == NULL));
40095 +
40096 +       dplug = inode_dir_plugin(inode);
40097 +       assert("vs-1101", dplug && dplug->done);
40098 +
40099 +       /* kill cursors which might be attached to inode */
40100 +       reiser4_kill_cursors(inode);
40101 +
40102 +       /* grab space enough for removing two items */
40103 +       if (reiser4_grab_space
40104 +           (2 * estimate_one_item_removal(reiser4_tree_by_inode(inode)),
40105 +            BA_RESERVED | BA_CAN_COMMIT))
40106 +               return RETERR(-ENOSPC);
40107 +
40108 +       result = dplug->done(inode);
40109 +       if (!result)
40110 +               result = common_object_delete_no_reserve(inode);
40111 +       return result;
40112 +}
40113 +
40114 +/* this is common implementation of add_link method of file plugin
40115 + */
40116 +int reiser4_add_link_common(struct inode *object, struct inode *parent)
40117 +{
40118 +       /*
40119 +        * increment ->i_nlink and update ->i_ctime
40120 +        */
40121 +
40122 +       INODE_INC_FIELD(object, i_nlink);
40123 +       object->i_ctime = CURRENT_TIME;
40124 +       return 0;
40125 +}
40126 +
40127 +/* this is common implementation of rem_link method of file plugin
40128 + */
40129 +int reiser4_rem_link_common(struct inode *object, struct inode *parent)
40130 +{
40131 +       assert("nikita-2021", object != NULL);
40132 +       assert("nikita-2163", object->i_nlink > 0);
40133 +
40134 +       /*
40135 +        * decrement ->i_nlink and update ->i_ctime
40136 +        */
40137 +
40138 +       INODE_DEC_FIELD(object, i_nlink);
40139 +       object->i_ctime = CURRENT_TIME;
40140 +       return 0;
40141 +}
40142 +
40143 +/* this is common implementation of rem_link method of file plugin for typical
40144 +   directory
40145 +*/
40146 +int rem_link_common_dir(struct inode *object, struct inode *parent UNUSED_ARG)
40147 +{
40148 +       assert("nikita-20211", object != NULL);
40149 +       assert("nikita-21631", object->i_nlink > 0);
40150 +
40151 +       /*
40152 +        * decrement ->i_nlink and update ->i_ctime
40153 +        */
40154 +       INODE_DEC_FIELD(object, i_nlink);
40155 +       if (object->i_nlink == 1)
40156 +               INODE_DEC_FIELD(object, i_nlink);
40157 +       object->i_ctime = CURRENT_TIME;
40158 +       return 0;
40159 +}
40160 +
40161 +/* this is common implementation of owns_item method of file plugin
40162 +   compare objectids of keys in inode and coord */
40163 +int owns_item_common(const struct inode *inode,        /* object to check
40164 +                                                * against */
40165 +                    const coord_t *coord/* coord to check */)
40166 +{
40167 +       reiser4_key item_key;
40168 +       reiser4_key file_key;
40169 +
40170 +       assert("nikita-760", inode != NULL);
40171 +       assert("nikita-761", coord != NULL);
40172 +
40173 +       return coord_is_existing_item(coord) &&
40174 +           (get_key_objectid(build_sd_key(inode, &file_key)) ==
40175 +            get_key_objectid(item_key_by_coord(coord, &item_key)));
40176 +}
40177 +
40178 +/* this is common implementation of owns_item method of file plugin
40179 +   for typical directory
40180 +*/
40181 +int owns_item_common_dir(const struct inode *inode,/* object to check against */
40182 +                        const coord_t *coord/* coord of item to check */)
40183 +{
40184 +       reiser4_key item_key;
40185 +
40186 +       assert("nikita-1335", inode != NULL);
40187 +       assert("nikita-1334", coord != NULL);
40188 +
40189 +       if (plugin_of_group(item_plugin_by_coord(coord), DIR_ENTRY_ITEM_TYPE))
40190 +               return get_key_locality(item_key_by_coord(coord, &item_key)) ==
40191 +                   get_inode_oid(inode);
40192 +       else
40193 +               return owns_item_common(inode, coord);
40194 +}
40195 +
40196 +/* this is common implementation of can_add_link method of file plugin
40197 +   checks whether yet another hard links to this object can be added
40198 +*/
40199 +int can_add_link_common(const struct inode *object/* object to check */)
40200 +{
40201 +       assert("nikita-732", object != NULL);
40202 +
40203 +       /* inode->i_nlink is unsigned int, so just check for integer
40204 +          overflow */
40205 +       return object->i_nlink + 1 != 0;
40206 +}
40207 +
40208 +/* this is common implementation of can_rem_link method of file plugin for
40209 +   typical directory
40210 +*/
40211 +int can_rem_link_common_dir(const struct inode *inode)
40212 +{
40213 +       /* is_dir_empty() returns 0 is dir is empty */
40214 +       return !is_dir_empty(inode);
40215 +}
40216 +
40217 +/* this is common implementation of detach method of file plugin for typical
40218 +   directory
40219 +*/
40220 +int reiser4_detach_common_dir(struct inode *child, struct inode *parent)
40221 +{
40222 +       dir_plugin *dplug;
40223 +
40224 +       dplug = inode_dir_plugin(child);
40225 +       assert("nikita-2883", dplug != NULL);
40226 +       assert("nikita-2884", dplug->detach != NULL);
40227 +       return dplug->detach(child, parent);
40228 +}
40229 +
40230 +/* this is common implementation of bind method of file plugin for typical
40231 +   directory
40232 +*/
40233 +int reiser4_bind_common_dir(struct inode *child, struct inode *parent)
40234 +{
40235 +       dir_plugin *dplug;
40236 +
40237 +       dplug = inode_dir_plugin(child);
40238 +       assert("nikita-2646", dplug != NULL);
40239 +       return dplug->attach(child, parent);
40240 +}
40241 +
40242 +static int process_truncate(struct inode *, __u64 size);
40243 +
40244 +/* this is common implementation of safelink method of file plugin
40245 + */
40246 +int safelink_common(struct inode *object, reiser4_safe_link_t link, __u64 value)
40247 +{
40248 +       int result;
40249 +
40250 +       assert("vs-1705", get_current_context()->trans->atom == NULL);
40251 +       if (link == SAFE_UNLINK)
40252 +               /* nothing to do. iput() in the caller (process_safelink) will
40253 +                * finish with file */
40254 +               result = 0;
40255 +       else if (link == SAFE_TRUNCATE)
40256 +               result = process_truncate(object, value);
40257 +       else {
40258 +               warning("nikita-3438", "Unrecognized safe-link type: %i", link);
40259 +               result = RETERR(-EIO);
40260 +       }
40261 +       return result;
40262 +}
40263 +
40264 +/* this is common implementation of estimate.create method of file plugin
40265 +   can be used when object creation involves insertion of one item (usually stat
40266 +   data) into tree
40267 +*/
40268 +reiser4_block_nr estimate_create_common(const struct inode *object)
40269 +{
40270 +       return estimate_one_insert_item(reiser4_tree_by_inode(object));
40271 +}
40272 +
40273 +/* this is common implementation of estimate.create method of file plugin for
40274 +   typical directory
40275 +   can be used when directory creation involves insertion of two items (usually
40276 +   stat data and item containing "." and "..") into tree
40277 +*/
40278 +reiser4_block_nr estimate_create_common_dir(const struct inode *object)
40279 +{
40280 +       return 2 * estimate_one_insert_item(reiser4_tree_by_inode(object));
40281 +}
40282 +
40283 +/* this is common implementation of estimate.update method of file plugin
40284 +   can be used when stat data update does not do more than inserting a unit
40285 +   into a stat data item which is probably true for most cases
40286 +*/
40287 +reiser4_block_nr estimate_update_common(const struct inode *inode)
40288 +{
40289 +       return estimate_one_insert_into_item(reiser4_tree_by_inode(inode));
40290 +}
40291 +
40292 +/* this is common implementation of estimate.unlink method of file plugin
40293 + */
40294 +reiser4_block_nr
40295 +estimate_unlink_common(const struct inode *object UNUSED_ARG,
40296 +                      const struct inode *parent UNUSED_ARG)
40297 +{
40298 +       return 0;
40299 +}
40300 +
40301 +/* this is common implementation of estimate.unlink method of file plugin for
40302 +   typical directory
40303 +*/
40304 +reiser4_block_nr
40305 +estimate_unlink_common_dir(const struct inode *object,
40306 +                          const struct inode *parent)
40307 +{
40308 +       dir_plugin *dplug;
40309 +
40310 +       dplug = inode_dir_plugin(object);
40311 +       assert("nikita-2888", dplug != NULL);
40312 +       assert("nikita-2887", dplug->estimate.unlink != NULL);
40313 +       return dplug->estimate.unlink(object, parent);
40314 +}
40315 +
40316 +char *wire_write_common(struct inode *inode, char *start)
40317 +{
40318 +       return build_inode_onwire(inode, start);
40319 +}
40320 +
40321 +char *wire_read_common(char *addr, reiser4_object_on_wire * obj)
40322 +{
40323 +       if (!obj)
40324 +               return locate_obj_key_id_onwire(addr);
40325 +       return extract_obj_key_id_from_onwire(addr, &obj->u.std.key_id);
40326 +}
40327 +
40328 +struct dentry *wire_get_common(struct super_block *sb,
40329 +                              reiser4_object_on_wire * obj)
40330 +{
40331 +       struct inode *inode;
40332 +       struct dentry *dentry;
40333 +       reiser4_key key;
40334 +
40335 +       extract_key_from_id(&obj->u.std.key_id, &key);
40336 +       inode = reiser4_iget(sb, &key, 1);
40337 +       if (!IS_ERR(inode)) {
40338 +               reiser4_iget_complete(inode);
40339 +               dentry = d_obtain_alias(inode);
40340 +               if (dentry == NULL) {
40341 +                       iput(inode);
40342 +                       dentry = ERR_PTR(-ENOMEM);
40343 +               } else
40344 +                       dentry->d_op = &get_super_private(sb)->ops.dentry;
40345 +       } else if (PTR_ERR(inode) == -ENOENT)
40346 +               /*
40347 +                * inode wasn't found at the key encoded in the file
40348 +                * handle. Hence, file handle is stale.
40349 +                */
40350 +               dentry = ERR_PTR(RETERR(-ESTALE));
40351 +       else
40352 +               dentry = (void *)inode;
40353 +       return dentry;
40354 +}
40355 +
40356 +int wire_size_common(struct inode *inode)
40357 +{
40358 +       return inode_onwire_size(inode);
40359 +}
40360 +
40361 +void wire_done_common(reiser4_object_on_wire * obj)
40362 +{
40363 +       /* nothing to do */
40364 +}
40365 +
40366 +/* helper function to print errors */
40367 +static void key_warning(const reiser4_key * key /* key to print */ ,
40368 +                       const struct inode *inode,
40369 +                       int code/* error code to print */)
40370 +{
40371 +       assert("nikita-716", key != NULL);
40372 +
40373 +       if (code != -ENOMEM) {
40374 +               warning("nikita-717", "Error for inode %llu (%i)",
40375 +                       (unsigned long long)get_key_objectid(key), code);
40376 +               reiser4_print_key("for key", key);
40377 +       }
40378 +}
40379 +
40380 +/* NIKITA-FIXME-HANS: perhaps this function belongs in another file? */
40381 +#if REISER4_DEBUG
40382 +static void
40383 +check_inode_seal(const struct inode *inode,
40384 +                const coord_t *coord, const reiser4_key * key)
40385 +{
40386 +       reiser4_key unit_key;
40387 +
40388 +       unit_key_by_coord(coord, &unit_key);
40389 +       assert("nikita-2752",
40390 +              WITH_DATA_RET(coord->node, 1, keyeq(key, &unit_key)));
40391 +       assert("nikita-2753", get_inode_oid(inode) == get_key_objectid(key));
40392 +}
40393 +
40394 +static void check_sd_coord(coord_t *coord, const reiser4_key * key)
40395 +{
40396 +       reiser4_key ukey;
40397 +
40398 +       coord_clear_iplug(coord);
40399 +       if (zload(coord->node))
40400 +               return;
40401 +
40402 +       if (!coord_is_existing_unit(coord) ||
40403 +           !item_plugin_by_coord(coord) ||
40404 +           !keyeq(unit_key_by_coord(coord, &ukey), key) ||
40405 +           (znode_get_level(coord->node) != LEAF_LEVEL) ||
40406 +           !item_is_statdata(coord)) {
40407 +               warning("nikita-1901", "Conspicuous seal");
40408 +               reiser4_print_key("key", key);
40409 +               print_coord("coord", coord, 1);
40410 +               impossible("nikita-2877", "no way");
40411 +       }
40412 +       zrelse(coord->node);
40413 +}
40414 +
40415 +#else
40416 +#define check_inode_seal(inode, coord, key) noop
40417 +#define check_sd_coord(coord, key) noop
40418 +#endif
40419 +
40420 +/* insert new stat-data into tree. Called with inode state
40421 +    locked. Return inode state locked. */
40422 +static int insert_new_sd(struct inode *inode/* inode to create sd for */)
40423 +{
40424 +       int result;
40425 +       reiser4_key key;
40426 +       coord_t coord;
40427 +       reiser4_item_data data;
40428 +       char *area;
40429 +       reiser4_inode *ref;
40430 +       lock_handle lh;
40431 +       oid_t oid;
40432 +
40433 +       assert("nikita-723", inode != NULL);
40434 +       assert("nikita-3406", reiser4_inode_get_flag(inode, REISER4_NO_SD));
40435 +
40436 +       ref = reiser4_inode_data(inode);
40437 +       spin_lock_inode(inode);
40438 +
40439 +       if (ref->plugin_mask != 0)
40440 +               /* inode has non-standard plugins */
40441 +               inode_set_extension(inode, PLUGIN_STAT);
40442 +       /*
40443 +        * prepare specification of new item to be inserted
40444 +        */
40445 +
40446 +       data.iplug = inode_sd_plugin(inode);
40447 +       data.length = data.iplug->s.sd.save_len(inode);
40448 +       spin_unlock_inode(inode);
40449 +
40450 +       data.data = NULL;
40451 +       data.user = 0;
40452 +/* could be optimized for case where there is only one node format in
40453 + * use in the filesystem, probably there are lots of such
40454 + * places we could optimize for only one node layout.... -Hans */
40455 +       if (data.length > reiser4_tree_by_inode(inode)->nplug->max_item_size()) {
40456 +               /* This is silly check, but we don't know actual node where
40457 +                  insertion will go into. */
40458 +               return RETERR(-ENAMETOOLONG);
40459 +       }
40460 +       oid = oid_allocate(inode->i_sb);
40461 +/* NIKITA-FIXME-HANS: what is your opinion on whether this error check should be
40462 + * encapsulated into oid_allocate? */
40463 +       if (oid == ABSOLUTE_MAX_OID)
40464 +               return RETERR(-EOVERFLOW);
40465 +
40466 +       set_inode_oid(inode, oid);
40467 +
40468 +       coord_init_zero(&coord);
40469 +       init_lh(&lh);
40470 +
40471 +       result = insert_by_key(reiser4_tree_by_inode(inode),
40472 +                              build_sd_key(inode, &key), &data, &coord, &lh,
40473 +                              /* stat data lives on a leaf level */
40474 +                              LEAF_LEVEL, CBK_UNIQUE);
40475 +
40476 +       /* we don't want to re-check that somebody didn't insert
40477 +          stat-data while we were doing io, because if it did,
40478 +          insert_by_key() returned error. */
40479 +       /* but what _is_ possible is that plugin for inode's stat-data,
40480 +          list of non-standard plugins or their state would change
40481 +          during io, so that stat-data wouldn't fit into sd. To avoid
40482 +          this race we keep inode_state lock. This lock has to be
40483 +          taken each time you access inode in a way that would cause
40484 +          changes in sd size: changing plugins etc.
40485 +        */
40486 +
40487 +       if (result == IBK_INSERT_OK) {
40488 +               coord_clear_iplug(&coord);
40489 +               result = zload(coord.node);
40490 +               if (result == 0) {
40491 +                       /* have we really inserted stat data? */
40492 +                       assert("nikita-725", item_is_statdata(&coord));
40493 +
40494 +                       /* inode was just created. It is inserted into hash
40495 +                          table, but no directory entry was yet inserted into
40496 +                          parent. So, inode is inaccessible through
40497 +                          ->lookup(). All places that directly grab inode
40498 +                          from hash-table (like old knfsd), should check
40499 +                          IMMUTABLE flag that is set by common_create_child.
40500 +                        */
40501 +                       assert("nikita-3240", data.iplug != NULL);
40502 +                       assert("nikita-3241", data.iplug->s.sd.save != NULL);
40503 +                       area = item_body_by_coord(&coord);
40504 +                       result = data.iplug->s.sd.save(inode, &area);
40505 +                       znode_make_dirty(coord.node);
40506 +                       if (result == 0) {
40507 +                               /* object has stat-data now */
40508 +                               reiser4_inode_clr_flag(inode, REISER4_NO_SD);
40509 +                               reiser4_inode_set_flag(inode,
40510 +                                                      REISER4_SDLEN_KNOWN);
40511 +                               /* initialise stat-data seal */
40512 +                               reiser4_seal_init(&ref->sd_seal, &coord, &key);
40513 +                               ref->sd_coord = coord;
40514 +                               check_inode_seal(inode, &coord, &key);
40515 +                       } else if (result != -ENOMEM)
40516 +                               /*
40517 +                                * convert any other error code to -EIO to
40518 +                                * avoid confusing user level with unexpected
40519 +                                * errors.
40520 +                                */
40521 +                               result = RETERR(-EIO);
40522 +                       zrelse(coord.node);
40523 +               }
40524 +       }
40525 +       done_lh(&lh);
40526 +
40527 +       if (result != 0)
40528 +               key_warning(&key, inode, result);
40529 +       else
40530 +               oid_count_allocated();
40531 +
40532 +       return result;
40533 +}
40534 +
40535 +/* find sd of inode in a tree, deal with errors */
40536 +int lookup_sd(struct inode *inode /* inode to look sd for */ ,
40537 +             znode_lock_mode lock_mode /* lock mode */ ,
40538 +             coord_t *coord /* resulting coord */ ,
40539 +             lock_handle * lh /* resulting lock handle */ ,
40540 +             const reiser4_key * key /* resulting key */ ,
40541 +             int silent)
40542 +{
40543 +       int result;
40544 +       __u32 flags;
40545 +
40546 +       assert("nikita-1692", inode != NULL);
40547 +       assert("nikita-1693", coord != NULL);
40548 +       assert("nikita-1694", key != NULL);
40549 +
40550 +       /* look for the object's stat data in a tree.
40551 +          This returns in "node" pointer to a locked znode and in "pos"
40552 +          position of an item found in node. Both are only valid if
40553 +          coord_found is returned. */
40554 +       flags = (lock_mode == ZNODE_WRITE_LOCK) ? CBK_FOR_INSERT : 0;
40555 +       flags |= CBK_UNIQUE;
40556 +       /*
40557 +        * traverse tree to find stat data. We cannot use vroot here, because
40558 +        * it only covers _body_ of the file, and stat data don't belong
40559 +        * there.
40560 +        */
40561 +       result = coord_by_key(reiser4_tree_by_inode(inode),
40562 +                             key,
40563 +                             coord,
40564 +                             lh,
40565 +                             lock_mode,
40566 +                             FIND_EXACT, LEAF_LEVEL, LEAF_LEVEL, flags, NULL);
40567 +       if (REISER4_DEBUG && result == 0)
40568 +               check_sd_coord(coord, key);
40569 +
40570 +       if (result != 0 && !silent)
40571 +               key_warning(key, inode, result);
40572 +       return result;
40573 +}
40574 +
40575 +static int
40576 +locate_inode_sd(struct inode *inode,
40577 +               reiser4_key * key, coord_t *coord, lock_handle * lh)
40578 +{
40579 +       reiser4_inode *state;
40580 +       seal_t seal;
40581 +       int result;
40582 +
40583 +       assert("nikita-3483", inode != NULL);
40584 +
40585 +       state = reiser4_inode_data(inode);
40586 +       spin_lock_inode(inode);
40587 +       *coord = state->sd_coord;
40588 +       coord_clear_iplug(coord);
40589 +       seal = state->sd_seal;
40590 +       spin_unlock_inode(inode);
40591 +
40592 +       build_sd_key(inode, key);
40593 +       if (reiser4_seal_is_set(&seal)) {
40594 +               /* first, try to use seal */
40595 +               result = reiser4_seal_validate(&seal,
40596 +                                              coord,
40597 +                                              key,
40598 +                                              lh, ZNODE_WRITE_LOCK,
40599 +                                              ZNODE_LOCK_LOPRI);
40600 +               if (result == 0)
40601 +                       check_sd_coord(coord, key);
40602 +       } else
40603 +               result = -E_REPEAT;
40604 +
40605 +       if (result != 0) {
40606 +               coord_init_zero(coord);
40607 +               result = lookup_sd(inode, ZNODE_WRITE_LOCK, coord, lh, key, 0);
40608 +       }
40609 +       return result;
40610 +}
40611 +
40612 +#if REISER4_DEBUG
40613 +static int all_but_offset_key_eq(const reiser4_key * k1, const reiser4_key * k2)
40614 +{
40615 +       return (get_key_locality(k1) == get_key_locality(k2) &&
40616 +               get_key_type(k1) == get_key_type(k2) &&
40617 +               get_key_band(k1) == get_key_band(k2) &&
40618 +               get_key_ordering(k1) == get_key_ordering(k2) &&
40619 +               get_key_objectid(k1) == get_key_objectid(k2));
40620 +}
40621 +
40622 +#include "../tree_walk.h"
40623 +
40624 +/* make some checks before and after stat-data resize operation */
40625 +static int check_sd_resize(struct inode *inode, coord_t *coord,
40626 +                          int length, int progress/* 1 means after resize */)
40627 +{
40628 +       int ret = 0;
40629 +       lock_handle left_lock;
40630 +       coord_t left_coord;
40631 +       reiser4_key left_key;
40632 +       reiser4_key key;
40633 +
40634 +       if (inode_file_plugin(inode) !=
40635 +           file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID))
40636 +               return 0;
40637 +       if (!length)
40638 +               return 0;
40639 +       if (coord->item_pos != 0)
40640 +               return 0;
40641 +
40642 +       init_lh(&left_lock);
40643 +       ret = reiser4_get_left_neighbor(&left_lock,
40644 +                                       coord->node,
40645 +                                       ZNODE_WRITE_LOCK,
40646 +                                       GN_CAN_USE_UPPER_LEVELS);
40647 +       if (ret == -E_REPEAT || ret == -E_NO_NEIGHBOR ||
40648 +           ret == -ENOENT || ret == -EINVAL
40649 +           || ret == -E_DEADLOCK) {
40650 +               ret = 0;
40651 +               goto exit;
40652 +       }
40653 +       ret = zload(left_lock.node);
40654 +       if (ret)
40655 +               goto exit;
40656 +       coord_init_last_unit(&left_coord, left_lock.node);
40657 +       item_key_by_coord(&left_coord, &left_key);
40658 +       item_key_by_coord(coord, &key);
40659 +
40660 +       if (all_but_offset_key_eq(&key, &left_key))
40661 +               /* corruption occured */
40662 +               ret = 1;
40663 +       zrelse(left_lock.node);
40664 + exit:
40665 +       done_lh(&left_lock);
40666 +       return ret;
40667 +}
40668 +#endif
40669 +
40670 +/* update stat-data at @coord */
40671 +static int
40672 +update_sd_at(struct inode *inode, coord_t *coord, reiser4_key * key,
40673 +            lock_handle * lh)
40674 +{
40675 +       int result;
40676 +       reiser4_item_data data;
40677 +       char *area;
40678 +       reiser4_inode *state;
40679 +       znode *loaded;
40680 +
40681 +       state = reiser4_inode_data(inode);
40682 +
40683 +       coord_clear_iplug(coord);
40684 +       result = zload(coord->node);
40685 +       if (result != 0)
40686 +               return result;
40687 +       loaded = coord->node;
40688 +
40689 +       spin_lock_inode(inode);
40690 +       assert("nikita-728", inode_sd_plugin(inode) != NULL);
40691 +       data.iplug = inode_sd_plugin(inode);
40692 +
40693 +       /* if inode has non-standard plugins, add appropriate stat data
40694 +        * extension */
40695 +       if (state->extmask & (1 << PLUGIN_STAT)) {
40696 +               if (state->plugin_mask == 0)
40697 +                       inode_clr_extension(inode, PLUGIN_STAT);
40698 +       } else if (state->plugin_mask != 0)
40699 +               inode_set_extension(inode, PLUGIN_STAT);
40700 +
40701 +       if (state->extmask & (1 << HEIR_STAT)) {
40702 +               if (state->heir_mask == 0)
40703 +                       inode_clr_extension(inode, HEIR_STAT);
40704 +       } else if (state->heir_mask != 0)
40705 +                       inode_set_extension(inode, HEIR_STAT);
40706 +
40707 +       /* data.length is how much space to add to (or remove
40708 +          from if negative) sd */
40709 +       if (!reiser4_inode_get_flag(inode, REISER4_SDLEN_KNOWN)) {
40710 +               /* recalculate stat-data length */
40711 +               data.length =
40712 +                   data.iplug->s.sd.save_len(inode) -
40713 +                   item_length_by_coord(coord);
40714 +               reiser4_inode_set_flag(inode, REISER4_SDLEN_KNOWN);
40715 +       } else
40716 +               data.length = 0;
40717 +       spin_unlock_inode(inode);
40718 +
40719 +       /* if on-disk stat data is of different length than required
40720 +          for this inode, resize it */
40721 +
40722 +       if (data.length != 0) {
40723 +               data.data = NULL;
40724 +               data.user = 0;
40725 +
40726 +               assert("edward-1441",
40727 +                      !check_sd_resize(inode, coord,
40728 +                                       data.length, 0/* before resize */));
40729 +
40730 +               /* insertion code requires that insertion point (coord) was
40731 +                * between units. */
40732 +               coord->between = AFTER_UNIT;
40733 +               result = reiser4_resize_item(coord, &data, key, lh,
40734 +                                            COPI_DONT_SHIFT_LEFT);
40735 +               if (result != 0) {
40736 +                       key_warning(key, inode, result);
40737 +                       zrelse(loaded);
40738 +                       return result;
40739 +               }
40740 +               if (loaded != coord->node) {
40741 +                 /* reiser4_resize_item moved coord to another node.
40742 +                    Zload it */
40743 +                       zrelse(loaded);
40744 +                       coord_clear_iplug(coord);
40745 +                       result = zload(coord->node);
40746 +                       if (result != 0)
40747 +                               return result;
40748 +                       loaded = coord->node;
40749 +               }
40750 +               assert("edward-1442",
40751 +                      !check_sd_resize(inode, coord,
40752 +                                       data.length, 1/* after resize */));
40753 +       }
40754 +       area = item_body_by_coord(coord);
40755 +       spin_lock_inode(inode);
40756 +       result = data.iplug->s.sd.save(inode, &area);
40757 +       znode_make_dirty(coord->node);
40758 +
40759 +       /* re-initialise stat-data seal */
40760 +
40761 +       /*
40762 +        * coord.between was possibly skewed from AT_UNIT when stat-data size
40763 +        * was changed and new extensions were pasted into item.
40764 +        */
40765 +       coord->between = AT_UNIT;
40766 +       reiser4_seal_init(&state->sd_seal, coord, key);
40767 +       state->sd_coord = *coord;
40768 +       spin_unlock_inode(inode);
40769 +       check_inode_seal(inode, coord, key);
40770 +       zrelse(loaded);
40771 +       return result;
40772 +}
40773 +
40774 +/* Update existing stat-data in a tree. Called with inode state locked. Return
40775 +   inode state locked. */
40776 +static int update_sd(struct inode *inode/* inode to update sd for */)
40777 +{
40778 +       int result;
40779 +       reiser4_key key;
40780 +       coord_t coord;
40781 +       lock_handle lh;
40782 +
40783 +       assert("nikita-726", inode != NULL);
40784 +
40785 +       /* no stat-data, nothing to update?! */
40786 +       assert("nikita-3482", !reiser4_inode_get_flag(inode, REISER4_NO_SD));
40787 +
40788 +       init_lh(&lh);
40789 +
40790 +       result = locate_inode_sd(inode, &key, &coord, &lh);
40791 +       if (result == 0)
40792 +               result = update_sd_at(inode, &coord, &key, &lh);
40793 +       done_lh(&lh);
40794 +
40795 +       return result;
40796 +}
40797 +
40798 +/* helper for reiser4_delete_object_common and reiser4_delete_dir_common.
40799 +   Remove object stat data. Space for that must be reserved by caller before
40800 +*/
40801 +static int
40802 +common_object_delete_no_reserve(struct inode *inode/* object to remove */)
40803 +{
40804 +       int result;
40805 +
40806 +       assert("nikita-1477", inode != NULL);
40807 +
40808 +       if (!reiser4_inode_get_flag(inode, REISER4_NO_SD)) {
40809 +               reiser4_key sd_key;
40810 +
40811 +               DQUOT_FREE_INODE(inode);
40812 +               DQUOT_DROP(inode);
40813 +
40814 +               build_sd_key(inode, &sd_key);
40815 +               result =
40816 +                   reiser4_cut_tree(reiser4_tree_by_inode(inode),
40817 +                                    &sd_key, &sd_key, NULL, 0);
40818 +               if (result == 0) {
40819 +                       reiser4_inode_set_flag(inode, REISER4_NO_SD);
40820 +                       result = oid_release(inode->i_sb, get_inode_oid(inode));
40821 +                       if (result == 0) {
40822 +                               oid_count_released();
40823 +
40824 +                               result = safe_link_del(reiser4_tree_by_inode(inode),
40825 +                                                      get_inode_oid(inode),
40826 +                                                      SAFE_UNLINK);
40827 +                       }
40828 +               }
40829 +       } else
40830 +               result = 0;
40831 +       return result;
40832 +}
40833 +
40834 +/* helper for safelink_common */
40835 +static int process_truncate(struct inode *inode, __u64 size)
40836 +{
40837 +       int result;
40838 +       struct iattr attr;
40839 +       file_plugin *fplug;
40840 +       reiser4_context *ctx;
40841 +       struct dentry dentry;
40842 +
40843 +       assert("vs-21", is_in_reiser4_context());
40844 +       ctx = reiser4_init_context(inode->i_sb);
40845 +       assert("vs-22", !IS_ERR(ctx));
40846 +
40847 +       attr.ia_size = size;
40848 +       attr.ia_valid = ATTR_SIZE | ATTR_CTIME;
40849 +       fplug = inode_file_plugin(inode);
40850 +
40851 +       mutex_lock(&inode->i_mutex);
40852 +       assert("vs-1704", get_current_context()->trans->atom == NULL);
40853 +       dentry.d_inode = inode;
40854 +       result = inode->i_op->setattr(&dentry, &attr);
40855 +       mutex_unlock(&inode->i_mutex);
40856 +
40857 +       context_set_commit_async(ctx);
40858 +       reiser4_exit_context(ctx);
40859 +
40860 +       return result;
40861 +}
40862 +
40863 +/*
40864 +  Local variables:
40865 +  c-indentation-style: "K&R"
40866 +  mode-name: "LC"
40867 +  c-basic-offset: 8
40868 +  tab-width: 8
40869 +  fill-column: 80
40870 +  scroll-step: 1
40871 +  End:
40872 +*/
40873 diff -puN /dev/null fs/reiser4/plugin/hash.c
40874 --- /dev/null
40875 +++ a/fs/reiser4/plugin/hash.c
40876 @@ -0,0 +1,352 @@
40877 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
40878 + * reiser4/README */
40879 +
40880 +/* Hash functions */
40881 +
40882 +#include "../debug.h"
40883 +#include "plugin_header.h"
40884 +#include "plugin.h"
40885 +#include "../super.h"
40886 +#include "../inode.h"
40887 +
40888 +#include <linux/types.h>
40889 +
40890 +/* old rupasov (yura) hash */
40891 +static __u64 hash_rupasov(const unsigned char *name /* name to hash */ ,
40892 +                         int len/* @name's length */)
40893 +{
40894 +       int i;
40895 +       int j;
40896 +       int pow;
40897 +       __u64 a;
40898 +       __u64 c;
40899 +
40900 +       assert("nikita-672", name != NULL);
40901 +       assert("nikita-673", len >= 0);
40902 +
40903 +       for (pow = 1, i = 1; i < len; ++i)
40904 +               pow = pow * 10;
40905 +
40906 +       if (len == 1)
40907 +               a = name[0] - 48;
40908 +       else
40909 +               a = (name[0] - 48) * pow;
40910 +
40911 +       for (i = 1; i < len; ++i) {
40912 +               c = name[i] - 48;
40913 +               for (pow = 1, j = i; j < len - 1; ++j)
40914 +                       pow = pow * 10;
40915 +               a = a + c * pow;
40916 +       }
40917 +       for (; i < 40; ++i) {
40918 +               c = '0' - 48;
40919 +               for (pow = 1, j = i; j < len - 1; ++j)
40920 +                       pow = pow * 10;
40921 +               a = a + c * pow;
40922 +       }
40923 +
40924 +       for (; i < 256; ++i) {
40925 +               c = i;
40926 +               for (pow = 1, j = i; j < len - 1; ++j)
40927 +                       pow = pow * 10;
40928 +               a = a + c * pow;
40929 +       }
40930 +
40931 +       a = a << 7;
40932 +       return a;
40933 +}
40934 +
40935 +/* r5 hash */
40936 +static __u64 hash_r5(const unsigned char *name /* name to hash */ ,
40937 +                    int len UNUSED_ARG/* @name's length */)
40938 +{
40939 +       __u64 a = 0;
40940 +
40941 +       assert("nikita-674", name != NULL);
40942 +       assert("nikita-675", len >= 0);
40943 +
40944 +       while (*name) {
40945 +               a += *name << 4;
40946 +               a += *name >> 4;
40947 +               a *= 11;
40948 +               name++;
40949 +       }
40950 +       return a;
40951 +}
40952 +
40953 +/* Keyed 32-bit hash function using TEA in a Davis-Meyer function
40954 +     H0 = Key
40955 +     Hi = E Mi(Hi-1) + Hi-1
40956 +
40957 +   (see Applied Cryptography, 2nd edition, p448).
40958 +
40959 +   Jeremy Fitzhardinge <jeremy@zip.com.au> 1998
40960 +
40961 +   Jeremy has agreed to the contents of reiserfs/README. -Hans
40962 +
40963 +   This code was blindly upgraded to __u64 by s/__u32/__u64/g.
40964 +*/
40965 +static __u64 hash_tea(const unsigned char *name /* name to hash */ ,
40966 +                     int len/* @name's length */)
40967 +{
40968 +       __u64 k[] = { 0x9464a485u, 0x542e1a94u, 0x3e846bffu, 0xb75bcfc3u };
40969 +
40970 +       __u64 h0 = k[0], h1 = k[1];
40971 +       __u64 a, b, c, d;
40972 +       __u64 pad;
40973 +       int i;
40974 +
40975 +       assert("nikita-676", name != NULL);
40976 +       assert("nikita-677", len >= 0);
40977 +
40978 +#define DELTA 0x9E3779B9u
40979 +#define FULLROUNDS 10          /* 32 is overkill, 16 is strong crypto */
40980 +#define PARTROUNDS 6           /* 6 gets complete mixing */
40981 +
40982 +/* a, b, c, d - data; h0, h1 - accumulated hash */
40983 +#define TEACORE(rounds)                                                        \
40984 +       do {                                                            \
40985 +               __u64 sum = 0;                                          \
40986 +               int n = rounds;                                         \
40987 +               __u64 b0, b1;                                           \
40988 +                                                                       \
40989 +               b0 = h0;                                                \
40990 +               b1 = h1;                                                \
40991 +                                                                       \
40992 +               do {                                                    \
40993 +                       sum += DELTA;                                   \
40994 +                       b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b); \
40995 +                       b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d); \
40996 +               } while (--n);                                          \
40997 +                                                                       \
40998 +               h0 += b0;                                               \
40999 +               h1 += b1;                                               \
41000 +       } while (0)
41001 +
41002 +       pad = (__u64) len | ((__u64) len << 8);
41003 +       pad |= pad << 16;
41004 +
41005 +       while (len >= 16) {
41006 +               a = (__u64) name[0] | (__u64) name[1] << 8 | (__u64) name[2] <<
41007 +                   16 | (__u64) name[3] << 24;
41008 +               b = (__u64) name[4] | (__u64) name[5] << 8 | (__u64) name[6] <<
41009 +                   16 | (__u64) name[7] << 24;
41010 +               c = (__u64) name[8] | (__u64) name[9] << 8 | (__u64) name[10] <<
41011 +                   16 | (__u64) name[11] << 24;
41012 +               d = (__u64) name[12] | (__u64) name[13] << 8 | (__u64) name[14]
41013 +                   << 16 | (__u64) name[15] << 24;
41014 +
41015 +               TEACORE(PARTROUNDS);
41016 +
41017 +               len -= 16;
41018 +               name += 16;
41019 +       }
41020 +
41021 +       if (len >= 12) {
41022 +               /* assert(len < 16); */
41023 +               if (len >= 16)
41024 +                       *(int *)0 = 0;
41025 +
41026 +               a = (__u64) name[0] | (__u64) name[1] << 8 | (__u64) name[2] <<
41027 +                   16 | (__u64) name[3] << 24;
41028 +               b = (__u64) name[4] | (__u64) name[5] << 8 | (__u64) name[6] <<
41029 +                   16 | (__u64) name[7] << 24;
41030 +               c = (__u64) name[8] | (__u64) name[9] << 8 | (__u64) name[10] <<
41031 +                   16 | (__u64) name[11] << 24;
41032 +
41033 +               d = pad;
41034 +               for (i = 12; i < len; i++) {
41035 +                       d <<= 8;
41036 +                       d |= name[i];
41037 +               }
41038 +       } else if (len >= 8) {
41039 +               /* assert(len < 12); */
41040 +               if (len >= 12)
41041 +                       *(int *)0 = 0;
41042 +               a = (__u64) name[0] | (__u64) name[1] << 8 | (__u64) name[2] <<
41043 +                   16 | (__u64) name[3] << 24;
41044 +               b = (__u64) name[4] | (__u64) name[5] << 8 | (__u64) name[6] <<
41045 +                   16 | (__u64) name[7] << 24;
41046 +
41047 +               c = d = pad;
41048 +               for (i = 8; i < len; i++) {
41049 +                       c <<= 8;
41050 +                       c |= name[i];
41051 +               }
41052 +       } else if (len >= 4) {
41053 +               /* assert(len < 8); */
41054 +               if (len >= 8)
41055 +                       *(int *)0 = 0;
41056 +               a = (__u64) name[0] | (__u64) name[1] << 8 | (__u64) name[2] <<
41057 +                   16 | (__u64) name[3] << 24;
41058 +
41059 +               b = c = d = pad;
41060 +               for (i = 4; i < len; i++) {
41061 +                       b <<= 8;
41062 +                       b |= name[i];
41063 +               }
41064 +       } else {
41065 +               /* assert(len < 4); */
41066 +               if (len >= 4)
41067 +                       *(int *)0 = 0;
41068 +               a = b = c = d = pad;
41069 +               for (i = 0; i < len; i++) {
41070 +                       a <<= 8;
41071 +                       a |= name[i];
41072 +               }
41073 +       }
41074 +
41075 +       TEACORE(FULLROUNDS);
41076 +
41077 +/*     return 0;*/
41078 +       return h0 ^ h1;
41079 +
41080 +}
41081 +
41082 +/* classical 64 bit Fowler/Noll/Vo-1 (FNV-1) hash.
41083 +
41084 +   See http://www.isthe.com/chongo/tech/comp/fnv/ for details.
41085 +
41086 +   Excerpts:
41087 +
41088 +     FNV hashes are designed to be fast while maintaining a low collision
41089 +     rate.
41090 +
41091 +     [This version also seems to preserve lexicographical order locally.]
41092 +
41093 +     FNV hash algorithms and source code have been released into the public
41094 +     domain.
41095 +
41096 +*/
41097 +static __u64 hash_fnv1(const unsigned char *name /* name to hash */ ,
41098 +                      int len UNUSED_ARG/* @name's length */)
41099 +{
41100 +       unsigned long long a = 0xcbf29ce484222325ull;
41101 +       const unsigned long long fnv_64_prime = 0x100000001b3ull;
41102 +
41103 +       assert("nikita-678", name != NULL);
41104 +       assert("nikita-679", len >= 0);
41105 +
41106 +       /* FNV-1 hash each octet in the buffer */
41107 +       for (; *name; ++name) {
41108 +               /* multiply by the 32 bit FNV magic prime mod 2^64 */
41109 +               a *= fnv_64_prime;
41110 +               /* xor the bottom with the current octet */
41111 +               a ^= (unsigned long long)(*name);
41112 +       }
41113 +       /* return our new hash value */
41114 +       return a;
41115 +}
41116 +
41117 +/* degenerate hash function used to simplify testing of non-unique key
41118 +   handling */
41119 +static __u64 hash_deg(const unsigned char *name UNUSED_ARG /* name to hash */ ,
41120 +                     int len UNUSED_ARG/* @name's length */)
41121 +{
41122 +       return 0xc0c0c0c010101010ull;
41123 +}
41124 +
41125 +static int change_hash(struct inode *inode,
41126 +                      reiser4_plugin * plugin,
41127 +                      pset_member memb)
41128 +{
41129 +       int result;
41130 +
41131 +       assert("nikita-3503", inode != NULL);
41132 +       assert("nikita-3504", plugin != NULL);
41133 +
41134 +       assert("nikita-3505", is_reiser4_inode(inode));
41135 +       assert("nikita-3507", plugin->h.type_id == REISER4_HASH_PLUGIN_TYPE);
41136 +
41137 +       if (!plugin_of_group(inode_file_plugin(inode), REISER4_DIRECTORY_FILE))
41138 +               return RETERR(-EINVAL);
41139 +
41140 +       result = 0;
41141 +       if (inode_hash_plugin(inode) == NULL ||
41142 +           inode_hash_plugin(inode)->h.id != plugin->h.id) {
41143 +               if (is_dir_empty(inode) == 0)
41144 +                       result = aset_set_unsafe(&reiser4_inode_data(inode)->pset,
41145 +                                                PSET_HASH, plugin);
41146 +               else
41147 +                       result = RETERR(-ENOTEMPTY);
41148 +
41149 +       }
41150 +       return result;
41151 +}
41152 +
41153 +static reiser4_plugin_ops hash_plugin_ops = {
41154 +       .init = NULL,
41155 +       .load = NULL,
41156 +       .save_len = NULL,
41157 +       .save = NULL,
41158 +       .change = change_hash
41159 +};
41160 +
41161 +/* hash plugins */
41162 +hash_plugin hash_plugins[LAST_HASH_ID] = {
41163 +       [RUPASOV_HASH_ID] = {
41164 +               .h = {
41165 +                       .type_id = REISER4_HASH_PLUGIN_TYPE,
41166 +                       .id = RUPASOV_HASH_ID,
41167 +                       .pops = &hash_plugin_ops,
41168 +                       .label = "rupasov",
41169 +                       .desc = "Original Yura's hash",
41170 +                       .linkage = {NULL, NULL}
41171 +               },
41172 +               .hash = hash_rupasov
41173 +       },
41174 +       [R5_HASH_ID] = {
41175 +               .h = {
41176 +                       .type_id = REISER4_HASH_PLUGIN_TYPE,
41177 +                       .id = R5_HASH_ID,
41178 +                       .pops = &hash_plugin_ops,
41179 +                       .label = "r5",
41180 +                       .desc = "r5 hash",
41181 +                       .linkage = {NULL, NULL}
41182 +               },
41183 +               .hash = hash_r5
41184 +       },
41185 +       [TEA_HASH_ID] = {
41186 +               .h = {
41187 +                       .type_id = REISER4_HASH_PLUGIN_TYPE,
41188 +                       .id = TEA_HASH_ID,
41189 +                       .pops = &hash_plugin_ops,
41190 +                       .label = "tea",
41191 +                       .desc = "tea hash",
41192 +                       .linkage = {NULL, NULL}
41193 +               },
41194 +               .hash = hash_tea
41195 +       },
41196 +       [FNV1_HASH_ID] = {
41197 +               .h = {
41198 +                       .type_id = REISER4_HASH_PLUGIN_TYPE,
41199 +                       .id = FNV1_HASH_ID,
41200 +                       .pops = &hash_plugin_ops,
41201 +                       .label = "fnv1",
41202 +                       .desc = "fnv1 hash",
41203 +                       .linkage = {NULL, NULL}
41204 +               },
41205 +               .hash = hash_fnv1
41206 +       },
41207 +       [DEGENERATE_HASH_ID] = {
41208 +               .h = {
41209 +                       .type_id = REISER4_HASH_PLUGIN_TYPE,
41210 +                       .id = DEGENERATE_HASH_ID,
41211 +                       .pops = &hash_plugin_ops,
41212 +                       .label = "degenerate hash",
41213 +                       .desc = "Degenerate hash: only for testing",
41214 +                       .linkage = {NULL, NULL}
41215 +               },
41216 +               .hash = hash_deg
41217 +       }
41218 +};
41219 +
41220 +/* Make Linus happy.
41221 +   Local variables:
41222 +   c-indentation-style: "K&R"
41223 +   mode-name: "LC"
41224 +   c-basic-offset: 8
41225 +   tab-width: 8
41226 +   fill-column: 120
41227 +   End:
41228 +*/
41229 diff -puN /dev/null fs/reiser4/plugin/inode_ops.c
41230 --- /dev/null
41231 +++ a/fs/reiser4/plugin/inode_ops.c
41232 @@ -0,0 +1,906 @@
41233 +/*
41234 + * Copyright 2005 by Hans Reiser, licensing governed by reiser4/README
41235 + */
41236 +
41237 +/*
41238 + * this file contains typical implementations for most of methods of struct
41239 + * inode_operations
41240 + */
41241 +
41242 +#include "../inode.h"
41243 +#include "../safe_link.h"
41244 +
41245 +#include <linux/quotaops.h>
41246 +#include <linux/namei.h>
41247 +
41248 +static int create_vfs_object(struct inode *parent, struct dentry *dentry,
41249 +                     reiser4_object_create_data *data);
41250 +
41251 +/**
41252 + * reiser4_create_common - create of inode operations
41253 + * @parent: inode of parent directory
41254 + * @dentry: dentry of new object to create
41255 + * @mode: the permissions to use
41256 + * @nameidata:
41257 + *
41258 + * This is common implementation of vfs's create method of struct
41259 + * inode_operations.
41260 + * Creates regular file using file plugin from parent directory plugin set.
41261 + */
41262 +int reiser4_create_common(struct inode *parent, struct dentry *dentry,
41263 +                         int mode, struct nameidata *nameidata)
41264 +{
41265 +       reiser4_object_create_data data;
41266 +       file_plugin *fplug;
41267 +
41268 +       memset(&data, 0, sizeof data);
41269 +       data.mode = S_IFREG | mode;
41270 +       fplug = child_create_plugin(parent) ? : inode_create_plugin(parent);
41271 +       if (!plugin_of_group(fplug, REISER4_REGULAR_FILE)) {
41272 +               warning("vpf-1900", "'%s' is not a regular file plugin.",
41273 +                       fplug->h.label);
41274 +               return RETERR(-EIO);
41275 +       }
41276 +       data.id = fplug->h.id;
41277 +       return create_vfs_object(parent, dentry, &data);
41278 +}
41279 +
41280 +int reiser4_lookup_name(struct inode *dir, struct dentry *, reiser4_key *);
41281 +void check_light_weight(struct inode *inode, struct inode *parent);
41282 +
41283 +/**
41284 + * reiser4_lookup_common - lookup of inode operations
41285 + * @parent: inode of directory to lookup into
41286 + * @dentry: name to look for
41287 + * @nameidata:
41288 + *
41289 + * This is common implementation of vfs's lookup method of struct
41290 + * inode_operations.
41291 + */
41292 +struct dentry *reiser4_lookup_common(struct inode *parent,
41293 +                                    struct dentry *dentry,
41294 +                                    struct nameidata *nameidata)
41295 +{
41296 +       reiser4_context *ctx;
41297 +       int result;
41298 +       struct dentry *new;
41299 +       struct inode *inode;
41300 +       reiser4_dir_entry_desc entry;
41301 +
41302 +       ctx = reiser4_init_context(parent->i_sb);
41303 +       if (IS_ERR(ctx))
41304 +               return (struct dentry *)ctx;
41305 +
41306 +       /* set up operations on dentry. */
41307 +       dentry->d_op = &get_super_private(parent->i_sb)->ops.dentry;
41308 +
41309 +       result = reiser4_lookup_name(parent, dentry, &entry.key);
41310 +       if (result) {
41311 +               context_set_commit_async(ctx);
41312 +               reiser4_exit_context(ctx);
41313 +               if (result == -ENOENT) {
41314 +                       /* object not found */
41315 +                       if (!IS_DEADDIR(parent))
41316 +                               d_add(dentry, NULL);
41317 +                       return NULL;
41318 +               }
41319 +               return ERR_PTR(result);
41320 +       }
41321 +
41322 +       inode = reiser4_iget(parent->i_sb, &entry.key, 0);
41323 +       if (IS_ERR(inode)) {
41324 +               context_set_commit_async(ctx);
41325 +               reiser4_exit_context(ctx);
41326 +               return ERR_PTR(PTR_ERR(inode));
41327 +       }
41328 +
41329 +       /* success */
41330 +       check_light_weight(inode, parent);
41331 +       new = d_splice_alias(inode, dentry);
41332 +       reiser4_iget_complete(inode);
41333 +
41334 +       /* prevent balance_dirty_pages() from being called: we don't want to
41335 +        * do this under directory i_mutex. */
41336 +       context_set_commit_async(ctx);
41337 +       reiser4_exit_context(ctx);
41338 +       return new;
41339 +}
41340 +
41341 +static reiser4_block_nr common_estimate_link(struct inode *parent,
41342 +                                            struct inode *object);
41343 +int reiser4_update_dir(struct inode *);
41344 +
41345 +/**
41346 + * reiser4_link_common - link of inode operations
41347 + * @existing: dentry of object which is to get new name
41348 + * @parent: directory where new name is to be created
41349 + * @newname: new name
41350 + *
41351 + * This is common implementation of vfs's link method of struct
41352 + * inode_operations.
41353 + */
41354 +int reiser4_link_common(struct dentry *existing, struct inode *parent,
41355 +                       struct dentry *newname)
41356 +{
41357 +       reiser4_context *ctx;
41358 +       int result;
41359 +       struct inode *object;
41360 +       dir_plugin *parent_dplug;
41361 +       reiser4_dir_entry_desc entry;
41362 +       reiser4_object_create_data data;
41363 +       reiser4_block_nr reserve;
41364 +
41365 +       ctx = reiser4_init_context(parent->i_sb);
41366 +       if (IS_ERR(ctx))
41367 +               return PTR_ERR(ctx);
41368 +
41369 +       assert("nikita-1431", existing != NULL);
41370 +       assert("nikita-1432", parent != NULL);
41371 +       assert("nikita-1433", newname != NULL);
41372 +
41373 +       object = existing->d_inode;
41374 +       assert("nikita-1434", object != NULL);
41375 +
41376 +       /* check for race with create_object() */
41377 +       if (reiser4_inode_get_flag(object, REISER4_IMMUTABLE)) {
41378 +               context_set_commit_async(ctx);
41379 +               reiser4_exit_context(ctx);
41380 +               return RETERR(-E_REPEAT);
41381 +       }
41382 +
41383 +       parent_dplug = inode_dir_plugin(parent);
41384 +
41385 +       memset(&entry, 0, sizeof entry);
41386 +       entry.obj = object;
41387 +
41388 +       data.mode = object->i_mode;
41389 +       data.id = inode_file_plugin(object)->h.id;
41390 +
41391 +       reserve = common_estimate_link(parent, existing->d_inode);
41392 +       if ((__s64) reserve < 0) {
41393 +               context_set_commit_async(ctx);
41394 +               reiser4_exit_context(ctx);
41395 +               return reserve;
41396 +       }
41397 +
41398 +       if (reiser4_grab_space(reserve, BA_CAN_COMMIT)) {
41399 +               context_set_commit_async(ctx);
41400 +               reiser4_exit_context(ctx);
41401 +               return RETERR(-ENOSPC);
41402 +       }
41403 +
41404 +       /*
41405 +        * Subtle race handling: sys_link() doesn't take i_mutex on @parent. It
41406 +        * means that link(2) can race against unlink(2) or rename(2), and
41407 +        * inode is dead (->i_nlink == 0) when reiser4_link() is entered.
41408 +        *
41409 +        * For such inode we have to undo special processing done in
41410 +        * reiser4_unlink() viz. creation of safe-link.
41411 +        */
41412 +       if (unlikely(object->i_nlink == 0)) {
41413 +               result = safe_link_del(reiser4_tree_by_inode(object),
41414 +                                      get_inode_oid(object), SAFE_UNLINK);
41415 +               if (result != 0) {
41416 +                       context_set_commit_async(ctx);
41417 +                       reiser4_exit_context(ctx);
41418 +                       return result;
41419 +               }
41420 +       }
41421 +
41422 +       /* increment nlink of @existing and update its stat data */
41423 +       result = reiser4_add_nlink(object, parent, 1);
41424 +       if (result == 0) {
41425 +               /* add entry to the parent */
41426 +               result =
41427 +                   parent_dplug->add_entry(parent, newname, &data, &entry);
41428 +               if (result != 0) {
41429 +                       /* failed to add entry to the parent, decrement nlink
41430 +                          of @existing */
41431 +                       reiser4_del_nlink(object, parent, 1);
41432 +                       /*
41433 +                        * now, if that failed, we have a file with too big
41434 +                        * nlink---space leak, much better than directory
41435 +                        * entry pointing to nowhere
41436 +                        */
41437 +               }
41438 +       }
41439 +       if (result == 0) {
41440 +               atomic_inc(&object->i_count);
41441 +               /*
41442 +                * Upon successful completion, link() shall mark for update
41443 +                * the st_ctime field of the file. Also, the st_ctime and
41444 +                * st_mtime fields of the directory that contains the new
41445 +                * entry shall be marked for update. --SUS
41446 +                */
41447 +               result = reiser4_update_dir(parent);
41448 +       }
41449 +       if (result == 0)
41450 +               d_instantiate(newname, existing->d_inode);
41451 +
41452 +       context_set_commit_async(ctx);
41453 +       reiser4_exit_context(ctx);
41454 +       return result;
41455 +}
41456 +
41457 +static int unlink_check_and_grab(struct inode *parent, struct dentry *victim);
41458 +
41459 +/**
41460 + * reiser4_unlink_common - unlink of inode operations
41461 + * @parent: inode of directory to remove name from
41462 + * @victim: name to be removed
41463 + *
41464 + * This is common implementation of vfs's unlink method of struct
41465 + * inode_operations.
41466 + */
41467 +int reiser4_unlink_common(struct inode *parent, struct dentry *victim)
41468 +{
41469 +       reiser4_context *ctx;
41470 +       int result;
41471 +       struct inode *object;
41472 +       file_plugin *fplug;
41473 +
41474 +       ctx = reiser4_init_context(parent->i_sb);
41475 +       if (IS_ERR(ctx))
41476 +               return PTR_ERR(ctx);
41477 +
41478 +       object = victim->d_inode;
41479 +       fplug = inode_file_plugin(object);
41480 +       assert("nikita-2882", fplug->detach != NULL);
41481 +
41482 +       result = unlink_check_and_grab(parent, victim);
41483 +       if (result != 0) {
41484 +               context_set_commit_async(ctx);
41485 +               reiser4_exit_context(ctx);
41486 +               return result;
41487 +       }
41488 +
41489 +       result = fplug->detach(object, parent);
41490 +       if (result == 0) {
41491 +               dir_plugin *parent_dplug;
41492 +               reiser4_dir_entry_desc entry;
41493 +
41494 +               parent_dplug = inode_dir_plugin(parent);
41495 +               memset(&entry, 0, sizeof entry);
41496 +
41497 +               /* first, delete directory entry */
41498 +               result = parent_dplug->rem_entry(parent, victim, &entry);
41499 +               if (result == 0) {
41500 +                       /*
41501 +                        * if name was removed successfully, we _have_ to
41502 +                        * return 0 from this function, because upper level
41503 +                        * caller (vfs_{rmdir,unlink}) expect this.
41504 +                        *
41505 +                        * now that directory entry is removed, update
41506 +                        * stat-data
41507 +                        */
41508 +                       reiser4_del_nlink(object, parent, 1);
41509 +                       /*
41510 +                        * Upon successful completion, unlink() shall mark for
41511 +                        * update the st_ctime and st_mtime fields of the
41512 +                        * parent directory. Also, if the file's link count is
41513 +                        * not 0, the st_ctime field of the file shall be
41514 +                        * marked for update. --SUS
41515 +                        */
41516 +                       reiser4_update_dir(parent);
41517 +                       /* add safe-link for this file */
41518 +                       if (object->i_nlink == 0)
41519 +                               safe_link_add(object, SAFE_UNLINK);
41520 +               }
41521 +       }
41522 +
41523 +       if (unlikely(result != 0)) {
41524 +               if (result != -ENOMEM)
41525 +                       warning("nikita-3398", "Cannot unlink %llu (%i)",
41526 +                               (unsigned long long)get_inode_oid(object),
41527 +                               result);
41528 +               /* if operation failed commit pending inode modifications to
41529 +                * the stat-data */
41530 +               reiser4_update_sd(object);
41531 +               reiser4_update_sd(parent);
41532 +       }
41533 +
41534 +       reiser4_release_reserved(object->i_sb);
41535 +
41536 +       /* @object's i_ctime was updated by ->rem_link() method(). */
41537 +
41538 +       /* @victim can be already removed from the disk by this time. Inode is
41539 +          then marked so that iput() wouldn't try to remove stat data. But
41540 +          inode itself is still there.
41541 +        */
41542 +
41543 +       /*
41544 +        * we cannot release directory semaphore here, because name has
41545 +        * already been deleted, but dentry (@victim) still exists.  Prevent
41546 +        * balance_dirty_pages() from being called on exiting this context: we
41547 +        * don't want to do this under directory i_mutex.
41548 +        */
41549 +       context_set_commit_async(ctx);
41550 +       reiser4_exit_context(ctx);
41551 +       return result;
41552 +}
41553 +
41554 +/**
41555 + * reiser4_symlink_common - symlink of inode operations
41556 + * @parent: inode of parent directory
41557 + * @dentry: dentry of object to be created
41558 + * @linkname: string symlink is to contain
41559 + *
41560 + * This is common implementation of vfs's symlink method of struct
41561 + * inode_operations.
41562 + * Creates object using file plugin SYMLINK_FILE_PLUGIN_ID.
41563 + */
41564 +int reiser4_symlink_common(struct inode *parent, struct dentry *dentry,
41565 +                          const char *linkname)
41566 +{
41567 +       reiser4_object_create_data data;
41568 +
41569 +       memset(&data, 0, sizeof data);
41570 +       data.name = linkname;
41571 +       data.id = SYMLINK_FILE_PLUGIN_ID;
41572 +       data.mode = S_IFLNK | S_IRWXUGO;
41573 +       return create_vfs_object(parent, dentry, &data);
41574 +}
41575 +
41576 +/**
41577 + * reiser4_mkdir_common - mkdir of inode operations
41578 + * @parent: inode of parent directory
41579 + * @dentry: dentry of object to be created
41580 + * @mode: the permissions to use
41581 + *
41582 + * This is common implementation of vfs's mkdir method of struct
41583 + * inode_operations.
41584 + * Creates object using file plugin DIRECTORY_FILE_PLUGIN_ID.
41585 + */
41586 +int reiser4_mkdir_common(struct inode *parent, struct dentry *dentry, int mode)
41587 +{
41588 +       reiser4_object_create_data data;
41589 +
41590 +       memset(&data, 0, sizeof data);
41591 +       data.mode = S_IFDIR | mode;
41592 +       data.id = DIRECTORY_FILE_PLUGIN_ID;
41593 +       return create_vfs_object(parent, dentry, &data);
41594 +}
41595 +
41596 +/**
41597 + * reiser4_mknod_common - mknod of inode operations
41598 + * @parent: inode of parent directory
41599 + * @dentry: dentry of object to be created
41600 + * @mode: the permissions to use and file type
41601 + * @rdev: minor and major of new device file
41602 + *
41603 + * This is common implementation of vfs's mknod method of struct
41604 + * inode_operations.
41605 + * Creates object using file plugin SPECIAL_FILE_PLUGIN_ID.
41606 + */
41607 +int reiser4_mknod_common(struct inode *parent, struct dentry *dentry,
41608 +                        int mode, dev_t rdev)
41609 +{
41610 +       reiser4_object_create_data data;
41611 +
41612 +       memset(&data, 0, sizeof data);
41613 +       data.mode = mode;
41614 +       data.rdev = rdev;
41615 +       data.id = SPECIAL_FILE_PLUGIN_ID;
41616 +       return create_vfs_object(parent, dentry, &data);
41617 +}
41618 +
41619 +/*
41620 + * implementation of vfs's rename method of struct inode_operations for typical
41621 + * directory is in inode_ops_rename.c
41622 + */
41623 +
41624 +/**
41625 + * reiser4_follow_link_common - follow_link of inode operations
41626 + * @dentry: dentry of symlink
41627 + * @data:
41628 + *
41629 + * This is common implementation of vfs's followlink method of struct
41630 + * inode_operations.
41631 + * Assumes that inode's i_private points to the content of symbolic link.
41632 + */
41633 +void *reiser4_follow_link_common(struct dentry *dentry, struct nameidata *nd)
41634 +{
41635 +       assert("vs-851", S_ISLNK(dentry->d_inode->i_mode));
41636 +
41637 +       if (!dentry->d_inode->i_private
41638 +           || !reiser4_inode_get_flag(dentry->d_inode,
41639 +                                      REISER4_GENERIC_PTR_USED))
41640 +               return ERR_PTR(RETERR(-EINVAL));
41641 +       nd_set_link(nd, dentry->d_inode->i_private);
41642 +       return NULL;
41643 +}
41644 +
41645 +/**
41646 + * reiser4_permission_common - permission of inode operations
41647 + * @inode: inode to check permissions for
41648 + * @mask: mode bits to check permissions for
41649 + * @nameidata:
41650 + *
41651 + * Uses generic function to check for rwx permissions.
41652 + */
41653 +int reiser4_permission_common(struct inode *inode, int mask)
41654 +{
41655 +       return generic_permission(inode, mask, NULL);
41656 +}
41657 +
41658 +static int setattr_reserve(reiser4_tree *);
41659 +
41660 +/* this is common implementation of vfs's setattr method of struct
41661 +   inode_operations
41662 +*/
41663 +int reiser4_setattr_common(struct dentry *dentry, struct iattr *attr)
41664 +{
41665 +       reiser4_context *ctx;
41666 +       struct inode *inode;
41667 +       int result;
41668 +
41669 +       inode = dentry->d_inode;
41670 +       result = inode_change_ok(inode, attr);
41671 +       if (result)
41672 +               return result;
41673 +
41674 +       ctx = reiser4_init_context(inode->i_sb);
41675 +       if (IS_ERR(ctx))
41676 +               return PTR_ERR(ctx);
41677 +
41678 +       assert("nikita-3119", !(attr->ia_valid & ATTR_SIZE));
41679 +
41680 +       /*
41681 +        * grab disk space and call standard inode_setattr().
41682 +        */
41683 +       result = setattr_reserve(reiser4_tree_by_inode(inode));
41684 +       if (!result) {
41685 +               if ((attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid)
41686 +                   || (attr->ia_valid & ATTR_GID
41687 +                       && attr->ia_gid != inode->i_gid)) {
41688 +                       result = DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0;
41689 +                       if (result) {
41690 +                               context_set_commit_async(ctx);
41691 +                               reiser4_exit_context(ctx);
41692 +                               return result;
41693 +                       }
41694 +               }
41695 +               result = inode_setattr(inode, attr);
41696 +               if (!result)
41697 +                       reiser4_update_sd(inode);
41698 +       }
41699 +
41700 +       context_set_commit_async(ctx);
41701 +       reiser4_exit_context(ctx);
41702 +       return result;
41703 +}
41704 +
41705 +/* this is common implementation of vfs's getattr method of struct
41706 +   inode_operations
41707 +*/
41708 +int reiser4_getattr_common(struct vfsmount *mnt UNUSED_ARG,
41709 +                          struct dentry *dentry, struct kstat *stat)
41710 +{
41711 +       struct inode *obj;
41712 +
41713 +       assert("nikita-2298", dentry != NULL);
41714 +       assert("nikita-2299", stat != NULL);
41715 +       assert("nikita-2300", dentry->d_inode != NULL);
41716 +
41717 +       obj = dentry->d_inode;
41718 +
41719 +       stat->dev = obj->i_sb->s_dev;
41720 +       stat->ino = oid_to_uino(get_inode_oid(obj));
41721 +       stat->mode = obj->i_mode;
41722 +       /* don't confuse userland with huge nlink. This is not entirely
41723 +        * correct, because nlink_t is not necessary 16 bit signed. */
41724 +       stat->nlink = min(obj->i_nlink, (typeof(obj->i_nlink)) 0x7fff);
41725 +       stat->uid = obj->i_uid;
41726 +       stat->gid = obj->i_gid;
41727 +       stat->rdev = obj->i_rdev;
41728 +       stat->atime = obj->i_atime;
41729 +       stat->mtime = obj->i_mtime;
41730 +       stat->ctime = obj->i_ctime;
41731 +       stat->size = obj->i_size;
41732 +       stat->blocks =
41733 +           (inode_get_bytes(obj) + VFS_BLKSIZE - 1) >> VFS_BLKSIZE_BITS;
41734 +       /* "preferred" blocksize for efficient file system I/O */
41735 +       stat->blksize = get_super_private(obj->i_sb)->optimal_io_size;
41736 +
41737 +       return 0;
41738 +}
41739 +
41740 +/* Estimate the maximum amount of nodes which might be allocated or changed on
41741 +   typical new object creation. Typical creation consists of calling create
41742 +   method of file plugin, adding directory entry to parent and update parent
41743 +   directory's stat data.
41744 +*/
41745 +static reiser4_block_nr estimate_create_vfs_object(struct inode *parent,
41746 +                                                  /* parent object */
41747 +                                                  struct inode *object
41748 +                                                  /* object */)
41749 +{
41750 +       assert("vpf-309", parent != NULL);
41751 +       assert("vpf-307", object != NULL);
41752 +
41753 +       return
41754 +           /* object creation estimation */
41755 +           inode_file_plugin(object)->estimate.create(object) +
41756 +           /* stat data of parent directory estimation */
41757 +           inode_file_plugin(parent)->estimate.update(parent) +
41758 +           /* adding entry estimation */
41759 +           inode_dir_plugin(parent)->estimate.add_entry(parent) +
41760 +           /* to undo in the case of failure */
41761 +           inode_dir_plugin(parent)->estimate.rem_entry(parent);
41762 +}
41763 +
41764 +/* Create child in directory.
41765 +
41766 +   . get object's plugin
41767 +   . get fresh inode
41768 +   . initialize inode
41769 +   . add object's stat-data
41770 +   . initialize object's directory
41771 +   . add entry to the parent
41772 +   . instantiate dentry
41773 +
41774 +*/
41775 +static int do_create_vfs_child(reiser4_object_create_data * data,/* parameters
41776 +                                                                   of new
41777 +                                                                   object */
41778 +                              struct inode **retobj)
41779 +{
41780 +       int result;
41781 +
41782 +       struct dentry *dentry;  /* parent object */
41783 +       struct inode *parent;   /* new name */
41784 +
41785 +       dir_plugin *par_dir;    /* directory plugin on the parent */
41786 +       dir_plugin *obj_dir;    /* directory plugin on the new object */
41787 +       file_plugin *obj_plug;  /* object plugin on the new object */
41788 +       struct inode *object;   /* new object */
41789 +       reiser4_block_nr reserve;
41790 +
41791 +       reiser4_dir_entry_desc entry;   /* new directory entry */
41792 +
41793 +       assert("nikita-1420", data != NULL);
41794 +       parent = data->parent;
41795 +       dentry = data->dentry;
41796 +
41797 +       assert("nikita-1418", parent != NULL);
41798 +       assert("nikita-1419", dentry != NULL);
41799 +
41800 +       /* check, that name is acceptable for parent */
41801 +       par_dir = inode_dir_plugin(parent);
41802 +       if (par_dir->is_name_acceptable &&
41803 +           !par_dir->is_name_acceptable(parent,
41804 +                                        dentry->d_name.name,
41805 +                                        (int)dentry->d_name.len))
41806 +               return RETERR(-ENAMETOOLONG);
41807 +
41808 +       result = 0;
41809 +       obj_plug = file_plugin_by_id((int)data->id);
41810 +       if (obj_plug == NULL) {
41811 +               warning("nikita-430", "Cannot find plugin %i", data->id);
41812 +               return RETERR(-ENOENT);
41813 +       }
41814 +       object = new_inode(parent->i_sb);
41815 +       if (object == NULL)
41816 +               return RETERR(-ENOMEM);
41817 +       /* we'll update i_nlink below */
41818 +       object->i_nlink = 0;
41819 +       /* new_inode() initializes i_ino to "arbitrary" value. Reset it to 0,
41820 +        * to simplify error handling: if some error occurs before i_ino is
41821 +        * initialized with oid, i_ino should already be set to some
41822 +        * distinguished value. */
41823 +       object->i_ino = 0;
41824 +
41825 +       /* So that on error iput will be called. */
41826 +       *retobj = object;
41827 +
41828 +       if (DQUOT_ALLOC_INODE(object)) {
41829 +               DQUOT_DROP(object);
41830 +               object->i_flags |= S_NOQUOTA;
41831 +               return RETERR(-EDQUOT);
41832 +       }
41833 +
41834 +       memset(&entry, 0, sizeof entry);
41835 +       entry.obj = object;
41836 +
41837 +       set_plugin(&reiser4_inode_data(object)->pset, PSET_FILE,
41838 +                  file_plugin_to_plugin(obj_plug));
41839 +       result = obj_plug->set_plug_in_inode(object, parent, data);
41840 +       if (result) {
41841 +               warning("nikita-431", "Cannot install plugin %i on %llx",
41842 +                       data->id, (unsigned long long)get_inode_oid(object));
41843 +               DQUOT_FREE_INODE(object);
41844 +               object->i_flags |= S_NOQUOTA;
41845 +               return result;
41846 +       }
41847 +
41848 +       /* reget plugin after installation */
41849 +       obj_plug = inode_file_plugin(object);
41850 +
41851 +       if (obj_plug->create_object == NULL) {
41852 +               DQUOT_FREE_INODE(object);
41853 +               object->i_flags |= S_NOQUOTA;
41854 +               return RETERR(-EPERM);
41855 +       }
41856 +
41857 +       /* if any of hash, tail, sd or permission plugins for newly created
41858 +          object are not set yet set them here inheriting them from parent
41859 +          directory
41860 +        */
41861 +       assert("nikita-2070", obj_plug->adjust_to_parent != NULL);
41862 +       result = obj_plug->adjust_to_parent(object,
41863 +                                           parent,
41864 +                                           object->i_sb->s_root->d_inode);
41865 +       if (result == 0)
41866 +               result = finish_pset(object);
41867 +       if (result != 0) {
41868 +               warning("nikita-432", "Cannot inherit from %llx to %llx",
41869 +                       (unsigned long long)get_inode_oid(parent),
41870 +                       (unsigned long long)get_inode_oid(object));
41871 +               DQUOT_FREE_INODE(object);
41872 +               object->i_flags |= S_NOQUOTA;
41873 +               return result;
41874 +       }
41875 +
41876 +       /* setup inode and file-operations for this inode */
41877 +       setup_inode_ops(object, data);
41878 +
41879 +       /* call file plugin's method to initialize plugin specific part of
41880 +        * inode */
41881 +       if (obj_plug->init_inode_data)
41882 +               obj_plug->init_inode_data(object, data, 1/*create */);
41883 +
41884 +       /* obtain directory plugin (if any) for new object. */
41885 +       obj_dir = inode_dir_plugin(object);
41886 +       if (obj_dir != NULL && obj_dir->init == NULL) {
41887 +               DQUOT_FREE_INODE(object);
41888 +               object->i_flags |= S_NOQUOTA;
41889 +               return RETERR(-EPERM);
41890 +       }
41891 +
41892 +       reiser4_inode_data(object)->locality_id = get_inode_oid(parent);
41893 +
41894 +       reserve = estimate_create_vfs_object(parent, object);
41895 +       if (reiser4_grab_space(reserve, BA_CAN_COMMIT)) {
41896 +               DQUOT_FREE_INODE(object);
41897 +               object->i_flags |= S_NOQUOTA;
41898 +               return RETERR(-ENOSPC);
41899 +       }
41900 +
41901 +       /* mark inode `immutable'. We disable changes to the file being
41902 +          created until valid directory entry for it is inserted. Otherwise,
41903 +          if file were expanded and insertion of directory entry fails, we
41904 +          have to remove file, but we only alloted enough space in
41905 +          transaction to remove _empty_ file. 3.x code used to remove stat
41906 +          data in different transaction thus possibly leaking disk space on
41907 +          crash. This all only matters if it's possible to access file
41908 +          without name, for example, by inode number
41909 +        */
41910 +       reiser4_inode_set_flag(object, REISER4_IMMUTABLE);
41911 +
41912 +       /* create empty object, this includes allocation of new objectid. For
41913 +          directories this implies creation of dot and dotdot  */
41914 +       assert("nikita-2265", reiser4_inode_get_flag(object, REISER4_NO_SD));
41915 +
41916 +       /* mark inode as `loaded'. From this point onward
41917 +          reiser4_delete_inode() will try to remove its stat-data. */
41918 +       reiser4_inode_set_flag(object, REISER4_LOADED);
41919 +
41920 +       result = obj_plug->create_object(object, parent, data);
41921 +       if (result != 0) {
41922 +               reiser4_inode_clr_flag(object, REISER4_IMMUTABLE);
41923 +               if (result != -ENAMETOOLONG && result != -ENOMEM)
41924 +                       warning("nikita-2219",
41925 +                               "Failed to create sd for %llu",
41926 +                               (unsigned long long)get_inode_oid(object));
41927 +               DQUOT_FREE_INODE(object);
41928 +               object->i_flags |= S_NOQUOTA;
41929 +               return result;
41930 +       }
41931 +
41932 +       if (obj_dir != NULL)
41933 +               result = obj_dir->init(object, parent, data);
41934 +       if (result == 0) {
41935 +               assert("nikita-434", !reiser4_inode_get_flag(object,
41936 +                                                            REISER4_NO_SD));
41937 +               /* insert inode into VFS hash table */
41938 +               insert_inode_hash(object);
41939 +               /* create entry */
41940 +               result = par_dir->add_entry(parent, dentry, data, &entry);
41941 +               if (result == 0) {
41942 +                       result = reiser4_add_nlink(object, parent, 0);
41943 +                       /* If O_CREAT is set and the file did not previously
41944 +                          exist, upon successful completion, open() shall
41945 +                          mark for update the st_atime, st_ctime, and
41946 +                          st_mtime fields of the file and the st_ctime and
41947 +                          st_mtime fields of the parent directory. --SUS
41948 +                        */
41949 +                       /* @object times are already updated by
41950 +                          reiser4_add_nlink() */
41951 +                       if (result == 0)
41952 +                               reiser4_update_dir(parent);
41953 +                       if (result != 0)
41954 +                               /* cleanup failure to add nlink */
41955 +                               par_dir->rem_entry(parent, dentry, &entry);
41956 +               }
41957 +               if (result != 0)
41958 +                       /* cleanup failure to add entry */
41959 +                       obj_plug->detach(object, parent);
41960 +       } else if (result != -ENOMEM)
41961 +               warning("nikita-2219", "Failed to initialize dir for %llu: %i",
41962 +                       (unsigned long long)get_inode_oid(object), result);
41963 +
41964 +       /*
41965 +        * update stat-data, committing all pending modifications to the inode
41966 +        * fields.
41967 +        */
41968 +       reiser4_update_sd(object);
41969 +       if (result != 0) {
41970 +               DQUOT_FREE_INODE(object);
41971 +               object->i_flags |= S_NOQUOTA;
41972 +               /* if everything was ok (result == 0), parent stat-data is
41973 +                * already updated above (update_parent_dir()) */
41974 +               reiser4_update_sd(parent);
41975 +               /* failure to create entry, remove object */
41976 +               obj_plug->delete_object(object);
41977 +       }
41978 +
41979 +       /* file has name now, clear immutable flag */
41980 +       reiser4_inode_clr_flag(object, REISER4_IMMUTABLE);
41981 +
41982 +       /* on error, iput() will call ->delete_inode(). We should keep track
41983 +          of the existence of stat-data for this inode and avoid attempt to
41984 +          remove it in reiser4_delete_inode(). This is accomplished through
41985 +          REISER4_NO_SD bit in inode.u.reiser4_i.plugin.flags
41986 +        */
41987 +       return result;
41988 +}
41989 +
41990 +/* this is helper for common implementations of reiser4_mkdir, reiser4_create,
41991 +   reiser4_mknod and reiser4_symlink
41992 +*/
41993 +static int
41994 +create_vfs_object(struct inode *parent,
41995 +                 struct dentry *dentry, reiser4_object_create_data * data)
41996 +{
41997 +       reiser4_context *ctx;
41998 +       int result;
41999 +       struct inode *child;
42000 +
42001 +       ctx = reiser4_init_context(parent->i_sb);
42002 +       if (IS_ERR(ctx))
42003 +               return PTR_ERR(ctx);
42004 +       context_set_commit_async(ctx);
42005 +
42006 +       data->parent = parent;
42007 +       data->dentry = dentry;
42008 +       child = NULL;
42009 +       result = do_create_vfs_child(data, &child);
42010 +       if (unlikely(result != 0)) {
42011 +               if (child != NULL) {
42012 +                       reiser4_make_bad_inode(child);
42013 +                       iput(child);
42014 +               }
42015 +       } else
42016 +               d_instantiate(dentry, child);
42017 +
42018 +       reiser4_exit_context(ctx);
42019 +       return result;
42020 +}
42021 +
42022 +/**
42023 + * helper for link_common. Estimate disk space necessary to add a link
42024 + * from @parent to @object
42025 + */
42026 +static reiser4_block_nr common_estimate_link(struct inode *parent /* parent
42027 +                                                                  * directory
42028 +                                                                  */,
42029 +                                            struct inode *object /* object to
42030 +                                                                  * which new
42031 +                                                                  * link is
42032 +                                                                  * being
42033 +                                                                  * created */)
42034 +{
42035 +       reiser4_block_nr res = 0;
42036 +       file_plugin *fplug;
42037 +       dir_plugin *dplug;
42038 +
42039 +       assert("vpf-317", object != NULL);
42040 +       assert("vpf-318", parent != NULL);
42041 +
42042 +       fplug = inode_file_plugin(object);
42043 +       dplug = inode_dir_plugin(parent);
42044 +       /* VS-FIXME-HANS: why do we do fplug->estimate.update(object) twice
42045 +        * instead of multiplying by 2? */
42046 +       /* reiser4_add_nlink(object) */
42047 +       res += fplug->estimate.update(object);
42048 +       /* add_entry(parent) */
42049 +       res += dplug->estimate.add_entry(parent);
42050 +       /* reiser4_del_nlink(object) */
42051 +       res += fplug->estimate.update(object);
42052 +       /* update_dir(parent) */
42053 +       res += inode_file_plugin(parent)->estimate.update(parent);
42054 +       /* safe-link */
42055 +       res += estimate_one_item_removal(reiser4_tree_by_inode(object));
42056 +
42057 +       return res;
42058 +}
42059 +
42060 +/* Estimate disk space necessary to remove a link between @parent and
42061 +   @object.
42062 +*/
42063 +static reiser4_block_nr estimate_unlink(struct inode *parent /* parent
42064 +                                                             * directory */,
42065 +                                       struct inode *object /* object to which
42066 +                                                             * new link is
42067 +                                                             * being created
42068 +                                                             */)
42069 +{
42070 +       reiser4_block_nr res = 0;
42071 +       file_plugin *fplug;
42072 +       dir_plugin *dplug;
42073 +
42074 +       assert("vpf-317", object != NULL);
42075 +       assert("vpf-318", parent != NULL);
42076 +
42077 +       fplug = inode_file_plugin(object);
42078 +       dplug = inode_dir_plugin(parent);
42079 +
42080 +       /* rem_entry(parent) */
42081 +       res += dplug->estimate.rem_entry(parent);
42082 +       /* reiser4_del_nlink(object) */
42083 +       res += fplug->estimate.update(object);
42084 +       /* update_dir(parent) */
42085 +       res += inode_file_plugin(parent)->estimate.update(parent);
42086 +       /* fplug->unlink */
42087 +       res += fplug->estimate.unlink(object, parent);
42088 +       /* safe-link */
42089 +       res += estimate_one_insert_item(reiser4_tree_by_inode(object));
42090 +
42091 +       return res;
42092 +}
42093 +
42094 +/* helper for reiser4_unlink_common. Estimate and grab space for unlink. */
42095 +static int unlink_check_and_grab(struct inode *parent, struct dentry *victim)
42096 +{
42097 +       file_plugin *fplug;
42098 +       struct inode *child;
42099 +       int result;
42100 +
42101 +       result = 0;
42102 +       child = victim->d_inode;
42103 +       fplug = inode_file_plugin(child);
42104 +
42105 +       /* check for race with create_object() */
42106 +       if (reiser4_inode_get_flag(child, REISER4_IMMUTABLE))
42107 +               return RETERR(-E_REPEAT);
42108 +       /* object being deleted should have stat data */
42109 +       assert("vs-949", !reiser4_inode_get_flag(child, REISER4_NO_SD));
42110 +
42111 +       /* ask object plugin */
42112 +       if (fplug->can_rem_link != NULL && !fplug->can_rem_link(child))
42113 +               return RETERR(-ENOTEMPTY);
42114 +
42115 +       result = (int)estimate_unlink(parent, child);
42116 +       if (result < 0)
42117 +               return result;
42118 +
42119 +       return reiser4_grab_reserved(child->i_sb, result, BA_CAN_COMMIT);
42120 +}
42121 +
42122 +/* helper for reiser4_setattr_common */
42123 +static int setattr_reserve(reiser4_tree * tree)
42124 +{
42125 +       assert("vs-1096", is_grab_enabled(get_current_context()));
42126 +       return reiser4_grab_space(estimate_one_insert_into_item(tree),
42127 +                                 BA_CAN_COMMIT);
42128 +}
42129 +
42130 +/* helper function. Standards require that for many file-system operations
42131 +   on success ctime and mtime of parent directory is to be updated. */
42132 +int reiser4_update_dir(struct inode *dir)
42133 +{
42134 +       assert("nikita-2525", dir != NULL);
42135 +
42136 +       dir->i_ctime = dir->i_mtime = CURRENT_TIME;
42137 +       return reiser4_update_sd(dir);
42138 +}
42139 diff -puN /dev/null fs/reiser4/plugin/inode_ops_rename.c
42140 --- /dev/null
42141 +++ a/fs/reiser4/plugin/inode_ops_rename.c
42142 @@ -0,0 +1,925 @@
42143 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
42144 + * reiser4/README */
42145 +
42146 +#include "../inode.h"
42147 +#include "../safe_link.h"
42148 +
42149 +static const char *possible_leak = "Possible disk space leak.";
42150 +
42151 +/* re-bind existing name at @from_coord in @from_dir to point to @to_inode.
42152 +
42153 +   Helper function called from hashed_rename() */
42154 +static int replace_name(struct inode *to_inode,        /* inode where @from_coord is
42155 +                                                * to be re-targeted at */
42156 +                       struct inode *from_dir, /* directory where @from_coord
42157 +                                                * lives */
42158 +                       struct inode *from_inode, /* inode @from_coord
42159 +                                                  * originally point to */
42160 +                       coord_t *from_coord,    /* where directory entry is in
42161 +                                                * the tree */
42162 +                       lock_handle * from_lh/* lock handle on @from_coord */)
42163 +{
42164 +       item_plugin *from_item;
42165 +       int result;
42166 +       znode *node;
42167 +
42168 +       coord_clear_iplug(from_coord);
42169 +       node = from_coord->node;
42170 +       result = zload(node);
42171 +       if (result != 0)
42172 +               return result;
42173 +       from_item = item_plugin_by_coord(from_coord);
42174 +       if (plugin_of_group(item_plugin_by_coord(from_coord),
42175 +                           DIR_ENTRY_ITEM_TYPE)) {
42176 +               reiser4_key to_key;
42177 +
42178 +               build_sd_key(to_inode, &to_key);
42179 +
42180 +               /* everything is found and prepared to change directory entry
42181 +                  at @from_coord to point to @to_inode.
42182 +
42183 +                  @to_inode is just about to get new name, so bump its link
42184 +                  counter.
42185 +
42186 +                */
42187 +               result = reiser4_add_nlink(to_inode, from_dir, 0);
42188 +               if (result != 0) {
42189 +                       /* Don't issue warning: this may be plain -EMLINK */
42190 +                       zrelse(node);
42191 +                       return result;
42192 +               }
42193 +
42194 +               result =
42195 +                   from_item->s.dir.update_key(from_coord, &to_key, from_lh);
42196 +               if (result != 0) {
42197 +                       reiser4_del_nlink(to_inode, from_dir, 0);
42198 +                       zrelse(node);
42199 +                       return result;
42200 +               }
42201 +
42202 +               /* @from_inode just lost its name, he-he.
42203 +
42204 +                  If @from_inode was directory, it contained dotdot pointing
42205 +                  to @from_dir. @from_dir i_nlink will be decreased when
42206 +                  iput() will be called on @from_inode.
42207 +
42208 +                  If file-system is not ADG (hard-links are
42209 +                  supported on directories), iput(from_inode) will not remove
42210 +                  @from_inode, and thus above is incorrect, but hard-links on
42211 +                  directories are problematic in many other respects.
42212 +                */
42213 +               result = reiser4_del_nlink(from_inode, from_dir, 0);
42214 +               if (result != 0) {
42215 +                       warning("nikita-2330",
42216 +                               "Cannot remove link from source: %i. %s",
42217 +                               result, possible_leak);
42218 +               }
42219 +               /* Has to return success, because entry is already
42220 +                * modified. */
42221 +               result = 0;
42222 +
42223 +               /* NOTE-NIKITA consider calling plugin method in stead of
42224 +                  accessing inode fields directly. */
42225 +               from_dir->i_mtime = CURRENT_TIME;
42226 +       } else {
42227 +               warning("nikita-2326", "Unexpected item type");
42228 +               result = RETERR(-EIO);
42229 +       }
42230 +       zrelse(node);
42231 +       return result;
42232 +}
42233 +
42234 +/* add new entry pointing to @inode into @dir at @coord, locked by @lh
42235 +
42236 +   Helper function used by hashed_rename(). */
42237 +static int add_name(struct inode *inode,       /* inode where @coord is to be
42238 +                                                * re-targeted at */
42239 +                   struct inode *dir,  /* directory where @coord lives */
42240 +                   struct dentry *name,        /* new name */
42241 +                   coord_t *coord,     /* where directory entry is in the tree
42242 +                                        */
42243 +                   lock_handle * lh,   /* lock handle on @coord */
42244 +                   int is_dir/* true, if @inode is directory */)
42245 +{
42246 +       int result;
42247 +       reiser4_dir_entry_desc entry;
42248 +
42249 +       assert("nikita-2333", lh->node == coord->node);
42250 +       assert("nikita-2334", is_dir == S_ISDIR(inode->i_mode));
42251 +
42252 +       memset(&entry, 0, sizeof entry);
42253 +       entry.obj = inode;
42254 +       /* build key of directory entry description */
42255 +       inode_dir_plugin(dir)->build_entry_key(dir, &name->d_name, &entry.key);
42256 +
42257 +       /* ext2 does this in different order: first inserts new entry,
42258 +          then increases directory nlink. We don't want do this,
42259 +          because reiser4_add_nlink() calls ->add_link() plugin
42260 +          method that can fail for whatever reason, leaving as with
42261 +          cleanup problems.
42262 +        */
42263 +       /* @inode is getting new name */
42264 +       reiser4_add_nlink(inode, dir, 0);
42265 +       /* create @new_name in @new_dir pointing to
42266 +          @old_inode */
42267 +       result = WITH_COORD(coord,
42268 +                           inode_dir_item_plugin(dir)->s.dir.add_entry(dir,
42269 +                                                                       coord,
42270 +                                                                       lh,
42271 +                                                                       name,
42272 +                                                                       &entry));
42273 +       if (result != 0) {
42274 +               int result2;
42275 +               result2 = reiser4_del_nlink(inode, dir, 0);
42276 +               if (result2 != 0) {
42277 +                       warning("nikita-2327",
42278 +                               "Cannot drop link on %lli %i. %s",
42279 +                               (unsigned long long)get_inode_oid(inode),
42280 +                               result2, possible_leak);
42281 +               }
42282 +       } else
42283 +               INODE_INC_FIELD(dir, i_size);
42284 +       return result;
42285 +}
42286 +
42287 +static reiser4_block_nr estimate_rename(struct inode *old_dir,  /* directory
42288 +                                                                * where @old is
42289 +                                                                * located */
42290 +                                       struct dentry *old_name,/* old name */
42291 +                                       struct inode *new_dir,  /* directory
42292 +                                                                * where @new is
42293 +                                                                * located */
42294 +                                       struct dentry *new_name /* new name */)
42295 +{
42296 +       reiser4_block_nr res1, res2;
42297 +       dir_plugin * p_parent_old, *p_parent_new;
42298 +       file_plugin * p_child_old, *p_child_new;
42299 +
42300 +       assert("vpf-311", old_dir != NULL);
42301 +       assert("vpf-312", new_dir != NULL);
42302 +       assert("vpf-313", old_name != NULL);
42303 +       assert("vpf-314", new_name != NULL);
42304 +
42305 +       p_parent_old = inode_dir_plugin(old_dir);
42306 +       p_parent_new = inode_dir_plugin(new_dir);
42307 +       p_child_old = inode_file_plugin(old_name->d_inode);
42308 +       if (new_name->d_inode)
42309 +               p_child_new = inode_file_plugin(new_name->d_inode);
42310 +       else
42311 +               p_child_new = NULL;
42312 +
42313 +       /* find_entry - can insert one leaf. */
42314 +       res1 = res2 = 1;
42315 +
42316 +       /* replace_name */
42317 +       {
42318 +               /* reiser4_add_nlink(p_child_old) and
42319 +                * reiser4_del_nlink(p_child_old) */
42320 +               res1 += 2 * p_child_old->estimate.update(old_name->d_inode);
42321 +               /* update key */
42322 +               res1 += 1;
42323 +               /* reiser4_del_nlink(p_child_new) */
42324 +               if (p_child_new)
42325 +                       res1 += p_child_new->estimate.update(new_name->d_inode);
42326 +       }
42327 +
42328 +       /* else add_name */
42329 +       {
42330 +               /* reiser4_add_nlink(p_parent_new) and
42331 +                * reiser4_del_nlink(p_parent_new) */
42332 +               res2 +=
42333 +                   2 * inode_file_plugin(new_dir)->estimate.update(new_dir);
42334 +               /* reiser4_add_nlink(p_parent_old) */
42335 +               res2 += p_child_old->estimate.update(old_name->d_inode);
42336 +               /* add_entry(p_parent_new) */
42337 +               res2 += p_parent_new->estimate.add_entry(new_dir);
42338 +               /* reiser4_del_nlink(p_parent_old) */
42339 +               res2 += p_child_old->estimate.update(old_name->d_inode);
42340 +       }
42341 +
42342 +       res1 = res1 < res2 ? res2 : res1;
42343 +
42344 +       /* reiser4_write_sd(p_parent_new) */
42345 +       res1 += inode_file_plugin(new_dir)->estimate.update(new_dir);
42346 +
42347 +       /* reiser4_write_sd(p_child_new) */
42348 +       if (p_child_new)
42349 +               res1 += p_child_new->estimate.update(new_name->d_inode);
42350 +
42351 +       /* hashed_rem_entry(p_parent_old) */
42352 +       res1 += p_parent_old->estimate.rem_entry(old_dir);
42353 +
42354 +       /* reiser4_del_nlink(p_child_old) */
42355 +       res1 += p_child_old->estimate.update(old_name->d_inode);
42356 +
42357 +       /* replace_name */
42358 +       {
42359 +               /* reiser4_add_nlink(p_parent_dir_new) */
42360 +               res1 += inode_file_plugin(new_dir)->estimate.update(new_dir);
42361 +               /* update_key */
42362 +               res1 += 1;
42363 +               /* reiser4_del_nlink(p_parent_new) */
42364 +               res1 += inode_file_plugin(new_dir)->estimate.update(new_dir);
42365 +               /* reiser4_del_nlink(p_parent_old) */
42366 +               res1 += inode_file_plugin(old_dir)->estimate.update(old_dir);
42367 +       }
42368 +
42369 +       /* reiser4_write_sd(p_parent_old) */
42370 +       res1 += inode_file_plugin(old_dir)->estimate.update(old_dir);
42371 +
42372 +       /* reiser4_write_sd(p_child_old) */
42373 +       res1 += p_child_old->estimate.update(old_name->d_inode);
42374 +
42375 +       return res1;
42376 +}
42377 +
42378 +static int hashed_rename_estimate_and_grab(struct inode *old_dir,  /* directory
42379 +                                                                   * where @old
42380 +                                                                   * is located
42381 +                                                                   */
42382 +                                          struct dentry *old_name,/* old name
42383 +                                                                   */
42384 +                                          struct inode *new_dir,  /* directory
42385 +                                                                   * where @new
42386 +                                                                   * is located
42387 +                                                                   */
42388 +                                          struct dentry *new_name /* new name
42389 +                                                                   */)
42390 +{
42391 +       reiser4_block_nr reserve;
42392 +
42393 +       reserve = estimate_rename(old_dir, old_name, new_dir, new_name);
42394 +
42395 +       if (reiser4_grab_space(reserve, BA_CAN_COMMIT))
42396 +               return RETERR(-ENOSPC);
42397 +
42398 +       return 0;
42399 +}
42400 +
42401 +/* check whether @old_inode and @new_inode can be moved within file system
42402 + * tree. This singles out attempts to rename pseudo-files, for example. */
42403 +static int can_rename(struct inode *old_dir, struct inode *old_inode,
42404 +                     struct inode *new_dir, struct inode *new_inode)
42405 +{
42406 +       file_plugin *fplug;
42407 +       dir_plugin *dplug;
42408 +
42409 +       assert("nikita-3370", old_inode != NULL);
42410 +
42411 +       dplug = inode_dir_plugin(new_dir);
42412 +       fplug = inode_file_plugin(old_inode);
42413 +
42414 +       if (dplug == NULL)
42415 +               return RETERR(-ENOTDIR);
42416 +       else if (new_dir->i_op->create == NULL)
42417 +               return RETERR(-EPERM);
42418 +       else if (!fplug->can_add_link(old_inode))
42419 +               return RETERR(-EMLINK);
42420 +       else if (new_inode != NULL) {
42421 +               fplug = inode_file_plugin(new_inode);
42422 +               if (fplug->can_rem_link != NULL &&
42423 +                   !fplug->can_rem_link(new_inode))
42424 +                       return RETERR(-EBUSY);
42425 +       }
42426 +       return 0;
42427 +}
42428 +
42429 +int reiser4_find_entry(struct inode *, struct dentry *, lock_handle * ,
42430 +              znode_lock_mode, reiser4_dir_entry_desc *);
42431 +int reiser4_update_dir(struct inode *);
42432 +
42433 +/* this is common implementation of vfs's rename method of struct
42434 +   inode_operations
42435 +   See comments in the body.
42436 +
42437 +   It is arguable that this function can be made generic so, that it
42438 +   will be applicable to any kind of directory plugin that deals with
42439 +   directories composed out of directory entries. The only obstacle
42440 +   here is that we don't have any data-type to represent directory
42441 +   entry. This should be re-considered when more than one different
42442 +   directory plugin will be implemented.
42443 +*/
42444 +int reiser4_rename_common(struct inode *old_dir /* directory where @old
42445 +                                                * is located */ ,
42446 +                         struct dentry *old_name /* old name */ ,
42447 +                         struct inode *new_dir /* directory where @new
42448 +                                                * is located */ ,
42449 +                         struct dentry *new_name/* new name */)
42450 +{
42451 +       /* From `The Open Group Base Specifications Issue 6'
42452 +
42453 +          If either the old or new argument names a symbolic link, rename()
42454 +          shall operate on the symbolic link itself, and shall not resolve
42455 +          the last component of the argument. If the old argument and the new
42456 +          argument resolve to the same existing file, rename() shall return
42457 +          successfully and perform no other action.
42458 +
42459 +          [this is done by VFS: vfs_rename()]
42460 +
42461 +          If the old argument points to the pathname of a file that is not a
42462 +          directory, the new argument shall not point to the pathname of a
42463 +          directory.
42464 +
42465 +          [checked by VFS: vfs_rename->may_delete()]
42466 +
42467 +          If the link named by the new argument exists, it shall
42468 +          be removed and old renamed to new. In this case, a link named new
42469 +          shall remain visible to other processes throughout the renaming
42470 +          operation and refer either to the file referred to by new or old
42471 +          before the operation began.
42472 +
42473 +          [we should assure this]
42474 +
42475 +          Write access permission is required for
42476 +          both the directory containing old and the directory containing new.
42477 +
42478 +          [checked by VFS: vfs_rename->may_delete(), may_create()]
42479 +
42480 +          If the old argument points to the pathname of a directory, the new
42481 +          argument shall not point to the pathname of a file that is not a
42482 +          directory.
42483 +
42484 +          [checked by VFS: vfs_rename->may_delete()]
42485 +
42486 +          If the directory named by the new argument exists, it
42487 +          shall be removed and old renamed to new. In this case, a link named
42488 +          new shall exist throughout the renaming operation and shall refer
42489 +          either to the directory referred to by new or old before the
42490 +          operation began.
42491 +
42492 +          [we should assure this]
42493 +
42494 +          If new names an existing directory, it shall be
42495 +          required to be an empty directory.
42496 +
42497 +          [we should check this]
42498 +
42499 +          If the old argument points to a pathname of a symbolic link, the
42500 +          symbolic link shall be renamed. If the new argument points to a
42501 +          pathname of a symbolic link, the symbolic link shall be removed.
42502 +
42503 +          The new pathname shall not contain a path prefix that names
42504 +          old. Write access permission is required for the directory
42505 +          containing old and the directory containing new. If the old
42506 +          argument points to the pathname of a directory, write access
42507 +          permission may be required for the directory named by old, and, if
42508 +          it exists, the directory named by new.
42509 +
42510 +          [checked by VFS: vfs_rename(), vfs_rename_dir()]
42511 +
42512 +          If the link named by the new argument exists and the file's link
42513 +          count becomes 0 when it is removed and no process has the file
42514 +          open, the space occupied by the file shall be freed and the file
42515 +          shall no longer be accessible. If one or more processes have the
42516 +          file open when the last link is removed, the link shall be removed
42517 +          before rename() returns, but the removal of the file contents shall
42518 +          be postponed until all references to the file are closed.
42519 +
42520 +          [iput() handles this, but we can do this manually, a la
42521 +          reiser4_unlink()]
42522 +
42523 +          Upon successful completion, rename() shall mark for update the
42524 +          st_ctime and st_mtime fields of the parent directory of each file.
42525 +
42526 +          [N/A]
42527 +
42528 +        */
42529 +       reiser4_context *ctx;
42530 +       int result;
42531 +       int is_dir;             /* is @old_name directory */
42532 +
42533 +       struct inode *old_inode;
42534 +       struct inode *new_inode;
42535 +       coord_t *new_coord;
42536 +
42537 +       struct reiser4_dentry_fsdata *new_fsdata;
42538 +       dir_plugin *dplug;
42539 +       file_plugin *fplug;
42540 +
42541 +       reiser4_dir_entry_desc *old_entry, *new_entry, *dotdot_entry;
42542 +       lock_handle * new_lh, *dotdot_lh;
42543 +       struct dentry *dotdot_name;
42544 +       struct reiser4_dentry_fsdata *dataonstack;
42545 +
42546 +       ctx = reiser4_init_context(old_dir->i_sb);
42547 +       if (IS_ERR(ctx))
42548 +               return PTR_ERR(ctx);
42549 +
42550 +       old_entry = kzalloc(3 * sizeof(*old_entry) + 2 * sizeof(*new_lh) +
42551 +                           sizeof(*dotdot_name) + sizeof(*dataonstack),
42552 +                           reiser4_ctx_gfp_mask_get());
42553 +       if (!old_entry) {
42554 +               context_set_commit_async(ctx);
42555 +               reiser4_exit_context(ctx);
42556 +               return RETERR(-ENOMEM);
42557 +       }
42558 +
42559 +       new_entry = old_entry + 1;
42560 +       dotdot_entry = old_entry + 2;
42561 +       new_lh = (lock_handle *)(old_entry + 3);
42562 +       dotdot_lh = new_lh + 1;
42563 +       dotdot_name = (struct dentry *)(new_lh + 2);
42564 +       dataonstack = (struct reiser4_dentry_fsdata *)(dotdot_name + 1);
42565 +
42566 +       assert("nikita-2318", old_dir != NULL);
42567 +       assert("nikita-2319", new_dir != NULL);
42568 +       assert("nikita-2320", old_name != NULL);
42569 +       assert("nikita-2321", new_name != NULL);
42570 +
42571 +       old_inode = old_name->d_inode;
42572 +       new_inode = new_name->d_inode;
42573 +
42574 +       dplug = inode_dir_plugin(old_dir);
42575 +       fplug = NULL;
42576 +
42577 +       new_fsdata = reiser4_get_dentry_fsdata(new_name);
42578 +       if (IS_ERR(new_fsdata)) {
42579 +               kfree(old_entry);
42580 +               context_set_commit_async(ctx);
42581 +               reiser4_exit_context(ctx);
42582 +               return PTR_ERR(new_fsdata);
42583 +       }
42584 +
42585 +       new_coord = &new_fsdata->dec.entry_coord;
42586 +       coord_clear_iplug(new_coord);
42587 +
42588 +       is_dir = S_ISDIR(old_inode->i_mode);
42589 +
42590 +       assert("nikita-3461", old_inode->i_nlink >= 1 + !!is_dir);
42591 +
42592 +       /* if target is existing directory and it's not empty---return error.
42593 +
42594 +          This check is done specifically, because is_dir_empty() requires
42595 +          tree traversal and have to be done before locks are taken.
42596 +        */
42597 +       if (is_dir && new_inode != NULL && is_dir_empty(new_inode) != 0) {
42598 +               kfree(old_entry);
42599 +               context_set_commit_async(ctx);
42600 +               reiser4_exit_context(ctx);
42601 +               return RETERR(-ENOTEMPTY);
42602 +       }
42603 +
42604 +       result = can_rename(old_dir, old_inode, new_dir, new_inode);
42605 +       if (result != 0) {
42606 +               kfree(old_entry);
42607 +               context_set_commit_async(ctx);
42608 +               reiser4_exit_context(ctx);
42609 +               return result;
42610 +       }
42611 +
42612 +       result = hashed_rename_estimate_and_grab(old_dir, old_name,
42613 +                                                new_dir, new_name);
42614 +       if (result != 0) {
42615 +               kfree(old_entry);
42616 +               context_set_commit_async(ctx);
42617 +               reiser4_exit_context(ctx);
42618 +               return result;
42619 +       }
42620 +
42621 +       init_lh(new_lh);
42622 +
42623 +       /* find entry for @new_name */
42624 +       result = reiser4_find_entry(new_dir, new_name, new_lh, ZNODE_WRITE_LOCK,
42625 +                                   new_entry);
42626 +
42627 +       if (IS_CBKERR(result)) {
42628 +               done_lh(new_lh);
42629 +               kfree(old_entry);
42630 +               context_set_commit_async(ctx);
42631 +               reiser4_exit_context(ctx);
42632 +               return result;
42633 +       }
42634 +
42635 +       reiser4_seal_done(&new_fsdata->dec.entry_seal);
42636 +
42637 +       /* add or replace name for @old_inode as @new_name */
42638 +       if (new_inode != NULL) {
42639 +               /* target (@new_name) exists. */
42640 +               /* Not clear what to do with objects that are
42641 +                  both directories and files at the same time. */
42642 +               if (result == CBK_COORD_FOUND) {
42643 +                       result = replace_name(old_inode,
42644 +                                             new_dir,
42645 +                                             new_inode, new_coord, new_lh);
42646 +                       if (result == 0)
42647 +                               fplug = inode_file_plugin(new_inode);
42648 +               } else if (result == CBK_COORD_NOTFOUND) {
42649 +                       /* VFS told us that @new_name is bound to existing
42650 +                          inode, but we failed to find directory entry. */
42651 +                       warning("nikita-2324", "Target not found");
42652 +                       result = RETERR(-ENOENT);
42653 +               }
42654 +       } else {
42655 +               /* target (@new_name) doesn't exists. */
42656 +               if (result == CBK_COORD_NOTFOUND)
42657 +                       result = add_name(old_inode,
42658 +                                         new_dir,
42659 +                                         new_name, new_coord, new_lh, is_dir);
42660 +               else if (result == CBK_COORD_FOUND) {
42661 +                       /* VFS told us that @new_name is "negative" dentry,
42662 +                          but we found directory entry. */
42663 +                       warning("nikita-2331", "Target found unexpectedly");
42664 +                       result = RETERR(-EIO);
42665 +               }
42666 +       }
42667 +
42668 +       assert("nikita-3462", ergo(result == 0,
42669 +                                  old_inode->i_nlink >= 2 + !!is_dir));
42670 +
42671 +       /* We are done with all modifications to the @new_dir, release lock on
42672 +          node. */
42673 +       done_lh(new_lh);
42674 +
42675 +       if (fplug != NULL) {
42676 +               /* detach @new_inode from name-space */
42677 +               result = fplug->detach(new_inode, new_dir);
42678 +               if (result != 0)
42679 +                       warning("nikita-2330", "Cannot detach %lli: %i. %s",
42680 +                               (unsigned long long)get_inode_oid(new_inode),
42681 +                               result, possible_leak);
42682 +       }
42683 +
42684 +       if (new_inode != NULL)
42685 +               reiser4_update_sd(new_inode);
42686 +
42687 +       if (result == 0) {
42688 +               old_entry->obj = old_inode;
42689 +
42690 +               dplug->build_entry_key(old_dir,
42691 +                                      &old_name->d_name, &old_entry->key);
42692 +
42693 +               /* At this stage new name was introduced for
42694 +                  @old_inode. @old_inode, @new_dir, and @new_inode i_nlink
42695 +                  counters were updated.
42696 +
42697 +                  We want to remove @old_name now. If @old_inode wasn't
42698 +                  directory this is simple.
42699 +                */
42700 +               result = dplug->rem_entry(old_dir, old_name, old_entry);
42701 +               if (result != 0 && result != -ENOMEM) {
42702 +                       warning("nikita-2335",
42703 +                               "Cannot remove old name: %i", result);
42704 +               } else {
42705 +                       result = reiser4_del_nlink(old_inode, old_dir, 0);
42706 +                       if (result != 0 && result != -ENOMEM) {
42707 +                               warning("nikita-2337",
42708 +                                       "Cannot drop link on old: %i", result);
42709 +                       }
42710 +               }
42711 +
42712 +               if (result == 0 && is_dir) {
42713 +                       /* @old_inode is directory. We also have to update
42714 +                          dotdot entry. */
42715 +                       coord_t *dotdot_coord;
42716 +
42717 +                       memset(dataonstack, 0, sizeof dataonstack);
42718 +                       memset(dotdot_entry, 0, sizeof dotdot_entry);
42719 +                       dotdot_entry->obj = old_dir;
42720 +                       memset(dotdot_name, 0, sizeof dotdot_name);
42721 +                       dotdot_name->d_name.name = "..";
42722 +                       dotdot_name->d_name.len = 2;
42723 +                       /*
42724 +                        * allocate ->d_fsdata on the stack to avoid using
42725 +                        * reiser4_get_dentry_fsdata(). Locking is not needed,
42726 +                        * because dentry is private to the current thread.
42727 +                        */
42728 +                       dotdot_name->d_fsdata = dataonstack;
42729 +                       init_lh(dotdot_lh);
42730 +
42731 +                       dotdot_coord = &dataonstack->dec.entry_coord;
42732 +                       coord_clear_iplug(dotdot_coord);
42733 +
42734 +                       result = reiser4_find_entry(old_inode, dotdot_name,
42735 +                                                   dotdot_lh, ZNODE_WRITE_LOCK,
42736 +                                                   dotdot_entry);
42737 +                       if (result == 0) {
42738 +                               /* replace_name() decreases i_nlink on
42739 +                                * @old_dir */
42740 +                               result = replace_name(new_dir,
42741 +                                                     old_inode,
42742 +                                                     old_dir,
42743 +                                                     dotdot_coord, dotdot_lh);
42744 +                       } else
42745 +                               result = RETERR(-EIO);
42746 +                       done_lh(dotdot_lh);
42747 +               }
42748 +       }
42749 +       reiser4_update_dir(new_dir);
42750 +       reiser4_update_dir(old_dir);
42751 +       reiser4_update_sd(old_inode);
42752 +       if (result == 0) {
42753 +               file_plugin *fplug;
42754 +
42755 +               if (new_inode != NULL) {
42756 +                       /* add safe-link for target file (in case we removed
42757 +                        * last reference to the poor fellow */
42758 +                       fplug = inode_file_plugin(new_inode);
42759 +                       if (new_inode->i_nlink == 0)
42760 +                               result = safe_link_add(new_inode, SAFE_UNLINK);
42761 +               }
42762 +       }
42763 +       kfree(old_entry);
42764 +       context_set_commit_async(ctx);
42765 +       reiser4_exit_context(ctx);
42766 +       return result;
42767 +}
42768 +
42769 +#if 0
42770 +int reiser4_rename_common(struct inode *old_dir /* directory where @old
42771 +                                                * is located */ ,
42772 +                         struct dentry *old_name /* old name */ ,
42773 +                         struct inode *new_dir /* directory where @new
42774 +                                                * is located */ ,
42775 +                         struct dentry *new_name/* new name */)
42776 +{
42777 +       /* From `The Open Group Base Specifications Issue 6'
42778 +
42779 +          If either the old or new argument names a symbolic link, rename()
42780 +          shall operate on the symbolic link itself, and shall not resolve
42781 +          the last component of the argument. If the old argument and the new
42782 +          argument resolve to the same existing file, rename() shall return
42783 +          successfully and perform no other action.
42784 +
42785 +          [this is done by VFS: vfs_rename()]
42786 +
42787 +          If the old argument points to the pathname of a file that is not a
42788 +          directory, the new argument shall not point to the pathname of a
42789 +          directory.
42790 +
42791 +          [checked by VFS: vfs_rename->may_delete()]
42792 +
42793 +          If the link named by the new argument exists, it shall
42794 +          be removed and old renamed to new. In this case, a link named new
42795 +          shall remain visible to other processes throughout the renaming
42796 +          operation and refer either to the file referred to by new or old
42797 +          before the operation began.
42798 +
42799 +          [we should assure this]
42800 +
42801 +          Write access permission is required for
42802 +          both the directory containing old and the directory containing new.
42803 +
42804 +          [checked by VFS: vfs_rename->may_delete(), may_create()]
42805 +
42806 +          If the old argument points to the pathname of a directory, the new
42807 +          argument shall not point to the pathname of a file that is not a
42808 +          directory.
42809 +
42810 +          [checked by VFS: vfs_rename->may_delete()]
42811 +
42812 +          If the directory named by the new argument exists, it
42813 +          shall be removed and old renamed to new. In this case, a link named
42814 +          new shall exist throughout the renaming operation and shall refer
42815 +          either to the directory referred to by new or old before the
42816 +          operation began.
42817 +
42818 +          [we should assure this]
42819 +
42820 +          If new names an existing directory, it shall be
42821 +          required to be an empty directory.
42822 +
42823 +          [we should check this]
42824 +
42825 +          If the old argument points to a pathname of a symbolic link, the
42826 +          symbolic link shall be renamed. If the new argument points to a
42827 +          pathname of a symbolic link, the symbolic link shall be removed.
42828 +
42829 +          The new pathname shall not contain a path prefix that names
42830 +          old. Write access permission is required for the directory
42831 +          containing old and the directory containing new. If the old
42832 +          argument points to the pathname of a directory, write access
42833 +          permission may be required for the directory named by old, and, if
42834 +          it exists, the directory named by new.
42835 +
42836 +          [checked by VFS: vfs_rename(), vfs_rename_dir()]
42837 +
42838 +          If the link named by the new argument exists and the file's link
42839 +          count becomes 0 when it is removed and no process has the file
42840 +          open, the space occupied by the file shall be freed and the file
42841 +          shall no longer be accessible. If one or more processes have the
42842 +          file open when the last link is removed, the link shall be removed
42843 +          before rename() returns, but the removal of the file contents shall
42844 +          be postponed until all references to the file are closed.
42845 +
42846 +          [iput() handles this, but we can do this manually, a la
42847 +          reiser4_unlink()]
42848 +
42849 +          Upon successful completion, rename() shall mark for update the
42850 +          st_ctime and st_mtime fields of the parent directory of each file.
42851 +
42852 +          [N/A]
42853 +
42854 +        */
42855 +       reiser4_context *ctx;
42856 +       int result;
42857 +       int is_dir;             /* is @old_name directory */
42858 +       struct inode *old_inode;
42859 +       struct inode *new_inode;
42860 +       reiser4_dir_entry_desc old_entry;
42861 +       reiser4_dir_entry_desc new_entry;
42862 +       coord_t *new_coord;
42863 +       struct reiser4_dentry_fsdata *new_fsdata;
42864 +       lock_handle new_lh;
42865 +       dir_plugin *dplug;
42866 +       file_plugin *fplug;
42867 +
42868 +       ctx = reiser4_init_context(old_dir->i_sb);
42869 +       if (IS_ERR(ctx))
42870 +               return PTR_ERR(ctx);
42871 +
42872 +       assert("nikita-2318", old_dir != NULL);
42873 +       assert("nikita-2319", new_dir != NULL);
42874 +       assert("nikita-2320", old_name != NULL);
42875 +       assert("nikita-2321", new_name != NULL);
42876 +
42877 +       old_inode = old_name->d_inode;
42878 +       new_inode = new_name->d_inode;
42879 +
42880 +       dplug = inode_dir_plugin(old_dir);
42881 +       fplug = NULL;
42882 +
42883 +       new_fsdata = reiser4_get_dentry_fsdata(new_name);
42884 +       if (IS_ERR(new_fsdata)) {
42885 +               result = PTR_ERR(new_fsdata);
42886 +               goto exit;
42887 +       }
42888 +
42889 +       new_coord = &new_fsdata->dec.entry_coord;
42890 +       coord_clear_iplug(new_coord);
42891 +
42892 +       is_dir = S_ISDIR(old_inode->i_mode);
42893 +
42894 +       assert("nikita-3461", old_inode->i_nlink >= 1 + !!is_dir);
42895 +
42896 +       /* if target is existing directory and it's not empty---return error.
42897 +
42898 +          This check is done specifically, because is_dir_empty() requires
42899 +          tree traversal and have to be done before locks are taken.
42900 +        */
42901 +       if (is_dir && new_inode != NULL && is_dir_empty(new_inode) != 0)
42902 +               return RETERR(-ENOTEMPTY);
42903 +
42904 +       result = can_rename(old_dir, old_inode, new_dir, new_inode);
42905 +       if (result != 0)
42906 +               goto exit;
42907 +
42908 +       result = hashed_rename_estimate_and_grab(old_dir, old_name,
42909 +                                                new_dir, new_name);
42910 +       if (result != 0)
42911 +               goto exit;
42912 +
42913 +       init_lh(&new_lh);
42914 +
42915 +       /* find entry for @new_name */
42916 +       result = reiser4_find_entry(new_dir, new_name, &new_lh,
42917 +                                   ZNODE_WRITE_LOCK, &new_entry);
42918 +
42919 +       if (IS_CBKERR(result)) {
42920 +               done_lh(&new_lh);
42921 +               goto exit;
42922 +       }
42923 +
42924 +       reiser4_seal_done(&new_fsdata->dec.entry_seal);
42925 +
42926 +       /* add or replace name for @old_inode as @new_name */
42927 +       if (new_inode != NULL) {
42928 +               /* target (@new_name) exists. */
42929 +               /* Not clear what to do with objects that are
42930 +                  both directories and files at the same time. */
42931 +               if (result == CBK_COORD_FOUND) {
42932 +                       result = replace_name(old_inode,
42933 +                                             new_dir,
42934 +                                             new_inode, new_coord, &new_lh);
42935 +                       if (result == 0)
42936 +                               fplug = inode_file_plugin(new_inode);
42937 +               } else if (result == CBK_COORD_NOTFOUND) {
42938 +                       /* VFS told us that @new_name is bound to existing
42939 +                          inode, but we failed to find directory entry. */
42940 +                       warning("nikita-2324", "Target not found");
42941 +                       result = RETERR(-ENOENT);
42942 +               }
42943 +       } else {
42944 +               /* target (@new_name) doesn't exists. */
42945 +               if (result == CBK_COORD_NOTFOUND)
42946 +                       result = add_name(old_inode,
42947 +                                         new_dir,
42948 +                                         new_name, new_coord, &new_lh, is_dir);
42949 +               else if (result == CBK_COORD_FOUND) {
42950 +                       /* VFS told us that @new_name is "negative" dentry,
42951 +                          but we found directory entry. */
42952 +                       warning("nikita-2331", "Target found unexpectedly");
42953 +                       result = RETERR(-EIO);
42954 +               }
42955 +       }
42956 +
42957 +       assert("nikita-3462", ergo(result == 0,
42958 +                                  old_inode->i_nlink >= 2 + !!is_dir));
42959 +
42960 +       /* We are done with all modifications to the @new_dir, release lock on
42961 +          node. */
42962 +       done_lh(&new_lh);
42963 +
42964 +       if (fplug != NULL) {
42965 +               /* detach @new_inode from name-space */
42966 +               result = fplug->detach(new_inode, new_dir);
42967 +               if (result != 0)
42968 +                       warning("nikita-2330", "Cannot detach %lli: %i. %s",
42969 +                               (unsigned long long)get_inode_oid(new_inode),
42970 +                               result, possible_leak);
42971 +       }
42972 +
42973 +       if (new_inode != NULL)
42974 +               reiser4_update_sd(new_inode);
42975 +
42976 +       if (result == 0) {
42977 +               memset(&old_entry, 0, sizeof old_entry);
42978 +               old_entry.obj = old_inode;
42979 +
42980 +               dplug->build_entry_key(old_dir,
42981 +                                      &old_name->d_name, &old_entry.key);
42982 +
42983 +               /* At this stage new name was introduced for
42984 +                  @old_inode. @old_inode, @new_dir, and @new_inode i_nlink
42985 +                  counters were updated.
42986 +
42987 +                  We want to remove @old_name now. If @old_inode wasn't
42988 +                  directory this is simple.
42989 +                */
42990 +               result = dplug->rem_entry(old_dir, old_name, &old_entry);
42991 +               /*result = rem_entry_hashed(old_dir, old_name, &old_entry); */
42992 +               if (result != 0 && result != -ENOMEM) {
42993 +                       warning("nikita-2335",
42994 +                               "Cannot remove old name: %i", result);
42995 +               } else {
42996 +                       result = reiser4_del_nlink(old_inode, old_dir, 0);
42997 +                       if (result != 0 && result != -ENOMEM) {
42998 +                               warning("nikita-2337",
42999 +                                       "Cannot drop link on old: %i", result);
43000 +                       }
43001 +               }
43002 +
43003 +               if (result == 0 && is_dir) {
43004 +                       /* @old_inode is directory. We also have to update
43005 +                          dotdot entry. */
43006 +                       coord_t *dotdot_coord;
43007 +                       lock_handle dotdot_lh;
43008 +                       struct dentry dotdot_name;
43009 +                       reiser4_dir_entry_desc dotdot_entry;
43010 +                       struct reiser4_dentry_fsdata dataonstack;
43011 +                       struct reiser4_dentry_fsdata *fsdata;
43012 +
43013 +                       memset(&dataonstack, 0, sizeof dataonstack);
43014 +                       memset(&dotdot_entry, 0, sizeof dotdot_entry);
43015 +                       dotdot_entry.obj = old_dir;
43016 +                       memset(&dotdot_name, 0, sizeof dotdot_name);
43017 +                       dotdot_name.d_name.name = "..";
43018 +                       dotdot_name.d_name.len = 2;
43019 +                       /*
43020 +                        * allocate ->d_fsdata on the stack to avoid using
43021 +                        * reiser4_get_dentry_fsdata(). Locking is not needed,
43022 +                        * because dentry is private to the current thread.
43023 +                        */
43024 +                       dotdot_name.d_fsdata = &dataonstack;
43025 +                       init_lh(&dotdot_lh);
43026 +
43027 +                       fsdata = &dataonstack;
43028 +                       dotdot_coord = &fsdata->dec.entry_coord;
43029 +                       coord_clear_iplug(dotdot_coord);
43030 +
43031 +                       result = reiser4_find_entry(old_inode,
43032 +                                                   &dotdot_name,
43033 +                                                   &dotdot_lh,
43034 +                                                   ZNODE_WRITE_LOCK,
43035 +                                                   &dotdot_entry);
43036 +                       if (result == 0) {
43037 +                               /* replace_name() decreases i_nlink on
43038 +                                * @old_dir */
43039 +                               result = replace_name(new_dir,
43040 +                                                     old_inode,
43041 +                                                     old_dir,
43042 +                                                     dotdot_coord, &dotdot_lh);
43043 +                       } else
43044 +                               result = RETERR(-EIO);
43045 +                       done_lh(&dotdot_lh);
43046 +               }
43047 +       }
43048 +       reiser4_update_dir(new_dir);
43049 +       reiser4_update_dir(old_dir);
43050 +       reiser4_update_sd(old_inode);
43051 +       if (result == 0) {
43052 +               file_plugin *fplug;
43053 +
43054 +               if (new_inode != NULL) {
43055 +                       /* add safe-link for target file (in case we removed
43056 +                        * last reference to the poor fellow */
43057 +                       fplug = inode_file_plugin(new_inode);
43058 +                       if (new_inode->i_nlink == 0)
43059 +                               result = safe_link_add(new_inode, SAFE_UNLINK);
43060 +               }
43061 +       }
43062 +exit:
43063 +       context_set_commit_async(ctx);
43064 +       reiser4_exit_context(ctx);
43065 +       return result;
43066 +}
43067 +#endif
43068 diff -puN /dev/null fs/reiser4/plugin/item/Makefile
43069 --- /dev/null
43070 +++ a/fs/reiser4/plugin/item/Makefile
43071 @@ -0,0 +1,18 @@
43072 +obj-$(CONFIG_REISER4_FS) += item_plugins.o
43073 +
43074 +item_plugins-objs :=           \
43075 +       item.o                  \
43076 +       static_stat.o           \
43077 +       sde.o                   \
43078 +       cde.o                   \
43079 +       blackbox.o              \
43080 +       internal.o              \
43081 +       tail.o                  \
43082 +       ctail.o                 \
43083 +       extent.o                \
43084 +       extent_item_ops.o       \
43085 +       extent_file_ops.o       \
43086 +       extent_flush_ops.o
43087 +
43088 +
43089 +
43090 diff -puN /dev/null fs/reiser4/plugin/item/acl.h
43091 --- /dev/null
43092 +++ a/fs/reiser4/plugin/item/acl.h
43093 @@ -0,0 +1,66 @@
43094 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
43095 +
43096 +/* Directory entry. */
43097 +
43098 +#if !defined( __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__ )
43099 +#define __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__
43100 +
43101 +#include "../../forward.h"
43102 +#include "../../dformat.h"
43103 +#include "../../kassign.h"
43104 +#include "../../key.h"
43105 +
43106 +#include <linux/fs.h>
43107 +#include <linux/dcache.h>      /* for struct dentry */
43108 +
43109 +typedef struct directory_entry_format {
43110 +       /* key of object stat-data. It's not necessary to store whole
43111 +          key here, because it's always key of stat-data, so minor
43112 +          packing locality and offset can be omitted here. But this
43113 +          relies on particular key allocation scheme for stat-data, so,
43114 +          for extensibility sake, whole key can be stored here.
43115 +
43116 +          We store key as array of bytes, because we don't want 8-byte
43117 +          alignment of dir entries.
43118 +        */
43119 +       obj_key_id id;
43120 +       /* file name. Null terminated string. */
43121 +       d8 name[0];
43122 +} directory_entry_format;
43123 +
43124 +void print_de(const char *prefix, coord_t * coord);
43125 +int extract_key_de(const coord_t * coord, reiser4_key * key);
43126 +int update_key_de(const coord_t * coord, const reiser4_key * key,
43127 +                 lock_handle * lh);
43128 +char *extract_name_de(const coord_t * coord, char *buf);
43129 +unsigned extract_file_type_de(const coord_t * coord);
43130 +int add_entry_de(struct inode *dir, coord_t * coord,
43131 +                lock_handle * lh, const struct dentry *name,
43132 +                reiser4_dir_entry_desc * entry);
43133 +int rem_entry_de(struct inode *dir, const struct qstr *name, coord_t * coord,
43134 +                lock_handle * lh, reiser4_dir_entry_desc * entry);
43135 +int max_name_len_de(const struct inode *dir);
43136 +
43137 +int de_rem_and_shrink(struct inode *dir, coord_t * coord, int length);
43138 +
43139 +char *extract_dent_name(const coord_t * coord,
43140 +                       directory_entry_format * dent, char *buf);
43141 +
43142 +#if REISER4_LARGE_KEY
43143 +#define DE_NAME_BUF_LEN (24)
43144 +#else
43145 +#define DE_NAME_BUF_LEN (16)
43146 +#endif
43147 +
43148 +/* __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__ */
43149 +#endif
43150 +
43151 +/* Make Linus happy.
43152 +   Local variables:
43153 +   c-indentation-style: "K&R"
43154 +   mode-name: "LC"
43155 +   c-basic-offset: 8
43156 +   tab-width: 8
43157 +   fill-column: 120
43158 +   End:
43159 +*/
43160 diff -puN /dev/null fs/reiser4/plugin/item/blackbox.c
43161 --- /dev/null
43162 +++ a/fs/reiser4/plugin/item/blackbox.c
43163 @@ -0,0 +1,142 @@
43164 +/* Copyright 2003 by Hans Reiser, licensing governed by
43165 + * reiser4/README */
43166 +
43167 +/* Black box item implementation */
43168 +
43169 +#include "../../forward.h"
43170 +#include "../../debug.h"
43171 +#include "../../dformat.h"
43172 +#include "../../kassign.h"
43173 +#include "../../coord.h"
43174 +#include "../../tree.h"
43175 +#include "../../lock.h"
43176 +
43177 +#include "blackbox.h"
43178 +#include "item.h"
43179 +#include "../plugin.h"
43180 +
43181 +int
43182 +store_black_box(reiser4_tree * tree,
43183 +               const reiser4_key * key, void *data, int length)
43184 +{
43185 +       int result;
43186 +       reiser4_item_data idata;
43187 +       coord_t coord;
43188 +       lock_handle lh;
43189 +
43190 +       memset(&idata, 0, sizeof idata);
43191 +
43192 +       idata.data = data;
43193 +       idata.user = 0;
43194 +       idata.length = length;
43195 +       idata.iplug = item_plugin_by_id(BLACK_BOX_ID);
43196 +
43197 +       init_lh(&lh);
43198 +       result = insert_by_key(tree, key,
43199 +                              &idata, &coord, &lh, LEAF_LEVEL, CBK_UNIQUE);
43200 +
43201 +       assert("nikita-3413",
43202 +              ergo(result == 0,
43203 +                   WITH_COORD(&coord,
43204 +                              item_length_by_coord(&coord) == length)));
43205 +
43206 +       done_lh(&lh);
43207 +       return result;
43208 +}
43209 +
43210 +int
43211 +load_black_box(reiser4_tree * tree,
43212 +              reiser4_key * key, void *data, int length, int exact)
43213 +{
43214 +       int result;
43215 +       coord_t coord;
43216 +       lock_handle lh;
43217 +
43218 +       init_lh(&lh);
43219 +       result = coord_by_key(tree, key,
43220 +                             &coord, &lh, ZNODE_READ_LOCK,
43221 +                             exact ? FIND_EXACT : FIND_MAX_NOT_MORE_THAN,
43222 +                             LEAF_LEVEL, LEAF_LEVEL, CBK_UNIQUE, NULL);
43223 +
43224 +       if (result == 0) {
43225 +               int ilen;
43226 +
43227 +               result = zload(coord.node);
43228 +               if (result == 0) {
43229 +                       ilen = item_length_by_coord(&coord);
43230 +                       if (ilen <= length) {
43231 +                               memcpy(data, item_body_by_coord(&coord), ilen);
43232 +                               unit_key_by_coord(&coord, key);
43233 +                       } else if (exact) {
43234 +                               /*
43235 +                                * item is larger than buffer provided by the
43236 +                                * user. Only issue a warning if @exact is
43237 +                                * set. If @exact is false, we are iterating
43238 +                                * over all safe-links and here we are reaching
43239 +                                * the end of the iteration.
43240 +                                */
43241 +                               warning("nikita-3415",
43242 +                                       "Wrong black box length: %i > %i",
43243 +                                       ilen, length);
43244 +                               result = RETERR(-EIO);
43245 +                       }
43246 +                       zrelse(coord.node);
43247 +               }
43248 +       }
43249 +
43250 +       done_lh(&lh);
43251 +       return result;
43252 +
43253 +}
43254 +
43255 +int
43256 +update_black_box(reiser4_tree * tree,
43257 +                const reiser4_key * key, void *data, int length)
43258 +{
43259 +       int result;
43260 +       coord_t coord;
43261 +       lock_handle lh;
43262 +
43263 +       init_lh(&lh);
43264 +       result = coord_by_key(tree, key,
43265 +                             &coord, &lh, ZNODE_READ_LOCK,
43266 +                             FIND_EXACT,
43267 +                             LEAF_LEVEL, LEAF_LEVEL, CBK_UNIQUE, NULL);
43268 +       if (result == 0) {
43269 +               int ilen;
43270 +
43271 +               result = zload(coord.node);
43272 +               if (result == 0) {
43273 +                       ilen = item_length_by_coord(&coord);
43274 +                       if (length <= ilen) {
43275 +                               memcpy(item_body_by_coord(&coord), data,
43276 +                                      length);
43277 +                       } else {
43278 +                               warning("nikita-3437",
43279 +                                       "Wrong black box length: %i < %i",
43280 +                                       ilen, length);
43281 +                               result = RETERR(-EIO);
43282 +                       }
43283 +                       zrelse(coord.node);
43284 +               }
43285 +       }
43286 +
43287 +       done_lh(&lh);
43288 +       return result;
43289 +
43290 +}
43291 +
43292 +int kill_black_box(reiser4_tree * tree, const reiser4_key * key)
43293 +{
43294 +       return reiser4_cut_tree(tree, key, key, NULL, 1);
43295 +}
43296 +
43297 +/* Make Linus happy.
43298 +   Local variables:
43299 +   c-indentation-style: "K&R"
43300 +   mode-name: "LC"
43301 +   c-basic-offset: 8
43302 +   tab-width: 8
43303 +   fill-column: 120
43304 +   End:
43305 +*/
43306 diff -puN /dev/null fs/reiser4/plugin/item/blackbox.h
43307 --- /dev/null
43308 +++ a/fs/reiser4/plugin/item/blackbox.h
43309 @@ -0,0 +1,33 @@
43310 +/* Copyright 2003 by Hans Reiser, licensing governed by
43311 + * reiser4/README */
43312 +
43313 +/* "Black box" entry to fixed-width contain user supplied data */
43314 +
43315 +#if !defined( __FS_REISER4_BLACK_BOX_H__ )
43316 +#define __FS_REISER4_BLACK_BOX_H__
43317 +
43318 +#include "../../forward.h"
43319 +#include "../../dformat.h"
43320 +#include "../../kassign.h"
43321 +#include "../../key.h"
43322 +
43323 +extern int store_black_box(reiser4_tree * tree,
43324 +                          const reiser4_key * key, void *data, int length);
43325 +extern int load_black_box(reiser4_tree * tree,
43326 +                         reiser4_key * key, void *data, int length, int exact);
43327 +extern int kill_black_box(reiser4_tree * tree, const reiser4_key * key);
43328 +extern int update_black_box(reiser4_tree * tree,
43329 +                           const reiser4_key * key, void *data, int length);
43330 +
43331 +/* __FS_REISER4_BLACK_BOX_H__ */
43332 +#endif
43333 +
43334 +/* Make Linus happy.
43335 +   Local variables:
43336 +   c-indentation-style: "K&R"
43337 +   mode-name: "LC"
43338 +   c-basic-offset: 8
43339 +   tab-width: 8
43340 +   fill-column: 120
43341 +   End:
43342 +*/
43343 diff -puN /dev/null fs/reiser4/plugin/item/cde.c
43344 --- /dev/null
43345 +++ a/fs/reiser4/plugin/item/cde.c
43346 @@ -0,0 +1,1008 @@
43347 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
43348 +
43349 +/* Directory entry implementation */
43350 +
43351 +/* DESCRIPTION:
43352 +
43353 +   This is "compound" directory item plugin implementation. This directory
43354 +   item type is compound (as opposed to the "simple directory item" in
43355 +   fs/reiser4/plugin/item/sde.[ch]), because it consists of several directory
43356 +   entries.
43357 +
43358 +   The reason behind this decision is disk space efficiency: all directory
43359 +   entries inside the same directory have identical fragment in their
43360 +   keys. This, of course, depends on key assignment policy. In our default key
43361 +   assignment policy, all directory entries have the same locality which is
43362 +   equal to the object id of their directory.
43363 +
43364 +   Composing directory item out of several directory entries for the same
43365 +   directory allows us to store said key fragment only once. That is, this is
43366 +   some ad hoc form of key compression (stem compression) that is implemented
43367 +   here, because general key compression is not supposed to be implemented in
43368 +   v4.0.
43369 +
43370 +   Another decision that was made regarding all directory item plugins, is
43371 +   that they will store entry keys unaligned. This is for that sake of disk
43372 +   space efficiency again.
43373 +
43374 +   In should be noted, that storing keys unaligned increases CPU consumption,
43375 +   at least on some architectures.
43376 +
43377 +   Internal on-disk structure of the compound directory item is the following:
43378 +
43379 +        HEADER          cde_item_format.        Here number of entries is stored.
43380 +        ENTRY_HEADER_0  cde_unit_header.        Here part of entry key and
43381 +        ENTRY_HEADER_1                          offset of entry body are stored.
43382 +        ENTRY_HEADER_2                         (basically two last parts of key)
43383 +        ...
43384 +        ENTRY_HEADER_N
43385 +        ENTRY_BODY_0    directory_entry_format. Here part of stat data key and
43386 +        ENTRY_BODY_1                            NUL-terminated name are stored.
43387 +        ENTRY_BODY_2                           (part of statadta key in the
43388 +                                                sence that since all SDs have
43389 +                                                zero offset, this offset is not
43390 +                                                stored on disk).
43391 +        ...
43392 +        ENTRY_BODY_N
43393 +
43394 +   When it comes to the balancing, each directory entry in compound directory
43395 +   item is unit, that is, something that can be cut from one item and pasted
43396 +   into another item of the same type. Handling of unit cut and paste is major
43397 +   reason for the complexity of code below.
43398 +
43399 +*/
43400 +
43401 +#include "../../forward.h"
43402 +#include "../../debug.h"
43403 +#include "../../dformat.h"
43404 +#include "../../kassign.h"
43405 +#include "../../key.h"
43406 +#include "../../coord.h"
43407 +#include "sde.h"
43408 +#include "cde.h"
43409 +#include "item.h"
43410 +#include "../node/node.h"
43411 +#include "../plugin.h"
43412 +#include "../../znode.h"
43413 +#include "../../carry.h"
43414 +#include "../../tree.h"
43415 +#include "../../inode.h"
43416 +
43417 +#include <linux/fs.h>          /* for struct inode */
43418 +#include <linux/dcache.h>      /* for struct dentry */
43419 +#include <linux/quotaops.h>
43420 +
43421 +#if 0
43422 +#define CHECKME(coord)                                         \
43423 +({                                                             \
43424 +       const char *message;                                    \
43425 +       coord_t dup;                                            \
43426 +                                                               \
43427 +       coord_dup_nocheck(&dup, (coord));                       \
43428 +       dup.unit_pos = 0;                                       \
43429 +       assert("nikita-2871", cde_check(&dup, &message) == 0);  \
43430 +})
43431 +#else
43432 +#define CHECKME(coord) noop
43433 +#endif
43434 +
43435 +/* return body of compound directory item at @coord */
43436 +static inline cde_item_format *formatted_at(const coord_t * coord)
43437 +{
43438 +       assert("nikita-1282", coord != NULL);
43439 +       return item_body_by_coord(coord);
43440 +}
43441 +
43442 +/* return entry header at @coord */
43443 +static inline cde_unit_header *header_at(const coord_t *
43444 +                                        coord /* coord of item */ ,
43445 +                                        int idx /* index of unit */ )
43446 +{
43447 +       assert("nikita-1283", coord != NULL);
43448 +       return &formatted_at(coord)->entry[idx];
43449 +}
43450 +
43451 +/* return number of units in compound directory item at @coord */
43452 +static int units(const coord_t * coord /* coord of item */ )
43453 +{
43454 +       return le16_to_cpu(get_unaligned(&formatted_at(coord)->num_of_entries));
43455 +}
43456 +
43457 +/* return offset of the body of @idx-th entry in @coord */
43458 +static unsigned int offset_of(const coord_t * coord /* coord of item */ ,
43459 +                             int idx /* index of unit */ )
43460 +{
43461 +       if (idx < units(coord))
43462 +               return le16_to_cpu(get_unaligned(&header_at(coord, idx)->offset));
43463 +       else if (idx == units(coord))
43464 +               return item_length_by_coord(coord);
43465 +       else
43466 +               impossible("nikita-1308", "Wrong idx");
43467 +       return 0;
43468 +}
43469 +
43470 +/* set offset of the body of @idx-th entry in @coord */
43471 +static void set_offset(const coord_t * coord /* coord of item */ ,
43472 +                      int idx /* index of unit */ ,
43473 +                      unsigned int offset /* new offset */ )
43474 +{
43475 +       put_unaligned(cpu_to_le16((__u16) offset), &header_at(coord, idx)->offset);
43476 +}
43477 +
43478 +static void adj_offset(const coord_t * coord /* coord of item */ ,
43479 +                      int idx /* index of unit */ ,
43480 +                      int delta /* offset change */ )
43481 +{
43482 +       d16 *doffset;
43483 +       __u16 offset;
43484 +
43485 +       doffset = &header_at(coord, idx)->offset;
43486 +       offset = le16_to_cpu(get_unaligned(doffset));
43487 +       offset += delta;
43488 +       put_unaligned(cpu_to_le16((__u16) offset), doffset);
43489 +}
43490 +
43491 +/* return pointer to @offset-th byte from the beginning of @coord */
43492 +static char *address(const coord_t * coord /* coord of item */ ,
43493 +                    int offset)
43494 +{
43495 +       return ((char *)item_body_by_coord(coord)) + offset;
43496 +}
43497 +
43498 +/* return pointer to the body of @idx-th entry in @coord */
43499 +static directory_entry_format *entry_at(const coord_t * coord  /* coord of
43500 +                                                                * item */ ,
43501 +                                       int idx /* index of unit */ )
43502 +{
43503 +       return (directory_entry_format *) address(coord,
43504 +                                                 (int)offset_of(coord, idx));
43505 +}
43506 +
43507 +/* return number of unit referenced by @coord */
43508 +static int idx_of(const coord_t * coord /* coord of item */ )
43509 +{
43510 +       assert("nikita-1285", coord != NULL);
43511 +       return coord->unit_pos;
43512 +}
43513 +
43514 +/* find position where entry with @entry_key would be inserted into @coord */
43515 +static int find(const coord_t * coord /* coord of item */ ,
43516 +               const reiser4_key * entry_key /* key to look for */ ,
43517 +               cmp_t * last /* result of last comparison */ )
43518 +{
43519 +       int entries;
43520 +
43521 +       int left;
43522 +       int right;
43523 +
43524 +       cde_unit_header *header;
43525 +
43526 +       assert("nikita-1295", coord != NULL);
43527 +       assert("nikita-1296", entry_key != NULL);
43528 +       assert("nikita-1297", last != NULL);
43529 +
43530 +       entries = units(coord);
43531 +       left = 0;
43532 +       right = entries - 1;
43533 +       while (right - left >= REISER4_SEQ_SEARCH_BREAK) {
43534 +               int median;
43535 +
43536 +               median = (left + right) >> 1;
43537 +
43538 +               header = header_at(coord, median);
43539 +               *last = de_id_key_cmp(&header->hash, entry_key);
43540 +               switch (*last) {
43541 +               case LESS_THAN:
43542 +                       left = median;
43543 +                       break;
43544 +               case GREATER_THAN:
43545 +                       right = median;
43546 +                       break;
43547 +               case EQUAL_TO:{
43548 +                               do {
43549 +                                       median--;
43550 +                                       header--;
43551 +                               } while (median >= 0 &&
43552 +                                        de_id_key_cmp(&header->hash,
43553 +                                                      entry_key) == EQUAL_TO);
43554 +                               return median + 1;
43555 +                       }
43556 +               }
43557 +       }
43558 +       header = header_at(coord, left);
43559 +       for (; left < entries; ++left, ++header) {
43560 +               prefetch(header + 1);
43561 +               *last = de_id_key_cmp(&header->hash, entry_key);
43562 +               if (*last != LESS_THAN)
43563 +                       break;
43564 +       }
43565 +       if (left < entries)
43566 +               return left;
43567 +       else
43568 +               return RETERR(-ENOENT);
43569 +
43570 +}
43571 +
43572 +/* expand @coord as to accommodate for insertion of @no new entries starting
43573 +   from @pos, with total bodies size @size. */
43574 +static int expand_item(const coord_t * coord /* coord of item */ ,
43575 +                      int pos /* unit position */ , int no     /* number of new
43576 +                                                                * units*/ ,
43577 +                      int size /* total size of new units' data */ ,
43578 +                      unsigned int data_size   /* free space already reserved
43579 +                                                * in the item for insertion */ )
43580 +{
43581 +       int entries;
43582 +       cde_unit_header *header;
43583 +       char *dent;
43584 +       int i;
43585 +
43586 +       assert("nikita-1310", coord != NULL);
43587 +       assert("nikita-1311", pos >= 0);
43588 +       assert("nikita-1312", no > 0);
43589 +       assert("nikita-1313", data_size >= no * sizeof(directory_entry_format));
43590 +       assert("nikita-1343",
43591 +              item_length_by_coord(coord) >=
43592 +              (int)(size + data_size + no * sizeof *header));
43593 +
43594 +       entries = units(coord);
43595 +
43596 +       if (pos == entries)
43597 +               dent = address(coord, size);
43598 +       else
43599 +               dent = (char *)entry_at(coord, pos);
43600 +       /* place where new header will be in */
43601 +       header = header_at(coord, pos);
43602 +       /* free space for new entry headers */
43603 +       memmove(header + no, header,
43604 +               (unsigned)(address(coord, size) - (char *)header));
43605 +       /* if adding to the end initialise first new header */
43606 +       if (pos == entries) {
43607 +               set_offset(coord, pos, (unsigned)size);
43608 +       }
43609 +
43610 +       /* adjust entry pointer and size */
43611 +       dent = dent + no * sizeof *header;
43612 +       size += no * sizeof *header;
43613 +       /* free space for new entries */
43614 +       memmove(dent + data_size, dent,
43615 +               (unsigned)(address(coord, size) - dent));
43616 +
43617 +       /* increase counter */
43618 +       entries += no;
43619 +       put_unaligned(cpu_to_le16((__u16) entries), &formatted_at(coord)->num_of_entries);
43620 +
43621 +       /* [ 0 ... pos ] entries were shifted by no * ( sizeof *header )
43622 +          bytes.  */
43623 +       for (i = 0; i <= pos; ++i)
43624 +               adj_offset(coord, i, no * sizeof *header);
43625 +       /* [ pos + no ... +\infty ) entries were shifted by ( no *
43626 +          sizeof *header + data_size ) bytes */
43627 +       for (i = pos + no; i < entries; ++i)
43628 +               adj_offset(coord, i, no * sizeof *header + data_size);
43629 +       return 0;
43630 +}
43631 +
43632 +/* insert new @entry into item */
43633 +static int expand(const coord_t * coord /* coord of item */ ,
43634 +                 struct cde_entry * entry /* entry to insert */ ,
43635 +                 int len /* length of @entry data */ ,
43636 +                 int *pos /* position to insert */ ,
43637 +                 reiser4_dir_entry_desc * dir_entry    /* parameters for new
43638 +                                                        * entry */ )
43639 +{
43640 +       cmp_t cmp_res;
43641 +       int datasize;
43642 +
43643 +       *pos = find(coord, &dir_entry->key, &cmp_res);
43644 +       if (*pos < 0)
43645 +               *pos = units(coord);
43646 +
43647 +       datasize = sizeof(directory_entry_format);
43648 +       if (is_longname(entry->name->name, entry->name->len))
43649 +               datasize += entry->name->len + 1;
43650 +
43651 +       expand_item(coord, *pos, 1, item_length_by_coord(coord) - len,
43652 +                   datasize);
43653 +       return 0;
43654 +}
43655 +
43656 +/* paste body of @entry into item */
43657 +static int paste_entry(const coord_t * coord /* coord of item */ ,
43658 +                      struct cde_entry * entry /* new entry */ ,
43659 +                      int pos /* position to insert */ ,
43660 +                      reiser4_dir_entry_desc * dir_entry       /* parameters for
43661 +                                                                * new entry */ )
43662 +{
43663 +       cde_unit_header *header;
43664 +       directory_entry_format *dent;
43665 +       const char *name;
43666 +       int len;
43667 +
43668 +       header = header_at(coord, pos);
43669 +       dent = entry_at(coord, pos);
43670 +
43671 +       build_de_id_by_key(&dir_entry->key, &header->hash);
43672 +       build_inode_key_id(entry->obj, &dent->id);
43673 +       /* AUDIT unsafe strcpy() operation! It should be replaced with
43674 +          much less CPU hungry
43675 +          memcpy( ( char * ) dent -> name, entry -> name -> name , entry -> name -> len );
43676 +
43677 +          Also a more major thing is that there should be a way to figure out
43678 +          amount of space in dent -> name and be able to check that we are
43679 +          not going to overwrite more than we supposed to */
43680 +       name = entry->name->name;
43681 +       len = entry->name->len;
43682 +       if (is_longname(name, len)) {
43683 +               strcpy((unsigned char *)dent->name, name);
43684 +               put_unaligned(0, &dent->name[len]);
43685 +       }
43686 +       return 0;
43687 +}
43688 +
43689 +/* estimate how much space is necessary in item to insert/paste set of entries
43690 +   described in @data. */
43691 +int estimate_cde(const coord_t * coord /* coord of item */ ,
43692 +                const reiser4_item_data * data /* parameters for new item */ )
43693 +{
43694 +       struct cde_entry_data *e;
43695 +       int result;
43696 +       int i;
43697 +
43698 +       e = (struct cde_entry_data *) data->data;
43699 +
43700 +       assert("nikita-1288", e != NULL);
43701 +       assert("nikita-1289", e->num_of_entries >= 0);
43702 +
43703 +       if (coord == NULL)
43704 +               /* insert */
43705 +               result = sizeof(cde_item_format);
43706 +       else
43707 +               /* paste */
43708 +               result = 0;
43709 +
43710 +       result += e->num_of_entries *
43711 +           (sizeof(cde_unit_header) + sizeof(directory_entry_format));
43712 +       for (i = 0; i < e->num_of_entries; ++i) {
43713 +               const char *name;
43714 +               int len;
43715 +
43716 +               name = e->entry[i].name->name;
43717 +               len = e->entry[i].name->len;
43718 +               assert("nikita-2054", strlen(name) == len);
43719 +               if (is_longname(name, len))
43720 +                       result += len + 1;
43721 +       }
43722 +       ((reiser4_item_data *) data)->length = result;
43723 +       return result;
43724 +}
43725 +
43726 +/* ->nr_units() method for this item plugin. */
43727 +pos_in_node_t nr_units_cde(const coord_t * coord /* coord of item */ )
43728 +{
43729 +       return units(coord);
43730 +}
43731 +
43732 +/* ->unit_key() method for this item plugin. */
43733 +reiser4_key *unit_key_cde(const coord_t * coord /* coord of item */ ,
43734 +                         reiser4_key * key /* resulting key */ )
43735 +{
43736 +       assert("nikita-1452", coord != NULL);
43737 +       assert("nikita-1345", idx_of(coord) < units(coord));
43738 +       assert("nikita-1346", key != NULL);
43739 +
43740 +       item_key_by_coord(coord, key);
43741 +       extract_key_from_de_id(extract_dir_id_from_key(key),
43742 +                              &header_at(coord, idx_of(coord))->hash, key);
43743 +       return key;
43744 +}
43745 +
43746 +/* mergeable_cde(): implementation of ->mergeable() item method.
43747 +
43748 +   Two directory items are mergeable iff they are from the same
43749 +   directory. That simple.
43750 +
43751 +*/
43752 +int mergeable_cde(const coord_t * p1 /* coord of first item */ ,
43753 +                 const coord_t * p2 /* coord of second item */ )
43754 +{
43755 +       reiser4_key k1;
43756 +       reiser4_key k2;
43757 +
43758 +       assert("nikita-1339", p1 != NULL);
43759 +       assert("nikita-1340", p2 != NULL);
43760 +
43761 +       return
43762 +           (item_plugin_by_coord(p1) == item_plugin_by_coord(p2)) &&
43763 +           (extract_dir_id_from_key(item_key_by_coord(p1, &k1)) ==
43764 +            extract_dir_id_from_key(item_key_by_coord(p2, &k2)));
43765 +
43766 +}
43767 +
43768 +/* ->max_key_inside() method for this item plugin. */
43769 +reiser4_key *max_key_inside_cde(const coord_t * coord /* coord of item */ ,
43770 +                               reiser4_key * result /* resulting key */ )
43771 +{
43772 +       assert("nikita-1342", coord != NULL);
43773 +
43774 +       item_key_by_coord(coord, result);
43775 +       set_key_ordering(result, get_key_ordering(reiser4_max_key()));
43776 +       set_key_fulloid(result, get_key_fulloid(reiser4_max_key()));
43777 +       set_key_offset(result, get_key_offset(reiser4_max_key()));
43778 +       return result;
43779 +}
43780 +
43781 +/* @data contains data which are to be put into tree */
43782 +int can_contain_key_cde(const coord_t * coord /* coord of item */ ,
43783 +                       const reiser4_key * key /* key to check */ ,
43784 +                       const reiser4_item_data * data  /* parameters of new
43785 +                                                        * item/unit being
43786 +                                                        * created */ )
43787 +{
43788 +       reiser4_key item_key;
43789 +
43790 +       /* FIXME-VS: do not rely on anything but iplug field of @data. Only
43791 +          data->iplug is initialized */
43792 +       assert("vs-457", data && data->iplug);
43793 +/*     assert( "vs-553", data -> user == 0 );*/
43794 +       item_key_by_coord(coord, &item_key);
43795 +
43796 +       return (item_plugin_by_coord(coord) == data->iplug) &&
43797 +           (extract_dir_id_from_key(&item_key) ==
43798 +            extract_dir_id_from_key(key));
43799 +}
43800 +
43801 +#if REISER4_DEBUG
43802 +/* cde_check ->check() method for compressed directory items
43803 +
43804 +   used for debugging, every item should have here the most complete
43805 +   possible check of the consistency of the item that the inventor can
43806 +   construct
43807 +*/
43808 +int reiser4_check_cde(const coord_t * coord /* coord of item to check */,
43809 +                     const char **error /* where to store error message */)
43810 +{
43811 +       int i;
43812 +       int result;
43813 +       char *item_start;
43814 +       char *item_end;
43815 +       reiser4_key key;
43816 +
43817 +       coord_t c;
43818 +
43819 +       assert("nikita-1357", coord != NULL);
43820 +       assert("nikita-1358", error != NULL);
43821 +
43822 +       if (!ergo(coord->item_pos != 0,
43823 +                 is_dot_key(item_key_by_coord(coord, &key)))) {
43824 +               *error = "CDE doesn't start with dot";
43825 +               return -1;
43826 +       }
43827 +       item_start = item_body_by_coord(coord);
43828 +       item_end = item_start + item_length_by_coord(coord);
43829 +
43830 +       coord_dup(&c, coord);
43831 +       result = 0;
43832 +       for (i = 0; i < units(coord); ++i) {
43833 +               directory_entry_format *entry;
43834 +
43835 +               if ((char *)(header_at(coord, i) + 1) >
43836 +                   item_end - units(coord) * sizeof *entry) {
43837 +                       *error = "CDE header is out of bounds";
43838 +                       result = -1;
43839 +                       break;
43840 +               }
43841 +               entry = entry_at(coord, i);
43842 +               if ((char *)entry < item_start + sizeof(cde_item_format)) {
43843 +                       *error = "CDE header is too low";
43844 +                       result = -1;
43845 +                       break;
43846 +               }
43847 +               if ((char *)(entry + 1) > item_end) {
43848 +                       *error = "CDE header is too high";
43849 +                       result = -1;
43850 +                       break;
43851 +               }
43852 +       }
43853 +
43854 +       return result;
43855 +}
43856 +#endif
43857 +
43858 +/* ->init() method for this item plugin. */
43859 +int init_cde(coord_t * coord /* coord of item */ ,
43860 +            coord_t * from UNUSED_ARG, reiser4_item_data * data        /* structure used for insertion */
43861 +            UNUSED_ARG)
43862 +{
43863 +       put_unaligned(cpu_to_le16(0), &formatted_at(coord)->num_of_entries);
43864 +       return 0;
43865 +}
43866 +
43867 +/* ->lookup() method for this item plugin. */
43868 +lookup_result lookup_cde(const reiser4_key * key /* key to search for */ ,
43869 +                        lookup_bias bias /* search bias */ ,
43870 +                        coord_t * coord /* coord of item to lookup in */ )
43871 +{
43872 +       cmp_t last_comp;
43873 +       int pos;
43874 +
43875 +       reiser4_key utmost_key;
43876 +
43877 +       assert("nikita-1293", coord != NULL);
43878 +       assert("nikita-1294", key != NULL);
43879 +
43880 +       CHECKME(coord);
43881 +
43882 +       if (keygt(item_key_by_coord(coord, &utmost_key), key)) {
43883 +               coord->unit_pos = 0;
43884 +               coord->between = BEFORE_UNIT;
43885 +               return CBK_COORD_NOTFOUND;
43886 +       }
43887 +       pos = find(coord, key, &last_comp);
43888 +       if (pos >= 0) {
43889 +               coord->unit_pos = (int)pos;
43890 +               switch (last_comp) {
43891 +               case EQUAL_TO:
43892 +                       coord->between = AT_UNIT;
43893 +                       return CBK_COORD_FOUND;
43894 +               case GREATER_THAN:
43895 +                       coord->between = BEFORE_UNIT;
43896 +                       return RETERR(-ENOENT);
43897 +               case LESS_THAN:
43898 +               default:
43899 +                       impossible("nikita-1298", "Broken find");
43900 +                       return RETERR(-EIO);
43901 +               }
43902 +       } else {
43903 +               coord->unit_pos = units(coord) - 1;
43904 +               coord->between = AFTER_UNIT;
43905 +               return (bias ==
43906 +                       FIND_MAX_NOT_MORE_THAN) ? CBK_COORD_FOUND :
43907 +                   CBK_COORD_NOTFOUND;
43908 +       }
43909 +}
43910 +
43911 +/* ->paste() method for this item plugin. */
43912 +int paste_cde(coord_t * coord /* coord of item */ ,
43913 +             reiser4_item_data * data  /* parameters of new unit being
43914 +                                        * inserted */ ,
43915 +             carry_plugin_info * info UNUSED_ARG /* todo carry queue */ )
43916 +{
43917 +       struct cde_entry_data *e;
43918 +       int result;
43919 +       int i;
43920 +
43921 +       CHECKME(coord);
43922 +       e = (struct cde_entry_data *) data->data;
43923 +
43924 +       result = 0;
43925 +       for (i = 0; i < e->num_of_entries; ++i) {
43926 +               int pos;
43927 +               int phantom_size;
43928 +
43929 +               phantom_size = data->length;
43930 +               if (units(coord) == 0)
43931 +                       phantom_size -= sizeof(cde_item_format);
43932 +
43933 +               result =
43934 +                   expand(coord, e->entry + i, phantom_size, &pos, data->arg);
43935 +               if (result != 0)
43936 +                       break;
43937 +               result = paste_entry(coord, e->entry + i, pos, data->arg);
43938 +               if (result != 0)
43939 +                       break;
43940 +       }
43941 +       CHECKME(coord);
43942 +       return result;
43943 +}
43944 +
43945 +/* amount of space occupied by all entries starting from @idx both headers and
43946 +   bodies. */
43947 +static unsigned int part_size(const coord_t * coord /* coord of item */ ,
43948 +                             int idx /* index of unit */ )
43949 +{
43950 +       assert("nikita-1299", coord != NULL);
43951 +       assert("nikita-1300", idx < (int)units(coord));
43952 +
43953 +       return sizeof(cde_item_format) +
43954 +           (idx + 1) * sizeof(cde_unit_header) + offset_of(coord,
43955 +                                                           idx + 1) -
43956 +           offset_of(coord, 0);
43957 +}
43958 +
43959 +/* how many but not more than @want units of @source can be merged with
43960 +   item in @target node. If pend == append - we try to append last item
43961 +   of @target by first units of @source. If pend == prepend - we try to
43962 +   "prepend" first item in @target by last units of @source. @target
43963 +   node has @free_space bytes of free space. Total size of those units
43964 +   are returned via @size */
43965 +int can_shift_cde(unsigned free_space /* free space in item */ ,
43966 +                 coord_t * coord /* coord of source item */ ,
43967 +                 znode * target /* target node */ ,
43968 +                 shift_direction pend /* shift direction */ ,
43969 +                 unsigned *size /* resulting number of shifted bytes */ ,
43970 +                 unsigned want /* maximal number of bytes to shift */ )
43971 +{
43972 +       int shift;
43973 +
43974 +       CHECKME(coord);
43975 +       if (want == 0) {
43976 +               *size = 0;
43977 +               return 0;
43978 +       }
43979 +
43980 +       /* pend == SHIFT_LEFT <==> shifting to the left */
43981 +       if (pend == SHIFT_LEFT) {
43982 +               for (shift = min((int)want - 1, units(coord)); shift >= 0;
43983 +                    --shift) {
43984 +                       *size = part_size(coord, shift);
43985 +                       if (target != NULL)
43986 +                               *size -= sizeof(cde_item_format);
43987 +                       if (*size <= free_space)
43988 +                               break;
43989 +               }
43990 +               shift = shift + 1;
43991 +       } else {
43992 +               int total_size;
43993 +
43994 +               assert("nikita-1301", pend == SHIFT_RIGHT);
43995 +
43996 +               total_size = item_length_by_coord(coord);
43997 +               for (shift = units(coord) - want - 1; shift < units(coord) - 1;
43998 +                    ++shift) {
43999 +                       *size = total_size - part_size(coord, shift);
44000 +                       if (target == NULL)
44001 +                               *size += sizeof(cde_item_format);
44002 +                       if (*size <= free_space)
44003 +                               break;
44004 +               }
44005 +               shift = units(coord) - shift - 1;
44006 +       }
44007 +       if (shift == 0)
44008 +               *size = 0;
44009 +       CHECKME(coord);
44010 +       return shift;
44011 +}
44012 +
44013 +/* ->copy_units() method for this item plugin. */
44014 +void copy_units_cde(coord_t * target /* coord of target item */ ,
44015 +                   coord_t * source /* coord of source item */ ,
44016 +                   unsigned from /* starting unit */ ,
44017 +                   unsigned count /* how many units to copy */ ,
44018 +                   shift_direction where_is_free_space /* shift direction */ ,
44019 +                   unsigned free_space /* free space in item */ )
44020 +{
44021 +       char *header_from;
44022 +       char *header_to;
44023 +
44024 +       char *entry_from;
44025 +       char *entry_to;
44026 +
44027 +       int pos_in_target;
44028 +       int data_size;
44029 +       int data_delta;
44030 +       int i;
44031 +
44032 +       assert("nikita-1303", target != NULL);
44033 +       assert("nikita-1304", source != NULL);
44034 +       assert("nikita-1305", (int)from < units(source));
44035 +       assert("nikita-1307", (int)(from + count) <= units(source));
44036 +
44037 +       if (where_is_free_space == SHIFT_LEFT) {
44038 +               assert("nikita-1453", from == 0);
44039 +               pos_in_target = units(target);
44040 +       } else {
44041 +               assert("nikita-1309", (int)(from + count) == units(source));
44042 +               pos_in_target = 0;
44043 +               memmove(item_body_by_coord(target),
44044 +                       (char *)item_body_by_coord(target) + free_space,
44045 +                       item_length_by_coord(target) - free_space);
44046 +       }
44047 +
44048 +       CHECKME(target);
44049 +       CHECKME(source);
44050 +
44051 +       /* expand @target */
44052 +       data_size =
44053 +           offset_of(source, (int)(from + count)) - offset_of(source,
44054 +                                                              (int)from);
44055 +
44056 +       if (units(target) == 0)
44057 +               free_space -= sizeof(cde_item_format);
44058 +
44059 +       expand_item(target, pos_in_target, (int)count,
44060 +                   (int)(item_length_by_coord(target) - free_space),
44061 +                   (unsigned)data_size);
44062 +
44063 +       /* copy first @count units of @source into @target */
44064 +       data_delta =
44065 +           offset_of(target, pos_in_target) - offset_of(source, (int)from);
44066 +
44067 +       /* copy entries */
44068 +       entry_from = (char *)entry_at(source, (int)from);
44069 +       entry_to = (char *)entry_at(source, (int)(from + count));
44070 +       memmove(entry_at(target, pos_in_target), entry_from,
44071 +               (unsigned)(entry_to - entry_from));
44072 +
44073 +       /* copy headers */
44074 +       header_from = (char *)header_at(source, (int)from);
44075 +       header_to = (char *)header_at(source, (int)(from + count));
44076 +       memmove(header_at(target, pos_in_target), header_from,
44077 +               (unsigned)(header_to - header_from));
44078 +
44079 +       /* update offsets */
44080 +       for (i = pos_in_target; i < (int)(pos_in_target + count); ++i)
44081 +               adj_offset(target, i, data_delta);
44082 +       CHECKME(target);
44083 +       CHECKME(source);
44084 +}
44085 +
44086 +/* ->cut_units() method for this item plugin. */
44087 +int cut_units_cde(coord_t * coord /* coord of item */ ,
44088 +                 pos_in_node_t from /* start unit pos */ ,
44089 +                 pos_in_node_t to /* stop unit pos */ ,
44090 +                 struct carry_cut_data *cdata UNUSED_ARG,
44091 +                 reiser4_key * smallest_removed, reiser4_key * new_first)
44092 +{
44093 +       char *header_from;
44094 +       char *header_to;
44095 +
44096 +       char *entry_from;
44097 +       char *entry_to;
44098 +
44099 +       int size;
44100 +       int entry_delta;
44101 +       int header_delta;
44102 +       int i;
44103 +
44104 +       unsigned count;
44105 +
44106 +       CHECKME(coord);
44107 +
44108 +       count = to - from + 1;
44109 +
44110 +       assert("nikita-1454", coord != NULL);
44111 +       assert("nikita-1455", (int)(from + count) <= units(coord));
44112 +
44113 +       if (smallest_removed)
44114 +               unit_key_by_coord(coord, smallest_removed);
44115 +
44116 +       if (new_first) {
44117 +               coord_t next;
44118 +
44119 +               /* not everything is cut from item head */
44120 +               assert("vs-1527", from == 0);
44121 +               assert("vs-1528", to < units(coord) - 1);
44122 +
44123 +               coord_dup(&next, coord);
44124 +               next.unit_pos++;
44125 +               unit_key_by_coord(&next, new_first);
44126 +       }
44127 +
44128 +       size = item_length_by_coord(coord);
44129 +       if (count == (unsigned)units(coord)) {
44130 +               return size;
44131 +       }
44132 +
44133 +       header_from = (char *)header_at(coord, (int)from);
44134 +       header_to = (char *)header_at(coord, (int)(from + count));
44135 +
44136 +       entry_from = (char *)entry_at(coord, (int)from);
44137 +       entry_to = (char *)entry_at(coord, (int)(from + count));
44138 +
44139 +       /* move headers */
44140 +       memmove(header_from, header_to,
44141 +               (unsigned)(address(coord, size) - header_to));
44142 +
44143 +       header_delta = header_to - header_from;
44144 +
44145 +       entry_from -= header_delta;
44146 +       entry_to -= header_delta;
44147 +       size -= header_delta;
44148 +
44149 +       /* copy entries */
44150 +       memmove(entry_from, entry_to,
44151 +               (unsigned)(address(coord, size) - entry_to));
44152 +
44153 +       entry_delta = entry_to - entry_from;
44154 +       size -= entry_delta;
44155 +
44156 +       /* update offsets */
44157 +
44158 +       for (i = 0; i < (int)from; ++i)
44159 +               adj_offset(coord, i, -header_delta);
44160 +
44161 +       for (i = from; i < units(coord) - (int)count; ++i)
44162 +               adj_offset(coord, i, -header_delta - entry_delta);
44163 +
44164 +       put_unaligned(cpu_to_le16((__u16) units(coord) - count),
44165 +                     &formatted_at(coord)->num_of_entries);
44166 +
44167 +       if (from == 0) {
44168 +               /* entries from head was removed - move remaining to right */
44169 +               memmove((char *)item_body_by_coord(coord) +
44170 +                       header_delta + entry_delta, item_body_by_coord(coord),
44171 +                       (unsigned)size);
44172 +               if (REISER4_DEBUG)
44173 +                       memset(item_body_by_coord(coord), 0,
44174 +                              (unsigned)header_delta + entry_delta);
44175 +       } else {
44176 +               /* freed space is already at the end of item */
44177 +               if (REISER4_DEBUG)
44178 +                       memset((char *)item_body_by_coord(coord) + size, 0,
44179 +                              (unsigned)header_delta + entry_delta);
44180 +       }
44181 +
44182 +       return header_delta + entry_delta;
44183 +}
44184 +
44185 +int kill_units_cde(coord_t * coord /* coord of item */ ,
44186 +                  pos_in_node_t from /* start unit pos */ ,
44187 +                  pos_in_node_t to /* stop unit pos */ ,
44188 +                  struct carry_kill_data *kdata UNUSED_ARG,
44189 +                  reiser4_key * smallest_removed, reiser4_key * new_first)
44190 +{
44191 +       return cut_units_cde(coord, from, to, NULL, smallest_removed, new_first);
44192 +}
44193 +
44194 +/* ->s.dir.extract_key() method for this item plugin. */
44195 +int extract_key_cde(const coord_t * coord /* coord of item */ ,
44196 +                   reiser4_key * key /* resulting key */ )
44197 +{
44198 +       directory_entry_format *dent;
44199 +
44200 +       assert("nikita-1155", coord != NULL);
44201 +       assert("nikita-1156", key != NULL);
44202 +
44203 +       dent = entry_at(coord, idx_of(coord));
44204 +       return extract_key_from_id(&dent->id, key);
44205 +}
44206 +
44207 +int
44208 +update_key_cde(const coord_t * coord, const reiser4_key * key,
44209 +              lock_handle * lh UNUSED_ARG)
44210 +{
44211 +       directory_entry_format *dent;
44212 +       obj_key_id obj_id;
44213 +       int result;
44214 +
44215 +       assert("nikita-2344", coord != NULL);
44216 +       assert("nikita-2345", key != NULL);
44217 +
44218 +       dent = entry_at(coord, idx_of(coord));
44219 +       result = build_obj_key_id(key, &obj_id);
44220 +       if (result == 0) {
44221 +               dent->id = obj_id;
44222 +               znode_make_dirty(coord->node);
44223 +       }
44224 +       return 0;
44225 +}
44226 +
44227 +/* ->s.dir.extract_name() method for this item plugin. */
44228 +char *extract_name_cde(const coord_t * coord /* coord of item */ , char *buf)
44229 +{
44230 +       directory_entry_format *dent;
44231 +
44232 +       assert("nikita-1157", coord != NULL);
44233 +
44234 +       dent = entry_at(coord, idx_of(coord));
44235 +       return extract_dent_name(coord, dent, buf);
44236 +}
44237 +
44238 +static int cde_bytes(int pasting, const reiser4_item_data * data)
44239 +{
44240 +       int result;
44241 +
44242 +       result = data->length;
44243 +       if (!pasting)
44244 +               result -= sizeof(cde_item_format);
44245 +       return result;
44246 +}
44247 +
44248 +/* ->s.dir.add_entry() method for this item plugin */
44249 +int add_entry_cde(struct inode *dir /* directory object */ ,
44250 +                 coord_t * coord /* coord of item */ ,
44251 +                 lock_handle * lh /* lock handle for insertion */ ,
44252 +                 const struct dentry *name /* name to insert */ ,
44253 +                 reiser4_dir_entry_desc * dir_entry    /* parameters of new
44254 +                                                        * directory entry */ )
44255 +{
44256 +       reiser4_item_data data;
44257 +       struct cde_entry entry;
44258 +       struct cde_entry_data edata;
44259 +       int result;
44260 +
44261 +       assert("nikita-1656", coord->node == lh->node);
44262 +       assert("nikita-1657", znode_is_write_locked(coord->node));
44263 +
44264 +       edata.num_of_entries = 1;
44265 +       edata.entry = &entry;
44266 +
44267 +       entry.dir = dir;
44268 +       entry.obj = dir_entry->obj;
44269 +       entry.name = &name->d_name;
44270 +
44271 +       data.data = (char *)&edata;
44272 +       data.user = 0;          /* &edata is not user space */
44273 +       data.iplug = item_plugin_by_id(COMPOUND_DIR_ID);
44274 +       data.arg = dir_entry;
44275 +       assert("nikita-1302", data.iplug != NULL);
44276 +
44277 +       result = is_dot_key(&dir_entry->key);
44278 +       data.length = estimate_cde(result ? coord : NULL, &data);
44279 +
44280 +       /* NOTE-NIKITA quota plugin? */
44281 +       if (DQUOT_ALLOC_SPACE_NODIRTY(dir, cde_bytes(result, &data)))
44282 +               return RETERR(-EDQUOT);
44283 +
44284 +       if (result)
44285 +               result = insert_by_coord(coord, &data, &dir_entry->key, lh, 0);
44286 +       else
44287 +               result = reiser4_resize_item(coord, &data, &dir_entry->key,
44288 +                                            lh, 0);
44289 +       return result;
44290 +}
44291 +
44292 +/* ->s.dir.rem_entry() */
44293 +int rem_entry_cde(struct inode *dir /* directory of item */ ,
44294 +                 const struct qstr *name, coord_t * coord /* coord of item */ ,
44295 +                 lock_handle * lh UNUSED_ARG   /* lock handle for
44296 +                                                * removal */ ,
44297 +                 reiser4_dir_entry_desc * entry UNUSED_ARG     /* parameters of
44298 +                                                                * directory entry
44299 +                                                                * being removed */ )
44300 +{
44301 +       coord_t shadow;
44302 +       int result;
44303 +       int length;
44304 +       ON_DEBUG(char buf[DE_NAME_BUF_LEN]);
44305 +
44306 +       assert("nikita-2870", strlen(name->name) == name->len);
44307 +       assert("nikita-2869",
44308 +              !strcmp(name->name, extract_name_cde(coord, buf)));
44309 +
44310 +       length = sizeof(directory_entry_format) + sizeof(cde_unit_header);
44311 +       if (is_longname(name->name, name->len))
44312 +               length += name->len + 1;
44313 +
44314 +       if (inode_get_bytes(dir) < length) {
44315 +               warning("nikita-2628", "Dir is broke: %llu: %llu",
44316 +                       (unsigned long long)get_inode_oid(dir),
44317 +                       inode_get_bytes(dir));
44318 +
44319 +               return RETERR(-EIO);
44320 +       }
44321 +
44322 +       /* cut_node() is supposed to take pointers to _different_
44323 +          coords, because it will modify them without respect to
44324 +          possible aliasing. To work around this, create temporary copy
44325 +          of @coord.
44326 +        */
44327 +       coord_dup(&shadow, coord);
44328 +       result =
44329 +           kill_node_content(coord, &shadow, NULL, NULL, NULL, NULL, NULL, 0);
44330 +       if (result == 0) {
44331 +               /* NOTE-NIKITA quota plugin? */
44332 +               DQUOT_FREE_SPACE_NODIRTY(dir, length);
44333 +       }
44334 +       return result;
44335 +}
44336 +
44337 +/* ->s.dir.max_name_len() method for this item plugin */
44338 +int max_name_len_cde(const struct inode *dir /* directory */ )
44339 +{
44340 +       return
44341 +               reiser4_tree_by_inode(dir)->nplug->max_item_size() -
44342 +               sizeof(directory_entry_format) - sizeof(cde_item_format) -
44343 +               sizeof(cde_unit_header) - 2;
44344 +}
44345 +
44346 +/* Make Linus happy.
44347 +   Local variables:
44348 +   c-indentation-style: "K&R"
44349 +   mode-name: "LC"
44350 +   c-basic-offset: 8
44351 +   tab-width: 8
44352 +   fill-column: 120
44353 +   End:
44354 +*/
44355 diff -puN /dev/null fs/reiser4/plugin/item/cde.h
44356 --- /dev/null
44357 +++ a/fs/reiser4/plugin/item/cde.h
44358 @@ -0,0 +1,87 @@
44359 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
44360 +
44361 +/* Compound directory item. See cde.c for description. */
44362 +
44363 +#if !defined( __FS_REISER4_PLUGIN_COMPRESSED_DE_H__ )
44364 +#define __FS_REISER4_PLUGIN_COMPRESSED_DE_H__
44365 +
44366 +#include "../../forward.h"
44367 +#include "../../kassign.h"
44368 +#include "../../dformat.h"
44369 +
44370 +#include <linux/fs.h>          /* for struct inode */
44371 +#include <linux/dcache.h>      /* for struct dentry, etc  */
44372 +
44373 +typedef struct cde_unit_header {
44374 +       de_id hash;
44375 +       d16 offset;
44376 +} cde_unit_header;
44377 +
44378 +typedef struct cde_item_format {
44379 +       d16 num_of_entries;
44380 +       cde_unit_header entry[0];
44381 +} cde_item_format;
44382 +
44383 +struct cde_entry {
44384 +       const struct inode *dir;
44385 +       const struct inode *obj;
44386 +       const struct qstr *name;
44387 +};
44388 +
44389 +struct cde_entry_data {
44390 +       int num_of_entries;
44391 +       struct cde_entry *entry;
44392 +};
44393 +
44394 +/* plugin->item.b.* */
44395 +reiser4_key *max_key_inside_cde(const coord_t * coord, reiser4_key * result);
44396 +int can_contain_key_cde(const coord_t * coord, const reiser4_key * key,
44397 +                       const reiser4_item_data *);
44398 +int mergeable_cde(const coord_t * p1, const coord_t * p2);
44399 +pos_in_node_t nr_units_cde(const coord_t * coord);
44400 +reiser4_key *unit_key_cde(const coord_t * coord, reiser4_key * key);
44401 +int estimate_cde(const coord_t * coord, const reiser4_item_data * data);
44402 +void print_cde(const char *prefix, coord_t * coord);
44403 +int init_cde(coord_t * coord, coord_t * from, reiser4_item_data * data);
44404 +lookup_result lookup_cde(const reiser4_key * key, lookup_bias bias,
44405 +                        coord_t * coord);
44406 +int paste_cde(coord_t * coord, reiser4_item_data * data,
44407 +             carry_plugin_info * info UNUSED_ARG);
44408 +int can_shift_cde(unsigned free_space, coord_t * coord, znode * target,
44409 +                 shift_direction pend, unsigned *size, unsigned want);
44410 +void copy_units_cde(coord_t * target, coord_t * source, unsigned from,
44411 +                   unsigned count, shift_direction where_is_free_space,
44412 +                   unsigned free_space);
44413 +int cut_units_cde(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
44414 +                 struct carry_cut_data *, reiser4_key * smallest_removed,
44415 +                 reiser4_key * new_first);
44416 +int kill_units_cde(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
44417 +                  struct carry_kill_data *, reiser4_key * smallest_removed,
44418 +                  reiser4_key * new_first);
44419 +void print_cde(const char *prefix, coord_t * coord);
44420 +int reiser4_check_cde(const coord_t * coord, const char **error);
44421 +
44422 +/* plugin->u.item.s.dir.* */
44423 +int extract_key_cde(const coord_t * coord, reiser4_key * key);
44424 +int update_key_cde(const coord_t * coord, const reiser4_key * key,
44425 +                  lock_handle * lh);
44426 +char *extract_name_cde(const coord_t * coord, char *buf);
44427 +int add_entry_cde(struct inode *dir, coord_t * coord,
44428 +                 lock_handle * lh, const struct dentry *name,
44429 +                 reiser4_dir_entry_desc * entry);
44430 +int rem_entry_cde(struct inode *dir, const struct qstr *name, coord_t * coord,
44431 +                 lock_handle * lh, reiser4_dir_entry_desc * entry);
44432 +int max_name_len_cde(const struct inode *dir);
44433 +
44434 +/* __FS_REISER4_PLUGIN_COMPRESSED_DE_H__ */
44435 +#endif
44436 +
44437 +/* Make Linus happy.
44438 +   Local variables:
44439 +   c-indentation-style: "K&R"
44440 +   mode-name: "LC"
44441 +   c-basic-offset: 8
44442 +   tab-width: 8
44443 +   fill-column: 120
44444 +   End:
44445 +*/
44446 diff -puN /dev/null fs/reiser4/plugin/item/ctail.c
44447 --- /dev/null
44448 +++ a/fs/reiser4/plugin/item/ctail.c
44449 @@ -0,0 +1,1613 @@
44450 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
44451 +
44452 +/* ctails (aka "clustered tails") are items for cryptcompress objects */
44453 +
44454 +/* DESCRIPTION:
44455 +
44456 +Each cryptcompress object is stored on disk as a set of clusters sliced
44457 +into ctails.
44458 +
44459 +Internal on-disk structure:
44460 +
44461 +        HEADER   (1)  Here stored disk cluster shift
44462 +       BODY
44463 +*/
44464 +
44465 +#include "../../forward.h"
44466 +#include "../../debug.h"
44467 +#include "../../dformat.h"
44468 +#include "../../kassign.h"
44469 +#include "../../key.h"
44470 +#include "../../coord.h"
44471 +#include "item.h"
44472 +#include "../node/node.h"
44473 +#include "../plugin.h"
44474 +#include "../object.h"
44475 +#include "../../znode.h"
44476 +#include "../../carry.h"
44477 +#include "../../tree.h"
44478 +#include "../../inode.h"
44479 +#include "../../super.h"
44480 +#include "../../context.h"
44481 +#include "../../page_cache.h"
44482 +#include "../cluster.h"
44483 +#include "../../flush.h"
44484 +#include "../../tree_walk.h"
44485 +
44486 +#include <linux/pagevec.h>
44487 +#include <linux/swap.h>
44488 +#include <linux/fs.h>
44489 +
44490 +/* return body of ctail item at @coord */
44491 +static ctail_item_format *ctail_formatted_at(const coord_t * coord)
44492 +{
44493 +       assert("edward-60", coord != NULL);
44494 +       return item_body_by_coord(coord);
44495 +}
44496 +
44497 +static int cluster_shift_by_coord(const coord_t * coord)
44498 +{
44499 +       return get_unaligned(&ctail_formatted_at(coord)->cluster_shift);
44500 +}
44501 +
44502 +static inline void dclust_set_extension_shift(hint_t * hint)
44503 +{
44504 +       assert("edward-1270",
44505 +              item_id_by_coord(&hint->ext_coord.coord) == CTAIL_ID);
44506 +       hint->ext_coord.extension.ctail.shift =
44507 +           cluster_shift_by_coord(&hint->ext_coord.coord);
44508 +}
44509 +
44510 +static loff_t off_by_coord(const coord_t * coord)
44511 +{
44512 +       reiser4_key key;
44513 +       return get_key_offset(item_key_by_coord(coord, &key));
44514 +}
44515 +
44516 +int coord_is_unprepped_ctail(const coord_t * coord)
44517 +{
44518 +       assert("edward-1233", coord != NULL);
44519 +       assert("edward-1234", item_id_by_coord(coord) == CTAIL_ID);
44520 +       assert("edward-1235",
44521 +              ergo((int)cluster_shift_by_coord(coord) == (int)UCTAIL_SHIFT,
44522 +                   nr_units_ctail(coord) == (pos_in_node_t) UCTAIL_NR_UNITS));
44523 +
44524 +       return (int)cluster_shift_by_coord(coord) == (int)UCTAIL_SHIFT;
44525 +}
44526 +
44527 +static cloff_t clust_by_coord(const coord_t * coord, struct inode *inode)
44528 +{
44529 +       int shift;
44530 +
44531 +       if (inode != NULL) {
44532 +               shift = inode_cluster_shift(inode);
44533 +               assert("edward-1236",
44534 +                      ergo(!coord_is_unprepped_ctail(coord),
44535 +                           shift == cluster_shift_by_coord(coord)));
44536 +       } else {
44537 +               assert("edward-1237", !coord_is_unprepped_ctail(coord));
44538 +               shift = cluster_shift_by_coord(coord);
44539 +       }
44540 +       return off_by_coord(coord) >> shift;
44541 +}
44542 +
44543 +static int disk_cluster_size(const coord_t * coord)
44544 +{
44545 +       assert("edward-1156",
44546 +              item_plugin_by_coord(coord) == item_plugin_by_id(CTAIL_ID));
44547 +       /* calculation of disk cluster size
44548 +          is meaninless if ctail is unprepped */
44549 +       assert("edward-1238", !coord_is_unprepped_ctail(coord));
44550 +
44551 +       return 1 << cluster_shift_by_coord(coord);
44552 +}
44553 +
44554 +/* true if the key is of first disk cluster item */
44555 +static int is_disk_cluster_key(const reiser4_key * key, const coord_t * coord)
44556 +{
44557 +       assert("edward-1239", item_id_by_coord(coord) == CTAIL_ID);
44558 +
44559 +       return coord_is_unprepped_ctail(coord) ||
44560 +           ((get_key_offset(key) &
44561 +             ((loff_t) disk_cluster_size(coord) - 1)) == 0);
44562 +}
44563 +
44564 +static char *first_unit(coord_t * coord)
44565 +{
44566 +       /* FIXME: warning: pointer of type `void *' used in arithmetic */
44567 +       return (char *)item_body_by_coord(coord) + sizeof(ctail_item_format);
44568 +}
44569 +
44570 +/* plugin->u.item.b.max_key_inside :
44571 +   tail_max_key_inside */
44572 +
44573 +/* plugin->u.item.b.can_contain_key */
44574 +int
44575 +can_contain_key_ctail(const coord_t * coord, const reiser4_key * key,
44576 +                     const reiser4_item_data * data)
44577 +{
44578 +       reiser4_key item_key;
44579 +
44580 +       if (item_plugin_by_coord(coord) != data->iplug)
44581 +               return 0;
44582 +
44583 +       item_key_by_coord(coord, &item_key);
44584 +       if (get_key_locality(key) != get_key_locality(&item_key) ||
44585 +           get_key_objectid(key) != get_key_objectid(&item_key))
44586 +               return 0;
44587 +       if (get_key_offset(&item_key) + nr_units_ctail(coord) !=
44588 +           get_key_offset(key))
44589 +               return 0;
44590 +       if (is_disk_cluster_key(key, coord))
44591 +               return 0;
44592 +       return 1;
44593 +}
44594 +
44595 +/* plugin->u.item.b.mergeable */
44596 +int mergeable_ctail(const coord_t * p1, const coord_t * p2)
44597 +{
44598 +       reiser4_key key1, key2;
44599 +
44600 +       assert("edward-62", item_id_by_coord(p1) == CTAIL_ID);
44601 +       assert("edward-61", plugin_of_group(item_plugin_by_coord(p1),
44602 +                                           UNIX_FILE_METADATA_ITEM_TYPE));
44603 +
44604 +       if (item_id_by_coord(p2) != CTAIL_ID) {
44605 +               /* second item is of another type */
44606 +               return 0;
44607 +       }
44608 +
44609 +       item_key_by_coord(p1, &key1);
44610 +       item_key_by_coord(p2, &key2);
44611 +       if (get_key_locality(&key1) != get_key_locality(&key2) ||
44612 +           get_key_objectid(&key1) != get_key_objectid(&key2) ||
44613 +           get_key_type(&key1) != get_key_type(&key2)) {
44614 +               /* items of different objects */
44615 +               return 0;
44616 +       }
44617 +       if (get_key_offset(&key1) + nr_units_ctail(p1) != get_key_offset(&key2))
44618 +               /*  not adjacent items */
44619 +               return 0;
44620 +       if (is_disk_cluster_key(&key2, p2))
44621 +               return 0;
44622 +       return 1;
44623 +}
44624 +
44625 +/* plugin->u.item.b.nr_units */
44626 +pos_in_node_t nr_units_ctail(const coord_t * coord)
44627 +{
44628 +       return (item_length_by_coord(coord) -
44629 +               sizeof(ctail_formatted_at(coord)->cluster_shift));
44630 +}
44631 +
44632 +/* plugin->u.item.b.estimate:
44633 +   estimate how much space is needed to insert/paste @data->length bytes
44634 +   into ctail at @coord */
44635 +int estimate_ctail(const coord_t * coord /* coord of item */ ,
44636 +                  const reiser4_item_data *
44637 +                  data /* parameters for new item */ )
44638 +{
44639 +       if (coord == NULL)
44640 +               /* insert */
44641 +               return (sizeof(ctail_item_format) + data->length);
44642 +       else
44643 +               /* paste */
44644 +               return data->length;
44645 +}
44646 +
44647 +/* ->init() method for this item plugin. */
44648 +int init_ctail(coord_t * to /* coord of item */ ,
44649 +              coord_t * from /* old_item */ ,
44650 +              reiser4_item_data * data /* structure used for insertion */ )
44651 +{
44652 +       int cluster_shift;      /* cpu value to convert */
44653 +
44654 +       if (data) {
44655 +               assert("edward-463", data->length > sizeof(ctail_item_format));
44656 +               cluster_shift = *((int *)(data->arg));
44657 +               data->length -= sizeof(ctail_item_format);
44658 +       } else {
44659 +               assert("edward-464", from != NULL);
44660 +               assert("edward-855", ctail_ok(from));
44661 +               cluster_shift = (int)(cluster_shift_by_coord(from));
44662 +       }
44663 +       put_unaligned((d8)cluster_shift, &ctail_formatted_at(to)->cluster_shift);
44664 +       assert("edward-856", ctail_ok(to));
44665 +       return 0;
44666 +}
44667 +
44668 +/* plugin->u.item.b.lookup:
44669 +   NULL: We are looking for item keys only */
44670 +
44671 +#if REISER4_DEBUG
44672 +int ctail_ok(const coord_t * coord)
44673 +{
44674 +       return coord_is_unprepped_ctail(coord) ||
44675 +           cluster_shift_ok(cluster_shift_by_coord(coord));
44676 +}
44677 +
44678 +/* plugin->u.item.b.check */
44679 +int check_ctail(const coord_t * coord, const char **error)
44680 +{
44681 +       if (!ctail_ok(coord)) {
44682 +               if (error)
44683 +                       *error = "bad cluster shift in ctail";
44684 +               return 1;
44685 +       }
44686 +       return 0;
44687 +}
44688 +#endif
44689 +
44690 +/* plugin->u.item.b.paste */
44691 +int
44692 +paste_ctail(coord_t * coord, reiser4_item_data * data,
44693 +           carry_plugin_info * info UNUSED_ARG)
44694 +{
44695 +       unsigned old_nr_units;
44696 +
44697 +       assert("edward-268", data->data != NULL);
44698 +       /* copy only from kernel space */
44699 +       assert("edward-66", data->user == 0);
44700 +
44701 +       old_nr_units =
44702 +           item_length_by_coord(coord) - sizeof(ctail_item_format) -
44703 +           data->length;
44704 +
44705 +       /* ctail items never get pasted in the middle */
44706 +
44707 +       if (coord->unit_pos == 0 && coord->between == AT_UNIT) {
44708 +
44709 +               /* paste at the beginning when create new item */
44710 +               assert("edward-450",
44711 +                      item_length_by_coord(coord) ==
44712 +                      data->length + sizeof(ctail_item_format));
44713 +               assert("edward-451", old_nr_units == 0);
44714 +       } else if (coord->unit_pos == old_nr_units - 1
44715 +                  && coord->between == AFTER_UNIT) {
44716 +
44717 +               /* paste at the end */
44718 +               coord->unit_pos++;
44719 +       } else
44720 +               impossible("edward-453", "bad paste position");
44721 +
44722 +       memcpy(first_unit(coord) + coord->unit_pos, data->data, data->length);
44723 +
44724 +       assert("edward-857", ctail_ok(coord));
44725 +
44726 +       return 0;
44727 +}
44728 +
44729 +/* plugin->u.item.b.fast_paste */
44730 +
44731 +/* plugin->u.item.b.can_shift
44732 +   number of units is returned via return value, number of bytes via @size. For
44733 +   ctail items they coincide */
44734 +int
44735 +can_shift_ctail(unsigned free_space, coord_t * source,
44736 +               znode * target, shift_direction direction UNUSED_ARG,
44737 +               unsigned *size /* number of bytes */ , unsigned want)
44738 +{
44739 +       /* make sure that that we do not want to shift more than we have */
44740 +       assert("edward-68", want > 0 && want <= nr_units_ctail(source));
44741 +
44742 +       *size = min(want, free_space);
44743 +
44744 +       if (!target) {
44745 +               /* new item will be created */
44746 +               if (*size <= sizeof(ctail_item_format)) {
44747 +                       *size = 0;
44748 +                       return 0;
44749 +               }
44750 +               return *size - sizeof(ctail_item_format);
44751 +       }
44752 +       return *size;
44753 +}
44754 +
44755 +/* plugin->u.item.b.copy_units
44756 +   cooperates with ->can_shift() */
44757 +void
44758 +copy_units_ctail(coord_t * target, coord_t * source,
44759 +                unsigned from, unsigned count /* units */ ,
44760 +                shift_direction where_is_free_space,
44761 +                unsigned free_space /* bytes */ )
44762 +{
44763 +       /* make sure that item @target is expanded already */
44764 +       assert("edward-69", (unsigned)item_length_by_coord(target) >= count);
44765 +       assert("edward-70", free_space == count || free_space == count + 1);
44766 +
44767 +       assert("edward-858", ctail_ok(source));
44768 +
44769 +       if (where_is_free_space == SHIFT_LEFT) {
44770 +               /* append item @target with @count first bytes of @source:
44771 +                  this restriction came from ordinary tails */
44772 +               assert("edward-71", from == 0);
44773 +               assert("edward-860", ctail_ok(target));
44774 +
44775 +               memcpy(first_unit(target) + nr_units_ctail(target) - count,
44776 +                      first_unit(source), count);
44777 +       } else {
44778 +               /* target item is moved to right already */
44779 +               reiser4_key key;
44780 +
44781 +               assert("edward-72", nr_units_ctail(source) == from + count);
44782 +
44783 +               if (free_space == count) {
44784 +                       init_ctail(target, source, NULL);
44785 +               } else {
44786 +                       /* new item has been created */
44787 +                       assert("edward-862", ctail_ok(target));
44788 +               }
44789 +               memcpy(first_unit(target), first_unit(source) + from, count);
44790 +
44791 +               assert("edward-863", ctail_ok(target));
44792 +
44793 +               /* new units are inserted before first unit in an item,
44794 +                  therefore, we have to update item key */
44795 +               item_key_by_coord(source, &key);
44796 +               set_key_offset(&key, get_key_offset(&key) + from);
44797 +
44798 +               node_plugin_by_node(target->node)->update_item_key(target, &key,
44799 +                                                                  NULL /*info */);
44800 +       }
44801 +}
44802 +
44803 +/* plugin->u.item.b.create_hook */
44804 +int create_hook_ctail(const coord_t * coord, void *arg)
44805 +{
44806 +       assert("edward-864", znode_is_loaded(coord->node));
44807 +
44808 +       znode_set_convertible(coord->node);
44809 +       return 0;
44810 +}
44811 +
44812 +/* plugin->u.item.b.kill_hook */
44813 +int kill_hook_ctail(const coord_t * coord, pos_in_node_t from,
44814 +                   pos_in_node_t count, carry_kill_data * kdata)
44815 +{
44816 +       struct inode *inode;
44817 +
44818 +       assert("edward-1157", item_id_by_coord(coord) == CTAIL_ID);
44819 +       assert("edward-291", znode_is_write_locked(coord->node));
44820 +
44821 +       inode = kdata->inode;
44822 +       if (inode) {
44823 +               reiser4_key key;
44824 +               struct cryptcompress_info * info;
44825 +               cloff_t index;
44826 +
44827 +               item_key_by_coord(coord, &key);
44828 +               info = cryptcompress_inode_data(inode);
44829 +               index = off_to_clust(get_key_offset(&key), inode);
44830 +
44831 +               if (from == 0) {
44832 +                       info->trunc_index = index;
44833 +                       if (is_disk_cluster_key(&key, coord)) {
44834 +                               /*
44835 +                                * first item of disk cluster is to be killed
44836 +                                */
44837 +                               truncate_complete_page_cluster(
44838 +                                       inode, index, kdata->params.truncate);
44839 +                               inode_sub_bytes(inode,
44840 +                                               inode_cluster_size(inode));
44841 +                       }
44842 +               }
44843 +       }
44844 +       return 0;
44845 +}
44846 +
44847 +/* for shift_hook_ctail(),
44848 +   return true if the first disk cluster item has dirty child
44849 +*/
44850 +static int ctail_convertible(const coord_t * coord)
44851 +{
44852 +       int result;
44853 +       reiser4_key key;
44854 +       jnode *child = NULL;
44855 +
44856 +       assert("edward-477", coord != NULL);
44857 +       assert("edward-478", item_id_by_coord(coord) == CTAIL_ID);
44858 +
44859 +       if (coord_is_unprepped_ctail(coord))
44860 +               /* unprepped ctail should be converted */
44861 +               return 1;
44862 +
44863 +       item_key_by_coord(coord, &key);
44864 +       child = jlookup(current_tree,
44865 +                       get_key_objectid(&key),
44866 +                       off_to_pg(off_by_coord(coord)));
44867 +       if (!child)
44868 +               return 0;
44869 +       result = JF_ISSET(child, JNODE_DIRTY);
44870 +       jput(child);
44871 +       return result;
44872 +}
44873 +
44874 +/* FIXME-EDWARD */
44875 +/* plugin->u.item.b.shift_hook */
44876 +int shift_hook_ctail(const coord_t * item /* coord of item */ ,
44877 +                    unsigned from UNUSED_ARG /* start unit */ ,
44878 +                    unsigned count UNUSED_ARG /* stop unit */ ,
44879 +                    znode * old_node /* old parent */ )
44880 +{
44881 +       assert("edward-479", item != NULL);
44882 +       assert("edward-480", item->node != old_node);
44883 +
44884 +       if (!znode_convertible(old_node) || znode_convertible(item->node))
44885 +               return 0;
44886 +       if (ctail_convertible(item))
44887 +               znode_set_convertible(item->node);
44888 +       return 0;
44889 +}
44890 +
44891 +static int
44892 +cut_or_kill_ctail_units(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
44893 +                       int cut, void *p, reiser4_key * smallest_removed,
44894 +                       reiser4_key * new_first)
44895 +{
44896 +       pos_in_node_t count;    /* number of units to cut */
44897 +       char *item;
44898 +
44899 +       count = to - from + 1;
44900 +       item = item_body_by_coord(coord);
44901 +
44902 +       assert("edward-74", ergo(from != 0, to == coord_last_unit_pos(coord)));
44903 +
44904 +       if (smallest_removed) {
44905 +               /* store smallest key removed */
44906 +               item_key_by_coord(coord, smallest_removed);
44907 +               set_key_offset(smallest_removed,
44908 +                              get_key_offset(smallest_removed) + from);
44909 +       }
44910 +
44911 +       if (new_first) {
44912 +               assert("vs-1531", from == 0);
44913 +
44914 +               item_key_by_coord(coord, new_first);
44915 +               set_key_offset(new_first,
44916 +                              get_key_offset(new_first) + from + count);
44917 +       }
44918 +
44919 +       if (!cut)
44920 +               kill_hook_ctail(coord, from, 0, (struct carry_kill_data *)p);
44921 +
44922 +       if (from == 0) {
44923 +               if (count != nr_units_ctail(coord)) {
44924 +                       /* part of item is removed, so move free space at the beginning
44925 +                          of the item and update item key */
44926 +                       reiser4_key key;
44927 +                       memcpy(item + to + 1, item, sizeof(ctail_item_format));
44928 +                       item_key_by_coord(coord, &key);
44929 +                       set_key_offset(&key, get_key_offset(&key) + count);
44930 +                       node_plugin_by_node(coord->node)->update_item_key(coord,
44931 +                                                                         &key,
44932 +                                                                         NULL);
44933 +               } else {
44934 +                       /* cut_units should not be called to cut evrything */
44935 +                       assert("vs-1532", ergo(cut, 0));
44936 +                       /* whole item is cut, so more then amount of space occupied
44937 +                          by units got freed */
44938 +                       count += sizeof(ctail_item_format);
44939 +               }
44940 +               if (REISER4_DEBUG)
44941 +                       memset(item, 0, count);
44942 +       } else if (REISER4_DEBUG)
44943 +               memset(item + sizeof(ctail_item_format) + from, 0, count);
44944 +       return count;
44945 +}
44946 +
44947 +/* plugin->u.item.b.cut_units */
44948 +int
44949 +cut_units_ctail(coord_t * item, pos_in_node_t from, pos_in_node_t to,
44950 +               carry_cut_data * cdata, reiser4_key * smallest_removed,
44951 +               reiser4_key * new_first)
44952 +{
44953 +       return cut_or_kill_ctail_units(item, from, to, 1, NULL,
44954 +                                      smallest_removed, new_first);
44955 +}
44956 +
44957 +/* plugin->u.item.b.kill_units */
44958 +int
44959 +kill_units_ctail(coord_t * item, pos_in_node_t from, pos_in_node_t to,
44960 +                struct carry_kill_data *kdata, reiser4_key * smallest_removed,
44961 +                reiser4_key * new_first)
44962 +{
44963 +       return cut_or_kill_ctail_units(item, from, to, 0, kdata,
44964 +                                      smallest_removed, new_first);
44965 +}
44966 +
44967 +/* plugin->u.item.s.file.read */
44968 +int read_ctail(struct file *file UNUSED_ARG, flow_t * f, hint_t * hint)
44969 +{
44970 +       uf_coord_t *uf_coord;
44971 +       coord_t *coord;
44972 +
44973 +       uf_coord = &hint->ext_coord;
44974 +       coord = &uf_coord->coord;
44975 +       assert("edward-127", f->user == 0);
44976 +       assert("edward-129", coord && coord->node);
44977 +       assert("edward-130", coord_is_existing_unit(coord));
44978 +       assert("edward-132", znode_is_loaded(coord->node));
44979 +
44980 +       /* start read only from the beginning of ctail */
44981 +       assert("edward-133", coord->unit_pos == 0);
44982 +       /* read only whole ctails */
44983 +       assert("edward-135", nr_units_ctail(coord) <= f->length);
44984 +
44985 +       assert("edward-136", reiser4_schedulable());
44986 +       assert("edward-886", ctail_ok(coord));
44987 +
44988 +       if (f->data)
44989 +               memcpy(f->data, (char *)first_unit(coord),
44990 +                      (size_t) nr_units_ctail(coord));
44991 +
44992 +       dclust_set_extension_shift(hint);
44993 +       mark_page_accessed(znode_page(coord->node));
44994 +       move_flow_forward(f, nr_units_ctail(coord));
44995 +
44996 +       return 0;
44997 +}
44998 +
44999 +/**
45000 + * Prepare transform stream with plain text for page
45001 + * @page taking into account synchronization issues.
45002 + */
45003 +static int ctail_read_disk_cluster(struct cluster_handle * clust,
45004 +                                  struct inode * inode, struct page * page,
45005 +                                  znode_lock_mode mode)
45006 +{
45007 +       int result;
45008 +
45009 +       assert("edward-1450", mode == ZNODE_READ_LOCK || ZNODE_WRITE_LOCK);
45010 +       assert("edward-671", clust->hint != NULL);
45011 +       assert("edward-140", clust->dstat == INVAL_DISK_CLUSTER);
45012 +       assert("edward-672", cryptcompress_inode_ok(inode));
45013 +       assert("edward-1527", PageLocked(page));
45014 +
45015 +       unlock_page(page);
45016 +
45017 +       /* set input stream */
45018 +       result = grab_tfm_stream(inode, &clust->tc, INPUT_STREAM);
45019 +       if (result) {
45020 +               lock_page(page);
45021 +               return result;
45022 +       }
45023 +       result = find_disk_cluster(clust, inode, 1 /* read items */, mode);
45024 +       lock_page(page);
45025 +       if (result)
45026 +               return result;
45027 +       /*
45028 +        * at this point we have locked position in the tree
45029 +        */
45030 +       assert("edward-1528", znode_is_any_locked(clust->hint->lh.node));
45031 +
45032 +       if (page->mapping != inode->i_mapping) {
45033 +               /* page was truncated */
45034 +               reiser4_unset_hint(clust->hint);
45035 +               reset_cluster_params(clust);
45036 +               return AOP_TRUNCATED_PAGE;
45037 +       }
45038 +       if (PageUptodate(page)) {
45039 +               /* disk cluster can be obsolete, don't use it! */
45040 +               reiser4_unset_hint(clust->hint);
45041 +               reset_cluster_params(clust);
45042 +               return 0;
45043 +       }
45044 +       if (clust->dstat == FAKE_DISK_CLUSTER ||
45045 +           clust->dstat == UNPR_DISK_CLUSTER ||
45046 +           clust->dstat == TRNC_DISK_CLUSTER) {
45047 +               /*
45048 +                * this information about disk cluster will be valid
45049 +                * as long as we keep the position in the tree locked
45050 +                */
45051 +               tfm_cluster_set_uptodate(&clust->tc);
45052 +               return 0;
45053 +       }
45054 +       /* now prepare output stream.. */
45055 +       result = grab_coa(&clust->tc, inode_compression_plugin(inode));
45056 +       if (result)
45057 +               return result;
45058 +       /* ..and fill this with plain text */
45059 +       result = reiser4_inflate_cluster(clust, inode);
45060 +       if (result)
45061 +               return result;
45062 +       /*
45063 +        * The stream is ready! It won't be obsolete as
45064 +        * long as we keep last disk cluster item locked.
45065 +        */
45066 +       tfm_cluster_set_uptodate(&clust->tc);
45067 +       return 0;
45068 +}
45069 +
45070 +/*
45071 + * fill one page with plain text.
45072 + */
45073 +int do_readpage_ctail(struct inode * inode, struct cluster_handle * clust,
45074 +                     struct page *page, znode_lock_mode mode)
45075 +{
45076 +       int ret;
45077 +       unsigned cloff;
45078 +       char *data;
45079 +       size_t to_page;
45080 +       struct tfm_cluster * tc = &clust->tc;
45081 +
45082 +       assert("edward-212", PageLocked(page));
45083 +
45084 +       if (unlikely(page->mapping != inode->i_mapping))
45085 +               return AOP_TRUNCATED_PAGE;
45086 +       if (PageUptodate(page))
45087 +               goto exit;
45088 +       to_page = pbytes(page_index(page), inode);
45089 +       if (to_page == 0) {
45090 +               zero_user(page, 0, PAGE_CACHE_SIZE);
45091 +               SetPageUptodate(page);
45092 +               goto exit;
45093 +       }
45094 +       if (!tfm_cluster_is_uptodate(&clust->tc)) {
45095 +               clust->index = pg_to_clust(page->index, inode);
45096 +
45097 +               /* this will unlock/lock the page */
45098 +               ret = ctail_read_disk_cluster(clust, inode, page, mode);
45099 +
45100 +               assert("edward-212", PageLocked(page));
45101 +               if (ret)
45102 +                       return ret;
45103 +
45104 +               /* refresh bytes */
45105 +               to_page = pbytes(page_index(page), inode);
45106 +               if (to_page == 0) {
45107 +                       zero_user(page, 0, PAGE_CACHE_SIZE);
45108 +                       SetPageUptodate(page);
45109 +                       goto exit;
45110 +               }
45111 +       }
45112 +       if (PageUptodate(page))
45113 +               /* somebody else fill it already */
45114 +               goto exit;
45115 +
45116 +       assert("edward-119", tfm_cluster_is_uptodate(tc));
45117 +       assert("edward-1529", znode_is_any_locked(clust->hint->lh.node));
45118 +
45119 +       switch (clust->dstat) {
45120 +       case UNPR_DISK_CLUSTER:
45121 +               BUG_ON(1);
45122 +       case TRNC_DISK_CLUSTER:
45123 +               /*
45124 +                * Race with truncate!
45125 +                * We resolve it in favour of the last one (the only way,
45126 +                 * as in this case plain text is unrecoverable)
45127 +                */
45128 +       case FAKE_DISK_CLUSTER:
45129 +               /* fill the page by zeroes */
45130 +               zero_user(page, 0, PAGE_CACHE_SIZE);
45131 +               SetPageUptodate(page);
45132 +               break;
45133 +       case PREP_DISK_CLUSTER:
45134 +               /* fill page by transformed stream with plain text */
45135 +               assert("edward-1058", !PageUptodate(page));
45136 +               assert("edward-120", tc->len <= inode_cluster_size(inode));
45137 +
45138 +               /* page index in this logical cluster */
45139 +               cloff = pg_to_off_to_cloff(page->index, inode);
45140 +
45141 +               data = kmap(page);
45142 +               memcpy(data, tfm_stream_data(tc, OUTPUT_STREAM) + cloff, to_page);
45143 +               memset(data + to_page, 0, (size_t) PAGE_CACHE_SIZE - to_page);
45144 +               flush_dcache_page(page);
45145 +               kunmap(page);
45146 +               SetPageUptodate(page);
45147 +               break;
45148 +       default:
45149 +               impossible("edward-1169", "bad disk cluster state");
45150 +       }
45151 +      exit:
45152 +       return 0;
45153 +}
45154 +
45155 +/* plugin->u.item.s.file.readpage */
45156 +int readpage_ctail(void *vp, struct page *page)
45157 +{
45158 +       int result;
45159 +       hint_t * hint;
45160 +       struct cluster_handle * clust = vp;
45161 +
45162 +       assert("edward-114", clust != NULL);
45163 +       assert("edward-115", PageLocked(page));
45164 +       assert("edward-116", !PageUptodate(page));
45165 +       assert("edward-118", page->mapping && page->mapping->host);
45166 +       assert("edward-867", !tfm_cluster_is_uptodate(&clust->tc));
45167 +
45168 +       hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
45169 +       if (hint == NULL) {
45170 +               unlock_page(page);
45171 +               return RETERR(-ENOMEM);
45172 +       }
45173 +       clust->hint = hint;
45174 +       result = load_file_hint(clust->file, hint);
45175 +       if (result) {
45176 +               kfree(hint);
45177 +               unlock_page(page);
45178 +               return result;
45179 +       }
45180 +       assert("vs-25", hint->ext_coord.lh == &hint->lh);
45181 +
45182 +       result = do_readpage_ctail(page->mapping->host, clust, page,
45183 +                                  ZNODE_READ_LOCK);
45184 +       assert("edward-213", PageLocked(page));
45185 +       assert("edward-1163", ergo(!result, PageUptodate(page)));
45186 +
45187 +       unlock_page(page);
45188 +       done_lh(&hint->lh);
45189 +       hint->ext_coord.valid = 0;
45190 +       save_file_hint(clust->file, hint);
45191 +       kfree(hint);
45192 +       tfm_cluster_clr_uptodate(&clust->tc);
45193 +
45194 +       return result;
45195 +}
45196 +
45197 +/* Helper function for ->readpages() */
45198 +static int ctail_read_page_cluster(struct cluster_handle * clust,
45199 +                                  struct inode *inode)
45200 +{
45201 +       int i;
45202 +       int result;
45203 +       assert("edward-779", clust != NULL);
45204 +       assert("edward-1059", clust->win == NULL);
45205 +       assert("edward-780", inode != NULL);
45206 +
45207 +       result = prepare_page_cluster(inode, clust, READ_OP);
45208 +       if (result)
45209 +               return result;
45210 +
45211 +       assert("edward-781", !tfm_cluster_is_uptodate(&clust->tc));
45212 +
45213 +       for (i = 0; i < clust->nr_pages; i++) {
45214 +               struct page *page = clust->pages[i];
45215 +               lock_page(page);
45216 +               result = do_readpage_ctail(inode, clust, page, ZNODE_READ_LOCK);
45217 +               unlock_page(page);
45218 +               if (result)
45219 +                       break;
45220 +       }
45221 +       tfm_cluster_clr_uptodate(&clust->tc);
45222 +       put_page_cluster(clust, inode, READ_OP);
45223 +       return result;
45224 +}
45225 +
45226 +/* filler for read_cache_pages() */
45227 +static int ctail_readpages_filler(void * data, struct page * page)
45228 +{
45229 +       int ret = 0;
45230 +       struct cluster_handle * clust = data;
45231 +       struct inode * inode = clust->file->f_dentry->d_inode;
45232 +
45233 +       assert("edward-1525", page->mapping == inode->i_mapping);
45234 +
45235 +       if (PageUptodate(page)) {
45236 +               unlock_page(page);
45237 +               return 0;
45238 +       }
45239 +       if (pbytes(page_index(page), inode) == 0) {
45240 +               zero_user(page, 0, PAGE_CACHE_SIZE);
45241 +               SetPageUptodate(page);
45242 +               unlock_page(page);
45243 +               return 0;
45244 +       }
45245 +       move_cluster_forward(clust, inode, page->index);
45246 +       unlock_page(page);
45247 +       /*
45248 +        * read the whole page cluster
45249 +        */
45250 +       ret = ctail_read_page_cluster(clust, inode);
45251 +
45252 +       assert("edward-869", !tfm_cluster_is_uptodate(&clust->tc));
45253 +       return ret;
45254 +}
45255 +
45256 +/*
45257 + * We populate a bit more then upper readahead suggests:
45258 + * with each nominated page we read the whole page cluster
45259 + * this page belongs to.
45260 + */
45261 +int readpages_ctail(struct file *file, struct address_space *mapping,
45262 +                   struct list_head *pages)
45263 +{
45264 +       int ret = 0;
45265 +       hint_t *hint;
45266 +       struct cluster_handle clust;
45267 +       struct inode *inode = mapping->host;
45268 +
45269 +       assert("edward-1521", inode == file->f_dentry->d_inode);
45270 +
45271 +       cluster_init_read(&clust, NULL);
45272 +       clust.file = file;
45273 +       hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
45274 +       if (hint == NULL) {
45275 +               warning("vs-28", "failed to allocate hint");
45276 +               ret = RETERR(-ENOMEM);
45277 +               goto exit1;
45278 +       }
45279 +       clust.hint = hint;
45280 +       ret = load_file_hint(clust.file, hint);
45281 +       if (ret) {
45282 +               warning("edward-1522", "failed to load hint");
45283 +               goto exit2;
45284 +       }
45285 +       assert("vs-26", hint->ext_coord.lh == &hint->lh);
45286 +       ret = alloc_cluster_pgset(&clust, cluster_nrpages(inode));
45287 +       if (ret) {
45288 +               warning("edward-1523", "failed to alloc pgset");
45289 +               goto exit3;
45290 +       }
45291 +       ret = read_cache_pages(mapping, pages, ctail_readpages_filler, &clust);
45292 +
45293 +       assert("edward-870", !tfm_cluster_is_uptodate(&clust.tc));
45294 + exit3:
45295 +       done_lh(&hint->lh);
45296 +       save_file_hint(file, hint);
45297 +       hint->ext_coord.valid = 0;
45298 + exit2:
45299 +       kfree(hint);
45300 + exit1:
45301 +       put_cluster_handle(&clust);
45302 +       return ret;
45303 +}
45304 +
45305 +/*
45306 +   plugin->u.item.s.file.append_key
45307 +   key of the first item of the next disk cluster
45308 +*/
45309 +reiser4_key *append_key_ctail(const coord_t * coord, reiser4_key * key)
45310 +{
45311 +       assert("edward-1241", item_id_by_coord(coord) == CTAIL_ID);
45312 +       assert("edward-1242", cluster_shift_ok(cluster_shift_by_coord(coord)));
45313 +
45314 +       item_key_by_coord(coord, key);
45315 +       set_key_offset(key, ((__u64) (clust_by_coord(coord, NULL)) + 1)
45316 +                      << cluster_shift_by_coord(coord));
45317 +       return key;
45318 +}
45319 +
45320 +static int insert_unprepped_ctail(struct cluster_handle * clust,
45321 +                                 struct inode *inode)
45322 +{
45323 +       int result;
45324 +       char buf[UCTAIL_NR_UNITS];
45325 +       reiser4_item_data data;
45326 +       reiser4_key key;
45327 +       int shift = (int)UCTAIL_SHIFT;
45328 +
45329 +       memset(buf, 0, (size_t) UCTAIL_NR_UNITS);
45330 +       result = key_by_inode_cryptcompress(inode,
45331 +                                           clust_to_off(clust->index, inode),
45332 +                                           &key);
45333 +       if (result)
45334 +               return result;
45335 +       data.user = 0;
45336 +       data.iplug = item_plugin_by_id(CTAIL_ID);
45337 +       data.arg = &shift;
45338 +       data.length = sizeof(ctail_item_format) + (size_t) UCTAIL_NR_UNITS;
45339 +       data.data = buf;
45340 +
45341 +       result = insert_by_coord(&clust->hint->ext_coord.coord,
45342 +                                &data, &key, clust->hint->ext_coord.lh, 0);
45343 +       return result;
45344 +}
45345 +
45346 +static int
45347 +insert_cryptcompress_flow(coord_t * coord, lock_handle * lh, flow_t * f,
45348 +                         int cluster_shift)
45349 +{
45350 +       int result;
45351 +       carry_pool *pool;
45352 +       carry_level *lowest_level;
45353 +       reiser4_item_data *data;
45354 +       carry_op *op;
45355 +
45356 +       pool =
45357 +           init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
45358 +                           sizeof(*data));
45359 +       if (IS_ERR(pool))
45360 +               return PTR_ERR(pool);
45361 +       lowest_level = (carry_level *) (pool + 1);
45362 +       init_carry_level(lowest_level, pool);
45363 +       data = (reiser4_item_data *) (lowest_level + 3);
45364 +
45365 +       assert("edward-466", coord->between == AFTER_ITEM
45366 +              || coord->between == AFTER_UNIT || coord->between == BEFORE_ITEM
45367 +              || coord->between == EMPTY_NODE
45368 +              || coord->between == BEFORE_UNIT);
45369 +
45370 +       if (coord->between == AFTER_UNIT) {
45371 +               coord->unit_pos = 0;
45372 +               coord->between = AFTER_ITEM;
45373 +       }
45374 +       op = reiser4_post_carry(lowest_level, COP_INSERT_FLOW, coord->node,
45375 +                               0 /* operate directly on coord -> node */);
45376 +       if (IS_ERR(op) || (op == NULL)) {
45377 +               done_carry_pool(pool);
45378 +               return RETERR(op ? PTR_ERR(op) : -EIO);
45379 +       }
45380 +       data->user = 0;
45381 +       data->iplug = item_plugin_by_id(CTAIL_ID);
45382 +       data->arg = &cluster_shift;
45383 +
45384 +       data->length = 0;
45385 +       data->data = NULL;
45386 +
45387 +       op->u.insert_flow.flags = COPI_DONT_SHIFT_LEFT | COPI_DONT_SHIFT_RIGHT;
45388 +       op->u.insert_flow.insert_point = coord;
45389 +       op->u.insert_flow.flow = f;
45390 +       op->u.insert_flow.data = data;
45391 +       op->u.insert_flow.new_nodes = 0;
45392 +
45393 +       lowest_level->track_type = CARRY_TRACK_CHANGE;
45394 +       lowest_level->tracked = lh;
45395 +
45396 +       result = reiser4_carry(lowest_level, NULL);
45397 +       done_carry_pool(pool);
45398 +
45399 +       return result;
45400 +}
45401 +
45402 +/* Implementation of CRC_APPEND_ITEM mode of ctail conversion */
45403 +static int insert_cryptcompress_flow_in_place(coord_t * coord,
45404 +                                             lock_handle * lh, flow_t * f,
45405 +                                             int cluster_shift)
45406 +{
45407 +       int ret;
45408 +       coord_t pos;
45409 +       lock_handle lock;
45410 +
45411 +       assert("edward-484",
45412 +              coord->between == AT_UNIT || coord->between == AFTER_ITEM);
45413 +       assert("edward-485", item_id_by_coord(coord) == CTAIL_ID);
45414 +
45415 +       coord_dup(&pos, coord);
45416 +       pos.unit_pos = 0;
45417 +       pos.between = AFTER_ITEM;
45418 +
45419 +       init_lh(&lock);
45420 +       copy_lh(&lock, lh);
45421 +
45422 +       ret = insert_cryptcompress_flow(&pos, &lock, f, cluster_shift);
45423 +       done_lh(&lock);
45424 +       assert("edward-1347", znode_is_write_locked(lh->node));
45425 +       assert("edward-1228", !ret);
45426 +       return ret;
45427 +}
45428 +
45429 +/* Implementation of CRC_OVERWRITE_ITEM mode of ctail conversion */
45430 +static int overwrite_ctail(coord_t * coord, flow_t * f)
45431 +{
45432 +       unsigned count;
45433 +
45434 +       assert("edward-269", f->user == 0);
45435 +       assert("edward-270", f->data != NULL);
45436 +       assert("edward-271", f->length > 0);
45437 +       assert("edward-272", coord_is_existing_unit(coord));
45438 +       assert("edward-273", coord->unit_pos == 0);
45439 +       assert("edward-274", znode_is_write_locked(coord->node));
45440 +       assert("edward-275", reiser4_schedulable());
45441 +       assert("edward-467", item_id_by_coord(coord) == CTAIL_ID);
45442 +       assert("edward-1243", ctail_ok(coord));
45443 +
45444 +       count = nr_units_ctail(coord);
45445 +
45446 +       if (count > f->length)
45447 +               count = f->length;
45448 +       memcpy(first_unit(coord), f->data, count);
45449 +       move_flow_forward(f, count);
45450 +       coord->unit_pos += count;
45451 +       return 0;
45452 +}
45453 +
45454 +/* Implementation of CRC_CUT_ITEM mode of ctail conversion:
45455 +   cut ctail (part or whole) starting from next unit position */
45456 +static int cut_ctail(coord_t * coord)
45457 +{
45458 +       coord_t stop;
45459 +
45460 +       assert("edward-435", coord->between == AT_UNIT &&
45461 +              coord->item_pos < coord_num_items(coord) &&
45462 +              coord->unit_pos <= coord_num_units(coord));
45463 +
45464 +       if (coord->unit_pos == coord_num_units(coord))
45465 +               /* nothing to cut */
45466 +               return 0;
45467 +       coord_dup(&stop, coord);
45468 +       stop.unit_pos = coord_last_unit_pos(coord);
45469 +
45470 +       return cut_node_content(coord, &stop, NULL, NULL, NULL);
45471 +}
45472 +
45473 +int ctail_insert_unprepped_cluster(struct cluster_handle * clust,
45474 +                                  struct inode * inode)
45475 +{
45476 +       int result;
45477 +       assert("edward-1244", inode != NULL);
45478 +       assert("edward-1245", clust->hint != NULL);
45479 +       assert("edward-1246", clust->dstat == FAKE_DISK_CLUSTER);
45480 +       assert("edward-1247", clust->reserved == 1);
45481 +
45482 +       result = get_disk_cluster_locked(clust, inode, ZNODE_WRITE_LOCK);
45483 +       if (cbk_errored(result))
45484 +               return result;
45485 +       assert("edward-1249", result == CBK_COORD_NOTFOUND);
45486 +       assert("edward-1250", znode_is_write_locked(clust->hint->lh.node));
45487 +
45488 +       assert("edward-1295",
45489 +              clust->hint->ext_coord.lh->node ==
45490 +              clust->hint->ext_coord.coord.node);
45491 +
45492 +       coord_set_between_clusters(&clust->hint->ext_coord.coord);
45493 +
45494 +       result = insert_unprepped_ctail(clust, inode);
45495 +       all_grabbed2free();
45496 +
45497 +       assert("edward-1251", !result);
45498 +       assert("edward-1252", cryptcompress_inode_ok(inode));
45499 +       assert("edward-1253", znode_is_write_locked(clust->hint->lh.node));
45500 +       assert("edward-1254",
45501 +              reiser4_clustered_blocks(reiser4_get_current_sb()));
45502 +       assert("edward-1255",
45503 +              znode_convertible(clust->hint->ext_coord.coord.node));
45504 +
45505 +       return result;
45506 +}
45507 +
45508 +static int do_convert_ctail(flush_pos_t * pos, cryptcompress_write_mode_t mode)
45509 +{
45510 +       int result = 0;
45511 +       struct convert_item_info * info;
45512 +
45513 +       assert("edward-468", pos != NULL);
45514 +       assert("edward-469", pos->sq != NULL);
45515 +       assert("edward-845", item_convert_data(pos) != NULL);
45516 +
45517 +       info = item_convert_data(pos);
45518 +       assert("edward-679", info->flow.data != NULL);
45519 +
45520 +       switch (mode) {
45521 +       case CRC_APPEND_ITEM:
45522 +               assert("edward-1229", info->flow.length != 0);
45523 +               assert("edward-1256",
45524 +                      cluster_shift_ok(cluster_shift_by_coord(&pos->coord)));
45525 +               result =
45526 +                   insert_cryptcompress_flow_in_place(&pos->coord,
45527 +                                                      &pos->lock,
45528 +                                                      &info->flow,
45529 +                                                      info->cluster_shift);
45530 +               break;
45531 +       case CRC_OVERWRITE_ITEM:
45532 +               assert("edward-1230", info->flow.length != 0);
45533 +               overwrite_ctail(&pos->coord, &info->flow);
45534 +               if (info->flow.length != 0)
45535 +                       break;
45536 +       case CRC_CUT_ITEM:
45537 +               assert("edward-1231", info->flow.length == 0);
45538 +               result = cut_ctail(&pos->coord);
45539 +               break;
45540 +       default:
45541 +               result = RETERR(-EIO);
45542 +               impossible("edward-244", "bad convert mode");
45543 +       }
45544 +       return result;
45545 +}
45546 +
45547 +/* plugin->u.item.f.scan */
45548 +int scan_ctail(flush_scan * scan)
45549 +{
45550 +       int result = 0;
45551 +       struct page *page;
45552 +       struct inode *inode;
45553 +       jnode *node = scan->node;
45554 +
45555 +       assert("edward-227", scan->node != NULL);
45556 +       assert("edward-228", jnode_is_cluster_page(scan->node));
45557 +       assert("edward-639", znode_is_write_locked(scan->parent_lock.node));
45558 +
45559 +       page = jnode_page(node);
45560 +       inode = page->mapping->host;
45561 +
45562 +       if (!reiser4_scanning_left(scan))
45563 +               return result;
45564 +       if (!ZF_ISSET(scan->parent_lock.node, JNODE_DIRTY))
45565 +               znode_make_dirty(scan->parent_lock.node);
45566 +
45567 +       if (!znode_convertible(scan->parent_lock.node)) {
45568 +               if (JF_ISSET(scan->node, JNODE_DIRTY))
45569 +                       znode_set_convertible(scan->parent_lock.node);
45570 +               else {
45571 +                       warning("edward-681",
45572 +                               "cluster page is already processed");
45573 +                       return -EAGAIN;
45574 +               }
45575 +       }
45576 +       return result;
45577 +}
45578 +
45579 +/* If true, this function attaches children */
45580 +static int should_attach_convert_idata(flush_pos_t * pos)
45581 +{
45582 +       int result;
45583 +       assert("edward-431", pos != NULL);
45584 +       assert("edward-432", pos->child == NULL);
45585 +       assert("edward-619", znode_is_write_locked(pos->coord.node));
45586 +       assert("edward-470",
45587 +              item_plugin_by_coord(&pos->coord) ==
45588 +              item_plugin_by_id(CTAIL_ID));
45589 +
45590 +       /* check for leftmost child */
45591 +       utmost_child_ctail(&pos->coord, LEFT_SIDE, &pos->child);
45592 +
45593 +       if (!pos->child)
45594 +               return 0;
45595 +       spin_lock_jnode(pos->child);
45596 +       result = (JF_ISSET(pos->child, JNODE_DIRTY) &&
45597 +                 pos->child->atom == ZJNODE(pos->coord.node)->atom);
45598 +       spin_unlock_jnode(pos->child);
45599 +       if (!result && pos->child) {
45600 +               /* existing child isn't to attach, clear up this one */
45601 +               jput(pos->child);
45602 +               pos->child = NULL;
45603 +       }
45604 +       return result;
45605 +}
45606 +
45607 +/**
45608 + * Collect all needed information about the object here,
45609 + * as in-memory inode can be evicted from memory before
45610 + * disk update completion.
45611 + */
45612 +static int init_convert_data_ctail(struct convert_item_info * idata,
45613 +                                  struct inode *inode)
45614 +{
45615 +       assert("edward-813", idata != NULL);
45616 +       assert("edward-814", inode != NULL);
45617 +
45618 +       idata->cluster_shift = inode_cluster_shift(inode);
45619 +       idata->d_cur = DC_FIRST_ITEM;
45620 +       idata->d_next = DC_INVALID_STATE;
45621 +
45622 +       return 0;
45623 +}
45624 +
45625 +static int alloc_item_convert_data(struct convert_info * sq)
45626 +{
45627 +       assert("edward-816", sq != NULL);
45628 +       assert("edward-817", sq->itm == NULL);
45629 +
45630 +       sq->itm = kmalloc(sizeof(*sq->itm), reiser4_ctx_gfp_mask_get());
45631 +       if (sq->itm == NULL)
45632 +               return RETERR(-ENOMEM);
45633 +       return 0;
45634 +}
45635 +
45636 +static void free_item_convert_data(struct convert_info * sq)
45637 +{
45638 +       assert("edward-818", sq != NULL);
45639 +       assert("edward-819", sq->itm != NULL);
45640 +       assert("edward-820", sq->iplug != NULL);
45641 +
45642 +       kfree(sq->itm);
45643 +       sq->itm = NULL;
45644 +       return;
45645 +}
45646 +
45647 +static int alloc_convert_data(flush_pos_t * pos)
45648 +{
45649 +       assert("edward-821", pos != NULL);
45650 +       assert("edward-822", pos->sq == NULL);
45651 +
45652 +       pos->sq = kmalloc(sizeof(*pos->sq), reiser4_ctx_gfp_mask_get());
45653 +       if (!pos->sq)
45654 +               return RETERR(-ENOMEM);
45655 +       memset(pos->sq, 0, sizeof(*pos->sq));
45656 +       cluster_init_write(&pos->sq->clust, NULL);
45657 +       return 0;
45658 +}
45659 +
45660 +void free_convert_data(flush_pos_t * pos)
45661 +{
45662 +       struct convert_info *sq;
45663 +
45664 +       assert("edward-823", pos != NULL);
45665 +       assert("edward-824", pos->sq != NULL);
45666 +
45667 +       sq = pos->sq;
45668 +       if (sq->itm)
45669 +               free_item_convert_data(sq);
45670 +       put_cluster_handle(&sq->clust);
45671 +       kfree(pos->sq);
45672 +       pos->sq = NULL;
45673 +       return;
45674 +}
45675 +
45676 +static int init_item_convert_data(flush_pos_t * pos, struct inode *inode)
45677 +{
45678 +       struct convert_info *sq;
45679 +
45680 +       assert("edward-825", pos != NULL);
45681 +       assert("edward-826", pos->sq != NULL);
45682 +       assert("edward-827", item_convert_data(pos) != NULL);
45683 +       assert("edward-828", inode != NULL);
45684 +
45685 +       sq = pos->sq;
45686 +
45687 +       memset(sq->itm, 0, sizeof(*sq->itm));
45688 +
45689 +       /* iplug->init_convert_data() */
45690 +       return init_convert_data_ctail(sq->itm, inode);
45691 +}
45692 +
45693 +/* create and attach disk cluster info used by 'convert' phase of the flush
45694 +   squalloc() */
45695 +static int attach_convert_idata(flush_pos_t * pos, struct inode *inode)
45696 +{
45697 +       int ret = 0;
45698 +       struct convert_item_info *info;
45699 +       struct cluster_handle *clust;
45700 +       file_plugin *fplug = inode_file_plugin(inode);
45701 +       compression_plugin *cplug = inode_compression_plugin(inode);
45702 +
45703 +       assert("edward-248", pos != NULL);
45704 +       assert("edward-249", pos->child != NULL);
45705 +       assert("edward-251", inode != NULL);
45706 +       assert("edward-682", cryptcompress_inode_ok(inode));
45707 +       assert("edward-252",
45708 +              fplug == file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID));
45709 +       assert("edward-473",
45710 +              item_plugin_by_coord(&pos->coord) ==
45711 +              item_plugin_by_id(CTAIL_ID));
45712 +
45713 +       if (!pos->sq) {
45714 +               ret = alloc_convert_data(pos);
45715 +               if (ret)
45716 +                       return ret;
45717 +       }
45718 +       clust = &pos->sq->clust;
45719 +       ret = grab_coa(&clust->tc, cplug);
45720 +       if (ret)
45721 +               goto err;
45722 +       ret = set_cluster_by_page(clust,
45723 +                                 jnode_page(pos->child),
45724 +                                 MAX_CLUSTER_NRPAGES);
45725 +       if (ret)
45726 +               goto err;
45727 +
45728 +       assert("edward-829", pos->sq != NULL);
45729 +       assert("edward-250", item_convert_data(pos) == NULL);
45730 +
45731 +       pos->sq->iplug = item_plugin_by_id(CTAIL_ID);
45732 +
45733 +       ret = alloc_item_convert_data(pos->sq);
45734 +       if (ret)
45735 +               goto err;
45736 +       ret = init_item_convert_data(pos, inode);
45737 +       if (ret)
45738 +               goto err;
45739 +       info = item_convert_data(pos);
45740 +
45741 +       ret = checkout_logical_cluster(clust, pos->child, inode);
45742 +       if (ret)
45743 +               goto err;
45744 +
45745 +       reiser4_deflate_cluster(clust, inode);
45746 +       inc_item_convert_count(pos);
45747 +
45748 +       /* prepare flow for insertion */
45749 +       fplug->flow_by_inode(inode,
45750 +                            (const char __user *)tfm_stream_data(&clust->tc, OUTPUT_STREAM),
45751 +                            0 /* kernel space */ ,
45752 +                            clust->tc.len,
45753 +                            clust_to_off(clust->index, inode),
45754 +                            WRITE_OP, &info->flow);
45755 +       jput(pos->child);
45756 +       return 0;
45757 +      err:
45758 +       jput(pos->child);
45759 +       free_convert_data(pos);
45760 +       return ret;
45761 +}
45762 +
45763 +/* clear up disk cluster info */
45764 +static void detach_convert_idata(struct convert_info * sq)
45765 +{
45766 +       struct convert_item_info *info;
45767 +
45768 +       assert("edward-253", sq != NULL);
45769 +       assert("edward-840", sq->itm != NULL);
45770 +
45771 +       info = sq->itm;
45772 +       assert("edward-1212", info->flow.length == 0);
45773 +
45774 +       free_item_convert_data(sq);
45775 +       return;
45776 +}
45777 +
45778 +/* plugin->u.item.f.utmost_child */
45779 +
45780 +/* This function sets leftmost child for a first cluster item,
45781 +   if the child exists, and NULL in other cases.
45782 +   NOTE-EDWARD: Do not call this for RIGHT_SIDE */
45783 +
45784 +int utmost_child_ctail(const coord_t * coord, sideof side, jnode ** child)
45785 +{
45786 +       reiser4_key key;
45787 +
45788 +       item_key_by_coord(coord, &key);
45789 +
45790 +       assert("edward-257", coord != NULL);
45791 +       assert("edward-258", child != NULL);
45792 +       assert("edward-259", side == LEFT_SIDE);
45793 +       assert("edward-260",
45794 +              item_plugin_by_coord(coord) == item_plugin_by_id(CTAIL_ID));
45795 +
45796 +       if (!is_disk_cluster_key(&key, coord))
45797 +               *child = NULL;
45798 +       else
45799 +               *child = jlookup(current_tree,
45800 +                                get_key_objectid(item_key_by_coord
45801 +                                                 (coord, &key)),
45802 +                                off_to_pg(get_key_offset(&key)));
45803 +       return 0;
45804 +}
45805 +
45806 +/* Returns true if @p2 is the next item to @p1
45807 +   in the _same_ disk cluster.
45808 +   Disk cluster is a set of items. If ->clustered() != NULL,
45809 +   with each item the whole disk cluster should be read/modified
45810 +*/
45811 +
45812 +/* Go rightward and check for next disk cluster item, set
45813 + * d_next to DC_CHAINED_ITEM, if the last one exists.
45814 + * If the current position is last item, go to right neighbor.
45815 + * Skip empty nodes. Note, that right neighbors may be not in
45816 + * the slum because of races. If so, make it dirty and
45817 + * convertible.
45818 + */
45819 +static int next_item_dc_stat(flush_pos_t * pos)
45820 +{
45821 +       int ret = 0;
45822 +       int stop = 0;
45823 +       znode *cur;
45824 +       coord_t coord;
45825 +       lock_handle lh;
45826 +       lock_handle right_lock;
45827 +
45828 +       assert("edward-1232", !node_is_empty(pos->coord.node));
45829 +       assert("edward-1014",
45830 +              pos->coord.item_pos < coord_num_items(&pos->coord));
45831 +       assert("edward-1015", chaining_data_present(pos));
45832 +       assert("edward-1017",
45833 +              item_convert_data(pos)->d_next == DC_INVALID_STATE);
45834 +
45835 +       item_convert_data(pos)->d_next = DC_AFTER_CLUSTER;
45836 +
45837 +       if (item_convert_data(pos)->d_cur == DC_AFTER_CLUSTER)
45838 +               return ret;
45839 +       if (pos->coord.item_pos < coord_num_items(&pos->coord) - 1)
45840 +               return ret;
45841 +
45842 +       /* Check next slum item.
45843 +        * Note, that it can not be killed by concurrent truncate,
45844 +        * as the last one will want the lock held by us.
45845 +        */
45846 +       init_lh(&right_lock);
45847 +       cur = pos->coord.node;
45848 +
45849 +       while (!stop) {
45850 +               init_lh(&lh);
45851 +               ret = reiser4_get_right_neighbor(&lh,
45852 +                                                cur,
45853 +                                                ZNODE_WRITE_LOCK,
45854 +                                                GN_CAN_USE_UPPER_LEVELS);
45855 +               if (ret)
45856 +                       break;
45857 +               ret = zload(lh.node);
45858 +               if (ret) {
45859 +                       done_lh(&lh);
45860 +                       break;
45861 +               }
45862 +               coord_init_before_first_item(&coord, lh.node);
45863 +
45864 +               if (node_is_empty(lh.node)) {
45865 +                       znode_make_dirty(lh.node);
45866 +                       znode_set_convertible(lh.node);
45867 +                       stop = 0;
45868 +               } else if (same_disk_cluster(&pos->coord, &coord)) {
45869 +
45870 +                       item_convert_data(pos)->d_next = DC_CHAINED_ITEM;
45871 +
45872 +                       if (!ZF_ISSET(lh.node, JNODE_DIRTY)) {
45873 +                               /*
45874 +                                  warning("edward-1024",
45875 +                                  "next slum item mergeable, "
45876 +                                  "but znode %p isn't dirty\n",
45877 +                                  lh.node);
45878 +                                */
45879 +                               znode_make_dirty(lh.node);
45880 +                       }
45881 +                       if (!znode_convertible(lh.node)) {
45882 +                               /*
45883 +                                  warning("edward-1272",
45884 +                                  "next slum item mergeable, "
45885 +                                  "but znode %p isn't convertible\n",
45886 +                                  lh.node);
45887 +                                */
45888 +                               znode_set_convertible(lh.node);
45889 +                       }
45890 +                       stop = 1;
45891 +               } else
45892 +                       stop = 1;
45893 +               zrelse(lh.node);
45894 +               done_lh(&right_lock);
45895 +               copy_lh(&right_lock, &lh);
45896 +               done_lh(&lh);
45897 +               cur = right_lock.node;
45898 +       }
45899 +       done_lh(&right_lock);
45900 +
45901 +       if (ret == -E_NO_NEIGHBOR)
45902 +               ret = 0;
45903 +       return ret;
45904 +}
45905 +
45906 +static int
45907 +assign_convert_mode(struct convert_item_info * idata,
45908 +                   cryptcompress_write_mode_t * mode)
45909 +{
45910 +       int result = 0;
45911 +
45912 +       assert("edward-1025", idata != NULL);
45913 +
45914 +       if (idata->flow.length) {
45915 +               /* append or overwrite */
45916 +               switch (idata->d_cur) {
45917 +               case DC_FIRST_ITEM:
45918 +               case DC_CHAINED_ITEM:
45919 +                       *mode = CRC_OVERWRITE_ITEM;
45920 +                       break;
45921 +               case DC_AFTER_CLUSTER:
45922 +                       *mode = CRC_APPEND_ITEM;
45923 +                       break;
45924 +               default:
45925 +                       impossible("edward-1018", "wrong current item state");
45926 +               }
45927 +       } else {
45928 +               /* cut or invalidate */
45929 +               switch (idata->d_cur) {
45930 +               case DC_FIRST_ITEM:
45931 +               case DC_CHAINED_ITEM:
45932 +                       *mode = CRC_CUT_ITEM;
45933 +                       break;
45934 +               case DC_AFTER_CLUSTER:
45935 +                       result = 1;
45936 +                       break;
45937 +               default:
45938 +                       impossible("edward-1019", "wrong current item state");
45939 +               }
45940 +       }
45941 +       return result;
45942 +}
45943 +
45944 +/* plugin->u.item.f.convert */
45945 +/* write ctail in guessed mode */
45946 +int convert_ctail(flush_pos_t * pos)
45947 +{
45948 +       int result;
45949 +       int nr_items;
45950 +       cryptcompress_write_mode_t mode = CRC_OVERWRITE_ITEM;
45951 +
45952 +       assert("edward-1020", pos != NULL);
45953 +       assert("edward-1213", coord_num_items(&pos->coord) != 0);
45954 +       assert("edward-1257", item_id_by_coord(&pos->coord) == CTAIL_ID);
45955 +       assert("edward-1258", ctail_ok(&pos->coord));
45956 +       assert("edward-261", pos->coord.node != NULL);
45957 +
45958 +       nr_items = coord_num_items(&pos->coord);
45959 +       if (!chaining_data_present(pos)) {
45960 +               if (should_attach_convert_idata(pos)) {
45961 +                       /* attach convert item info */
45962 +                       struct inode *inode;
45963 +
45964 +                       assert("edward-264", pos->child != NULL);
45965 +                       assert("edward-265", jnode_page(pos->child) != NULL);
45966 +                       assert("edward-266",
45967 +                              jnode_page(pos->child)->mapping != NULL);
45968 +
45969 +                       inode = jnode_page(pos->child)->mapping->host;
45970 +
45971 +                       assert("edward-267", inode != NULL);
45972 +
45973 +                       /* attach item convert info by child and put the last one */
45974 +                       result = attach_convert_idata(pos, inode);
45975 +                       pos->child = NULL;
45976 +                       if (result == -E_REPEAT) {
45977 +                               /* jnode became clean, or there is no dirty
45978 +                                  pages (nothing to update in disk cluster) */
45979 +                               warning("edward-1021",
45980 +                                       "convert_ctail: nothing to attach");
45981 +                               return 0;
45982 +                       }
45983 +                       if (result != 0)
45984 +                               return result;
45985 +               } else
45986 +                       /* unconvertible */
45987 +                       return 0;
45988 +       } else {
45989 +               /* use old convert info */
45990 +
45991 +               struct convert_item_info *idata;
45992 +
45993 +               idata = item_convert_data(pos);
45994 +
45995 +               result = assign_convert_mode(idata, &mode);
45996 +               if (result) {
45997 +                       /* disk cluster is over,
45998 +                          nothing to update anymore */
45999 +                       detach_convert_idata(pos->sq);
46000 +                       return 0;
46001 +               }
46002 +       }
46003 +
46004 +       assert("edward-433", chaining_data_present(pos));
46005 +       assert("edward-1022",
46006 +              pos->coord.item_pos < coord_num_items(&pos->coord));
46007 +
46008 +       /* check if next item is of current disk cluster */
46009 +       result = next_item_dc_stat(pos);
46010 +       if (result) {
46011 +               detach_convert_idata(pos->sq);
46012 +               return result;
46013 +       }
46014 +       result = do_convert_ctail(pos, mode);
46015 +       if (result) {
46016 +               detach_convert_idata(pos->sq);
46017 +               return result;
46018 +       }
46019 +       switch (mode) {
46020 +       case CRC_CUT_ITEM:
46021 +               assert("edward-1214", item_convert_data(pos)->flow.length == 0);
46022 +               assert("edward-1215",
46023 +                      coord_num_items(&pos->coord) == nr_items ||
46024 +                      coord_num_items(&pos->coord) == nr_items - 1);
46025 +               if (item_convert_data(pos)->d_next == DC_CHAINED_ITEM)
46026 +                       break;
46027 +               if (coord_num_items(&pos->coord) != nr_items) {
46028 +                       /* the item was killed, no more chained items */
46029 +                       detach_convert_idata(pos->sq);
46030 +                       if (!node_is_empty(pos->coord.node))
46031 +                               /* make sure the next item will be scanned */
46032 +                               coord_init_before_item(&pos->coord);
46033 +                       break;
46034 +               }
46035 +       case CRC_APPEND_ITEM:
46036 +               assert("edward-434", item_convert_data(pos)->flow.length == 0);
46037 +               detach_convert_idata(pos->sq);
46038 +               break;
46039 +       case CRC_OVERWRITE_ITEM:
46040 +               if (coord_is_unprepped_ctail(&pos->coord)) {
46041 +                       /* convert unpprepped ctail to prepped one */
46042 +                       assert("edward-1259",
46043 +                              cluster_shift_ok(item_convert_data(pos)->
46044 +                                               cluster_shift));
46045 +                       put_unaligned((d8)item_convert_data(pos)->cluster_shift,
46046 +                                     &ctail_formatted_at(&pos->coord)->
46047 +                                     cluster_shift);
46048 +               }
46049 +               break;
46050 +       }
46051 +       return result;
46052 +}
46053 +
46054 +/* Make Linus happy.
46055 +   Local variables:
46056 +   c-indentation-style: "K&R"
46057 +   mode-name: "LC"
46058 +   c-basic-offset: 8
46059 +   tab-width: 8
46060 +   fill-column: 120
46061 +   End:
46062 +*/
46063 diff -puN /dev/null fs/reiser4/plugin/item/ctail.h
46064 --- /dev/null
46065 +++ a/fs/reiser4/plugin/item/ctail.h
46066 @@ -0,0 +1,102 @@
46067 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
46068 +
46069 +/* Ctail items are fragments (or bodies) of special tipe to provide
46070 +   optimal storage of encrypted and(or) compressed files. */
46071 +
46072 +
46073 +#if !defined( __FS_REISER4_CTAIL_H__ )
46074 +#define __FS_REISER4_CTAIL_H__
46075 +
46076 +/* Disk format of ctail item */
46077 +typedef struct ctail_item_format {
46078 +       /* packed shift;
46079 +          if its value is different from UCTAIL_SHIFT (see below), then
46080 +          size of disk cluster is calculated as (1 << cluster_shift) */
46081 +       d8 cluster_shift;
46082 +       /* ctail body */
46083 +       d8 body[0];
46084 +} __attribute__ ((packed)) ctail_item_format;
46085 +
46086 +/* "Unprepped" disk cluster is represented by a single ctail item
46087 +   with the following "magic" attributes: */
46088 +/* "magic" cluster_shift */
46089 +#define UCTAIL_SHIFT 0xff
46090 +/* How many units unprepped ctail item has */
46091 +#define UCTAIL_NR_UNITS 1
46092 +
46093 +/* The following is a set of various item states in a disk cluster.
46094 +   Disk cluster is a set of items whose keys belong to the interval
46095 +   [dc_key , dc_key + disk_cluster_size - 1] */
46096 +typedef enum {
46097 +       DC_INVALID_STATE = 0,
46098 +       DC_FIRST_ITEM = 1,
46099 +       DC_CHAINED_ITEM = 2,
46100 +       DC_AFTER_CLUSTER = 3
46101 +} dc_item_stat;
46102 +
46103 +/* ctail-specific extension.
46104 +   In particular this describes parameters of disk cluster an item belongs to */
46105 +struct ctail_coord_extension {
46106 +       int shift; /* this contains cluster_shift extracted from
46107 +                     ctail_item_format (above), or UCTAIL_SHIFT
46108 +                     (the last one is the "magic" of unprepped disk clusters)*/
46109 +       int dsize; /* size of a prepped disk cluster */
46110 +       int ncount; /* count of nodes occupied by a disk cluster */
46111 +};
46112 +
46113 +struct cut_list;
46114 +
46115 +/* plugin->item.b.* */
46116 +int can_contain_key_ctail(const coord_t *, const reiser4_key *,
46117 +                         const reiser4_item_data *);
46118 +int mergeable_ctail(const coord_t * p1, const coord_t * p2);
46119 +pos_in_node_t nr_units_ctail(const coord_t * coord);
46120 +int estimate_ctail(const coord_t * coord, const reiser4_item_data * data);
46121 +void print_ctail(const char *prefix, coord_t * coord);
46122 +lookup_result lookup_ctail(const reiser4_key *, lookup_bias, coord_t *);
46123 +
46124 +int paste_ctail(coord_t * coord, reiser4_item_data * data,
46125 +               carry_plugin_info * info UNUSED_ARG);
46126 +int init_ctail(coord_t *, coord_t *, reiser4_item_data *);
46127 +int can_shift_ctail(unsigned free_space, coord_t * coord,
46128 +                   znode * target, shift_direction pend, unsigned *size,
46129 +                   unsigned want);
46130 +void copy_units_ctail(coord_t * target, coord_t * source, unsigned from,
46131 +                     unsigned count, shift_direction where_is_free_space,
46132 +                     unsigned free_space);
46133 +int cut_units_ctail(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
46134 +                   carry_cut_data *, reiser4_key * smallest_removed,
46135 +                   reiser4_key * new_first);
46136 +int kill_units_ctail(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
46137 +                    carry_kill_data *, reiser4_key * smallest_removed,
46138 +                    reiser4_key * new_first);
46139 +int ctail_ok(const coord_t * coord);
46140 +int check_ctail(const coord_t * coord, const char **error);
46141 +
46142 +/* plugin->u.item.s.* */
46143 +int read_ctail(struct file *, flow_t *, hint_t *);
46144 +int readpage_ctail(void *, struct page *);
46145 +int readpages_ctail(struct file *, struct address_space *, struct list_head *);
46146 +reiser4_key *append_key_ctail(const coord_t *, reiser4_key *);
46147 +int create_hook_ctail(const coord_t * coord, void *arg);
46148 +int kill_hook_ctail(const coord_t *, pos_in_node_t, pos_in_node_t,
46149 +                   carry_kill_data *);
46150 +int shift_hook_ctail(const coord_t *, unsigned, unsigned, znode *);
46151 +
46152 +/* plugin->u.item.f */
46153 +int utmost_child_ctail(const coord_t *, sideof, jnode **);
46154 +int scan_ctail(flush_scan *);
46155 +int convert_ctail(flush_pos_t *);
46156 +size_t inode_scaled_cluster_size(struct inode *);
46157 +
46158 +#endif                         /* __FS_REISER4_CTAIL_H__ */
46159 +
46160 +/* Make Linus happy.
46161 +   Local variables:
46162 +   c-indentation-style: "K&R"
46163 +   mode-name: "LC"
46164 +   c-basic-offset: 8
46165 +   tab-width: 8
46166 +   fill-column: 120
46167 +   End:
46168 +*/
46169 diff -puN /dev/null fs/reiser4/plugin/item/extent.c
46170 --- /dev/null
46171 +++ a/fs/reiser4/plugin/item/extent.c
46172 @@ -0,0 +1,197 @@
46173 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
46174 +
46175 +#include "item.h"
46176 +#include "../../key.h"
46177 +#include "../../super.h"
46178 +#include "../../carry.h"
46179 +#include "../../inode.h"
46180 +#include "../../page_cache.h"
46181 +#include "../../flush.h"
46182 +#include "../object.h"
46183 +
46184 +/* prepare structure reiser4_item_data. It is used to put one extent unit into tree */
46185 +/* Audited by: green(2002.06.13) */
46186 +reiser4_item_data *init_new_extent(reiser4_item_data * data, void *ext_unit,
46187 +                                  int nr_extents)
46188 +{
46189 +       data->data = ext_unit;
46190 +       /* data->data is kernel space */
46191 +       data->user = 0;
46192 +       data->length = sizeof(reiser4_extent) * nr_extents;
46193 +       data->arg = NULL;
46194 +       data->iplug = item_plugin_by_id(EXTENT_POINTER_ID);
46195 +       return data;
46196 +}
46197 +
46198 +/* how many bytes are addressed by @nr first extents of the extent item */
46199 +reiser4_block_nr reiser4_extent_size(const coord_t * coord, pos_in_node_t nr)
46200 +{
46201 +       pos_in_node_t i;
46202 +       reiser4_block_nr blocks;
46203 +       reiser4_extent *ext;
46204 +
46205 +       ext = item_body_by_coord(coord);
46206 +       assert("vs-263", nr <= nr_units_extent(coord));
46207 +
46208 +       blocks = 0;
46209 +       for (i = 0; i < nr; i++, ext++) {
46210 +               blocks += extent_get_width(ext);
46211 +       }
46212 +
46213 +       return blocks * current_blocksize;
46214 +}
46215 +
46216 +extent_state state_of_extent(reiser4_extent * ext)
46217 +{
46218 +       switch ((int)extent_get_start(ext)) {
46219 +       case 0:
46220 +               return HOLE_EXTENT;
46221 +       case 1:
46222 +               return UNALLOCATED_EXTENT;
46223 +       default:
46224 +               break;
46225 +       }
46226 +       return ALLOCATED_EXTENT;
46227 +}
46228 +
46229 +int extent_is_unallocated(const coord_t * item)
46230 +{
46231 +       assert("jmacd-5133", item_is_extent(item));
46232 +
46233 +       return state_of_extent(extent_by_coord(item)) == UNALLOCATED_EXTENT;
46234 +}
46235 +
46236 +/* set extent's start and width */
46237 +void reiser4_set_extent(reiser4_extent * ext, reiser4_block_nr start,
46238 +                       reiser4_block_nr width)
46239 +{
46240 +       extent_set_start(ext, start);
46241 +       extent_set_width(ext, width);
46242 +}
46243 +
46244 +/**
46245 + * reiser4_replace_extent - replace extent and paste 1 or 2 after it
46246 + * @un_extent: coordinate of extent to be overwritten
46247 + * @lh: need better comment
46248 + * @key: need better comment
46249 + * @exts_to_add: data prepared for insertion into tree
46250 + * @replace: need better comment
46251 + * @flags: need better comment
46252 + * @return_insert_position: need better comment
46253 + *
46254 + * Overwrites one extent, pastes 1 or 2 more ones after overwritten one.  If
46255 + * @return_inserted_position is 1 - @un_extent and @lh are returned set to
46256 + * first of newly inserted units, if it is 0 - @un_extent and @lh are returned
46257 + * set to extent which was overwritten.
46258 + */
46259 +int reiser4_replace_extent(struct replace_handle *h,
46260 +                          int return_inserted_position)
46261 +{
46262 +       int result;
46263 +       znode *orig_znode;
46264 +       /*ON_DEBUG(reiser4_extent orig_ext);*/  /* this is for debugging */
46265 +
46266 +       assert("vs-990", coord_is_existing_unit(h->coord));
46267 +       assert("vs-1375", znode_is_write_locked(h->coord->node));
46268 +       assert("vs-1426", extent_get_width(&h->overwrite) != 0);
46269 +       assert("vs-1427", extent_get_width(&h->new_extents[0]) != 0);
46270 +       assert("vs-1427", ergo(h->nr_new_extents == 2,
46271 +                              extent_get_width(&h->new_extents[1]) != 0));
46272 +
46273 +       /* compose structure for paste */
46274 +       init_new_extent(&h->item, &h->new_extents[0], h->nr_new_extents);
46275 +
46276 +       coord_dup(&h->coord_after, h->coord);
46277 +       init_lh(&h->lh_after);
46278 +       copy_lh(&h->lh_after, h->lh);
46279 +       reiser4_tap_init(&h->watch, &h->coord_after, &h->lh_after, ZNODE_WRITE_LOCK);
46280 +       reiser4_tap_monitor(&h->watch);
46281 +
46282 +       ON_DEBUG(h->orig_ext = *extent_by_coord(h->coord));
46283 +       orig_znode = h->coord->node;
46284 +
46285 +#if REISER4_DEBUG
46286 +       /* make sure that key is set properly */
46287 +       unit_key_by_coord(h->coord, &h->tmp);
46288 +       set_key_offset(&h->tmp,
46289 +                      get_key_offset(&h->tmp) +
46290 +                      extent_get_width(&h->overwrite) * current_blocksize);
46291 +       assert("vs-1080", keyeq(&h->tmp, &h->paste_key));
46292 +#endif
46293 +
46294 +       /* set insert point after unit to be replaced */
46295 +       h->coord->between = AFTER_UNIT;
46296 +
46297 +       result = insert_into_item(h->coord, return_inserted_position ? h->lh : NULL,
46298 +                                 &h->paste_key, &h->item, h->flags);
46299 +       if (!result) {
46300 +               /* now we have to replace the unit after which new units were
46301 +                  inserted. Its position is tracked by @watch */
46302 +               reiser4_extent *ext;
46303 +               znode *node;
46304 +
46305 +               node = h->coord_after.node;
46306 +               if (node != orig_znode) {
46307 +                       coord_clear_iplug(&h->coord_after);
46308 +                       result = zload(node);
46309 +               }
46310 +
46311 +               if (likely(!result)) {
46312 +                       ext = extent_by_coord(&h->coord_after);
46313 +
46314 +                       assert("vs-987", znode_is_loaded(node));
46315 +                       assert("vs-988", !memcmp(ext, &h->orig_ext, sizeof(*ext)));
46316 +
46317 +                       /* overwrite extent unit */
46318 +                       memcpy(ext, &h->overwrite, sizeof(reiser4_extent));
46319 +                       znode_make_dirty(node);
46320 +
46321 +                       if (node != orig_znode)
46322 +                               zrelse(node);
46323 +
46324 +                       if (return_inserted_position == 0) {
46325 +                               /* coord and lh are to be set to overwritten
46326 +                                  extent */
46327 +                               assert("vs-1662",
46328 +                                      WITH_DATA(node, !memcmp(&h->overwrite,
46329 +                                                              extent_by_coord(
46330 +                                                                      &h->coord_after),
46331 +                                                              sizeof(reiser4_extent))));
46332 +
46333 +                               *h->coord = h->coord_after;
46334 +                               done_lh(h->lh);
46335 +                               copy_lh(h->lh, &h->lh_after);
46336 +                       } else {
46337 +                               /* h->coord and h->lh are to be set to first of
46338 +                                  inserted units */
46339 +                               assert("vs-1663",
46340 +                                      WITH_DATA(h->coord->node,
46341 +                                                !memcmp(&h->new_extents[0],
46342 +                                                        extent_by_coord(h->coord),
46343 +                                                        sizeof(reiser4_extent))));
46344 +                               assert("vs-1664", h->lh->node == h->coord->node);
46345 +                       }
46346 +               }
46347 +       }
46348 +       reiser4_tap_done(&h->watch);
46349 +
46350 +       return result;
46351 +}
46352 +
46353 +lock_handle *znode_lh(znode *node)
46354 +{
46355 +       assert("vs-1371", znode_is_write_locked(node));
46356 +       assert("vs-1372", znode_is_wlocked_once(node));
46357 +       return list_entry(node->lock.owners.next, lock_handle, owners_link);
46358 +}
46359 +
46360 +/*
46361 + * Local variables:
46362 + * c-indentation-style: "K&R"
46363 + * mode-name: "LC"
46364 + * c-basic-offset: 8
46365 + * tab-width: 8
46366 + * fill-column: 79
46367 + * scroll-step: 1
46368 + * End:
46369 + */
46370 diff -puN /dev/null fs/reiser4/plugin/item/extent.h
46371 --- /dev/null
46372 +++ a/fs/reiser4/plugin/item/extent.h
46373 @@ -0,0 +1,231 @@
46374 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
46375 +
46376 +#ifndef __REISER4_EXTENT_H__
46377 +#define __REISER4_EXTENT_H__
46378 +
46379 +/* on disk extent */
46380 +typedef struct {
46381 +       reiser4_dblock_nr start;
46382 +       reiser4_dblock_nr width;
46383 +} reiser4_extent;
46384 +
46385 +struct extent_stat {
46386 +       int unallocated_units;
46387 +       int unallocated_blocks;
46388 +       int allocated_units;
46389 +       int allocated_blocks;
46390 +       int hole_units;
46391 +       int hole_blocks;
46392 +};
46393 +
46394 +/* extents in an extent item can be either holes, or unallocated or allocated
46395 +   extents */
46396 +typedef enum {
46397 +       HOLE_EXTENT,
46398 +       UNALLOCATED_EXTENT,
46399 +       ALLOCATED_EXTENT
46400 +} extent_state;
46401 +
46402 +#define HOLE_EXTENT_START 0
46403 +#define UNALLOCATED_EXTENT_START 1
46404 +#define UNALLOCATED_EXTENT_START2 2
46405 +
46406 +struct extent_coord_extension {
46407 +       reiser4_block_nr pos_in_unit;
46408 +       reiser4_block_nr width; /* width of current unit */
46409 +       pos_in_node_t nr_units; /* number of units */
46410 +       int ext_offset;         /* offset from the beginning of zdata() */
46411 +       unsigned long expected_page;
46412 +#if REISER4_DEBUG
46413 +       reiser4_extent extent;
46414 +#endif
46415 +};
46416 +
46417 +/* macros to set/get fields of on-disk extent */
46418 +static inline reiser4_block_nr extent_get_start(const reiser4_extent * ext)
46419 +{
46420 +       return le64_to_cpu(ext->start);
46421 +}
46422 +
46423 +static inline reiser4_block_nr extent_get_width(const reiser4_extent * ext)
46424 +{
46425 +       return le64_to_cpu(ext->width);
46426 +}
46427 +
46428 +extern __u64 reiser4_current_block_count(void);
46429 +
46430 +static inline void
46431 +extent_set_start(reiser4_extent * ext, reiser4_block_nr start)
46432 +{
46433 +       cassert(sizeof(ext->start) == 8);
46434 +       assert("nikita-2510",
46435 +              ergo(start > 1, start < reiser4_current_block_count()));
46436 +       put_unaligned(cpu_to_le64(start), &ext->start);
46437 +}
46438 +
46439 +static inline void
46440 +extent_set_width(reiser4_extent * ext, reiser4_block_nr width)
46441 +{
46442 +       cassert(sizeof(ext->width) == 8);
46443 +       assert("", width > 0);
46444 +       put_unaligned(cpu_to_le64(width), &ext->width);
46445 +       assert("nikita-2511",
46446 +              ergo(extent_get_start(ext) > 1,
46447 +                   extent_get_start(ext) + width <=
46448 +                   reiser4_current_block_count()));
46449 +}
46450 +
46451 +#define extent_item(coord)                                     \
46452 +({                                                             \
46453 +       assert("nikita-3143", item_is_extent(coord));           \
46454 +       ((reiser4_extent *)item_body_by_coord (coord));         \
46455 +})
46456 +
46457 +#define extent_by_coord(coord)                                 \
46458 +({                                                             \
46459 +       assert("nikita-3144", item_is_extent(coord));           \
46460 +       (extent_item (coord) + (coord)->unit_pos);              \
46461 +})
46462 +
46463 +#define width_by_coord(coord)                                  \
46464 +({                                                             \
46465 +       assert("nikita-3145", item_is_extent(coord));           \
46466 +       extent_get_width (extent_by_coord(coord));              \
46467 +})
46468 +
46469 +struct carry_cut_data;
46470 +struct carry_kill_data;
46471 +
46472 +/* plugin->u.item.b.* */
46473 +reiser4_key *max_key_inside_extent(const coord_t *, reiser4_key *);
46474 +int can_contain_key_extent(const coord_t * coord, const reiser4_key * key,
46475 +                          const reiser4_item_data *);
46476 +int mergeable_extent(const coord_t * p1, const coord_t * p2);
46477 +pos_in_node_t nr_units_extent(const coord_t *);
46478 +lookup_result lookup_extent(const reiser4_key *, lookup_bias, coord_t *);
46479 +void init_coord_extent(coord_t *);
46480 +int init_extent(coord_t *, reiser4_item_data *);
46481 +int paste_extent(coord_t *, reiser4_item_data *, carry_plugin_info *);
46482 +int can_shift_extent(unsigned free_space,
46483 +                    coord_t * source, znode * target, shift_direction,
46484 +                    unsigned *size, unsigned want);
46485 +void copy_units_extent(coord_t * target, coord_t * source, unsigned from,
46486 +                      unsigned count, shift_direction where_is_free_space,
46487 +                      unsigned free_space);
46488 +int kill_hook_extent(const coord_t *, pos_in_node_t from, pos_in_node_t count,
46489 +                    struct carry_kill_data *);
46490 +int create_hook_extent(const coord_t * coord, void *arg);
46491 +int cut_units_extent(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
46492 +                    struct carry_cut_data *, reiser4_key * smallest_removed,
46493 +                    reiser4_key * new_first);
46494 +int kill_units_extent(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
46495 +                     struct carry_kill_data *, reiser4_key * smallest_removed,
46496 +                     reiser4_key * new_first);
46497 +reiser4_key *unit_key_extent(const coord_t *, reiser4_key *);
46498 +reiser4_key *max_unit_key_extent(const coord_t *, reiser4_key *);
46499 +void print_extent(const char *, coord_t *);
46500 +int utmost_child_extent(const coord_t * coord, sideof side, jnode ** child);
46501 +int utmost_child_real_block_extent(const coord_t * coord, sideof side,
46502 +                                  reiser4_block_nr * block);
46503 +void item_stat_extent(const coord_t * coord, void *vp);
46504 +int reiser4_check_extent(const coord_t * coord, const char **error);
46505 +
46506 +/* plugin->u.item.s.file.* */
46507 +ssize_t reiser4_write_extent(struct file *, struct inode * inode,
46508 +                            const char __user *, size_t, loff_t *);
46509 +int reiser4_read_extent(struct file *, flow_t *, hint_t *);
46510 +int reiser4_readpage_extent(void *, struct page *);
46511 +int reiser4_do_readpage_extent(reiser4_extent*, reiser4_block_nr, struct page*);
46512 +reiser4_key *append_key_extent(const coord_t *, reiser4_key *);
46513 +void init_coord_extension_extent(uf_coord_t *, loff_t offset);
46514 +int get_block_address_extent(const coord_t *, sector_t block,
46515 +                            sector_t * result);
46516 +
46517 +/* these are used in flush.c
46518 +   FIXME-VS: should they be somewhere in item_plugin? */
46519 +int allocate_extent_item_in_place(coord_t *, lock_handle *, flush_pos_t * pos);
46520 +int allocate_and_copy_extent(znode * left, coord_t * right, flush_pos_t * pos,
46521 +                            reiser4_key * stop_key);
46522 +
46523 +int extent_is_unallocated(const coord_t * item);       /* True if this extent is unallocated (i.e., not a hole, not allocated). */
46524 +__u64 extent_unit_index(const coord_t * item); /* Block offset of this unit. */
46525 +__u64 extent_unit_width(const coord_t * item); /* Number of blocks in this unit. */
46526 +
46527 +/* plugin->u.item.f. */
46528 +int reiser4_scan_extent(flush_scan * scan);
46529 +extern int key_by_offset_extent(struct inode *, loff_t, reiser4_key *);
46530 +
46531 +reiser4_item_data *init_new_extent(reiser4_item_data * data, void *ext_unit,
46532 +                                  int nr_extents);
46533 +reiser4_block_nr reiser4_extent_size(const coord_t * coord, pos_in_node_t nr);
46534 +extent_state state_of_extent(reiser4_extent * ext);
46535 +void reiser4_set_extent(reiser4_extent *, reiser4_block_nr start,
46536 +                       reiser4_block_nr width);
46537 +int reiser4_update_extent(struct inode *, jnode *, loff_t pos,
46538 +                         int *plugged_hole);
46539 +
46540 +#include "../../coord.h"
46541 +#include "../../lock.h"
46542 +#include "../../tap.h"
46543 +
46544 +struct replace_handle {
46545 +       /* these are to be set before calling reiser4_replace_extent */
46546 +       coord_t *coord;
46547 +       lock_handle *lh;
46548 +       reiser4_key key;
46549 +       reiser4_key *pkey;
46550 +       reiser4_extent overwrite;
46551 +       reiser4_extent new_extents[2];
46552 +       int nr_new_extents;
46553 +       unsigned flags;
46554 +
46555 +       /* these are used by reiser4_replace_extent */
46556 +       reiser4_item_data item;
46557 +       coord_t coord_after;
46558 +       lock_handle lh_after;
46559 +       tap_t watch;
46560 +       reiser4_key paste_key;
46561 +#if REISER4_DEBUG
46562 +       reiser4_extent orig_ext;
46563 +       reiser4_key tmp;
46564 +#endif
46565 +};
46566 +
46567 +/* this structure is kmalloced before calling make_extent to avoid excessive
46568 +   stack consumption on plug_hole->reiser4_replace_extent */
46569 +struct make_extent_handle {
46570 +       uf_coord_t *uf_coord;
46571 +       reiser4_block_nr blocknr;
46572 +       int created;
46573 +       struct inode *inode;
46574 +       union {
46575 +               struct {
46576 +               } append;
46577 +               struct replace_handle replace;
46578 +       } u;
46579 +};
46580 +
46581 +int reiser4_replace_extent(struct replace_handle *,
46582 +                          int return_inserted_position);
46583 +lock_handle *znode_lh(znode *);
46584 +
46585 +/* the reiser4 repacker support */
46586 +struct repacker_cursor;
46587 +extern int process_extent_backward_for_repacking(tap_t *,
46588 +                                                struct repacker_cursor *);
46589 +extern int mark_extent_for_repacking(tap_t *, int);
46590 +
46591 +#define coord_by_uf_coord(uf_coord) (&((uf_coord)->coord))
46592 +#define ext_coord_by_uf_coord(uf_coord) (&((uf_coord)->extension.extent))
46593 +
46594 +/* __REISER4_EXTENT_H__ */
46595 +#endif
46596 +/*
46597 +   Local variables:
46598 +   c-indentation-style: "K&R"
46599 +   mode-name: "LC"
46600 +   c-basic-offset: 8
46601 +   tab-width: 8
46602 +   fill-column: 120
46603 +   End:
46604 +*/
46605 diff -puN /dev/null fs/reiser4/plugin/item/extent_file_ops.c
46606 --- /dev/null
46607 +++ a/fs/reiser4/plugin/item/extent_file_ops.c
46608 @@ -0,0 +1,1450 @@
46609 +/* COPYRIGHT 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
46610 +
46611 +#include "item.h"
46612 +#include "../../inode.h"
46613 +#include "../../page_cache.h"
46614 +#include "../object.h"
46615 +
46616 +#include <linux/quotaops.h>
46617 +#include <linux/swap.h>
46618 +
46619 +static inline reiser4_extent *ext_by_offset(const znode *node, int offset)
46620 +{
46621 +       reiser4_extent *ext;
46622 +
46623 +       ext = (reiser4_extent *) (zdata(node) + offset);
46624 +       return ext;
46625 +}
46626 +
46627 +/**
46628 + * check_uf_coord - verify coord extension
46629 + * @uf_coord:
46630 + * @key:
46631 + *
46632 + * Makes sure that all fields of @uf_coord are set properly. If @key is
46633 + * specified - check whether @uf_coord is set correspondingly.
46634 + */
46635 +static void check_uf_coord(const uf_coord_t *uf_coord, const reiser4_key *key)
46636 +{
46637 +#if REISER4_DEBUG
46638 +       const coord_t *coord;
46639 +       const struct extent_coord_extension *ext_coord;
46640 +       reiser4_extent *ext;
46641 +
46642 +       coord = &uf_coord->coord;
46643 +       ext_coord = &uf_coord->extension.extent;
46644 +       ext = ext_by_offset(coord->node, uf_coord->extension.extent.ext_offset);
46645 +
46646 +       assert("",
46647 +              WITH_DATA(coord->node,
46648 +                        (uf_coord->valid == 1 &&
46649 +                         coord_is_iplug_set(coord) &&
46650 +                         item_is_extent(coord) &&
46651 +                         ext_coord->nr_units == nr_units_extent(coord) &&
46652 +                         ext == extent_by_coord(coord) &&
46653 +                         ext_coord->width == extent_get_width(ext) &&
46654 +                         coord->unit_pos < ext_coord->nr_units &&
46655 +                         ext_coord->pos_in_unit < ext_coord->width &&
46656 +                         memcmp(ext, &ext_coord->extent,
46657 +                                sizeof(reiser4_extent)) == 0)));
46658 +       if (key) {
46659 +               reiser4_key coord_key;
46660 +
46661 +               unit_key_by_coord(&uf_coord->coord, &coord_key);
46662 +               set_key_offset(&coord_key,
46663 +                              get_key_offset(&coord_key) +
46664 +                              (uf_coord->extension.extent.
46665 +                               pos_in_unit << PAGE_CACHE_SHIFT));
46666 +               assert("", keyeq(key, &coord_key));
46667 +       }
46668 +#endif
46669 +}
46670 +
46671 +static inline reiser4_extent *ext_by_ext_coord(const uf_coord_t *uf_coord)
46672 +{
46673 +       check_uf_coord(uf_coord, NULL);
46674 +
46675 +       return ext_by_offset(uf_coord->coord.node,
46676 +                            uf_coord->extension.extent.ext_offset);
46677 +}
46678 +
46679 +#if REISER4_DEBUG
46680 +
46681 +/**
46682 + * offset_is_in_unit
46683 + *
46684 + *
46685 + *
46686 + */
46687 +/* return 1 if offset @off is inside of extent unit pointed to by @coord. Set
46688 +   pos_in_unit inside of unit correspondingly */
46689 +static int offset_is_in_unit(const coord_t *coord, loff_t off)
46690 +{
46691 +       reiser4_key unit_key;
46692 +       __u64 unit_off;
46693 +       reiser4_extent *ext;
46694 +
46695 +       ext = extent_by_coord(coord);
46696 +
46697 +       unit_key_extent(coord, &unit_key);
46698 +       unit_off = get_key_offset(&unit_key);
46699 +       if (off < unit_off)
46700 +               return 0;
46701 +       if (off >= (unit_off + (current_blocksize * extent_get_width(ext))))
46702 +               return 0;
46703 +       return 1;
46704 +}
46705 +
46706 +static int
46707 +coord_matches_key_extent(const coord_t * coord, const reiser4_key * key)
46708 +{
46709 +       reiser4_key item_key;
46710 +
46711 +       assert("vs-771", coord_is_existing_unit(coord));
46712 +       assert("vs-1258", keylt(key, append_key_extent(coord, &item_key)));
46713 +       assert("vs-1259", keyge(key, item_key_by_coord(coord, &item_key)));
46714 +
46715 +       return offset_is_in_unit(coord, get_key_offset(key));
46716 +}
46717 +
46718 +#endif
46719 +
46720 +/**
46721 + * can_append -
46722 + * @key:
46723 + * @coord:
46724 + *
46725 + * Returns 1 if @key is equal to an append key of item @coord is set to
46726 + */
46727 +static int can_append(const reiser4_key *key, const coord_t *coord)
46728 +{
46729 +       reiser4_key append_key;
46730 +
46731 +       return keyeq(key, append_key_extent(coord, &append_key));
46732 +}
46733 +
46734 +/**
46735 + * append_hole
46736 + * @coord:
46737 + * @lh:
46738 + * @key:
46739 + *
46740 + */
46741 +static int append_hole(coord_t *coord, lock_handle *lh,
46742 +                      const reiser4_key *key)
46743 +{
46744 +       reiser4_key append_key;
46745 +       reiser4_block_nr hole_width;
46746 +       reiser4_extent *ext, new_ext;
46747 +       reiser4_item_data idata;
46748 +
46749 +       /* last item of file may have to be appended with hole */
46750 +       assert("vs-708", znode_get_level(coord->node) == TWIG_LEVEL);
46751 +       assert("vs-714", item_id_by_coord(coord) == EXTENT_POINTER_ID);
46752 +
46753 +       /* key of first byte which is not addressed by this extent */
46754 +       append_key_extent(coord, &append_key);
46755 +
46756 +       assert("", keyle(&append_key, key));
46757 +
46758 +       /*
46759 +        * extent item has to be appended with hole. Calculate length of that
46760 +        * hole
46761 +        */
46762 +       hole_width = ((get_key_offset(key) - get_key_offset(&append_key) +
46763 +                      current_blocksize - 1) >> current_blocksize_bits);
46764 +       assert("vs-954", hole_width > 0);
46765 +
46766 +       /* set coord after last unit */
46767 +       coord_init_after_item_end(coord);
46768 +
46769 +       /* get last extent in the item */
46770 +       ext = extent_by_coord(coord);
46771 +       if (state_of_extent(ext) == HOLE_EXTENT) {
46772 +               /*
46773 +                * last extent of a file is hole extent. Widen that extent by
46774 +                * @hole_width blocks. Note that we do not worry about
46775 +                * overflowing - extent width is 64 bits
46776 +                */
46777 +               reiser4_set_extent(ext, HOLE_EXTENT_START,
46778 +                                  extent_get_width(ext) + hole_width);
46779 +               znode_make_dirty(coord->node);
46780 +               return 0;
46781 +       }
46782 +
46783 +       /* append last item of the file with hole extent unit */
46784 +       assert("vs-713", (state_of_extent(ext) == ALLOCATED_EXTENT ||
46785 +                         state_of_extent(ext) == UNALLOCATED_EXTENT));
46786 +
46787 +       reiser4_set_extent(&new_ext, HOLE_EXTENT_START, hole_width);
46788 +       init_new_extent(&idata, &new_ext, 1);
46789 +       return insert_into_item(coord, lh, &append_key, &idata, 0);
46790 +}
46791 +
46792 +/**
46793 + * check_jnodes
46794 + * @twig: longterm locked twig node
46795 + * @key:
46796 + *
46797 + */
46798 +static void check_jnodes(znode *twig, const reiser4_key *key, int count)
46799 +{
46800 +#if REISER4_DEBUG
46801 +       coord_t c;
46802 +       reiser4_key node_key, jnode_key;
46803 +
46804 +       jnode_key = *key;
46805 +
46806 +       assert("", twig != NULL);
46807 +       assert("", znode_get_level(twig) == TWIG_LEVEL);
46808 +       assert("", znode_is_write_locked(twig));
46809 +
46810 +       zload(twig);
46811 +       /* get the smallest key in twig node */
46812 +       coord_init_first_unit(&c, twig);
46813 +       unit_key_by_coord(&c, &node_key);
46814 +       assert("", keyle(&node_key, &jnode_key));
46815 +
46816 +       coord_init_last_unit(&c, twig);
46817 +       unit_key_by_coord(&c, &node_key);
46818 +       if (item_plugin_by_coord(&c)->s.file.append_key)
46819 +               item_plugin_by_coord(&c)->s.file.append_key(&c, &node_key);
46820 +       set_key_offset(&jnode_key,
46821 +                      get_key_offset(&jnode_key) + (loff_t)count * PAGE_CACHE_SIZE - 1);
46822 +       assert("", keylt(&jnode_key, &node_key));
46823 +       zrelse(twig);
46824 +#endif
46825 +}
46826 +
46827 +/**
46828 + * append_last_extent - append last file item
46829 + * @uf_coord: coord to start insertion from
46830 + * @jnodes: array of jnodes
46831 + * @count: number of jnodes in the array
46832 + *
46833 + * There is already at least one extent item of file @inode in the tree. Append
46834 + * the last of them with unallocated extent unit of width @count. Assign
46835 + * fake block numbers to jnodes corresponding to the inserted extent.
46836 + */
46837 +static int append_last_extent(uf_coord_t *uf_coord, const reiser4_key *key,
46838 +                             jnode **jnodes, int count)
46839 +{
46840 +       int result;
46841 +       reiser4_extent new_ext;
46842 +       reiser4_item_data idata;
46843 +       coord_t *coord;
46844 +       struct extent_coord_extension *ext_coord;
46845 +       reiser4_extent *ext;
46846 +       reiser4_block_nr block;
46847 +       jnode *node;
46848 +       int i;
46849 +
46850 +       coord = &uf_coord->coord;
46851 +       ext_coord = &uf_coord->extension.extent;
46852 +       ext = ext_by_ext_coord(uf_coord);
46853 +
46854 +       /* check correctness of position in the item */
46855 +       assert("vs-228", coord->unit_pos == coord_last_unit_pos(coord));
46856 +       assert("vs-1311", coord->between == AFTER_UNIT);
46857 +       assert("vs-1302", ext_coord->pos_in_unit == ext_coord->width - 1);
46858 +
46859 +       if (!can_append(key, coord)) {
46860 +               /* hole extent has to be inserted */
46861 +               result = append_hole(coord, uf_coord->lh, key);
46862 +               uf_coord->valid = 0;
46863 +               return result;
46864 +       }
46865 +
46866 +       if (count == 0)
46867 +               return 0;
46868 +
46869 +       assert("", get_key_offset(key) == (loff_t)index_jnode(jnodes[0]) * PAGE_CACHE_SIZE);
46870 +
46871 +       result = DQUOT_ALLOC_BLOCK_NODIRTY(mapping_jnode(jnodes[0])->host,
46872 +                                          count);
46873 +       BUG_ON(result != 0);
46874 +
46875 +       switch (state_of_extent(ext)) {
46876 +       case UNALLOCATED_EXTENT:
46877 +               /*
46878 +                * last extent unit of the file is unallocated one. Increase
46879 +                * its width by @count
46880 +                */
46881 +               reiser4_set_extent(ext, UNALLOCATED_EXTENT_START,
46882 +                                  extent_get_width(ext) + count);
46883 +               znode_make_dirty(coord->node);
46884 +
46885 +               /* update coord extension */
46886 +               ext_coord->width += count;
46887 +               ON_DEBUG(extent_set_width
46888 +                        (&uf_coord->extension.extent.extent,
46889 +                         ext_coord->width));
46890 +               break;
46891 +
46892 +       case HOLE_EXTENT:
46893 +       case ALLOCATED_EXTENT:
46894 +               /*
46895 +                * last extent unit of the file is either hole or allocated
46896 +                * one. Append one unallocated extent of width @count
46897 +                */
46898 +               reiser4_set_extent(&new_ext, UNALLOCATED_EXTENT_START, count);
46899 +               init_new_extent(&idata, &new_ext, 1);
46900 +               result = insert_into_item(coord, uf_coord->lh, key, &idata, 0);
46901 +               uf_coord->valid = 0;
46902 +               if (result)
46903 +                       return result;
46904 +               break;
46905 +
46906 +       default:
46907 +               return RETERR(-EIO);
46908 +       }
46909 +
46910 +       /*
46911 +        * make sure that we hold long term locked twig node containing all
46912 +        * jnodes we are about to capture
46913 +        */
46914 +       check_jnodes(uf_coord->lh->node, key, count);
46915 +
46916 +       /*
46917 +        * assign fake block numbers to all jnodes. FIXME: make sure whether
46918 +        * twig node containing inserted extent item is locked
46919 +        */
46920 +       block = fake_blocknr_unformatted(count);
46921 +       for (i = 0; i < count; i ++, block ++) {
46922 +               node = jnodes[i];
46923 +               spin_lock_jnode(node);
46924 +               JF_SET(node, JNODE_CREATED);
46925 +               jnode_set_block(node, &block);
46926 +               result = reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0);
46927 +               BUG_ON(result != 0);
46928 +               jnode_make_dirty_locked(node);
46929 +               spin_unlock_jnode(node);
46930 +       }
46931 +       return count;
46932 +}
46933 +
46934 +/**
46935 + * insert_first_hole - inser hole extent into tree
46936 + * @coord:
46937 + * @lh:
46938 + * @key:
46939 + *
46940 + *
46941 + */
46942 +static int insert_first_hole(coord_t *coord, lock_handle *lh,
46943 +                            const reiser4_key *key)
46944 +{
46945 +       reiser4_extent new_ext;
46946 +       reiser4_item_data idata;
46947 +       reiser4_key item_key;
46948 +       reiser4_block_nr hole_width;
46949 +
46950 +       /* @coord must be set for inserting of new item */
46951 +       assert("vs-711", coord_is_between_items(coord));
46952 +
46953 +       item_key = *key;
46954 +       set_key_offset(&item_key, 0ull);
46955 +
46956 +       hole_width = ((get_key_offset(key) + current_blocksize - 1) >>
46957 +                     current_blocksize_bits);
46958 +       assert("vs-710", hole_width > 0);
46959 +
46960 +       /* compose body of hole extent and insert item into tree */
46961 +       reiser4_set_extent(&new_ext, HOLE_EXTENT_START, hole_width);
46962 +       init_new_extent(&idata, &new_ext, 1);
46963 +       return insert_extent_by_coord(coord, &idata, &item_key, lh);
46964 +}
46965 +
46966 +
46967 +/**
46968 + * insert_first_extent - insert first file item
46969 + * @inode: inode of file
46970 + * @uf_coord: coord to start insertion from
46971 + * @jnodes: array of jnodes
46972 + * @count: number of jnodes in the array
46973 + * @inode:
46974 + *
46975 + * There are no items of file @inode in the tree yet. Insert unallocated extent
46976 + * of width @count into tree or hole extent if writing not to the
46977 + * beginning. Assign fake block numbers to jnodes corresponding to the inserted
46978 + * unallocated extent. Returns number of jnodes or error code.
46979 + */
46980 +static int insert_first_extent(uf_coord_t *uf_coord, const reiser4_key *key,
46981 +                              jnode **jnodes, int count,
46982 +                              struct inode *inode)
46983 +{
46984 +       int result;
46985 +       int i;
46986 +       reiser4_extent new_ext;
46987 +       reiser4_item_data idata;
46988 +       reiser4_block_nr block;
46989 +       struct unix_file_info *uf_info;
46990 +       jnode *node;
46991 +
46992 +       /* first extent insertion starts at leaf level */
46993 +       assert("vs-719", znode_get_level(uf_coord->coord.node) == LEAF_LEVEL);
46994 +       assert("vs-711", coord_is_between_items(&uf_coord->coord));
46995 +
46996 +       if (get_key_offset(key) != 0) {
46997 +               result = insert_first_hole(&uf_coord->coord, uf_coord->lh, key);
46998 +               uf_coord->valid = 0;
46999 +               uf_info = unix_file_inode_data(inode);
47000 +
47001 +               /*
47002 +                * first item insertion is only possible when writing to empty
47003 +                * file or performing tail conversion
47004 +                */
47005 +               assert("", (uf_info->container == UF_CONTAINER_EMPTY ||
47006 +                           (reiser4_inode_get_flag(inode,
47007 +                                                   REISER4_PART_MIXED) &&
47008 +                            reiser4_inode_get_flag(inode,
47009 +                                                   REISER4_PART_IN_CONV))));
47010 +               /* if file was empty - update its state */
47011 +               if (result == 0 && uf_info->container == UF_CONTAINER_EMPTY)
47012 +                       uf_info->container = UF_CONTAINER_EXTENTS;
47013 +               return result;
47014 +       }
47015 +
47016 +       if (count == 0)
47017 +               return 0;
47018 +
47019 +       result = DQUOT_ALLOC_BLOCK_NODIRTY(mapping_jnode(jnodes[0])->host, count);
47020 +       BUG_ON(result != 0);
47021 +
47022 +       /*
47023 +        * prepare for tree modification: compose body of item and item data
47024 +        * structure needed for insertion
47025 +        */
47026 +       reiser4_set_extent(&new_ext, UNALLOCATED_EXTENT_START, count);
47027 +       init_new_extent(&idata, &new_ext, 1);
47028 +
47029 +       /* insert extent item into the tree */
47030 +       result = insert_extent_by_coord(&uf_coord->coord, &idata, key,
47031 +                                       uf_coord->lh);
47032 +       if (result)
47033 +               return result;
47034 +
47035 +       /*
47036 +        * make sure that we hold long term locked twig node containing all
47037 +        * jnodes we are about to capture
47038 +        */
47039 +       check_jnodes(uf_coord->lh->node, key, count);
47040 +       /*
47041 +        * assign fake block numbers to all jnodes, capture and mark them dirty
47042 +        */
47043 +       block = fake_blocknr_unformatted(count);
47044 +       for (i = 0; i < count; i ++, block ++) {
47045 +               node = jnodes[i];
47046 +               spin_lock_jnode(node);
47047 +               JF_SET(node, JNODE_CREATED);
47048 +               jnode_set_block(node, &block);
47049 +               result = reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0);
47050 +               BUG_ON(result != 0);
47051 +               jnode_make_dirty_locked(node);
47052 +               spin_unlock_jnode(node);
47053 +       }
47054 +
47055 +       /*
47056 +        * invalidate coordinate, research must be performed to continue
47057 +        * because write will continue on twig level
47058 +        */
47059 +       uf_coord->valid = 0;
47060 +       return count;
47061 +}
47062 +
47063 +/**
47064 + * plug_hole - replace hole extent with unallocated and holes
47065 + * @uf_coord:
47066 + * @key:
47067 + * @node:
47068 + * @h: structure containing coordinate, lock handle, key, etc
47069 + *
47070 + * Creates an unallocated extent of width 1 within a hole. In worst case two
47071 + * additional extents can be created.
47072 + */
47073 +static int plug_hole(uf_coord_t *uf_coord, const reiser4_key *key, int *how)
47074 +{
47075 +       struct replace_handle rh;
47076 +       reiser4_extent *ext;
47077 +       reiser4_block_nr width, pos_in_unit;
47078 +       coord_t *coord;
47079 +       struct extent_coord_extension *ext_coord;
47080 +       int return_inserted_position;
47081 +
47082 +       check_uf_coord(uf_coord, key);
47083 +
47084 +       rh.coord = coord_by_uf_coord(uf_coord);
47085 +       rh.lh = uf_coord->lh;
47086 +       rh.flags = 0;
47087 +
47088 +       coord = coord_by_uf_coord(uf_coord);
47089 +       ext_coord = ext_coord_by_uf_coord(uf_coord);
47090 +       ext = ext_by_ext_coord(uf_coord);
47091 +
47092 +       width = ext_coord->width;
47093 +       pos_in_unit = ext_coord->pos_in_unit;
47094 +
47095 +       *how = 0;
47096 +       if (width == 1) {
47097 +               reiser4_set_extent(ext, UNALLOCATED_EXTENT_START, 1);
47098 +               znode_make_dirty(coord->node);
47099 +               /* update uf_coord */
47100 +               ON_DEBUG(ext_coord->extent = *ext);
47101 +               *how = 1;
47102 +               return 0;
47103 +       } else if (pos_in_unit == 0) {
47104 +               /* we deal with first element of extent */
47105 +               if (coord->unit_pos) {
47106 +                       /* there is an extent to the left */
47107 +                       if (state_of_extent(ext - 1) == UNALLOCATED_EXTENT) {
47108 +                               /*
47109 +                                * left neighboring unit is an unallocated
47110 +                                * extent. Increase its width and decrease
47111 +                                * width of hole
47112 +                                */
47113 +                               extent_set_width(ext - 1,
47114 +                                                extent_get_width(ext - 1) + 1);
47115 +                               extent_set_width(ext, width - 1);
47116 +                               znode_make_dirty(coord->node);
47117 +
47118 +                               /* update coord extension */
47119 +                               coord->unit_pos--;
47120 +                               ext_coord->width = extent_get_width(ext - 1);
47121 +                               ext_coord->pos_in_unit = ext_coord->width - 1;
47122 +                               ext_coord->ext_offset -= sizeof(reiser4_extent);
47123 +                               ON_DEBUG(ext_coord->extent =
47124 +                                        *extent_by_coord(coord));
47125 +                               *how = 2;
47126 +                               return 0;
47127 +                       }
47128 +               }
47129 +               /* extent for replace */
47130 +               reiser4_set_extent(&rh.overwrite, UNALLOCATED_EXTENT_START, 1);
47131 +               /* extent to be inserted */
47132 +               reiser4_set_extent(&rh.new_extents[0], HOLE_EXTENT_START,
47133 +                                  width - 1);
47134 +               rh.nr_new_extents = 1;
47135 +
47136 +               /* have reiser4_replace_extent to return with @coord and
47137 +                  @uf_coord->lh set to unit which was replaced */
47138 +               return_inserted_position = 0;
47139 +               *how = 3;
47140 +       } else if (pos_in_unit == width - 1) {
47141 +               /* we deal with last element of extent */
47142 +               if (coord->unit_pos < nr_units_extent(coord) - 1) {
47143 +                       /* there is an extent unit to the right */
47144 +                       if (state_of_extent(ext + 1) == UNALLOCATED_EXTENT) {
47145 +                               /*
47146 +                                * right neighboring unit is an unallocated
47147 +                                * extent. Increase its width and decrease
47148 +                                * width of hole
47149 +                                */
47150 +                               extent_set_width(ext + 1,
47151 +                                                extent_get_width(ext + 1) + 1);
47152 +                               extent_set_width(ext, width - 1);
47153 +                               znode_make_dirty(coord->node);
47154 +
47155 +                               /* update coord extension */
47156 +                               coord->unit_pos++;
47157 +                               ext_coord->width = extent_get_width(ext + 1);
47158 +                               ext_coord->pos_in_unit = 0;
47159 +                               ext_coord->ext_offset += sizeof(reiser4_extent);
47160 +                               ON_DEBUG(ext_coord->extent =
47161 +                                        *extent_by_coord(coord));
47162 +                               *how = 4;
47163 +                               return 0;
47164 +                       }
47165 +               }
47166 +               /* extent for replace */
47167 +               reiser4_set_extent(&rh.overwrite, HOLE_EXTENT_START, width - 1);
47168 +               /* extent to be inserted */
47169 +               reiser4_set_extent(&rh.new_extents[0], UNALLOCATED_EXTENT_START,
47170 +                                  1);
47171 +               rh.nr_new_extents = 1;
47172 +
47173 +               /* have reiser4_replace_extent to return with @coord and
47174 +                  @uf_coord->lh set to unit which was inserted */
47175 +               return_inserted_position = 1;
47176 +               *how = 5;
47177 +       } else {
47178 +               /* extent for replace */
47179 +               reiser4_set_extent(&rh.overwrite, HOLE_EXTENT_START,
47180 +                                  pos_in_unit);
47181 +               /* extents to be inserted */
47182 +               reiser4_set_extent(&rh.new_extents[0], UNALLOCATED_EXTENT_START,
47183 +                                  1);
47184 +               reiser4_set_extent(&rh.new_extents[1], HOLE_EXTENT_START,
47185 +                                  width - pos_in_unit - 1);
47186 +               rh.nr_new_extents = 2;
47187 +
47188 +               /* have reiser4_replace_extent to return with @coord and
47189 +                  @uf_coord->lh set to first of units which were inserted */
47190 +               return_inserted_position = 1;
47191 +               *how = 6;
47192 +       }
47193 +       unit_key_by_coord(coord, &rh.paste_key);
47194 +       set_key_offset(&rh.paste_key, get_key_offset(&rh.paste_key) +
47195 +                      extent_get_width(&rh.overwrite) * current_blocksize);
47196 +
47197 +       uf_coord->valid = 0;
47198 +       return reiser4_replace_extent(&rh, return_inserted_position);
47199 +}
47200 +
47201 +/**
47202 + * overwrite_one_block -
47203 + * @uf_coord:
47204 + * @key:
47205 + * @node:
47206 + *
47207 + * If @node corresponds to hole extent - create unallocated extent for it and
47208 + * assign fake block number. If @node corresponds to allocated extent - assign
47209 + * block number of jnode
47210 + */
47211 +static int overwrite_one_block(uf_coord_t *uf_coord, const reiser4_key *key,
47212 +                              jnode *node, int *hole_plugged)
47213 +{
47214 +       int result;
47215 +       struct extent_coord_extension *ext_coord;
47216 +       reiser4_extent *ext;
47217 +       reiser4_block_nr block;
47218 +       int how;
47219 +
47220 +       assert("vs-1312", uf_coord->coord.between == AT_UNIT);
47221 +
47222 +       result = 0;
47223 +       ext_coord = ext_coord_by_uf_coord(uf_coord);
47224 +       ext = ext_by_ext_coord(uf_coord);
47225 +       assert("", state_of_extent(ext) != UNALLOCATED_EXTENT);
47226 +
47227 +       switch (state_of_extent(ext)) {
47228 +       case ALLOCATED_EXTENT:
47229 +               block = extent_get_start(ext) + ext_coord->pos_in_unit;
47230 +               break;
47231 +
47232 +       case HOLE_EXTENT:
47233 +               result = DQUOT_ALLOC_BLOCK_NODIRTY(mapping_jnode(node)->host, 1);
47234 +               BUG_ON(result != 0);
47235 +               result = plug_hole(uf_coord, key, &how);
47236 +               if (result)
47237 +                       return result;
47238 +               block = fake_blocknr_unformatted(1);
47239 +               if (hole_plugged)
47240 +                       *hole_plugged = 1;
47241 +               JF_SET(node, JNODE_CREATED);
47242 +               break;
47243 +
47244 +       default:
47245 +               return RETERR(-EIO);
47246 +       }
47247 +
47248 +       jnode_set_block(node, &block);
47249 +       return 0;
47250 +}
47251 +
47252 +/**
47253 + * move_coord - move coordinate forward
47254 + * @uf_coord:
47255 + *
47256 + * Move coordinate one data block pointer forward. Return 1 if coord is set to
47257 + * the last one already or is invalid.
47258 + */
47259 +static int move_coord(uf_coord_t *uf_coord)
47260 +{
47261 +       struct extent_coord_extension *ext_coord;
47262 +
47263 +       if (uf_coord->valid == 0)
47264 +               return 1;
47265 +       ext_coord = &uf_coord->extension.extent;
47266 +       ext_coord->pos_in_unit ++;
47267 +       if (ext_coord->pos_in_unit < ext_coord->width)
47268 +               /* coordinate moved within the unit */
47269 +               return 0;
47270 +
47271 +       /* end of unit is reached. Try to move to next unit */
47272 +       ext_coord->pos_in_unit = 0;
47273 +       uf_coord->coord.unit_pos ++;
47274 +       if (uf_coord->coord.unit_pos < ext_coord->nr_units) {
47275 +               /* coordinate moved to next unit */
47276 +               ext_coord->ext_offset += sizeof(reiser4_extent);
47277 +               ext_coord->width =
47278 +                       extent_get_width(ext_by_offset
47279 +                                        (uf_coord->coord.node,
47280 +                                         ext_coord->ext_offset));
47281 +               ON_DEBUG(ext_coord->extent =
47282 +                        *ext_by_offset(uf_coord->coord.node,
47283 +                                       ext_coord->ext_offset));
47284 +               return 0;
47285 +       }
47286 +       /* end of item is reached */
47287 +       uf_coord->valid = 0;
47288 +       return 1;
47289 +}
47290 +
47291 +/**
47292 + * overwrite_extent -
47293 + * @inode:
47294 + *
47295 + * Returns number of handled jnodes.
47296 + */
47297 +static int overwrite_extent(uf_coord_t *uf_coord, const reiser4_key *key,
47298 +                           jnode **jnodes, int count, int *plugged_hole)
47299 +{
47300 +       int result;
47301 +       reiser4_key k;
47302 +       int i;
47303 +       jnode *node;
47304 +
47305 +       k = *key;
47306 +       for (i = 0; i < count; i ++) {
47307 +               node = jnodes[i];
47308 +               if (*jnode_get_block(node) == 0) {
47309 +                       result = overwrite_one_block(uf_coord, &k, node, plugged_hole);
47310 +                       if (result)
47311 +                               return result;
47312 +               }
47313 +               /*
47314 +                * make sure that we hold long term locked twig node containing
47315 +                * all jnodes we are about to capture
47316 +                */
47317 +               check_jnodes(uf_coord->lh->node, &k, 1);
47318 +               /*
47319 +                * assign fake block numbers to all jnodes, capture and mark
47320 +                * them dirty
47321 +                */
47322 +               spin_lock_jnode(node);
47323 +               result = reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0);
47324 +               BUG_ON(result != 0);
47325 +               jnode_make_dirty_locked(node);
47326 +               spin_unlock_jnode(node);
47327 +
47328 +               if (uf_coord->valid == 0)
47329 +                       return i + 1;
47330 +
47331 +               check_uf_coord(uf_coord, &k);
47332 +
47333 +               if (move_coord(uf_coord)) {
47334 +                       /*
47335 +                        * failed to move to the next node pointer. Either end
47336 +                        * of file or end of twig node is reached. In the later
47337 +                        * case we might go to the right neighbor.
47338 +                        */
47339 +                       uf_coord->valid = 0;
47340 +                       return i + 1;
47341 +               }
47342 +               set_key_offset(&k, get_key_offset(&k) + PAGE_CACHE_SIZE);
47343 +       }
47344 +
47345 +       return count;
47346 +}
47347 +
47348 +/**
47349 + * reiser4_update_extent
47350 + * @file:
47351 + * @jnodes:
47352 + * @count:
47353 + * @off:
47354 + *
47355 + */
47356 +int reiser4_update_extent(struct inode *inode, jnode *node, loff_t pos,
47357 +                 int *plugged_hole)
47358 +{
47359 +       int result;
47360 +       znode *loaded;
47361 +       uf_coord_t uf_coord;
47362 +       coord_t *coord;
47363 +       lock_handle lh;
47364 +       reiser4_key key;
47365 +
47366 +       assert("", reiser4_lock_counters()->d_refs == 0);
47367 +
47368 +       key_by_inode_and_offset_common(inode, pos, &key);
47369 +
47370 +       init_uf_coord(&uf_coord, &lh);
47371 +       coord = &uf_coord.coord;
47372 +       result = find_file_item_nohint(coord, &lh, &key,
47373 +                                      ZNODE_WRITE_LOCK, inode);
47374 +       if (IS_CBKERR(result)) {
47375 +               assert("", reiser4_lock_counters()->d_refs == 0);
47376 +               return result;
47377 +       }
47378 +
47379 +       result = zload(coord->node);
47380 +       BUG_ON(result != 0);
47381 +       loaded = coord->node;
47382 +
47383 +       if (coord->between == AFTER_UNIT) {
47384 +               /*
47385 +                * append existing extent item with unallocated extent of width
47386 +                * nr_jnodes
47387 +                */
47388 +               init_coord_extension_extent(&uf_coord,
47389 +                                           get_key_offset(&key));
47390 +               result = append_last_extent(&uf_coord, &key,
47391 +                                           &node, 1);
47392 +       } else if (coord->between == AT_UNIT) {
47393 +               /*
47394 +                * overwrite
47395 +                * not optimal yet. Will be optimized if new write will show
47396 +                * performance win.
47397 +                */
47398 +               init_coord_extension_extent(&uf_coord,
47399 +                                           get_key_offset(&key));
47400 +               result = overwrite_extent(&uf_coord, &key,
47401 +                                         &node, 1, plugged_hole);
47402 +       } else {
47403 +               /*
47404 +                * there are no items of this file in the tree yet. Create
47405 +                * first item of the file inserting one unallocated extent of
47406 +                * width nr_jnodes
47407 +                */
47408 +               result = insert_first_extent(&uf_coord, &key, &node, 1, inode);
47409 +       }
47410 +       assert("", result == 1 || result < 0);
47411 +       zrelse(loaded);
47412 +       done_lh(&lh);
47413 +       assert("", reiser4_lock_counters()->d_refs == 0);
47414 +       return (result == 1) ? 0 : result;
47415 +}
47416 +
47417 +/**
47418 + * update_extents
47419 + * @file:
47420 + * @jnodes:
47421 + * @count:
47422 + * @off:
47423 + *
47424 + */
47425 +static int update_extents(struct file *file, struct inode *inode,
47426 +                         jnode **jnodes, int count, loff_t pos)
47427 +{
47428 +       struct hint hint;
47429 +       reiser4_key key;
47430 +       int result;
47431 +       znode *loaded;
47432 +
47433 +       result = load_file_hint(file, &hint);
47434 +       BUG_ON(result != 0);
47435 +
47436 +       if (count != 0)
47437 +               /*
47438 +                * count == 0 is special case: expanding truncate
47439 +                */
47440 +               pos = (loff_t)index_jnode(jnodes[0]) << PAGE_CACHE_SHIFT;
47441 +       key_by_inode_and_offset_common(inode, pos, &key);
47442 +
47443 +       assert("", reiser4_lock_counters()->d_refs == 0);
47444 +
47445 +       do {
47446 +               result = find_file_item(&hint, &key, ZNODE_WRITE_LOCK, inode);
47447 +               if (IS_CBKERR(result)) {
47448 +                       assert("", reiser4_lock_counters()->d_refs == 0);
47449 +                       return result;
47450 +               }
47451 +
47452 +               result = zload(hint.ext_coord.coord.node);
47453 +               BUG_ON(result != 0);
47454 +               loaded = hint.ext_coord.coord.node;
47455 +
47456 +               if (hint.ext_coord.coord.between == AFTER_UNIT) {
47457 +                       /*
47458 +                        * append existing extent item with unallocated extent
47459 +                        * of width nr_jnodes
47460 +                        */
47461 +                       if (hint.ext_coord.valid == 0)
47462 +                               /* NOTE: get statistics on this */
47463 +                               init_coord_extension_extent(&hint.ext_coord,
47464 +                                                           get_key_offset(&key));
47465 +                       result = append_last_extent(&hint.ext_coord, &key,
47466 +                                                   jnodes, count);
47467 +               } else if (hint.ext_coord.coord.between == AT_UNIT) {
47468 +                       /*
47469 +                        * overwrite
47470 +                        * not optimal yet. Will be optimized if new write will
47471 +                        * show performance win.
47472 +                        */
47473 +                       if (hint.ext_coord.valid == 0)
47474 +                               /* NOTE: get statistics on this */
47475 +                               init_coord_extension_extent(&hint.ext_coord,
47476 +                                                           get_key_offset(&key));
47477 +                       result = overwrite_extent(&hint.ext_coord, &key,
47478 +                                                 jnodes, count, NULL);
47479 +               } else {
47480 +                       /*
47481 +                        * there are no items of this file in the tree
47482 +                        * yet. Create first item of the file inserting one
47483 +                        * unallocated extent of * width nr_jnodes
47484 +                        */
47485 +                       result = insert_first_extent(&hint.ext_coord, &key,
47486 +                                                    jnodes, count, inode);
47487 +               }
47488 +               zrelse(loaded);
47489 +               if (result < 0) {
47490 +                       done_lh(hint.ext_coord.lh);
47491 +                       break;
47492 +               }
47493 +
47494 +               jnodes += result;
47495 +               count -= result;
47496 +               set_key_offset(&key, get_key_offset(&key) + result * PAGE_CACHE_SIZE);
47497 +
47498 +               /* seal and unlock znode */
47499 +               if (hint.ext_coord.valid)
47500 +                       reiser4_set_hint(&hint, &key, ZNODE_WRITE_LOCK);
47501 +               else
47502 +                       reiser4_unset_hint(&hint);
47503 +
47504 +       } while (count > 0);
47505 +
47506 +       save_file_hint(file, &hint);
47507 +       assert("", reiser4_lock_counters()->d_refs == 0);
47508 +       return result;
47509 +}
47510 +
47511 +/**
47512 + * write_extent_reserve_space - reserve space for extent write operation
47513 + * @inode:
47514 + *
47515 + * Estimates and reserves space which may be required for writing
47516 + * WRITE_GRANULARITY pages of file.
47517 + */
47518 +static int write_extent_reserve_space(struct inode *inode)
47519 +{
47520 +       __u64 count;
47521 +       reiser4_tree *tree;
47522 +
47523 +       /*
47524 +        * to write WRITE_GRANULARITY pages to a file by extents we have to
47525 +        * reserve disk space for:
47526 +
47527 +        * 1. find_file_item may have to insert empty node to the tree (empty
47528 +        * leaf node between two extent items). This requires 1 block and
47529 +        * number of blocks which are necessary to perform insertion of an
47530 +        * internal item into twig level.
47531 +
47532 +        * 2. for each of written pages there might be needed 1 block and
47533 +        * number of blocks which might be necessary to perform insertion of or
47534 +        * paste to an extent item.
47535 +
47536 +        * 3. stat data update
47537 +        */
47538 +       tree = reiser4_tree_by_inode(inode);
47539 +       count = estimate_one_insert_item(tree) +
47540 +               WRITE_GRANULARITY * (1 + estimate_one_insert_into_item(tree)) +
47541 +               estimate_one_insert_item(tree);
47542 +       grab_space_enable();
47543 +       return reiser4_grab_space(count, 0 /* flags */);
47544 +}
47545 +
47546 +/*
47547 + * filemap_copy_from_user no longer exists in generic code, because it
47548 + * is deadlocky (copying from user while holding the page lock is bad).
47549 + * As a temporary fix for reiser4, just define it here.
47550 + */
47551 +static inline size_t
47552 +filemap_copy_from_user(struct page *page, unsigned long offset,
47553 +                       const char __user *buf, unsigned bytes)
47554 +{
47555 +       char *kaddr;
47556 +       int left;
47557 +
47558 +       kaddr = kmap_atomic(page, KM_USER0);
47559 +       left = __copy_from_user_inatomic_nocache(kaddr + offset, buf, bytes);
47560 +       kunmap_atomic(kaddr, KM_USER0);
47561 +
47562 +       if (left != 0) {
47563 +               /* Do it the slow way */
47564 +               kaddr = kmap(page);
47565 +               left = __copy_from_user_nocache(kaddr + offset, buf, bytes);
47566 +               kunmap(page);
47567 +       }
47568 +       return bytes - left;
47569 +}
47570 +
47571 +/**
47572 + * reiser4_write_extent - write method of extent item plugin
47573 + * @file: file to write to
47574 + * @buf: address of user-space buffer
47575 + * @count: number of bytes to write
47576 + * @pos: position in file to write to
47577 + *
47578 + */
47579 +ssize_t reiser4_write_extent(struct file *file, struct inode * inode,
47580 +                            const char __user *buf, size_t count, loff_t *pos)
47581 +{
47582 +       int have_to_update_extent;
47583 +       int nr_pages, nr_dirty;
47584 +       struct page *page;
47585 +       jnode *jnodes[WRITE_GRANULARITY + 1];
47586 +       unsigned long index;
47587 +       unsigned long end;
47588 +       int i;
47589 +       int to_page, page_off;
47590 +       size_t left, written;
47591 +       int result = 0;
47592 +
47593 +       if (write_extent_reserve_space(inode))
47594 +               return RETERR(-ENOSPC);
47595 +
47596 +       if (count == 0) {
47597 +               /* truncate case */
47598 +               update_extents(file, inode, jnodes, 0, *pos);
47599 +               return 0;
47600 +       }
47601 +
47602 +       BUG_ON(get_current_context()->trans->atom != NULL);
47603 +
47604 +       left = count;
47605 +       index = *pos >> PAGE_CACHE_SHIFT;
47606 +       /* calculate number of pages which are to be written */
47607 +       end = ((*pos + count - 1) >> PAGE_CACHE_SHIFT);
47608 +       nr_pages = end - index + 1;
47609 +       nr_dirty = 0;
47610 +       assert("", nr_pages <= WRITE_GRANULARITY + 1);
47611 +
47612 +       /* get pages and jnodes */
47613 +       for (i = 0; i < nr_pages; i ++) {
47614 +               page = find_or_create_page(inode->i_mapping, index + i,
47615 +                                          reiser4_ctx_gfp_mask_get());
47616 +               if (page == NULL) {
47617 +                       nr_pages = i;
47618 +                       result = RETERR(-ENOMEM);
47619 +                       goto out;
47620 +               }
47621 +
47622 +               jnodes[i] = jnode_of_page(page);
47623 +               if (IS_ERR(jnodes[i])) {
47624 +                       unlock_page(page);
47625 +                       page_cache_release(page);
47626 +                       nr_pages = i;
47627 +                       result = RETERR(-ENOMEM);
47628 +                       goto out;
47629 +               }
47630 +               /* prevent jnode and page from disconnecting */
47631 +               JF_SET(jnodes[i], JNODE_WRITE_PREPARED);
47632 +               unlock_page(page);
47633 +       }
47634 +
47635 +       BUG_ON(get_current_context()->trans->atom != NULL);
47636 +
47637 +       have_to_update_extent = 0;
47638 +
47639 +       page_off = (*pos & (PAGE_CACHE_SIZE - 1));
47640 +       for (i = 0; i < nr_pages; i ++) {
47641 +               to_page = PAGE_CACHE_SIZE - page_off;
47642 +               if (to_page > left)
47643 +                       to_page = left;
47644 +               page = jnode_page(jnodes[i]);
47645 +               if (page_offset(page) < inode->i_size &&
47646 +                   !PageUptodate(page) && to_page != PAGE_CACHE_SIZE) {
47647 +                       /*
47648 +                        * the above is not optimal for partial write to last
47649 +                        * page of file when file size is not at boundary of
47650 +                        * page
47651 +                        */
47652 +                       lock_page(page);
47653 +                       if (!PageUptodate(page)) {
47654 +                               result = readpage_unix_file(NULL, page);
47655 +                               BUG_ON(result != 0);
47656 +                               /* wait for read completion */
47657 +                               lock_page(page);
47658 +                               BUG_ON(!PageUptodate(page));
47659 +                       } else
47660 +                               result = 0;
47661 +                       unlock_page(page);
47662 +               }
47663 +
47664 +               BUG_ON(get_current_context()->trans->atom != NULL);
47665 +               fault_in_pages_readable(buf, to_page);
47666 +               BUG_ON(get_current_context()->trans->atom != NULL);
47667 +
47668 +               lock_page(page);
47669 +               if (!PageUptodate(page) && to_page != PAGE_CACHE_SIZE)
47670 +                       simple_prepare_write(file, page, page_off,
47671 +                                            page_off + to_page);
47672 +
47673 +               written = filemap_copy_from_user(page, page_off, buf, to_page);
47674 +               if (unlikely(written != to_page)) {
47675 +                       unlock_page(page);
47676 +                       result = RETERR(-EFAULT);
47677 +                       break;
47678 +               }
47679 +
47680 +               flush_dcache_page(page);
47681 +               reiser4_set_page_dirty_internal(page);
47682 +               unlock_page(page);
47683 +               nr_dirty++;
47684 +
47685 +               mark_page_accessed(page);
47686 +               SetPageUptodate(page);
47687 +
47688 +               if (jnodes[i]->blocknr == 0)
47689 +                       have_to_update_extent ++;
47690 +
47691 +               page_off = 0;
47692 +               buf += to_page;
47693 +               left -= to_page;
47694 +               BUG_ON(get_current_context()->trans->atom != NULL);
47695 +       }
47696 +
47697 +       if (have_to_update_extent) {
47698 +               update_extents(file, inode, jnodes, nr_dirty, *pos);
47699 +       } else {
47700 +               for (i = 0; i < nr_dirty; i ++) {
47701 +                       int ret;
47702 +                       spin_lock_jnode(jnodes[i]);
47703 +                       ret = reiser4_try_capture(jnodes[i],
47704 +                                                    ZNODE_WRITE_LOCK, 0);
47705 +                       BUG_ON(ret != 0);
47706 +                       jnode_make_dirty_locked(jnodes[i]);
47707 +                       spin_unlock_jnode(jnodes[i]);
47708 +               }
47709 +       }
47710 +out:
47711 +       for (i = 0; i < nr_pages; i ++) {
47712 +               page_cache_release(jnode_page(jnodes[i]));
47713 +               JF_CLR(jnodes[i], JNODE_WRITE_PREPARED);
47714 +               jput(jnodes[i]);
47715 +       }
47716 +
47717 +       /* the only errors handled so far is ENOMEM and
47718 +          EFAULT on copy_from_user  */
47719 +
47720 +       return (count - left) ? (count - left) : result;
47721 +}
47722 +
47723 +int reiser4_do_readpage_extent(reiser4_extent * ext, reiser4_block_nr pos,
47724 +                              struct page *page)
47725 +{
47726 +       jnode *j;
47727 +       struct address_space *mapping;
47728 +       unsigned long index;
47729 +       oid_t oid;
47730 +       reiser4_block_nr block;
47731 +
47732 +       mapping = page->mapping;
47733 +       oid = get_inode_oid(mapping->host);
47734 +       index = page->index;
47735 +
47736 +       switch (state_of_extent(ext)) {
47737 +       case HOLE_EXTENT:
47738 +               /*
47739 +                * it is possible to have hole page with jnode, if page was
47740 +                * eflushed previously.
47741 +                */
47742 +               j = jfind(mapping, index);
47743 +               if (j == NULL) {
47744 +                       zero_user(page, 0, PAGE_CACHE_SIZE);
47745 +                       SetPageUptodate(page);
47746 +                       unlock_page(page);
47747 +                       return 0;
47748 +               }
47749 +               spin_lock_jnode(j);
47750 +               if (!jnode_page(j)) {
47751 +                       jnode_attach_page(j, page);
47752 +               } else {
47753 +                       BUG_ON(jnode_page(j) != page);
47754 +                       assert("vs-1504", jnode_page(j) == page);
47755 +               }
47756 +               block = *jnode_get_io_block(j);
47757 +               spin_unlock_jnode(j);
47758 +               if (block == 0) {
47759 +                       zero_user(page, 0, PAGE_CACHE_SIZE);
47760 +                       SetPageUptodate(page);
47761 +                       unlock_page(page);
47762 +                       jput(j);
47763 +                       return 0;
47764 +               }
47765 +               break;
47766 +
47767 +       case ALLOCATED_EXTENT:
47768 +               j = jnode_of_page(page);
47769 +               if (IS_ERR(j))
47770 +                       return PTR_ERR(j);
47771 +               if (*jnode_get_block(j) == 0) {
47772 +                       reiser4_block_nr blocknr;
47773 +
47774 +                       blocknr = extent_get_start(ext) + pos;
47775 +                       jnode_set_block(j, &blocknr);
47776 +               } else
47777 +                       assert("vs-1403",
47778 +                              j->blocknr == extent_get_start(ext) + pos);
47779 +               break;
47780 +
47781 +       case UNALLOCATED_EXTENT:
47782 +               j = jfind(mapping, index);
47783 +               assert("nikita-2688", j);
47784 +               assert("vs-1426", jnode_page(j) == NULL);
47785 +
47786 +               spin_lock_jnode(j);
47787 +               jnode_attach_page(j, page);
47788 +               spin_unlock_jnode(j);
47789 +               break;
47790 +
47791 +       default:
47792 +               warning("vs-957", "wrong extent\n");
47793 +               return RETERR(-EIO);
47794 +       }
47795 +
47796 +       BUG_ON(j == 0);
47797 +       reiser4_page_io(page, j, READ, reiser4_ctx_gfp_mask_get());
47798 +       jput(j);
47799 +       return 0;
47800 +}
47801 +
47802 +/* Implements plugin->u.item.s.file.read operation for extent items. */
47803 +int reiser4_read_extent(struct file *file, flow_t *flow, hint_t *hint)
47804 +{
47805 +       int result;
47806 +       struct page *page;
47807 +       unsigned long cur_page, next_page;
47808 +       unsigned long page_off, count;
47809 +       struct address_space *mapping;
47810 +       loff_t file_off;
47811 +       uf_coord_t *uf_coord;
47812 +       coord_t *coord;
47813 +       struct extent_coord_extension *ext_coord;
47814 +       unsigned long nr_pages;
47815 +       char *kaddr;
47816 +
47817 +       assert("vs-1353", current_blocksize == PAGE_CACHE_SIZE);
47818 +       assert("vs-572", flow->user == 1);
47819 +       assert("vs-1351", flow->length > 0);
47820 +
47821 +       uf_coord = &hint->ext_coord;
47822 +
47823 +       check_uf_coord(uf_coord, NULL);
47824 +       assert("vs-33", uf_coord->lh == &hint->lh);
47825 +
47826 +       coord = &uf_coord->coord;
47827 +       assert("vs-1119", znode_is_rlocked(coord->node));
47828 +       assert("vs-1120", znode_is_loaded(coord->node));
47829 +       assert("vs-1256", coord_matches_key_extent(coord, &flow->key));
47830 +
47831 +       mapping = file->f_dentry->d_inode->i_mapping;
47832 +       ext_coord = &uf_coord->extension.extent;
47833 +
47834 +       /* offset in a file to start read from */
47835 +       file_off = get_key_offset(&flow->key);
47836 +       /* offset within the page to start read from */
47837 +       page_off = (unsigned long)(file_off & (PAGE_CACHE_SIZE - 1));
47838 +       /* bytes which can be read from the page which contains file_off */
47839 +       count = PAGE_CACHE_SIZE - page_off;
47840 +
47841 +       /* index of page containing offset read is to start from */
47842 +       cur_page = (unsigned long)(file_off >> PAGE_CACHE_SHIFT);
47843 +       next_page = cur_page;
47844 +       /* number of pages flow spans over */
47845 +       nr_pages =
47846 +           ((file_off + flow->length + PAGE_CACHE_SIZE -
47847 +             1) >> PAGE_CACHE_SHIFT) - cur_page;
47848 +
47849 +       /* we start having twig node read locked. However, we do not want to
47850 +          keep that lock all the time readahead works. So, set a sel and
47851 +          release twig node. */
47852 +       reiser4_set_hint(hint, &flow->key, ZNODE_READ_LOCK);
47853 +       /* &hint->lh is done-ed */
47854 +
47855 +       do {
47856 +               reiser4_txn_restart_current();
47857 +               page = read_mapping_page(mapping, cur_page, file);
47858 +               if (IS_ERR(page))
47859 +                       return PTR_ERR(page);
47860 +               lock_page(page);
47861 +               if (!PageUptodate(page)) {
47862 +                       unlock_page(page);
47863 +                       page_cache_release(page);
47864 +                       warning("jmacd-97178", "extent_read: page is not up to date");
47865 +                       return RETERR(-EIO);
47866 +               }
47867 +               mark_page_accessed(page);
47868 +               unlock_page(page);
47869 +
47870 +               /* If users can be writing to this page using arbitrary virtual
47871 +                  addresses, take care about potential aliasing before reading
47872 +                  the page on the kernel side.
47873 +                */
47874 +               if (mapping_writably_mapped(mapping))
47875 +                       flush_dcache_page(page);
47876 +
47877 +               assert("nikita-3034", reiser4_schedulable());
47878 +
47879 +               /* number of bytes which are to be read from the page */
47880 +               if (count > flow->length)
47881 +                       count = flow->length;
47882 +
47883 +               result = fault_in_pages_writeable(flow->data, count);
47884 +               if (result) {
47885 +                       page_cache_release(page);
47886 +                       return RETERR(-EFAULT);
47887 +               }
47888 +
47889 +               kaddr = kmap_atomic(page, KM_USER0);
47890 +               result = __copy_to_user_inatomic(flow->data,
47891 +                                              kaddr + page_off, count);
47892 +               kunmap_atomic(kaddr, KM_USER0);
47893 +               if (result != 0) {
47894 +                       kaddr = kmap(page);
47895 +                       result = __copy_to_user(flow->data, kaddr + page_off, count);
47896 +                       kunmap(page);
47897 +                       if (unlikely(result))
47898 +                               return RETERR(-EFAULT);
47899 +               }
47900 +
47901 +               page_cache_release(page);
47902 +
47903 +               /* increase key (flow->key), update user area pointer (flow->data) */
47904 +               move_flow_forward(flow, count);
47905 +
47906 +               page_off = 0;
47907 +               cur_page ++;
47908 +               count = PAGE_CACHE_SIZE;
47909 +               nr_pages--;
47910 +       } while (flow->length);
47911 +
47912 +       return 0;
47913 +}
47914 +
47915 +/*
47916 +   plugin->s.file.readpage
47917 +   reiser4_read->unix_file_read->page_cache_readahead->reiser4_readpage->unix_file_readpage->extent_readpage
47918 +   or
47919 +   filemap_fault->reiser4_readpage->readpage_unix_file->->readpage_extent
47920 +
47921 +   At the beginning: coord->node is read locked, zloaded, page is
47922 +   locked, coord is set to existing unit inside of extent item (it is not necessary that coord matches to page->index)
47923 +*/
47924 +int reiser4_readpage_extent(void *vp, struct page *page)
47925 +{
47926 +       uf_coord_t *uf_coord = vp;
47927 +       ON_DEBUG(coord_t * coord = &uf_coord->coord);
47928 +       ON_DEBUG(reiser4_key key);
47929 +
47930 +       assert("vs-1040", PageLocked(page));
47931 +       assert("vs-1050", !PageUptodate(page));
47932 +       assert("vs-1039", page->mapping && page->mapping->host);
47933 +
47934 +       assert("vs-1044", znode_is_loaded(coord->node));
47935 +       assert("vs-758", item_is_extent(coord));
47936 +       assert("vs-1046", coord_is_existing_unit(coord));
47937 +       assert("vs-1045", znode_is_rlocked(coord->node));
47938 +       assert("vs-1047",
47939 +              page->mapping->host->i_ino ==
47940 +              get_key_objectid(item_key_by_coord(coord, &key)));
47941 +       check_uf_coord(uf_coord, NULL);
47942 +
47943 +       return reiser4_do_readpage_extent(
47944 +               ext_by_ext_coord(uf_coord),
47945 +               uf_coord->extension.extent.pos_in_unit, page);
47946 +}
47947 +
47948 +/**
47949 + * get_block_address_extent
47950 + * @coord:
47951 + * @block:
47952 + * @result:
47953 + *
47954 + *
47955 + */
47956 +int get_block_address_extent(const coord_t *coord, sector_t block,
47957 +                            sector_t *result)
47958 +{
47959 +       reiser4_extent *ext;
47960 +
47961 +       if (!coord_is_existing_unit(coord))
47962 +               return RETERR(-EINVAL);
47963 +
47964 +       ext = extent_by_coord(coord);
47965 +
47966 +       if (state_of_extent(ext) != ALLOCATED_EXTENT)
47967 +               /* FIXME: bad things may happen if it is unallocated extent */
47968 +               *result = 0;
47969 +       else {
47970 +               reiser4_key key;
47971 +
47972 +               unit_key_by_coord(coord, &key);
47973 +               assert("vs-1645",
47974 +                      block >= get_key_offset(&key) >> current_blocksize_bits);
47975 +               assert("vs-1646",
47976 +                      block <
47977 +                      (get_key_offset(&key) >> current_blocksize_bits) +
47978 +                      extent_get_width(ext));
47979 +               *result =
47980 +                   extent_get_start(ext) + (block -
47981 +                                            (get_key_offset(&key) >>
47982 +                                             current_blocksize_bits));
47983 +       }
47984 +       return 0;
47985 +}
47986 +
47987 +/*
47988 +  plugin->u.item.s.file.append_key
47989 +  key of first byte which is the next to last byte by addressed by this extent
47990 +*/
47991 +reiser4_key *append_key_extent(const coord_t * coord, reiser4_key * key)
47992 +{
47993 +       item_key_by_coord(coord, key);
47994 +       set_key_offset(key,
47995 +                      get_key_offset(key) + reiser4_extent_size(coord,
47996 +                                                                nr_units_extent
47997 +                                                                (coord)));
47998 +
47999 +       assert("vs-610", get_key_offset(key)
48000 +              && (get_key_offset(key) & (current_blocksize - 1)) == 0);
48001 +       return key;
48002 +}
48003 +
48004 +/* plugin->u.item.s.file.init_coord_extension */
48005 +void init_coord_extension_extent(uf_coord_t * uf_coord, loff_t lookuped)
48006 +{
48007 +       coord_t *coord;
48008 +       struct extent_coord_extension *ext_coord;
48009 +       reiser4_key key;
48010 +       loff_t offset;
48011 +
48012 +       assert("vs-1295", uf_coord->valid == 0);
48013 +
48014 +       coord = &uf_coord->coord;
48015 +       assert("vs-1288", coord_is_iplug_set(coord));
48016 +       assert("vs-1327", znode_is_loaded(coord->node));
48017 +
48018 +       if (coord->between != AFTER_UNIT && coord->between != AT_UNIT)
48019 +               return;
48020 +
48021 +       ext_coord = &uf_coord->extension.extent;
48022 +       ext_coord->nr_units = nr_units_extent(coord);
48023 +       ext_coord->ext_offset =
48024 +           (char *)extent_by_coord(coord) - zdata(coord->node);
48025 +       ext_coord->width = extent_get_width(extent_by_coord(coord));
48026 +       ON_DEBUG(ext_coord->extent = *extent_by_coord(coord));
48027 +       uf_coord->valid = 1;
48028 +
48029 +       /* pos_in_unit is the only uninitialized field in extended coord */
48030 +       if (coord->between == AFTER_UNIT) {
48031 +               assert("vs-1330",
48032 +                      coord->unit_pos == nr_units_extent(coord) - 1);
48033 +
48034 +               ext_coord->pos_in_unit = ext_coord->width - 1;
48035 +       } else {
48036 +               /* AT_UNIT */
48037 +               unit_key_by_coord(coord, &key);
48038 +               offset = get_key_offset(&key);
48039 +
48040 +               assert("vs-1328", offset <= lookuped);
48041 +               assert("vs-1329",
48042 +                      lookuped <
48043 +                      offset + ext_coord->width * current_blocksize);
48044 +               ext_coord->pos_in_unit =
48045 +                   ((lookuped - offset) >> current_blocksize_bits);
48046 +       }
48047 +}
48048 +
48049 +/*
48050 + * Local variables:
48051 + * c-indentation-style: "K&R"
48052 + * mode-name: "LC"
48053 + * c-basic-offset: 8
48054 + * tab-width: 8
48055 + * fill-column: 79
48056 + * scroll-step: 1
48057 + * End:
48058 + */
48059 diff -puN /dev/null fs/reiser4/plugin/item/extent_flush_ops.c
48060 --- /dev/null
48061 +++ a/fs/reiser4/plugin/item/extent_flush_ops.c
48062 @@ -0,0 +1,1028 @@
48063 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
48064 +
48065 +#include "item.h"
48066 +#include "../../tree.h"
48067 +#include "../../jnode.h"
48068 +#include "../../super.h"
48069 +#include "../../flush.h"
48070 +#include "../../carry.h"
48071 +#include "../object.h"
48072 +
48073 +#include <linux/pagemap.h>
48074 +
48075 +static reiser4_block_nr extent_unit_start(const coord_t * item);
48076 +
48077 +/* Return either first or last extent (depending on @side) of the item
48078 +   @coord is set to. Set @pos_in_unit either to first or to last block
48079 +   of extent. */
48080 +static reiser4_extent *extent_utmost_ext(const coord_t * coord, sideof side,
48081 +                                        reiser4_block_nr * pos_in_unit)
48082 +{
48083 +       reiser4_extent *ext;
48084 +
48085 +       if (side == LEFT_SIDE) {
48086 +               /* get first extent of item */
48087 +               ext = extent_item(coord);
48088 +               *pos_in_unit = 0;
48089 +       } else {
48090 +               /* get last extent of item and last position within it */
48091 +               assert("vs-363", side == RIGHT_SIDE);
48092 +               ext = extent_item(coord) + coord_last_unit_pos(coord);
48093 +               *pos_in_unit = extent_get_width(ext) - 1;
48094 +       }
48095 +
48096 +       return ext;
48097 +}
48098 +
48099 +/* item_plugin->f.utmost_child */
48100 +/* Return the child. Coord is set to extent item. Find jnode corresponding
48101 +   either to first or to last unformatted node pointed by the item */
48102 +int utmost_child_extent(const coord_t * coord, sideof side, jnode ** childp)
48103 +{
48104 +       reiser4_extent *ext;
48105 +       reiser4_block_nr pos_in_unit;
48106 +
48107 +       ext = extent_utmost_ext(coord, side, &pos_in_unit);
48108 +
48109 +       switch (state_of_extent(ext)) {
48110 +       case HOLE_EXTENT:
48111 +               *childp = NULL;
48112 +               return 0;
48113 +       case ALLOCATED_EXTENT:
48114 +       case UNALLOCATED_EXTENT:
48115 +               break;
48116 +       default:
48117 +               /* this should never happen */
48118 +               assert("vs-1417", 0);
48119 +       }
48120 +
48121 +       {
48122 +               reiser4_key key;
48123 +               reiser4_tree *tree;
48124 +               unsigned long index;
48125 +
48126 +               if (side == LEFT_SIDE) {
48127 +                       /* get key of first byte addressed by the extent */
48128 +                       item_key_by_coord(coord, &key);
48129 +               } else {
48130 +                       /* get key of byte which next after last byte addressed by the extent */
48131 +                       append_key_extent(coord, &key);
48132 +               }
48133 +
48134 +               assert("vs-544",
48135 +                      (get_key_offset(&key) >> PAGE_CACHE_SHIFT) < ~0ul);
48136 +               /* index of first or last (depending on @side) page addressed
48137 +                  by the extent */
48138 +               index =
48139 +                   (unsigned long)(get_key_offset(&key) >> PAGE_CACHE_SHIFT);
48140 +               if (side == RIGHT_SIDE)
48141 +                       index--;
48142 +
48143 +               tree = coord->node->zjnode.tree;
48144 +               *childp = jlookup(tree, get_key_objectid(&key), index);
48145 +       }
48146 +
48147 +       return 0;
48148 +}
48149 +
48150 +/* item_plugin->f.utmost_child_real_block */
48151 +/* Return the child's block, if allocated. */
48152 +int
48153 +utmost_child_real_block_extent(const coord_t * coord, sideof side,
48154 +                              reiser4_block_nr * block)
48155 +{
48156 +       reiser4_extent *ext;
48157 +
48158 +       ext = extent_by_coord(coord);
48159 +
48160 +       switch (state_of_extent(ext)) {
48161 +       case ALLOCATED_EXTENT:
48162 +               *block = extent_get_start(ext);
48163 +               if (side == RIGHT_SIDE)
48164 +                       *block += extent_get_width(ext) - 1;
48165 +               break;
48166 +       case HOLE_EXTENT:
48167 +       case UNALLOCATED_EXTENT:
48168 +               *block = 0;
48169 +               break;
48170 +       default:
48171 +               /* this should never happen */
48172 +               assert("vs-1418", 0);
48173 +       }
48174 +
48175 +       return 0;
48176 +}
48177 +
48178 +/* item_plugin->f.scan */
48179 +/* Performs leftward scanning starting from an unformatted node and its parent coordinate.
48180 +   This scan continues, advancing the parent coordinate, until either it encounters a
48181 +   formatted child or it finishes scanning this node.
48182 +
48183 +   If unallocated, the entire extent must be dirty and in the same atom.  (Actually, I'm
48184 +   not sure this is last property (same atom) is enforced, but it should be the case since
48185 +   one atom must write the parent and the others must read the parent, thus fusing?).  In
48186 +   any case, the code below asserts this case for unallocated extents.  Unallocated
48187 +   extents are thus optimized because we can skip to the endpoint when scanning.
48188 +
48189 +   It returns control to reiser4_scan_extent, handles these terminating conditions,
48190 +   e.g., by loading the next twig.
48191 +*/
48192 +int reiser4_scan_extent(flush_scan * scan)
48193 +{
48194 +       coord_t coord;
48195 +       jnode *neighbor;
48196 +       unsigned long scan_index, unit_index, unit_width, scan_max, scan_dist;
48197 +       reiser4_block_nr unit_start;
48198 +       __u64 oid;
48199 +       reiser4_key key;
48200 +       int ret = 0, allocated, incr;
48201 +       reiser4_tree *tree;
48202 +
48203 +       if (!JF_ISSET(scan->node, JNODE_DIRTY)) {
48204 +               scan->stop = 1;
48205 +               return 0;       /* Race with truncate, this node is already
48206 +                                * truncated. */
48207 +       }
48208 +
48209 +       coord_dup(&coord, &scan->parent_coord);
48210 +
48211 +       assert("jmacd-1404", !reiser4_scan_finished(scan));
48212 +       assert("jmacd-1405", jnode_get_level(scan->node) == LEAF_LEVEL);
48213 +       assert("jmacd-1406", jnode_is_unformatted(scan->node));
48214 +
48215 +       /* The scan_index variable corresponds to the current page index of the
48216 +          unformatted block scan position. */
48217 +       scan_index = index_jnode(scan->node);
48218 +
48219 +       assert("jmacd-7889", item_is_extent(&coord));
48220 +
48221 +      repeat:
48222 +       /* objectid of file */
48223 +       oid = get_key_objectid(item_key_by_coord(&coord, &key));
48224 +
48225 +       allocated = !extent_is_unallocated(&coord);
48226 +       /* Get the values of this extent unit: */
48227 +       unit_index = extent_unit_index(&coord);
48228 +       unit_width = extent_unit_width(&coord);
48229 +       unit_start = extent_unit_start(&coord);
48230 +
48231 +       assert("jmacd-7187", unit_width > 0);
48232 +       assert("jmacd-7188", scan_index >= unit_index);
48233 +       assert("jmacd-7189", scan_index <= unit_index + unit_width - 1);
48234 +
48235 +       /* Depending on the scan direction, we set different maximum values for scan_index
48236 +          (scan_max) and the number of nodes that would be passed if the scan goes the
48237 +          entire way (scan_dist).  Incr is an integer reflecting the incremental
48238 +          direction of scan_index. */
48239 +       if (reiser4_scanning_left(scan)) {
48240 +               scan_max = unit_index;
48241 +               scan_dist = scan_index - unit_index;
48242 +               incr = -1;
48243 +       } else {
48244 +               scan_max = unit_index + unit_width - 1;
48245 +               scan_dist = scan_max - unit_index;
48246 +               incr = +1;
48247 +       }
48248 +
48249 +       tree = coord.node->zjnode.tree;
48250 +
48251 +       /* If the extent is allocated we have to check each of its blocks.  If the extent
48252 +          is unallocated we can skip to the scan_max. */
48253 +       if (allocated) {
48254 +               do {
48255 +                       neighbor = jlookup(tree, oid, scan_index);
48256 +                       if (neighbor == NULL)
48257 +                               goto stop_same_parent;
48258 +
48259 +                       if (scan->node != neighbor
48260 +                           && !reiser4_scan_goto(scan, neighbor)) {
48261 +                               /* @neighbor was jput() by reiser4_scan_goto */
48262 +                               goto stop_same_parent;
48263 +                       }
48264 +
48265 +                       ret = scan_set_current(scan, neighbor, 1, &coord);
48266 +                       if (ret != 0) {
48267 +                               goto exit;
48268 +                       }
48269 +
48270 +                       /* reference to @neighbor is stored in @scan, no need
48271 +                          to jput(). */
48272 +                       scan_index += incr;
48273 +
48274 +               } while (incr + scan_max != scan_index);
48275 +
48276 +       } else {
48277 +               /* Optimized case for unallocated extents, skip to the end. */
48278 +               neighbor = jlookup(tree, oid, scan_max /*index */ );
48279 +               if (neighbor == NULL) {
48280 +                       /* Race with truncate */
48281 +                       scan->stop = 1;
48282 +                       ret = 0;
48283 +                       goto exit;
48284 +               }
48285 +
48286 +               assert("zam-1043",
48287 +                      reiser4_blocknr_is_fake(jnode_get_block(neighbor)));
48288 +
48289 +               ret = scan_set_current(scan, neighbor, scan_dist, &coord);
48290 +               if (ret != 0) {
48291 +                       goto exit;
48292 +               }
48293 +       }
48294 +
48295 +       if (coord_sideof_unit(&coord, scan->direction) == 0
48296 +           && item_is_extent(&coord)) {
48297 +               /* Continue as long as there are more extent units. */
48298 +
48299 +               scan_index =
48300 +                   extent_unit_index(&coord) +
48301 +                   (reiser4_scanning_left(scan) ?
48302 +                    extent_unit_width(&coord) - 1 : 0);
48303 +               goto repeat;
48304 +       }
48305 +
48306 +       if (0) {
48307 +             stop_same_parent:
48308 +
48309 +               /* If we are scanning left and we stop in the middle of an allocated
48310 +                  extent, we know the preceder immediately.. */
48311 +               /* middle of extent is (scan_index - unit_index) != 0. */
48312 +               if (reiser4_scanning_left(scan) &&
48313 +                   (scan_index - unit_index) != 0) {
48314 +                       /* FIXME(B): Someone should step-through and verify that this preceder
48315 +                          calculation is indeed correct. */
48316 +                       /* @unit_start is starting block (number) of extent
48317 +                          unit. Flush stopped at the @scan_index block from
48318 +                          the beginning of the file, which is (scan_index -
48319 +                          unit_index) block within extent.
48320 +                        */
48321 +                       if (unit_start) {
48322 +                               /* skip preceder update when we are at hole */
48323 +                               scan->preceder_blk =
48324 +                                   unit_start + scan_index - unit_index;
48325 +                               check_preceder(scan->preceder_blk);
48326 +                       }
48327 +               }
48328 +
48329 +               /* In this case, we leave coord set to the parent of scan->node. */
48330 +               scan->stop = 1;
48331 +
48332 +       } else {
48333 +               /* In this case, we are still scanning, coord is set to the next item which is
48334 +                  either off-the-end of the node or not an extent. */
48335 +               assert("jmacd-8912", scan->stop == 0);
48336 +               assert("jmacd-7812",
48337 +                      (coord_is_after_sideof_unit(&coord, scan->direction)
48338 +                       || !item_is_extent(&coord)));
48339 +       }
48340 +
48341 +       ret = 0;
48342 +      exit:
48343 +       return ret;
48344 +}
48345 +
48346 +/* ask block allocator for some blocks */
48347 +static void extent_allocate_blocks(reiser4_blocknr_hint *preceder,
48348 +                                  reiser4_block_nr wanted_count,
48349 +                                  reiser4_block_nr *first_allocated,
48350 +                                  reiser4_block_nr *allocated,
48351 +                                  block_stage_t block_stage)
48352 +{
48353 +       *allocated = wanted_count;
48354 +       preceder->max_dist = 0; /* scan whole disk, if needed */
48355 +
48356 +       /* that number of blocks (wanted_count) is either in UNALLOCATED or in GRABBED */
48357 +       preceder->block_stage = block_stage;
48358 +
48359 +       /* FIXME: we do not handle errors here now */
48360 +       check_me("vs-420",
48361 +                reiser4_alloc_blocks(preceder, first_allocated, allocated,
48362 +                                     BA_PERMANENT) == 0);
48363 +       /* update flush_pos's preceder to last allocated block number */
48364 +       preceder->blk = *first_allocated + *allocated - 1;
48365 +}
48366 +
48367 +/* when on flush time unallocated extent is to be replaced with allocated one it may happen that one unallocated extent
48368 +   will have to be replaced with set of allocated extents. In this case insert_into_item will be called which may have
48369 +   to add new nodes into tree. Space for that is taken from inviolable reserve (5%). */
48370 +static reiser4_block_nr reserve_replace(void)
48371 +{
48372 +       reiser4_block_nr grabbed, needed;
48373 +
48374 +       grabbed = get_current_context()->grabbed_blocks;
48375 +       needed = estimate_one_insert_into_item(current_tree);
48376 +       check_me("vpf-340", !reiser4_grab_space_force(needed, BA_RESERVED));
48377 +       return grabbed;
48378 +}
48379 +
48380 +static void free_replace_reserved(reiser4_block_nr grabbed)
48381 +{
48382 +       reiser4_context *ctx;
48383 +
48384 +       ctx = get_current_context();
48385 +       grabbed2free(ctx, get_super_private(ctx->super),
48386 +                    ctx->grabbed_blocks - grabbed);
48387 +}
48388 +
48389 +/* Block offset of first block addressed by unit */
48390 +__u64 extent_unit_index(const coord_t * item)
48391 +{
48392 +       reiser4_key key;
48393 +
48394 +       assert("vs-648", coord_is_existing_unit(item));
48395 +       unit_key_by_coord(item, &key);
48396 +       return get_key_offset(&key) >> current_blocksize_bits;
48397 +}
48398 +
48399 +/* AUDIT shouldn't return value be of reiser4_block_nr type?
48400 +   Josh's answer: who knows?  Is a "number of blocks" the same type as "block offset"? */
48401 +__u64 extent_unit_width(const coord_t * item)
48402 +{
48403 +       assert("vs-649", coord_is_existing_unit(item));
48404 +       return width_by_coord(item);
48405 +}
48406 +
48407 +/* Starting block location of this unit */
48408 +static reiser4_block_nr extent_unit_start(const coord_t * item)
48409 +{
48410 +       return extent_get_start(extent_by_coord(item));
48411 +}
48412 +
48413 +/**
48414 + * split_allocated_extent -
48415 + * @coord:
48416 + * @pos_in_unit:
48417 + *
48418 + * replace allocated extent with two allocated extents
48419 + */
48420 +static int split_allocated_extent(coord_t *coord, reiser4_block_nr pos_in_unit)
48421 +{
48422 +       int result;
48423 +       struct replace_handle *h;
48424 +       reiser4_extent *ext;
48425 +       reiser4_block_nr grabbed;
48426 +
48427 +       ext = extent_by_coord(coord);
48428 +       assert("vs-1410", state_of_extent(ext) == ALLOCATED_EXTENT);
48429 +       assert("vs-1411", extent_get_width(ext) > pos_in_unit);
48430 +
48431 +       h = kmalloc(sizeof(*h), reiser4_ctx_gfp_mask_get());
48432 +       if (h == NULL)
48433 +               return RETERR(-ENOMEM);
48434 +       h->coord = coord;
48435 +       h->lh = znode_lh(coord->node);
48436 +       h->pkey = &h->key;
48437 +       unit_key_by_coord(coord, h->pkey);
48438 +       set_key_offset(h->pkey,
48439 +                      (get_key_offset(h->pkey) +
48440 +                       pos_in_unit * current_blocksize));
48441 +       reiser4_set_extent(&h->overwrite, extent_get_start(ext),
48442 +                          pos_in_unit);
48443 +       reiser4_set_extent(&h->new_extents[0],
48444 +                          extent_get_start(ext) + pos_in_unit,
48445 +                          extent_get_width(ext) - pos_in_unit);
48446 +       h->nr_new_extents = 1;
48447 +       h->flags = COPI_DONT_SHIFT_LEFT;
48448 +       h->paste_key = h->key;
48449 +
48450 +       /* reserve space for extent unit paste, @grabbed is reserved before */
48451 +       grabbed = reserve_replace();
48452 +       result = reiser4_replace_extent(h, 0 /* leave @coord set to overwritten
48453 +                                               extent */);
48454 +       /* restore reserved */
48455 +       free_replace_reserved(grabbed);
48456 +       kfree(h);
48457 +       return result;
48458 +}
48459 +
48460 +/* replace extent @ext by extent @replace. Try to merge @replace with previous extent of the item (if there is
48461 +   one). Return 1 if it succeeded, 0 - otherwise */
48462 +static int try_to_merge_with_left(coord_t *coord, reiser4_extent *ext,
48463 +                      reiser4_extent *replace)
48464 +{
48465 +       assert("vs-1415", extent_by_coord(coord) == ext);
48466 +
48467 +       if (coord->unit_pos == 0
48468 +           || state_of_extent(ext - 1) != ALLOCATED_EXTENT)
48469 +               /* @ext either does not exist or is not allocated extent */
48470 +               return 0;
48471 +       if (extent_get_start(ext - 1) + extent_get_width(ext - 1) !=
48472 +           extent_get_start(replace))
48473 +               return 0;
48474 +
48475 +       /* we can glue, widen previous unit */
48476 +       extent_set_width(ext - 1,
48477 +                        extent_get_width(ext - 1) + extent_get_width(replace));
48478 +
48479 +       if (extent_get_width(ext) != extent_get_width(replace)) {
48480 +               /* make current extent narrower */
48481 +               if (state_of_extent(ext) == ALLOCATED_EXTENT)
48482 +                       extent_set_start(ext,
48483 +                                        extent_get_start(ext) +
48484 +                                        extent_get_width(replace));
48485 +               extent_set_width(ext,
48486 +                                extent_get_width(ext) -
48487 +                                extent_get_width(replace));
48488 +       } else {
48489 +               /* current extent completely glued with its left neighbor, remove it */
48490 +               coord_t from, to;
48491 +
48492 +               coord_dup(&from, coord);
48493 +               from.unit_pos = nr_units_extent(coord) - 1;
48494 +               coord_dup(&to, &from);
48495 +
48496 +               /* currently cut from extent can cut either from the beginning or from the end. Move place which got
48497 +                  freed after unit removal to end of item */
48498 +               memmove(ext, ext + 1,
48499 +                       (from.unit_pos -
48500 +                        coord->unit_pos) * sizeof(reiser4_extent));
48501 +               /* wipe part of item which is going to be cut, so that node_check will not be confused */
48502 +               cut_node_content(&from, &to, NULL, NULL, NULL);
48503 +       }
48504 +       znode_make_dirty(coord->node);
48505 +       /* move coord back */
48506 +       coord->unit_pos--;
48507 +       return 1;
48508 +}
48509 +
48510 +/**
48511 + * conv_extent - replace extent with 2 ones
48512 + * @coord: coordinate of extent to be replaced
48513 + * @replace: extent to overwrite the one @coord is set to
48514 + *
48515 + * Overwrites extent @coord is set to and paste one extent unit after
48516 + * overwritten one if @replace is shorter than initial extent
48517 + */
48518 +static int conv_extent(coord_t *coord, reiser4_extent *replace)
48519 +{
48520 +       int result;
48521 +       struct replace_handle *h;
48522 +       reiser4_extent *ext;
48523 +       reiser4_block_nr start, width, new_width;
48524 +       reiser4_block_nr grabbed;
48525 +       extent_state state;
48526 +
48527 +       ext = extent_by_coord(coord);
48528 +       state = state_of_extent(ext);
48529 +       start = extent_get_start(ext);
48530 +       width = extent_get_width(ext);
48531 +       new_width = extent_get_width(replace);
48532 +
48533 +       assert("vs-1458", (state == UNALLOCATED_EXTENT ||
48534 +                          state == ALLOCATED_EXTENT));
48535 +       assert("vs-1459", width >= new_width);
48536 +
48537 +       if (try_to_merge_with_left(coord, ext, replace)) {
48538 +               /* merged @replace with left neighbor. Current unit is either
48539 +                  removed or narrowed */
48540 +               return 0;
48541 +       }
48542 +
48543 +       if (width == new_width) {
48544 +               /* replace current extent with @replace */
48545 +               *ext = *replace;
48546 +               znode_make_dirty(coord->node);
48547 +               return 0;
48548 +       }
48549 +
48550 +       h = kmalloc(sizeof(*h), reiser4_ctx_gfp_mask_get());
48551 +       if (h == NULL)
48552 +               return RETERR(-ENOMEM);
48553 +       h->coord = coord;
48554 +       h->lh = znode_lh(coord->node);
48555 +       h->pkey = &h->key;
48556 +       unit_key_by_coord(coord, h->pkey);
48557 +       set_key_offset(h->pkey,
48558 +                      (get_key_offset(h->pkey) + new_width * current_blocksize));
48559 +       h->overwrite = *replace;
48560 +
48561 +       /* replace @ext with @replace and padding extent */
48562 +       reiser4_set_extent(&h->new_extents[0],
48563 +                          (state == ALLOCATED_EXTENT) ?
48564 +                          (start + new_width) :
48565 +                          UNALLOCATED_EXTENT_START,
48566 +                          width - new_width);
48567 +       h->nr_new_extents = 1;
48568 +       h->flags = COPI_DONT_SHIFT_LEFT;
48569 +       h->paste_key = h->key;
48570 +
48571 +       /* reserve space for extent unit paste, @grabbed is reserved before */
48572 +       grabbed = reserve_replace();
48573 +       result = reiser4_replace_extent(h, 0 /* leave @coord set to overwritten
48574 +                                               extent */);
48575 +
48576 +       /* restore reserved */
48577 +       free_replace_reserved(grabbed);
48578 +       kfree(h);
48579 +       return result;
48580 +}
48581 +
48582 +/**
48583 + * assign_real_blocknrs
48584 + * @flush_pos:
48585 + * @oid: objectid of file jnodes to assign block number to belongs to
48586 + * @index: first jnode on the range
48587 + * @count: number of jnodes to assign block numbers to
48588 + * @first: start of allocated block range
48589 + *
48590 + * Assigns block numbers to each of @count jnodes. Index of first jnode is
48591 + * @index. Jnodes get lookuped with jlookup.
48592 + */
48593 +static void assign_real_blocknrs(flush_pos_t *flush_pos, oid_t oid,
48594 +                                unsigned long index, reiser4_block_nr count,
48595 +                                reiser4_block_nr first)
48596 +{
48597 +       unsigned long i;
48598 +       reiser4_tree *tree;
48599 +       txn_atom *atom;
48600 +       int nr;
48601 +
48602 +       atom = atom_locked_by_fq(flush_pos->fq);
48603 +       assert("vs-1468", atom);
48604 +       BUG_ON(atom == NULL);
48605 +
48606 +       nr = 0;
48607 +       tree = current_tree;
48608 +       for (i = 0; i < count; ++i, ++index) {
48609 +               jnode *node;
48610 +
48611 +               node = jlookup(tree, oid, index);
48612 +               assert("", node != NULL);
48613 +               BUG_ON(node == NULL);
48614 +
48615 +               spin_lock_jnode(node);
48616 +               assert("", !jnode_is_flushprepped(node));
48617 +               assert("vs-1475", node->atom == atom);
48618 +               assert("vs-1476", atomic_read(&node->x_count) > 0);
48619 +
48620 +               JF_CLR(node, JNODE_FLUSH_RESERVED);
48621 +               jnode_set_block(node, &first);
48622 +               unformatted_make_reloc(node, flush_pos->fq);
48623 +               ON_DEBUG(count_jnode(node->atom, node, NODE_LIST(node),
48624 +                                    FQ_LIST, 0));
48625 +               spin_unlock_jnode(node);
48626 +               first++;
48627 +
48628 +               atomic_dec(&node->x_count);
48629 +               nr ++;
48630 +       }
48631 +
48632 +       spin_unlock_atom(atom);
48633 +       return;
48634 +}
48635 +
48636 +/**
48637 + * make_node_ovrwr - assign node to overwrite set
48638 + * @jnodes: overwrite set list head
48639 + * @node: jnode to belong to overwrite set
48640 + *
48641 + * Sets OVRWR jnode state bit and puts @node to the end of list head @jnodes
48642 + * which is an accumulator for nodes before they get to overwrite set list of
48643 + * atom.
48644 + */
48645 +static void make_node_ovrwr(struct list_head *jnodes, jnode *node)
48646 +{
48647 +       spin_lock_jnode(node);
48648 +
48649 +       assert("zam-917", !JF_ISSET(node, JNODE_RELOC));
48650 +       assert("zam-918", !JF_ISSET(node, JNODE_OVRWR));
48651 +
48652 +       JF_SET(node, JNODE_OVRWR);
48653 +       list_move_tail(&node->capture_link, jnodes);
48654 +       ON_DEBUG(count_jnode(node->atom, node, DIRTY_LIST, OVRWR_LIST, 0));
48655 +
48656 +       spin_unlock_jnode(node);
48657 +}
48658 +
48659 +/**
48660 + * mark_jnodes_overwrite - put bunch of jnodes to overwrite set
48661 + * @flush_pos: flush position
48662 + * @oid: objectid of file jnodes belong to
48663 + * @index: starting index
48664 + * @width: extent width
48665 + *
48666 + * Puts nodes of one extent (file objectid @oid, extent width @width) to atom's
48667 + * overwrite set. Starting from the one with index @index. If end of slum is
48668 + * detected (node is not found or flushprepped) - stop iterating and set flush
48669 + * position's state to POS_INVALID.
48670 + */
48671 +static void mark_jnodes_overwrite(flush_pos_t *flush_pos, oid_t oid,
48672 +                                 unsigned long index, reiser4_block_nr width)
48673 +{
48674 +       unsigned long i;
48675 +       reiser4_tree *tree;
48676 +       jnode *node;
48677 +       txn_atom *atom;
48678 +       LIST_HEAD(jnodes);
48679 +
48680 +       tree = current_tree;
48681 +
48682 +       atom = atom_locked_by_fq(reiser4_pos_fq(flush_pos));
48683 +       assert("vs-1478", atom);
48684 +
48685 +       for (i = flush_pos->pos_in_unit; i < width; i++, index++) {
48686 +               node = jlookup(tree, oid, index);
48687 +               if (!node) {
48688 +                       flush_pos->state = POS_INVALID;
48689 +                       break;
48690 +               }
48691 +               if (jnode_check_flushprepped(node)) {
48692 +                       flush_pos->state = POS_INVALID;
48693 +                       atomic_dec(&node->x_count);
48694 +                       break;
48695 +               }
48696 +               if (node->atom != atom) {
48697 +                       flush_pos->state = POS_INVALID;
48698 +                       atomic_dec(&node->x_count);
48699 +                       break;
48700 +               }
48701 +               make_node_ovrwr(&jnodes, node);
48702 +               atomic_dec(&node->x_count);
48703 +       }
48704 +
48705 +       list_splice_init(&jnodes, ATOM_OVRWR_LIST(atom)->prev);
48706 +       spin_unlock_atom(atom);
48707 +}
48708 +
48709 +/**
48710 + * allocated_extent_slum_size
48711 + * @flush_pos:
48712 + * @oid:
48713 + * @index:
48714 + * @count:
48715 + *
48716 + *
48717 + */
48718 +static int allocated_extent_slum_size(flush_pos_t *flush_pos, oid_t oid,
48719 +                                     unsigned long index, unsigned long count)
48720 +{
48721 +       unsigned long i;
48722 +       reiser4_tree *tree;
48723 +       txn_atom *atom;
48724 +       int nr;
48725 +
48726 +       atom = atom_locked_by_fq(reiser4_pos_fq(flush_pos));
48727 +       assert("vs-1468", atom);
48728 +
48729 +       nr = 0;
48730 +       tree = current_tree;
48731 +       for (i = 0; i < count; ++i, ++index) {
48732 +               jnode *node;
48733 +
48734 +               node = jlookup(tree, oid, index);
48735 +               if (!node)
48736 +                       break;
48737 +
48738 +               if (jnode_check_flushprepped(node)) {
48739 +                       atomic_dec(&node->x_count);
48740 +                       break;
48741 +               }
48742 +
48743 +               if (node->atom != atom) {
48744 +                       /*
48745 +                        * this is possible on overwrite: extent_write may
48746 +                        * capture several unformatted nodes without capturing
48747 +                        * any formatted nodes.
48748 +                        */
48749 +                       atomic_dec(&node->x_count);
48750 +                       break;
48751 +               }
48752 +
48753 +               assert("vs-1476", atomic_read(&node->x_count) > 1);
48754 +               atomic_dec(&node->x_count);
48755 +               nr ++;
48756 +       }
48757 +
48758 +       spin_unlock_atom(atom);
48759 +       return nr;
48760 +}
48761 +
48762 +/**
48763 + * alloc_extent
48764 + * @flush_pos:
48765 + *
48766 + *
48767 + * this is called by handle_pos_on_twig to proceed extent unit flush_pos->coord
48768 + * is set to. It is to prepare for flushing sequence of not flushprepped nodes
48769 + * (slum). It supposes that slum starts at flush_pos->pos_in_unit position
48770 + * within the extent. Slum gets to relocate set if flush_pos->leaf_relocate is
48771 + * set to 1 and to overwrite set otherwise
48772 + */
48773 +int reiser4_alloc_extent(flush_pos_t *flush_pos)
48774 +{
48775 +       coord_t *coord;
48776 +       reiser4_extent *ext;
48777 +       reiser4_extent replace_ext;
48778 +       oid_t oid;
48779 +       reiser4_block_nr protected;
48780 +       reiser4_block_nr start;
48781 +       __u64 index;
48782 +       __u64 width;
48783 +       extent_state state;
48784 +       int result;
48785 +       reiser4_block_nr first_allocated;
48786 +       __u64 allocated;
48787 +       reiser4_key key;
48788 +       block_stage_t block_stage;
48789 +
48790 +       assert("vs-1468", flush_pos->state == POS_ON_EPOINT);
48791 +       assert("vs-1469", coord_is_existing_unit(&flush_pos->coord)
48792 +              && item_is_extent(&flush_pos->coord));
48793 +
48794 +       coord = &flush_pos->coord;
48795 +
48796 +       ext = extent_by_coord(coord);
48797 +       state = state_of_extent(ext);
48798 +       if (state == HOLE_EXTENT) {
48799 +               flush_pos->state = POS_INVALID;
48800 +               return 0;
48801 +       }
48802 +
48803 +       item_key_by_coord(coord, &key);
48804 +       oid = get_key_objectid(&key);
48805 +       index = extent_unit_index(coord) + flush_pos->pos_in_unit;
48806 +       start = extent_get_start(ext);
48807 +       width = extent_get_width(ext);
48808 +
48809 +       assert("vs-1457", width > flush_pos->pos_in_unit);
48810 +
48811 +       if (flush_pos->leaf_relocate || state == UNALLOCATED_EXTENT) {
48812 +               /* relocate */
48813 +               if (flush_pos->pos_in_unit) {
48814 +                       /* split extent unit into two */
48815 +                       result =
48816 +                           split_allocated_extent(coord,
48817 +                                                  flush_pos->pos_in_unit);
48818 +                       flush_pos->pos_in_unit = 0;
48819 +                       return result;
48820 +               }
48821 +
48822 +               /* limit number of nodes to allocate */
48823 +               if (flush_pos->nr_to_write < width)
48824 +                       width = flush_pos->nr_to_write;
48825 +
48826 +               if (state == ALLOCATED_EXTENT) {
48827 +                       /*
48828 +                        * all protected nodes are not flushprepped, therefore
48829 +                        * they are counted as flush_reserved
48830 +                        */
48831 +                       block_stage = BLOCK_FLUSH_RESERVED;
48832 +                       protected = allocated_extent_slum_size(flush_pos, oid,
48833 +                                                              index, width);
48834 +                       if (protected == 0) {
48835 +                               flush_pos->state = POS_INVALID;
48836 +                               flush_pos->pos_in_unit = 0;
48837 +                               return 0;
48838 +                       }
48839 +               } else {
48840 +                       block_stage = BLOCK_UNALLOCATED;
48841 +                       protected = width;
48842 +               }
48843 +
48844 +               /*
48845 +                * look at previous unit if possible. If it is allocated, make
48846 +                * preceder more precise
48847 +                */
48848 +               if (coord->unit_pos &&
48849 +                   (state_of_extent(ext - 1) == ALLOCATED_EXTENT))
48850 +                       reiser4_pos_hint(flush_pos)->blk =
48851 +                               extent_get_start(ext - 1) +
48852 +                               extent_get_width(ext - 1);
48853 +
48854 +               /* allocate new block numbers for protected nodes */
48855 +               extent_allocate_blocks(reiser4_pos_hint(flush_pos),
48856 +                                      protected,
48857 +                                      &first_allocated, &allocated,
48858 +                                      block_stage);
48859 +
48860 +               if (state == ALLOCATED_EXTENT)
48861 +                       /*
48862 +                        * on relocating - free nodes which are going to be
48863 +                        * relocated
48864 +                        */
48865 +                       reiser4_dealloc_blocks(&start, &allocated,
48866 +                                              BLOCK_ALLOCATED, BA_DEFER);
48867 +
48868 +               /* assign new block numbers to protected nodes */
48869 +               assign_real_blocknrs(flush_pos, oid, index, allocated, first_allocated);
48870 +
48871 +               /* prepare extent which will replace current one */
48872 +               reiser4_set_extent(&replace_ext, first_allocated, allocated);
48873 +
48874 +               /* adjust extent item */
48875 +               result = conv_extent(coord, &replace_ext);
48876 +               if (result != 0 && result != -ENOMEM) {
48877 +                       warning("vs-1461",
48878 +                               "Failed to allocate extent. Should not happen\n");
48879 +                       return result;
48880 +               }
48881 +
48882 +               /*
48883 +                * break flush: we prepared for flushing as many blocks as we
48884 +                * were asked for
48885 +                */
48886 +               if (flush_pos->nr_to_write == allocated)
48887 +                       flush_pos->state = POS_INVALID;
48888 +       } else {
48889 +               /* overwrite */
48890 +               mark_jnodes_overwrite(flush_pos, oid, index, width);
48891 +       }
48892 +       flush_pos->pos_in_unit = 0;
48893 +       return 0;
48894 +}
48895 +
48896 +/* if @key is glueable to the item @coord is set to */
48897 +static int must_insert(const coord_t *coord, const reiser4_key *key)
48898 +{
48899 +       reiser4_key last;
48900 +
48901 +       if (item_id_by_coord(coord) == EXTENT_POINTER_ID
48902 +           && keyeq(append_key_extent(coord, &last), key))
48903 +               return 0;
48904 +       return 1;
48905 +}
48906 +
48907 +/* copy extent @copy to the end of @node. It may have to either insert new item after the last one, or append last item,
48908 +   or modify last unit of last item to have greater width */
48909 +static int put_unit_to_end(znode *node, const reiser4_key *key,
48910 +                          reiser4_extent *copy_ext)
48911 +{
48912 +       int result;
48913 +       coord_t coord;
48914 +       cop_insert_flag flags;
48915 +       reiser4_extent *last_ext;
48916 +       reiser4_item_data data;
48917 +
48918 +       /* set coord after last unit in an item */
48919 +       coord_init_last_unit(&coord, node);
48920 +       coord.between = AFTER_UNIT;
48921 +
48922 +       flags =
48923 +           COPI_DONT_SHIFT_LEFT | COPI_DONT_SHIFT_RIGHT | COPI_DONT_ALLOCATE;
48924 +       if (must_insert(&coord, key)) {
48925 +               result =
48926 +                   insert_by_coord(&coord, init_new_extent(&data, copy_ext, 1),
48927 +                                   key, NULL /*lh */ , flags);
48928 +
48929 +       } else {
48930 +               /* try to glue with last unit */
48931 +               last_ext = extent_by_coord(&coord);
48932 +               if (state_of_extent(last_ext) &&
48933 +                   extent_get_start(last_ext) + extent_get_width(last_ext) ==
48934 +                   extent_get_start(copy_ext)) {
48935 +                       /* widen last unit of node */
48936 +                       extent_set_width(last_ext,
48937 +                                        extent_get_width(last_ext) +
48938 +                                        extent_get_width(copy_ext));
48939 +                       znode_make_dirty(node);
48940 +                       return 0;
48941 +               }
48942 +
48943 +               /* FIXME: put an assertion here that we can not merge last unit in @node and new unit */
48944 +               result =
48945 +                   insert_into_item(&coord, NULL /*lh */ , key,
48946 +                                    init_new_extent(&data, copy_ext, 1),
48947 +                                    flags);
48948 +       }
48949 +
48950 +       assert("vs-438", result == 0 || result == -E_NODE_FULL);
48951 +       return result;
48952 +}
48953 +
48954 +/* @coord is set to extent unit */
48955 +squeeze_result squalloc_extent(znode *left, const coord_t *coord,
48956 +                              flush_pos_t *flush_pos,
48957 +                              reiser4_key *stop_key)
48958 +{
48959 +       reiser4_extent *ext;
48960 +       __u64 index;
48961 +       __u64 width;
48962 +       reiser4_block_nr start;
48963 +       extent_state state;
48964 +       oid_t oid;
48965 +       reiser4_block_nr first_allocated;
48966 +       __u64 allocated;
48967 +       __u64 protected;
48968 +       reiser4_extent copy_extent;
48969 +       reiser4_key key;
48970 +       int result;
48971 +       block_stage_t block_stage;
48972 +
48973 +       assert("vs-1457", flush_pos->pos_in_unit == 0);
48974 +       assert("vs-1467", coord_is_leftmost_unit(coord));
48975 +       assert("vs-1467", item_is_extent(coord));
48976 +
48977 +       ext = extent_by_coord(coord);
48978 +       index = extent_unit_index(coord);
48979 +       start = extent_get_start(ext);
48980 +       width = extent_get_width(ext);
48981 +       state = state_of_extent(ext);
48982 +       unit_key_by_coord(coord, &key);
48983 +       oid = get_key_objectid(&key);
48984 +
48985 +       if ((flush_pos->leaf_relocate && state == ALLOCATED_EXTENT) ||
48986 +           (state == UNALLOCATED_EXTENT)) {
48987 +               /* relocate */
48988 +               if (state == ALLOCATED_EXTENT) {
48989 +                       /* all protected nodes are not flushprepped, therefore
48990 +                        * they are counted as flush_reserved */
48991 +                       block_stage = BLOCK_FLUSH_RESERVED;
48992 +                       protected = allocated_extent_slum_size(flush_pos, oid,
48993 +                                                              index, width);
48994 +                       if (protected == 0) {
48995 +                               flush_pos->state = POS_INVALID;
48996 +                               flush_pos->pos_in_unit = 0;
48997 +                               return 0;
48998 +                       }
48999 +               } else {
49000 +                       block_stage = BLOCK_UNALLOCATED;
49001 +                       protected = width;
49002 +               }
49003 +
49004 +               /*
49005 +                * look at previous unit if possible. If it is allocated, make
49006 +                * preceder more precise
49007 +                */
49008 +               if (coord->unit_pos &&
49009 +                   (state_of_extent(ext - 1) == ALLOCATED_EXTENT))
49010 +                       reiser4_pos_hint(flush_pos)->blk =
49011 +                               extent_get_start(ext - 1) +
49012 +                               extent_get_width(ext - 1);
49013 +
49014 +               /* allocate new block numbers for protected nodes */
49015 +               extent_allocate_blocks(reiser4_pos_hint(flush_pos),
49016 +                                      protected,
49017 +                                      &first_allocated, &allocated,
49018 +                                      block_stage);
49019 +
49020 +               /* prepare extent which will be copied to left */
49021 +               reiser4_set_extent(&copy_extent, first_allocated, allocated);
49022 +
49023 +               result = put_unit_to_end(left, &key, &copy_extent);
49024 +               if (result == -E_NODE_FULL) {
49025 +                       int target_block_stage;
49026 +
49027 +                       /* free blocks which were just allocated */
49028 +                       target_block_stage =
49029 +                           (state ==
49030 +                            ALLOCATED_EXTENT) ? BLOCK_FLUSH_RESERVED :
49031 +                           BLOCK_UNALLOCATED;
49032 +                       reiser4_dealloc_blocks(&first_allocated, &allocated,
49033 +                                              target_block_stage,
49034 +                                              BA_PERMANENT);
49035 +
49036 +                       /* rewind the preceder. */
49037 +                       flush_pos->preceder.blk = first_allocated;
49038 +                       check_preceder(flush_pos->preceder.blk);
49039 +
49040 +                       return SQUEEZE_TARGET_FULL;
49041 +               }
49042 +
49043 +               if (state == ALLOCATED_EXTENT) {
49044 +                       /* free nodes which were relocated */
49045 +                       reiser4_dealloc_blocks(&start, &allocated,
49046 +                                              BLOCK_ALLOCATED, BA_DEFER);
49047 +               }
49048 +
49049 +               /* assign new block numbers to protected nodes */
49050 +               assign_real_blocknrs(flush_pos, oid, index, allocated,
49051 +                                    first_allocated);
49052 +
49053 +               set_key_offset(&key,
49054 +                              get_key_offset(&key) +
49055 +                              (allocated << current_blocksize_bits));
49056 +       } else {
49057 +               /*
49058 +                * overwrite: try to copy unit as it is to left neighbor and
49059 +                * make all first not flushprepped nodes overwrite nodes
49060 +                */
49061 +               reiser4_set_extent(&copy_extent, start, width);
49062 +               result = put_unit_to_end(left, &key, &copy_extent);
49063 +               if (result == -E_NODE_FULL)
49064 +                       return SQUEEZE_TARGET_FULL;
49065 +
49066 +               if (state != HOLE_EXTENT)
49067 +                       mark_jnodes_overwrite(flush_pos, oid, index, width);
49068 +               set_key_offset(&key,
49069 +                              get_key_offset(&key) +
49070 +                              (width << current_blocksize_bits));
49071 +       }
49072 +       *stop_key = key;
49073 +       return SQUEEZE_CONTINUE;
49074 +}
49075 +
49076 +int key_by_offset_extent(struct inode *inode, loff_t off, reiser4_key * key)
49077 +{
49078 +       return key_by_inode_and_offset_common(inode, off, key);
49079 +}
49080 +
49081 +/*
49082 + * Local variables:
49083 + * c-indentation-style: "K&R"
49084 + * mode-name: "LC"
49085 + * c-basic-offset: 8
49086 + * tab-width: 8
49087 + * fill-column: 79
49088 + * scroll-step: 1
49089 + * End:
49090 + */
49091 diff -puN /dev/null fs/reiser4/plugin/item/extent_item_ops.c
49092 --- /dev/null
49093 +++ a/fs/reiser4/plugin/item/extent_item_ops.c
49094 @@ -0,0 +1,889 @@
49095 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
49096 +
49097 +#include "item.h"
49098 +#include "../../inode.h"
49099 +#include "../../tree_walk.h"   /* check_sibling_list() */
49100 +#include "../../page_cache.h"
49101 +#include "../../carry.h"
49102 +
49103 +#include <linux/quotaops.h>
49104 +
49105 +/* item_plugin->b.max_key_inside */
49106 +reiser4_key *max_key_inside_extent(const coord_t * coord, reiser4_key * key)
49107 +{
49108 +       item_key_by_coord(coord, key);
49109 +       set_key_offset(key, get_key_offset(reiser4_max_key()));
49110 +       return key;
49111 +}
49112 +
49113 +/* item_plugin->b.can_contain_key
49114 +   this checks whether @key of @data is matching to position set by @coord */
49115 +int
49116 +can_contain_key_extent(const coord_t * coord, const reiser4_key * key,
49117 +                      const reiser4_item_data * data)
49118 +{
49119 +       reiser4_key item_key;
49120 +
49121 +       if (item_plugin_by_coord(coord) != data->iplug)
49122 +               return 0;
49123 +
49124 +       item_key_by_coord(coord, &item_key);
49125 +       if (get_key_locality(key) != get_key_locality(&item_key) ||
49126 +           get_key_objectid(key) != get_key_objectid(&item_key) ||
49127 +           get_key_ordering(key) != get_key_ordering(&item_key))
49128 +               return 0;
49129 +
49130 +       return 1;
49131 +}
49132 +
49133 +/* item_plugin->b.mergeable
49134 +   first item is of extent type */
49135 +/* Audited by: green(2002.06.13) */
49136 +int mergeable_extent(const coord_t * p1, const coord_t * p2)
49137 +{
49138 +       reiser4_key key1, key2;
49139 +
49140 +       assert("vs-299", item_id_by_coord(p1) == EXTENT_POINTER_ID);
49141 +       /* FIXME-VS: Which is it? Assert or return 0 */
49142 +       if (item_id_by_coord(p2) != EXTENT_POINTER_ID) {
49143 +               return 0;
49144 +       }
49145 +
49146 +       item_key_by_coord(p1, &key1);
49147 +       item_key_by_coord(p2, &key2);
49148 +       if (get_key_locality(&key1) != get_key_locality(&key2) ||
49149 +           get_key_objectid(&key1) != get_key_objectid(&key2) ||
49150 +           get_key_ordering(&key1) != get_key_ordering(&key2) ||
49151 +           get_key_type(&key1) != get_key_type(&key2))
49152 +               return 0;
49153 +       if (get_key_offset(&key1) +
49154 +           reiser4_extent_size(p1, nr_units_extent(p1)) !=
49155 +           get_key_offset(&key2))
49156 +               return 0;
49157 +       return 1;
49158 +}
49159 +
49160 +/* item_plugin->b.nr_units */
49161 +pos_in_node_t nr_units_extent(const coord_t * coord)
49162 +{
49163 +       /* length of extent item has to be multiple of extent size */
49164 +       assert("vs-1424",
49165 +              (item_length_by_coord(coord) % sizeof(reiser4_extent)) == 0);
49166 +       return item_length_by_coord(coord) / sizeof(reiser4_extent);
49167 +}
49168 +
49169 +/* item_plugin->b.lookup */
49170 +lookup_result
49171 +lookup_extent(const reiser4_key * key, lookup_bias bias UNUSED_ARG,
49172 +             coord_t * coord)
49173 +{                              /* znode and item_pos are
49174 +                                  set to an extent item to
49175 +                                  look through */
49176 +       reiser4_key item_key;
49177 +       reiser4_block_nr lookuped, offset;
49178 +       unsigned i, nr_units;
49179 +       reiser4_extent *ext;
49180 +       unsigned blocksize;
49181 +       unsigned char blocksize_bits;
49182 +
49183 +       item_key_by_coord(coord, &item_key);
49184 +       offset = get_key_offset(&item_key);
49185 +
49186 +       /* key we are looking for must be greater than key of item @coord */
49187 +       assert("vs-414", keygt(key, &item_key));
49188 +
49189 +       assert("umka-99945",
49190 +              !keygt(key, max_key_inside_extent(coord, &item_key)));
49191 +
49192 +       ext = extent_item(coord);
49193 +       assert("vs-1350", (char *)ext == (zdata(coord->node) + coord->offset));
49194 +
49195 +       blocksize = current_blocksize;
49196 +       blocksize_bits = current_blocksize_bits;
49197 +
49198 +       /* offset we are looking for */
49199 +       lookuped = get_key_offset(key);
49200 +
49201 +       nr_units = nr_units_extent(coord);
49202 +       /* go through all extents until the one which address given offset */
49203 +       for (i = 0; i < nr_units; i++, ext++) {
49204 +               offset += (extent_get_width(ext) << blocksize_bits);
49205 +               if (offset > lookuped) {
49206 +                       /* desired byte is somewhere in this extent */
49207 +                       coord->unit_pos = i;
49208 +                       coord->between = AT_UNIT;
49209 +                       return CBK_COORD_FOUND;
49210 +               }
49211 +       }
49212 +
49213 +       /* set coord after last unit */
49214 +       coord->unit_pos = nr_units - 1;
49215 +       coord->between = AFTER_UNIT;
49216 +       return CBK_COORD_FOUND;
49217 +}
49218 +
49219 +/* item_plugin->b.paste
49220 +   item @coord is set to has been appended with @data->length of free
49221 +   space. data->data contains data to be pasted into the item in position
49222 +   @coord->in_item.unit_pos. It must fit into that free space.
49223 +   @coord must be set between units.
49224 +*/
49225 +int
49226 +paste_extent(coord_t * coord, reiser4_item_data * data,
49227 +            carry_plugin_info * info UNUSED_ARG)
49228 +{
49229 +       unsigned old_nr_units;
49230 +       reiser4_extent *ext;
49231 +       int item_length;
49232 +
49233 +       ext = extent_item(coord);
49234 +       item_length = item_length_by_coord(coord);
49235 +       old_nr_units = (item_length - data->length) / sizeof(reiser4_extent);
49236 +
49237 +       /* this is also used to copy extent into newly created item, so
49238 +          old_nr_units could be 0 */
49239 +       assert("vs-260", item_length >= data->length);
49240 +
49241 +       /* make sure that coord is set properly */
49242 +       assert("vs-35",
49243 +              ((!coord_is_existing_unit(coord))
49244 +               || (!old_nr_units && !coord->unit_pos)));
49245 +
49246 +       /* first unit to be moved */
49247 +       switch (coord->between) {
49248 +       case AFTER_UNIT:
49249 +               coord->unit_pos++;
49250 +       case BEFORE_UNIT:
49251 +               coord->between = AT_UNIT;
49252 +               break;
49253 +       case AT_UNIT:
49254 +               assert("vs-331", !old_nr_units && !coord->unit_pos);
49255 +               break;
49256 +       default:
49257 +               impossible("vs-330", "coord is set improperly");
49258 +       }
49259 +
49260 +       /* prepare space for new units */
49261 +       memmove(ext + coord->unit_pos + data->length / sizeof(reiser4_extent),
49262 +               ext + coord->unit_pos,
49263 +               (old_nr_units - coord->unit_pos) * sizeof(reiser4_extent));
49264 +
49265 +       /* copy new data from kernel space */
49266 +       assert("vs-556", data->user == 0);
49267 +       memcpy(ext + coord->unit_pos, data->data, (unsigned)data->length);
49268 +
49269 +       /* after paste @coord is set to first of pasted units */
49270 +       assert("vs-332", coord_is_existing_unit(coord));
49271 +       assert("vs-333",
49272 +              !memcmp(data->data, extent_by_coord(coord),
49273 +                      (unsigned)data->length));
49274 +       return 0;
49275 +}
49276 +
49277 +/* item_plugin->b.can_shift */
49278 +int
49279 +can_shift_extent(unsigned free_space, coord_t * source,
49280 +                znode * target UNUSED_ARG, shift_direction pend UNUSED_ARG,
49281 +                unsigned *size, unsigned want)
49282 +{
49283 +       *size = item_length_by_coord(source);
49284 +       if (*size > free_space)
49285 +               /* never split a unit of extent item */
49286 +               *size = free_space - free_space % sizeof(reiser4_extent);
49287 +
49288 +       /* we can shift *size bytes, calculate how many do we want to shift */
49289 +       if (*size > want * sizeof(reiser4_extent))
49290 +               *size = want * sizeof(reiser4_extent);
49291 +
49292 +       if (*size % sizeof(reiser4_extent) != 0)
49293 +               impossible("vs-119", "Wrong extent size: %i %zd", *size,
49294 +                          sizeof(reiser4_extent));
49295 +       return *size / sizeof(reiser4_extent);
49296 +
49297 +}
49298 +
49299 +/* item_plugin->b.copy_units */
49300 +void
49301 +copy_units_extent(coord_t * target, coord_t * source,
49302 +                 unsigned from, unsigned count,
49303 +                 shift_direction where_is_free_space, unsigned free_space)
49304 +{
49305 +       char *from_ext, *to_ext;
49306 +
49307 +       assert("vs-217", free_space == count * sizeof(reiser4_extent));
49308 +
49309 +       from_ext = item_body_by_coord(source);
49310 +       to_ext = item_body_by_coord(target);
49311 +
49312 +       if (where_is_free_space == SHIFT_LEFT) {
49313 +               assert("vs-215", from == 0);
49314 +
49315 +               /* At this moment, item length was already updated in the item
49316 +                  header by shifting code, hence nr_units_extent() will
49317 +                  return "new" number of units---one we obtain after copying
49318 +                  units.
49319 +                */
49320 +               to_ext +=
49321 +                   (nr_units_extent(target) - count) * sizeof(reiser4_extent);
49322 +       } else {
49323 +               reiser4_key key;
49324 +               coord_t coord;
49325 +
49326 +               assert("vs-216",
49327 +                      from + count == coord_last_unit_pos(source) + 1);
49328 +
49329 +               from_ext += item_length_by_coord(source) - free_space;
49330 +
49331 +               /* new units are inserted before first unit in an item,
49332 +                  therefore, we have to update item key */
49333 +               coord = *source;
49334 +               coord.unit_pos = from;
49335 +               unit_key_extent(&coord, &key);
49336 +
49337 +               node_plugin_by_node(target->node)->update_item_key(target, &key,
49338 +                                                                  NULL /*info */);
49339 +       }
49340 +
49341 +       memcpy(to_ext, from_ext, free_space);
49342 +}
49343 +
49344 +/* item_plugin->b.create_hook
49345 +   @arg is znode of leaf node for which we need to update right delimiting key */
49346 +int create_hook_extent(const coord_t * coord, void *arg)
49347 +{
49348 +       coord_t *child_coord;
49349 +       znode *node;
49350 +       reiser4_key key;
49351 +       reiser4_tree *tree;
49352 +
49353 +       if (!arg)
49354 +               return 0;
49355 +
49356 +       child_coord = arg;
49357 +       tree = znode_get_tree(coord->node);
49358 +
49359 +       assert("nikita-3246", znode_get_level(child_coord->node) == LEAF_LEVEL);
49360 +
49361 +       write_lock_tree(tree);
49362 +       write_lock_dk(tree);
49363 +       /* find a node on the left level for which right delimiting key has to
49364 +          be updated */
49365 +       if (coord_wrt(child_coord) == COORD_ON_THE_LEFT) {
49366 +               assert("vs-411", znode_is_left_connected(child_coord->node));
49367 +               node = child_coord->node->left;
49368 +       } else {
49369 +               assert("vs-412", coord_wrt(child_coord) == COORD_ON_THE_RIGHT);
49370 +               node = child_coord->node;
49371 +               assert("nikita-3314", node != NULL);
49372 +       }
49373 +
49374 +       if (node != NULL) {
49375 +               znode_set_rd_key(node, item_key_by_coord(coord, &key));
49376 +
49377 +               assert("nikita-3282", check_sibling_list(node));
49378 +               /* break sibling links */
49379 +               if (ZF_ISSET(node, JNODE_RIGHT_CONNECTED) && node->right) {
49380 +                       ON_DEBUG(node->right->left_version =
49381 +                                atomic_inc_return(&delim_key_version);
49382 +                                node->right_version =
49383 +                                atomic_inc_return(&delim_key_version););
49384 +
49385 +                       node->right->left = NULL;
49386 +                       node->right = NULL;
49387 +               }
49388 +       }
49389 +       write_unlock_dk(tree);
49390 +       write_unlock_tree(tree);
49391 +       return 0;
49392 +}
49393 +
49394 +#define ITEM_TAIL_KILLED 0
49395 +#define ITEM_HEAD_KILLED 1
49396 +#define ITEM_KILLED 2
49397 +
49398 +/* item_plugin->b.kill_hook
49399 +   this is called when @count units starting from @from-th one are going to be removed
49400 +   */
49401 +int
49402 +kill_hook_extent(const coord_t * coord, pos_in_node_t from, pos_in_node_t count,
49403 +                struct carry_kill_data *kdata)
49404 +{
49405 +       reiser4_extent *ext;
49406 +       reiser4_block_nr start, length;
49407 +       const reiser4_key *pfrom_key, *pto_key;
49408 +       struct inode *inode;
49409 +       reiser4_tree *tree;
49410 +       pgoff_t from_off, to_off, offset, skip;
49411 +       int retval;
49412 +
49413 +       /* these are located in memory kmalloc-ed by kill_node_content */
49414 +       reiser4_key *min_item_key, *max_item_key, *from_key, *to_key, *key;
49415 +       coord_t *dup, *next;
49416 +
49417 +       assert("zam-811", znode_is_write_locked(coord->node));
49418 +       assert("nikita-3315", kdata != NULL);
49419 +       assert("vs-34", kdata->buf != NULL);
49420 +
49421 +       /* map structures to kdata->buf */
49422 +       min_item_key = (reiser4_key *) (kdata->buf);
49423 +       max_item_key = min_item_key + 1;
49424 +       from_key = max_item_key + 1;
49425 +       to_key = from_key + 1;
49426 +       key = to_key + 1;
49427 +       dup = (coord_t *) (key + 1);
49428 +       next = dup + 1;
49429 +
49430 +       item_key_by_coord(coord, min_item_key);
49431 +       max_item_key_by_coord(coord, max_item_key);
49432 +
49433 +       if (kdata->params.from_key) {
49434 +               pfrom_key = kdata->params.from_key;
49435 +               pto_key = kdata->params.to_key;
49436 +       } else {
49437 +               assert("vs-1549", from == coord->unit_pos);
49438 +               unit_key_by_coord(coord, from_key);
49439 +               pfrom_key = from_key;
49440 +
49441 +               coord_dup(dup, coord);
49442 +               dup->unit_pos = from + count - 1;
49443 +               max_unit_key_by_coord(dup, to_key);
49444 +               pto_key = to_key;
49445 +       }
49446 +
49447 +       if (!keylt(pto_key, max_item_key)) {
49448 +               if (!keygt(pfrom_key, min_item_key)) {
49449 +                       znode *left, *right;
49450 +
49451 +                       /* item is to be removed completely */
49452 +                       assert("nikita-3316", kdata->left != NULL
49453 +                              && kdata->right != NULL);
49454 +
49455 +                       left = kdata->left->node;
49456 +                       right = kdata->right->node;
49457 +
49458 +                       tree = current_tree;
49459 +                       /* we have to do two things:
49460 +                        *
49461 +                        *     1. link left and right formatted neighbors of
49462 +                        *        extent being removed, and
49463 +                        *
49464 +                        *     2. update their delimiting keys.
49465 +                        *
49466 +                        * atomicity of these operations is protected by
49467 +                        * taking dk-lock and tree-lock.
49468 +                        */
49469 +                       /* if neighbors of item being removed are znodes -
49470 +                        * link them */
49471 +                       write_lock_tree(tree);
49472 +                       write_lock_dk(tree);
49473 +                       link_left_and_right(left, right);
49474 +                       if (left) {
49475 +                               /* update right delimiting key of left
49476 +                                * neighbor of extent item */
49477 +                               /*coord_t next;
49478 +                                  reiser4_key key; */
49479 +
49480 +                               coord_dup(next, coord);
49481 +
49482 +                               if (coord_next_item(next))
49483 +                                       *key = *znode_get_rd_key(coord->node);
49484 +                               else
49485 +                                       item_key_by_coord(next, key);
49486 +                               znode_set_rd_key(left, key);
49487 +                       }
49488 +                       write_unlock_dk(tree);
49489 +                       write_unlock_tree(tree);
49490 +
49491 +                       from_off =
49492 +                           get_key_offset(min_item_key) >> PAGE_CACHE_SHIFT;
49493 +                       to_off =
49494 +                           (get_key_offset(max_item_key) +
49495 +                            1) >> PAGE_CACHE_SHIFT;
49496 +                       retval = ITEM_KILLED;
49497 +               } else {
49498 +                       /* tail of item is to be removed */
49499 +                       from_off =
49500 +                           (get_key_offset(pfrom_key) + PAGE_CACHE_SIZE -
49501 +                            1) >> PAGE_CACHE_SHIFT;
49502 +                       to_off =
49503 +                           (get_key_offset(max_item_key) +
49504 +                            1) >> PAGE_CACHE_SHIFT;
49505 +                       retval = ITEM_TAIL_KILLED;
49506 +               }
49507 +       } else {
49508 +               /* head of item is to be removed */
49509 +               assert("vs-1571", keyeq(pfrom_key, min_item_key));
49510 +               assert("vs-1572",
49511 +                      (get_key_offset(pfrom_key) & (PAGE_CACHE_SIZE - 1)) ==
49512 +                      0);
49513 +               assert("vs-1573",
49514 +                      ((get_key_offset(pto_key) + 1) & (PAGE_CACHE_SIZE -
49515 +                                                        1)) == 0);
49516 +
49517 +               if (kdata->left->node) {
49518 +                       /* update right delimiting key of left neighbor of extent item */
49519 +                       /*reiser4_key key; */
49520 +
49521 +                       *key = *pto_key;
49522 +                       set_key_offset(key, get_key_offset(pto_key) + 1);
49523 +
49524 +                       write_lock_dk(current_tree);
49525 +                       znode_set_rd_key(kdata->left->node, key);
49526 +                       write_unlock_dk(current_tree);
49527 +               }
49528 +
49529 +               from_off = get_key_offset(pfrom_key) >> PAGE_CACHE_SHIFT;
49530 +               to_off = (get_key_offset(pto_key) + 1) >> PAGE_CACHE_SHIFT;
49531 +               retval = ITEM_HEAD_KILLED;
49532 +       }
49533 +
49534 +       inode = kdata->inode;
49535 +       assert("vs-1545", inode != NULL);
49536 +       if (inode != NULL)
49537 +               /* take care of pages and jnodes corresponding to part of item being killed */
49538 +               reiser4_invalidate_pages(inode->i_mapping, from_off,
49539 +                                        to_off - from_off,
49540 +                                        kdata->params.truncate);
49541 +
49542 +       ext = extent_item(coord) + from;
49543 +       offset =
49544 +           (get_key_offset(min_item_key) +
49545 +            reiser4_extent_size(coord, from)) >> PAGE_CACHE_SHIFT;
49546 +
49547 +       assert("vs-1551", from_off >= offset);
49548 +       assert("vs-1552", from_off - offset <= extent_get_width(ext));
49549 +       skip = from_off - offset;
49550 +       offset = from_off;
49551 +
49552 +       while (offset < to_off) {
49553 +               length = extent_get_width(ext) - skip;
49554 +               if (state_of_extent(ext) == HOLE_EXTENT) {
49555 +                       skip = 0;
49556 +                       offset += length;
49557 +                       ext++;
49558 +                       continue;
49559 +               }
49560 +
49561 +               if (offset + length > to_off) {
49562 +                       length = to_off - offset;
49563 +               }
49564 +
49565 +               DQUOT_FREE_BLOCK_NODIRTY(inode, length);
49566 +
49567 +               if (state_of_extent(ext) == UNALLOCATED_EXTENT) {
49568 +                       /* some jnodes corresponding to this unallocated extent */
49569 +                       fake_allocated2free(length, 0 /* unformatted */ );
49570 +
49571 +                       skip = 0;
49572 +                       offset += length;
49573 +                       ext++;
49574 +                       continue;
49575 +               }
49576 +
49577 +               assert("vs-1218", state_of_extent(ext) == ALLOCATED_EXTENT);
49578 +
49579 +               if (length != 0) {
49580 +                       start = extent_get_start(ext) + skip;
49581 +
49582 +                       /* BA_DEFER bit parameter is turned on because blocks which get freed are not safe to be freed
49583 +                          immediately */
49584 +                       reiser4_dealloc_blocks(&start, &length,
49585 +                                              0 /* not used */ ,
49586 +                                              BA_DEFER
49587 +                                              /* unformatted with defer */ );
49588 +               }
49589 +               skip = 0;
49590 +               offset += length;
49591 +               ext++;
49592 +       }
49593 +       return retval;
49594 +}
49595 +
49596 +/* item_plugin->b.kill_units */
49597 +int
49598 +kill_units_extent(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
49599 +                 struct carry_kill_data *kdata, reiser4_key * smallest_removed,
49600 +                 reiser4_key * new_first)
49601 +{
49602 +       reiser4_extent *ext;
49603 +       reiser4_key item_key;
49604 +       pos_in_node_t count;
49605 +       reiser4_key from_key, to_key;
49606 +       const reiser4_key *pfrom_key, *pto_key;
49607 +       loff_t off;
49608 +       int result;
49609 +
49610 +       assert("vs-1541",
49611 +              ((kdata->params.from_key == NULL && kdata->params.to_key == NULL)
49612 +               || (kdata->params.from_key != NULL
49613 +                   && kdata->params.to_key != NULL)));
49614 +
49615 +       if (kdata->params.from_key) {
49616 +               pfrom_key = kdata->params.from_key;
49617 +               pto_key = kdata->params.to_key;
49618 +       } else {
49619 +               coord_t dup;
49620 +
49621 +               /* calculate key range of kill */
49622 +               assert("vs-1549", from == coord->unit_pos);
49623 +               unit_key_by_coord(coord, &from_key);
49624 +               pfrom_key = &from_key;
49625 +
49626 +               coord_dup(&dup, coord);
49627 +               dup.unit_pos = to;
49628 +               max_unit_key_by_coord(&dup, &to_key);
49629 +               pto_key = &to_key;
49630 +       }
49631 +
49632 +       item_key_by_coord(coord, &item_key);
49633 +
49634 +#if REISER4_DEBUG
49635 +       {
49636 +               reiser4_key max_item_key;
49637 +
49638 +               max_item_key_by_coord(coord, &max_item_key);
49639 +
49640 +               if (new_first) {
49641 +                       /* head of item is to be cut */
49642 +                       assert("vs-1542", keyeq(pfrom_key, &item_key));
49643 +                       assert("vs-1538", keylt(pto_key, &max_item_key));
49644 +               } else {
49645 +                       /* tail of item is to be cut */
49646 +                       assert("vs-1540", keygt(pfrom_key, &item_key));
49647 +                       assert("vs-1543", !keylt(pto_key, &max_item_key));
49648 +               }
49649 +       }
49650 +#endif
49651 +
49652 +       if (smallest_removed)
49653 +               *smallest_removed = *pfrom_key;
49654 +
49655 +       if (new_first) {
49656 +               /* item head is cut. Item key will change. This new key is calculated here */
49657 +               assert("vs-1556",
49658 +                      (get_key_offset(pto_key) & (PAGE_CACHE_SIZE - 1)) ==
49659 +                      (PAGE_CACHE_SIZE - 1));
49660 +               *new_first = *pto_key;
49661 +               set_key_offset(new_first, get_key_offset(new_first) + 1);
49662 +       }
49663 +
49664 +       count = to - from + 1;
49665 +       result = kill_hook_extent(coord, from, count, kdata);
49666 +       if (result == ITEM_TAIL_KILLED) {
49667 +               assert("vs-1553",
49668 +                      get_key_offset(pfrom_key) >=
49669 +                      get_key_offset(&item_key) +
49670 +                      reiser4_extent_size(coord, from));
49671 +               off =
49672 +                   get_key_offset(pfrom_key) -
49673 +                       (get_key_offset(&item_key) +
49674 +                        reiser4_extent_size(coord, from));
49675 +               if (off) {
49676 +                       /* unit @from is to be cut partially. Its width decreases */
49677 +                       ext = extent_item(coord) + from;
49678 +                       extent_set_width(ext,
49679 +                                        (off + PAGE_CACHE_SIZE -
49680 +                                         1) >> PAGE_CACHE_SHIFT);
49681 +                       count--;
49682 +               }
49683 +       } else {
49684 +               __u64 max_to_offset;
49685 +               __u64 rest;
49686 +
49687 +               assert("vs-1575", result == ITEM_HEAD_KILLED);
49688 +               assert("", from == 0);
49689 +               assert("",
49690 +                      ((get_key_offset(pto_key) + 1) & (PAGE_CACHE_SIZE -
49691 +                                                        1)) == 0);
49692 +               assert("",
49693 +                      get_key_offset(pto_key) + 1 >
49694 +                      get_key_offset(&item_key) +
49695 +                      reiser4_extent_size(coord, to));
49696 +               max_to_offset =
49697 +                   get_key_offset(&item_key) +
49698 +                       reiser4_extent_size(coord, to + 1) - 1;
49699 +               assert("", get_key_offset(pto_key) <= max_to_offset);
49700 +
49701 +               rest =
49702 +                   (max_to_offset -
49703 +                    get_key_offset(pto_key)) >> PAGE_CACHE_SHIFT;
49704 +               if (rest) {
49705 +                       /* unit @to is to be cut partially */
49706 +                       ext = extent_item(coord) + to;
49707 +
49708 +                       assert("", extent_get_width(ext) > rest);
49709 +
49710 +                       if (state_of_extent(ext) == ALLOCATED_EXTENT)
49711 +                               extent_set_start(ext,
49712 +                                                extent_get_start(ext) +
49713 +                                                (extent_get_width(ext) -
49714 +                                                 rest));
49715 +
49716 +                       extent_set_width(ext, rest);
49717 +                       count--;
49718 +               }
49719 +       }
49720 +       return count * sizeof(reiser4_extent);
49721 +}
49722 +
49723 +/* item_plugin->b.cut_units
49724 +   this is too similar to kill_units_extent */
49725 +int
49726 +cut_units_extent(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
49727 +                struct carry_cut_data *cdata, reiser4_key * smallest_removed,
49728 +                reiser4_key * new_first)
49729 +{
49730 +       reiser4_extent *ext;
49731 +       reiser4_key item_key;
49732 +       pos_in_node_t count;
49733 +       reiser4_key from_key, to_key;
49734 +       const reiser4_key *pfrom_key, *pto_key;
49735 +       loff_t off;
49736 +
49737 +       assert("vs-1541",
49738 +              ((cdata->params.from_key == NULL && cdata->params.to_key == NULL)
49739 +               || (cdata->params.from_key != NULL
49740 +                   && cdata->params.to_key != NULL)));
49741 +
49742 +       if (cdata->params.from_key) {
49743 +               pfrom_key = cdata->params.from_key;
49744 +               pto_key = cdata->params.to_key;
49745 +       } else {
49746 +               coord_t dup;
49747 +
49748 +               /* calculate key range of kill */
49749 +               coord_dup(&dup, coord);
49750 +               dup.unit_pos = from;
49751 +               unit_key_by_coord(&dup, &from_key);
49752 +
49753 +               dup.unit_pos = to;
49754 +               max_unit_key_by_coord(&dup, &to_key);
49755 +
49756 +               pfrom_key = &from_key;
49757 +               pto_key = &to_key;
49758 +       }
49759 +
49760 +       assert("vs-1555",
49761 +              (get_key_offset(pfrom_key) & (PAGE_CACHE_SIZE - 1)) == 0);
49762 +       assert("vs-1556",
49763 +              (get_key_offset(pto_key) & (PAGE_CACHE_SIZE - 1)) ==
49764 +              (PAGE_CACHE_SIZE - 1));
49765 +
49766 +       item_key_by_coord(coord, &item_key);
49767 +
49768 +#if REISER4_DEBUG
49769 +       {
49770 +               reiser4_key max_item_key;
49771 +
49772 +               assert("vs-1584",
49773 +                      get_key_locality(pfrom_key) ==
49774 +                      get_key_locality(&item_key));
49775 +               assert("vs-1585",
49776 +                      get_key_type(pfrom_key) == get_key_type(&item_key));
49777 +               assert("vs-1586",
49778 +                      get_key_objectid(pfrom_key) ==
49779 +                      get_key_objectid(&item_key));
49780 +               assert("vs-1587",
49781 +                      get_key_ordering(pfrom_key) ==
49782 +                      get_key_ordering(&item_key));
49783 +
49784 +               max_item_key_by_coord(coord, &max_item_key);
49785 +
49786 +               if (new_first != NULL) {
49787 +                       /* head of item is to be cut */
49788 +                       assert("vs-1542", keyeq(pfrom_key, &item_key));
49789 +                       assert("vs-1538", keylt(pto_key, &max_item_key));
49790 +               } else {
49791 +                       /* tail of item is to be cut */
49792 +                       assert("vs-1540", keygt(pfrom_key, &item_key));
49793 +                       assert("vs-1543", keyeq(pto_key, &max_item_key));
49794 +               }
49795 +       }
49796 +#endif
49797 +
49798 +       if (smallest_removed)
49799 +               *smallest_removed = *pfrom_key;
49800 +
49801 +       if (new_first) {
49802 +               /* item head is cut. Item key will change. This new key is calculated here */
49803 +               *new_first = *pto_key;
49804 +               set_key_offset(new_first, get_key_offset(new_first) + 1);
49805 +       }
49806 +
49807 +       count = to - from + 1;
49808 +
49809 +       assert("vs-1553",
49810 +              get_key_offset(pfrom_key) >=
49811 +              get_key_offset(&item_key) + reiser4_extent_size(coord, from));
49812 +       off =
49813 +           get_key_offset(pfrom_key) - (get_key_offset(&item_key) +
49814 +                                        reiser4_extent_size(coord, from));
49815 +       if (off) {
49816 +               /* tail of unit @from is to be cut partially. Its width decreases */
49817 +               assert("vs-1582", new_first == NULL);
49818 +               ext = extent_item(coord) + from;
49819 +               extent_set_width(ext, off >> PAGE_CACHE_SHIFT);
49820 +               count--;
49821 +       }
49822 +
49823 +       assert("vs-1554",
49824 +              get_key_offset(pto_key) <=
49825 +              get_key_offset(&item_key) +
49826 +              reiser4_extent_size(coord, to + 1) - 1);
49827 +       off =
49828 +               (get_key_offset(&item_key) +
49829 +                reiser4_extent_size(coord, to + 1) - 1) -
49830 +               get_key_offset(pto_key);
49831 +       if (off) {
49832 +               /* @to_key is smaller than max key of unit @to. Unit @to will not be removed. It gets start increased
49833 +                  and width decreased. */
49834 +               assert("vs-1583", (off & (PAGE_CACHE_SIZE - 1)) == 0);
49835 +               ext = extent_item(coord) + to;
49836 +               if (state_of_extent(ext) == ALLOCATED_EXTENT)
49837 +                       extent_set_start(ext,
49838 +                                        extent_get_start(ext) +
49839 +                                        (extent_get_width(ext) -
49840 +                                         (off >> PAGE_CACHE_SHIFT)));
49841 +
49842 +               extent_set_width(ext, (off >> PAGE_CACHE_SHIFT));
49843 +               count--;
49844 +       }
49845 +       return count * sizeof(reiser4_extent);
49846 +}
49847 +
49848 +/* item_plugin->b.unit_key */
49849 +reiser4_key *unit_key_extent(const coord_t * coord, reiser4_key * key)
49850 +{
49851 +       assert("vs-300", coord_is_existing_unit(coord));
49852 +
49853 +       item_key_by_coord(coord, key);
49854 +       set_key_offset(key,
49855 +                      (get_key_offset(key) +
49856 +                       reiser4_extent_size(coord, coord->unit_pos)));
49857 +
49858 +       return key;
49859 +}
49860 +
49861 +/* item_plugin->b.max_unit_key */
49862 +reiser4_key *max_unit_key_extent(const coord_t * coord, reiser4_key * key)
49863 +{
49864 +       assert("vs-300", coord_is_existing_unit(coord));
49865 +
49866 +       item_key_by_coord(coord, key);
49867 +       set_key_offset(key,
49868 +                      (get_key_offset(key) +
49869 +                       reiser4_extent_size(coord, coord->unit_pos + 1) - 1));
49870 +       return key;
49871 +}
49872 +
49873 +/* item_plugin->b.estimate
49874 +   item_plugin->b.item_data_by_flow */
49875 +
49876 +#if REISER4_DEBUG
49877 +
49878 +/* item_plugin->b.check
49879 +   used for debugging, every item should have here the most complete
49880 +   possible check of the consistency of the item that the inventor can
49881 +   construct
49882 +*/
49883 +int reiser4_check_extent(const coord_t * coord /* coord of item to check */,
49884 +                        const char **error /* where to store error message */)
49885 +{
49886 +       reiser4_extent *ext, *first;
49887 +       unsigned i, j;
49888 +       reiser4_block_nr start, width, blk_cnt;
49889 +       unsigned num_units;
49890 +       reiser4_tree *tree;
49891 +       oid_t oid;
49892 +       reiser4_key key;
49893 +       coord_t scan;
49894 +
49895 +       assert("vs-933", REISER4_DEBUG);
49896 +
49897 +       if (znode_get_level(coord->node) != TWIG_LEVEL) {
49898 +               *error = "Extent on the wrong level";
49899 +               return -1;
49900 +       }
49901 +       if (item_length_by_coord(coord) % sizeof(reiser4_extent) != 0) {
49902 +               *error = "Wrong item size";
49903 +               return -1;
49904 +       }
49905 +       ext = first = extent_item(coord);
49906 +       blk_cnt = reiser4_block_count(reiser4_get_current_sb());
49907 +       num_units = coord_num_units(coord);
49908 +       tree = znode_get_tree(coord->node);
49909 +       item_key_by_coord(coord, &key);
49910 +       oid = get_key_objectid(&key);
49911 +       coord_dup(&scan, coord);
49912 +
49913 +       for (i = 0; i < num_units; ++i, ++ext) {
49914 +               __u64 index;
49915 +
49916 +               scan.unit_pos = i;
49917 +               index = extent_unit_index(&scan);
49918 +
49919 +#if 0
49920 +               /* check that all jnodes are present for the unallocated
49921 +                * extent */
49922 +               if (state_of_extent(ext) == UNALLOCATED_EXTENT) {
49923 +                       for (j = 0; j < extent_get_width(ext); j++) {
49924 +                               jnode *node;
49925 +
49926 +                               node = jlookup(tree, oid, index + j);
49927 +                               if (node == NULL) {
49928 +                                       print_coord("scan", &scan, 0);
49929 +                                       *error = "Jnode missing";
49930 +                                       return -1;
49931 +                               }
49932 +                               jput(node);
49933 +                       }
49934 +               }
49935 +#endif
49936 +
49937 +               start = extent_get_start(ext);
49938 +               if (start < 2)
49939 +                       continue;
49940 +               /* extent is allocated one */
49941 +               width = extent_get_width(ext);
49942 +               if (start >= blk_cnt) {
49943 +                       *error = "Start too large";
49944 +                       return -1;
49945 +               }
49946 +               if (start + width > blk_cnt) {
49947 +                       *error = "End too large";
49948 +                       return -1;
49949 +               }
49950 +               /* make sure that this extent does not overlap with other
49951 +                  allocated extents extents */
49952 +               for (j = 0; j < i; j++) {
49953 +                       if (state_of_extent(first + j) != ALLOCATED_EXTENT)
49954 +                               continue;
49955 +                       if (!
49956 +                           ((extent_get_start(ext) >=
49957 +                             extent_get_start(first + j) +
49958 +                             extent_get_width(first + j))
49959 +                            || (extent_get_start(ext) +
49960 +                                extent_get_width(ext) <=
49961 +                                extent_get_start(first + j)))) {
49962 +                               *error = "Extent overlaps with others";
49963 +                               return -1;
49964 +                       }
49965 +               }
49966 +
49967 +       }
49968 +
49969 +       return 0;
49970 +}
49971 +
49972 +#endif                         /* REISER4_DEBUG */
49973 +
49974 +/*
49975 +   Local variables:
49976 +   c-indentation-style: "K&R"
49977 +   mode-name: "LC"
49978 +   c-basic-offset: 8
49979 +   tab-width: 8
49980 +   fill-column: 120
49981 +   scroll-step: 1
49982 +   End:
49983 +*/
49984 diff -puN /dev/null fs/reiser4/plugin/item/internal.c
49985 --- /dev/null
49986 +++ a/fs/reiser4/plugin/item/internal.c
49987 @@ -0,0 +1,404 @@
49988 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
49989 +
49990 +/* Implementation of internal-item plugin methods. */
49991 +
49992 +#include "../../forward.h"
49993 +#include "../../debug.h"
49994 +#include "../../dformat.h"
49995 +#include "../../key.h"
49996 +#include "../../coord.h"
49997 +#include "internal.h"
49998 +#include "item.h"
49999 +#include "../node/node.h"
50000 +#include "../plugin.h"
50001 +#include "../../jnode.h"
50002 +#include "../../znode.h"
50003 +#include "../../tree_walk.h"
50004 +#include "../../tree_mod.h"
50005 +#include "../../tree.h"
50006 +#include "../../super.h"
50007 +#include "../../block_alloc.h"
50008 +
50009 +/* see internal.h for explanation */
50010 +
50011 +/* plugin->u.item.b.mergeable */
50012 +int mergeable_internal(const coord_t * p1 UNUSED_ARG /* first item */ ,
50013 +                      const coord_t * p2 UNUSED_ARG /* second item */ )
50014 +{
50015 +       /* internal items are not mergeable */
50016 +       return 0;
50017 +}
50018 +
50019 +/* ->lookup() method for internal items */
50020 +lookup_result lookup_internal(const reiser4_key * key /* key to look up */ ,
50021 +                             lookup_bias bias UNUSED_ARG /* lookup bias */ ,
50022 +                             coord_t * coord /* coord of item */ )
50023 +{
50024 +       reiser4_key ukey;
50025 +
50026 +       switch (keycmp(unit_key_by_coord(coord, &ukey), key)) {
50027 +       default:
50028 +               impossible("", "keycmp()?!");
50029 +       case LESS_THAN:
50030 +               /* FIXME-VS: AFTER_ITEM used to be here. But with new coord
50031 +                  item plugin can not be taken using coord set this way */
50032 +               assert("vs-681", coord->unit_pos == 0);
50033 +               coord->between = AFTER_UNIT;
50034 +       case EQUAL_TO:
50035 +               return CBK_COORD_FOUND;
50036 +       case GREATER_THAN:
50037 +               return CBK_COORD_NOTFOUND;
50038 +       }
50039 +}
50040 +
50041 +/* return body of internal item at @coord */
50042 +static internal_item_layout *internal_at(const coord_t * coord /* coord of
50043 +                                                                * item */ )
50044 +{
50045 +       assert("nikita-607", coord != NULL);
50046 +       assert("nikita-1650",
50047 +              item_plugin_by_coord(coord) ==
50048 +              item_plugin_by_id(NODE_POINTER_ID));
50049 +       return (internal_item_layout *) item_body_by_coord(coord);
50050 +}
50051 +
50052 +void reiser4_update_internal(const coord_t * coord,
50053 +                            const reiser4_block_nr * blocknr)
50054 +{
50055 +       internal_item_layout *item = internal_at(coord);
50056 +       assert("nikita-2959", reiser4_blocknr_is_sane(blocknr));
50057 +
50058 +       put_unaligned(cpu_to_le64(*blocknr), &item->pointer);
50059 +}
50060 +
50061 +/* return child block number stored in the internal item at @coord */
50062 +static reiser4_block_nr pointer_at(const coord_t * coord /* coord of item */ )
50063 +{
50064 +       assert("nikita-608", coord != NULL);
50065 +       return le64_to_cpu(get_unaligned(&internal_at(coord)->pointer));
50066 +}
50067 +
50068 +/* get znode pointed to by internal @item */
50069 +static znode *znode_at(const coord_t * item /* coord of item */ ,
50070 +                      znode * parent /* parent node */ )
50071 +{
50072 +       return child_znode(item, parent, 1, 0);
50073 +}
50074 +
50075 +/* store pointer from internal item into "block". Implementation of
50076 +    ->down_link() method */
50077 +void down_link_internal(const coord_t * coord /* coord of item */ ,
50078 +                       const reiser4_key * key UNUSED_ARG      /* key to get
50079 +                                                                * pointer for */ ,
50080 +                       reiser4_block_nr * block /* resulting block number */ )
50081 +{
50082 +       ON_DEBUG(reiser4_key item_key);
50083 +
50084 +       assert("nikita-609", coord != NULL);
50085 +       assert("nikita-611", block != NULL);
50086 +       assert("nikita-612", (key == NULL) ||
50087 +              /* twig horrors */
50088 +              (znode_get_level(coord->node) == TWIG_LEVEL)
50089 +              || keyle(item_key_by_coord(coord, &item_key), key));
50090 +
50091 +       *block = pointer_at(coord);
50092 +       assert("nikita-2960", reiser4_blocknr_is_sane(block));
50093 +}
50094 +
50095 +/* Get the child's block number, or 0 if the block is unallocated. */
50096 +int
50097 +utmost_child_real_block_internal(const coord_t * coord, sideof side UNUSED_ARG,
50098 +                                reiser4_block_nr * block)
50099 +{
50100 +       assert("jmacd-2059", coord != NULL);
50101 +
50102 +       *block = pointer_at(coord);
50103 +       assert("nikita-2961", reiser4_blocknr_is_sane(block));
50104 +
50105 +       if (reiser4_blocknr_is_fake(block)) {
50106 +               *block = 0;
50107 +       }
50108 +
50109 +       return 0;
50110 +}
50111 +
50112 +/* Return the child. */
50113 +int
50114 +utmost_child_internal(const coord_t * coord, sideof side UNUSED_ARG,
50115 +                     jnode ** childp)
50116 +{
50117 +       reiser4_block_nr block = pointer_at(coord);
50118 +       znode *child;
50119 +
50120 +       assert("jmacd-2059", childp != NULL);
50121 +       assert("nikita-2962", reiser4_blocknr_is_sane(&block));
50122 +
50123 +       child = zlook(znode_get_tree(coord->node), &block);
50124 +
50125 +       if (IS_ERR(child)) {
50126 +               return PTR_ERR(child);
50127 +       }
50128 +
50129 +       *childp = ZJNODE(child);
50130 +
50131 +       return 0;
50132 +}
50133 +
50134 +#if REISER4_DEBUG
50135 +
50136 +static void check_link(znode * left, znode * right)
50137 +{
50138 +       znode *scan;
50139 +
50140 +       for (scan = left; scan != right; scan = scan->right) {
50141 +               if (ZF_ISSET(scan, JNODE_RIP))
50142 +                       break;
50143 +               if (znode_is_right_connected(scan) && scan->right != NULL) {
50144 +                       if (ZF_ISSET(scan->right, JNODE_RIP))
50145 +                               break;
50146 +                       assert("nikita-3285",
50147 +                              znode_is_left_connected(scan->right));
50148 +                       assert("nikita-3265",
50149 +                              ergo(scan != left,
50150 +                                   ZF_ISSET(scan, JNODE_HEARD_BANSHEE)));
50151 +                       assert("nikita-3284", scan->right->left == scan);
50152 +               } else
50153 +                       break;
50154 +       }
50155 +}
50156 +
50157 +int check__internal(const coord_t * coord, const char **error)
50158 +{
50159 +       reiser4_block_nr blk;
50160 +       znode *child;
50161 +       coord_t cpy;
50162 +
50163 +       blk = pointer_at(coord);
50164 +       if (!reiser4_blocknr_is_sane(&blk)) {
50165 +               *error = "Invalid pointer";
50166 +               return -1;
50167 +       }
50168 +       coord_dup(&cpy, coord);
50169 +       child = znode_at(&cpy, cpy.node);
50170 +       if (child != NULL) {
50171 +               znode *left_child;
50172 +               znode *right_child;
50173 +
50174 +               left_child = right_child = NULL;
50175 +
50176 +               assert("nikita-3256", znode_invariant(child));
50177 +               if (coord_prev_item(&cpy) == 0 && item_is_internal(&cpy)) {
50178 +                       left_child = znode_at(&cpy, cpy.node);
50179 +                       if (left_child != NULL) {
50180 +                               read_lock_tree(znode_get_tree(child));
50181 +                               check_link(left_child, child);
50182 +                               read_unlock_tree(znode_get_tree(child));
50183 +                               zput(left_child);
50184 +                       }
50185 +               }
50186 +               coord_dup(&cpy, coord);
50187 +               if (coord_next_item(&cpy) == 0 && item_is_internal(&cpy)) {
50188 +                       right_child = znode_at(&cpy, cpy.node);
50189 +                       if (right_child != NULL) {
50190 +                               read_lock_tree(znode_get_tree(child));
50191 +                               check_link(child, right_child);
50192 +                               read_unlock_tree(znode_get_tree(child));
50193 +                               zput(right_child);
50194 +                       }
50195 +               }
50196 +               zput(child);
50197 +       }
50198 +       return 0;
50199 +}
50200 +
50201 +#endif  /*  REISER4_DEBUG  */
50202 +
50203 +/* return true only if this item really points to "block" */
50204 +/* Audited by: green(2002.06.14) */
50205 +int has_pointer_to_internal(const coord_t * coord /* coord of item */ ,
50206 +                           const reiser4_block_nr * block      /* block number to
50207 +                                                                * check */ )
50208 +{
50209 +       assert("nikita-613", coord != NULL);
50210 +       assert("nikita-614", block != NULL);
50211 +
50212 +       return pointer_at(coord) == *block;
50213 +}
50214 +
50215 +/* hook called by ->create_item() method of node plugin after new internal
50216 +   item was just created.
50217 +
50218 +   This is point where pointer to new node is inserted into tree. Initialize
50219 +   parent pointer in child znode, insert child into sibling list and slum.
50220 +
50221 +*/
50222 +int create_hook_internal(const coord_t * item /* coord of item */ ,
50223 +                        void *arg /* child's left neighbor, if any */ )
50224 +{
50225 +       znode *child;
50226 +       __u64 child_ptr;
50227 +
50228 +       assert("nikita-1252", item != NULL);
50229 +       assert("nikita-1253", item->node != NULL);
50230 +       assert("nikita-1181", znode_get_level(item->node) > LEAF_LEVEL);
50231 +       assert("nikita-1450", item->unit_pos == 0);
50232 +
50233 +       /*
50234 +        * preparing to item insertion build_child_ptr_data sets pointer to
50235 +        * data to be inserted to jnode's blocknr which is in cpu byte
50236 +        * order. Node's create_item simply copied those data. As result we
50237 +        * have child pointer in cpu's byte order. Convert content of internal
50238 +        * item to little endian byte order.
50239 +        */
50240 +       child_ptr = get_unaligned((__u64 *)item_body_by_coord(item));
50241 +       reiser4_update_internal(item, &child_ptr);
50242 +
50243 +       child = znode_at(item, item->node);
50244 +       if (child != NULL && !IS_ERR(child)) {
50245 +               znode *left;
50246 +               int result = 0;
50247 +               reiser4_tree *tree;
50248 +
50249 +               left = arg;
50250 +               tree = znode_get_tree(item->node);
50251 +               write_lock_tree(tree);
50252 +               write_lock_dk(tree);
50253 +               assert("nikita-1400", (child->in_parent.node == NULL)
50254 +                      || (znode_above_root(child->in_parent.node)));
50255 +               ++item->node->c_count;
50256 +               coord_to_parent_coord(item, &child->in_parent);
50257 +               sibling_list_insert_nolock(child, left);
50258 +
50259 +               assert("nikita-3297", ZF_ISSET(child, JNODE_ORPHAN));
50260 +               ZF_CLR(child, JNODE_ORPHAN);
50261 +
50262 +               if ((left != NULL) && !keyeq(znode_get_rd_key(left),
50263 +                                            znode_get_rd_key(child))) {
50264 +                       znode_set_rd_key(child, znode_get_rd_key(left));
50265 +               }
50266 +               write_unlock_dk(tree);
50267 +               write_unlock_tree(tree);
50268 +               zput(child);
50269 +               return result;
50270 +       } else {
50271 +               if (child == NULL)
50272 +                       child = ERR_PTR(-EIO);
50273 +               return PTR_ERR(child);
50274 +       }
50275 +}
50276 +
50277 +/* hook called by ->cut_and_kill() method of node plugin just before internal
50278 +   item is removed.
50279 +
50280 +   This is point where empty node is removed from the tree. Clear parent
50281 +   pointer in child, and mark node for pending deletion.
50282 +
50283 +   Node will be actually deleted later and in several installations:
50284 +
50285 +    . when last lock on this node will be released, node will be removed from
50286 +    the sibling list and its lock will be invalidated
50287 +
50288 +    . when last reference to this node will be dropped, bitmap will be updated
50289 +    and node will be actually removed from the memory.
50290 +
50291 +*/
50292 +int kill_hook_internal(const coord_t * item /* coord of item */ ,
50293 +                      pos_in_node_t from UNUSED_ARG /* start unit */ ,
50294 +                      pos_in_node_t count UNUSED_ARG /* stop unit */ ,
50295 +                      struct carry_kill_data *p UNUSED_ARG)
50296 +{
50297 +       znode *child;
50298 +       int result = 0;
50299 +
50300 +       assert("nikita-1222", item != NULL);
50301 +       assert("nikita-1224", from == 0);
50302 +       assert("nikita-1225", count == 1);
50303 +
50304 +       child = znode_at(item, item->node);
50305 +       if (IS_ERR(child))
50306 +               return PTR_ERR(child);
50307 +       assert("edward-1560", child != NULL);
50308 +
50309 +       result = zload(child);
50310 +       if (result) {
50311 +               zput(child);
50312 +               return result;
50313 +       }
50314 +       if (node_is_empty(child)) {
50315 +               reiser4_tree *tree;
50316 +
50317 +               assert("nikita-1397", znode_is_write_locked(child));
50318 +               assert("nikita-1398", child->c_count == 0);
50319 +               assert("nikita-2546", ZF_ISSET(child, JNODE_HEARD_BANSHEE));
50320 +
50321 +               tree = znode_get_tree(item->node);
50322 +               write_lock_tree(tree);
50323 +               init_parent_coord(&child->in_parent, NULL);
50324 +               --item->node->c_count;
50325 +               write_unlock_tree(tree);
50326 +       } else {
50327 +               warning("nikita-1223",
50328 +                       "Cowardly refuse to remove link to non-empty node");
50329 +               result = RETERR(-EIO);
50330 +       }
50331 +       zrelse(child);
50332 +       zput(child);
50333 +       return result;
50334 +}
50335 +
50336 +/* hook called by ->shift() node plugin method when iternal item was just
50337 +   moved from one node to another.
50338 +
50339 +   Update parent pointer in child and c_counts in old and new parent
50340 +
50341 +*/
50342 +int shift_hook_internal(const coord_t * item /* coord of item */ ,
50343 +                       unsigned from UNUSED_ARG /* start unit */ ,
50344 +                       unsigned count UNUSED_ARG /* stop unit */ ,
50345 +                       znode * old_node /* old parent */ )
50346 +{
50347 +       znode *child;
50348 +       znode *new_node;
50349 +       reiser4_tree *tree;
50350 +
50351 +       assert("nikita-1276", item != NULL);
50352 +       assert("nikita-1277", from == 0);
50353 +       assert("nikita-1278", count == 1);
50354 +       assert("nikita-1451", item->unit_pos == 0);
50355 +
50356 +       new_node = item->node;
50357 +       assert("nikita-2132", new_node != old_node);
50358 +       tree = znode_get_tree(item->node);
50359 +       child = child_znode(item, old_node, 1, 0);
50360 +       if (child == NULL)
50361 +               return 0;
50362 +       if (!IS_ERR(child)) {
50363 +               write_lock_tree(tree);
50364 +               ++new_node->c_count;
50365 +               assert("nikita-1395", znode_parent(child) == old_node);
50366 +               assert("nikita-1396", old_node->c_count > 0);
50367 +               coord_to_parent_coord(item, &child->in_parent);
50368 +               assert("nikita-1781", znode_parent(child) == new_node);
50369 +               assert("nikita-1782",
50370 +                      check_tree_pointer(item, child) == NS_FOUND);
50371 +               --old_node->c_count;
50372 +               write_unlock_tree(tree);
50373 +               zput(child);
50374 +               return 0;
50375 +       } else
50376 +               return PTR_ERR(child);
50377 +}
50378 +
50379 +/* plugin->u.item.b.max_key_inside - not defined */
50380 +
50381 +/* plugin->u.item.b.nr_units - item.c:single_unit */
50382 +
50383 +/* Make Linus happy.
50384 +   Local variables:
50385 +   c-indentation-style: "K&R"
50386 +   mode-name: "LC"
50387 +   c-basic-offset: 8
50388 +   tab-width: 8
50389 +   fill-column: 120
50390 +   End:
50391 +*/
50392 diff -puN /dev/null fs/reiser4/plugin/item/internal.h
50393 --- /dev/null
50394 +++ a/fs/reiser4/plugin/item/internal.h
50395 @@ -0,0 +1,57 @@
50396 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
50397 +/* Internal item contains down-link to the child of the internal/twig
50398 +   node in a tree. It is internal items that are actually used during
50399 +   tree traversal. */
50400 +
50401 +#if !defined( __FS_REISER4_PLUGIN_ITEM_INTERNAL_H__ )
50402 +#define __FS_REISER4_PLUGIN_ITEM_INTERNAL_H__
50403 +
50404 +#include "../../forward.h"
50405 +#include "../../dformat.h"
50406 +
50407 +/* on-disk layout of internal item */
50408 +typedef struct internal_item_layout {
50409 +       /*  0 */ reiser4_dblock_nr pointer;
50410 +       /*  4 */
50411 +} internal_item_layout;
50412 +
50413 +struct cut_list;
50414 +
50415 +int mergeable_internal(const coord_t * p1, const coord_t * p2);
50416 +lookup_result lookup_internal(const reiser4_key * key, lookup_bias bias,
50417 +                             coord_t * coord);
50418 +/* store pointer from internal item into "block". Implementation of
50419 +    ->down_link() method */
50420 +extern void down_link_internal(const coord_t * coord, const reiser4_key * key,
50421 +                              reiser4_block_nr * block);
50422 +extern int has_pointer_to_internal(const coord_t * coord,
50423 +                                  const reiser4_block_nr * block);
50424 +extern int create_hook_internal(const coord_t * item, void *arg);
50425 +extern int kill_hook_internal(const coord_t * item, pos_in_node_t from,
50426 +                             pos_in_node_t count, struct carry_kill_data *);
50427 +extern int shift_hook_internal(const coord_t * item, unsigned from,
50428 +                              unsigned count, znode * old_node);
50429 +extern void reiser4_print_internal(const char *prefix, coord_t * coord);
50430 +
50431 +extern int utmost_child_internal(const coord_t * coord, sideof side,
50432 +                                jnode ** child);
50433 +int utmost_child_real_block_internal(const coord_t * coord, sideof side,
50434 +                                    reiser4_block_nr * block);
50435 +
50436 +extern void reiser4_update_internal(const coord_t * coord,
50437 +                                   const reiser4_block_nr * blocknr);
50438 +/* FIXME: reiserfs has check_internal */
50439 +extern int check__internal(const coord_t * coord, const char **error);
50440 +
50441 +/* __FS_REISER4_PLUGIN_ITEM_INTERNAL_H__ */
50442 +#endif
50443 +
50444 +/* Make Linus happy.
50445 +   Local variables:
50446 +   c-indentation-style: "K&R"
50447 +   mode-name: "LC"
50448 +   c-basic-offset: 8
50449 +   tab-width: 8
50450 +   fill-column: 120
50451 +   End:
50452 +*/
50453 diff -puN /dev/null fs/reiser4/plugin/item/item.c
50454 --- /dev/null
50455 +++ a/fs/reiser4/plugin/item/item.c
50456 @@ -0,0 +1,719 @@
50457 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
50458 +
50459 +/* definition of item plugins. */
50460 +
50461 +#include "../../forward.h"
50462 +#include "../../debug.h"
50463 +#include "../../key.h"
50464 +#include "../../coord.h"
50465 +#include "../plugin_header.h"
50466 +#include "sde.h"
50467 +#include "internal.h"
50468 +#include "item.h"
50469 +#include "static_stat.h"
50470 +#include "../plugin.h"
50471 +#include "../../znode.h"
50472 +#include "../../tree.h"
50473 +#include "../../context.h"
50474 +#include "ctail.h"
50475 +
50476 +/* return pointer to item body */
50477 +void item_body_by_coord_hard(coord_t * coord /* coord to query */ )
50478 +{
50479 +       assert("nikita-324", coord != NULL);
50480 +       assert("nikita-325", coord->node != NULL);
50481 +       assert("nikita-326", znode_is_loaded(coord->node));
50482 +       assert("nikita-3200", coord->offset == INVALID_OFFSET);
50483 +
50484 +       coord->offset =
50485 +           node_plugin_by_node(coord->node)->item_by_coord(coord) -
50486 +           zdata(coord->node);
50487 +       ON_DEBUG(coord->body_v = coord->node->times_locked);
50488 +}
50489 +
50490 +void *item_body_by_coord_easy(const coord_t * coord /* coord to query */ )
50491 +{
50492 +       return zdata(coord->node) + coord->offset;
50493 +}
50494 +
50495 +#if REISER4_DEBUG
50496 +
50497 +int item_body_is_valid(const coord_t * coord)
50498 +{
50499 +       return
50500 +           coord->offset ==
50501 +           node_plugin_by_node(coord->node)->item_by_coord(coord) -
50502 +           zdata(coord->node);
50503 +}
50504 +
50505 +#endif
50506 +
50507 +/* return length of item at @coord */
50508 +pos_in_node_t item_length_by_coord(const coord_t * coord /* coord to query */ )
50509 +{
50510 +       int len;
50511 +
50512 +       assert("nikita-327", coord != NULL);
50513 +       assert("nikita-328", coord->node != NULL);
50514 +       assert("nikita-329", znode_is_loaded(coord->node));
50515 +
50516 +       len = node_plugin_by_node(coord->node)->length_by_coord(coord);
50517 +       return len;
50518 +}
50519 +
50520 +void obtain_item_plugin(const coord_t * coord)
50521 +{
50522 +       assert("nikita-330", coord != NULL);
50523 +       assert("nikita-331", coord->node != NULL);
50524 +       assert("nikita-332", znode_is_loaded(coord->node));
50525 +
50526 +       coord_set_iplug((coord_t *) coord,
50527 +                       node_plugin_by_node(coord->node)->
50528 +                       plugin_by_coord(coord));
50529 +       assert("nikita-2479",
50530 +              coord_iplug(coord) ==
50531 +              node_plugin_by_node(coord->node)->plugin_by_coord(coord));
50532 +}
50533 +
50534 +/* return id of item */
50535 +/* Audited by: green(2002.06.15) */
50536 +item_id item_id_by_coord(const coord_t * coord /* coord to query */ )
50537 +{
50538 +       assert("vs-539", coord != NULL);
50539 +       assert("vs-538", coord->node != NULL);
50540 +       assert("vs-537", znode_is_loaded(coord->node));
50541 +       assert("vs-536", item_plugin_by_coord(coord) != NULL);
50542 +       assert("vs-540",
50543 +              item_id_by_plugin(item_plugin_by_coord(coord)) < LAST_ITEM_ID);
50544 +
50545 +       return item_id_by_plugin(item_plugin_by_coord(coord));
50546 +}
50547 +
50548 +/* return key of item at @coord */
50549 +/* Audited by: green(2002.06.15) */
50550 +reiser4_key *item_key_by_coord(const coord_t * coord /* coord to query */ ,
50551 +                              reiser4_key * key /* result */ )
50552 +{
50553 +       assert("nikita-338", coord != NULL);
50554 +       assert("nikita-339", coord->node != NULL);
50555 +       assert("nikita-340", znode_is_loaded(coord->node));
50556 +
50557 +       return node_plugin_by_node(coord->node)->key_at(coord, key);
50558 +}
50559 +
50560 +/* this returns max key in the item */
50561 +reiser4_key *max_item_key_by_coord(const coord_t * coord /* coord to query */ ,
50562 +                                  reiser4_key * key /* result */ )
50563 +{
50564 +       coord_t last;
50565 +
50566 +       assert("nikita-338", coord != NULL);
50567 +       assert("nikita-339", coord->node != NULL);
50568 +       assert("nikita-340", znode_is_loaded(coord->node));
50569 +
50570 +       /* make coord pointing to last item's unit */
50571 +       coord_dup(&last, coord);
50572 +       last.unit_pos = coord_num_units(&last) - 1;
50573 +       assert("vs-1560", coord_is_existing_unit(&last));
50574 +
50575 +       max_unit_key_by_coord(&last, key);
50576 +       return key;
50577 +}
50578 +
50579 +/* return key of unit at @coord */
50580 +reiser4_key *unit_key_by_coord(const coord_t * coord /* coord to query */ ,
50581 +                              reiser4_key * key /* result */ )
50582 +{
50583 +       assert("nikita-772", coord != NULL);
50584 +       assert("nikita-774", coord->node != NULL);
50585 +       assert("nikita-775", znode_is_loaded(coord->node));
50586 +
50587 +       if (item_plugin_by_coord(coord)->b.unit_key != NULL)
50588 +               return item_plugin_by_coord(coord)->b.unit_key(coord, key);
50589 +       else
50590 +               return item_key_by_coord(coord, key);
50591 +}
50592 +
50593 +/* return the biggest key contained the unit @coord */
50594 +reiser4_key *max_unit_key_by_coord(const coord_t * coord /* coord to query */ ,
50595 +                                  reiser4_key * key /* result */ )
50596 +{
50597 +       assert("nikita-772", coord != NULL);
50598 +       assert("nikita-774", coord->node != NULL);
50599 +       assert("nikita-775", znode_is_loaded(coord->node));
50600 +
50601 +       if (item_plugin_by_coord(coord)->b.max_unit_key != NULL)
50602 +               return item_plugin_by_coord(coord)->b.max_unit_key(coord, key);
50603 +       else
50604 +               return unit_key_by_coord(coord, key);
50605 +}
50606 +
50607 +/* ->max_key_inside() method for items consisting of exactly one key (like
50608 +    stat-data) */
50609 +static reiser4_key *max_key_inside_single_key(const coord_t *
50610 +                                             coord /* coord of item */ ,
50611 +                                             reiser4_key *
50612 +                                             result /* resulting key */ )
50613 +{
50614 +       assert("nikita-604", coord != NULL);
50615 +
50616 +       /* coord -> key is starting key of this item and it has to be already
50617 +          filled in */
50618 +       return unit_key_by_coord(coord, result);
50619 +}
50620 +
50621 +/* ->nr_units() method for items consisting of exactly one unit always */
50622 +pos_in_node_t
50623 +nr_units_single_unit(const coord_t * coord UNUSED_ARG /* coord of item */ )
50624 +{
50625 +       return 1;
50626 +}
50627 +
50628 +static int
50629 +paste_no_paste(coord_t * coord UNUSED_ARG,
50630 +              reiser4_item_data * data UNUSED_ARG,
50631 +              carry_plugin_info * info UNUSED_ARG)
50632 +{
50633 +       return 0;
50634 +}
50635 +
50636 +/* default ->fast_paste() method */
50637 +static int
50638 +agree_to_fast_op(const coord_t * coord UNUSED_ARG /* coord of item */ )
50639 +{
50640 +       return 1;
50641 +}
50642 +
50643 +int item_can_contain_key(const coord_t * item /* coord of item */ ,
50644 +                        const reiser4_key * key /* key to check */ ,
50645 +                        const reiser4_item_data * data /* parameters of item
50646 +                                                        * being created */ )
50647 +{
50648 +       item_plugin *iplug;
50649 +       reiser4_key min_key_in_item;
50650 +       reiser4_key max_key_in_item;
50651 +
50652 +       assert("nikita-1658", item != NULL);
50653 +       assert("nikita-1659", key != NULL);
50654 +
50655 +       iplug = item_plugin_by_coord(item);
50656 +       if (iplug->b.can_contain_key != NULL)
50657 +               return iplug->b.can_contain_key(item, key, data);
50658 +       else {
50659 +               assert("nikita-1681", iplug->b.max_key_inside != NULL);
50660 +               item_key_by_coord(item, &min_key_in_item);
50661 +               iplug->b.max_key_inside(item, &max_key_in_item);
50662 +
50663 +               /* can contain key if
50664 +                  min_key_in_item <= key &&
50665 +                  key <= max_key_in_item
50666 +                */
50667 +               return keyle(&min_key_in_item, key)
50668 +                   && keyle(key, &max_key_in_item);
50669 +       }
50670 +}
50671 +
50672 +/* mergeable method for non mergeable items */
50673 +static int
50674 +not_mergeable(const coord_t * i1 UNUSED_ARG, const coord_t * i2 UNUSED_ARG)
50675 +{
50676 +       return 0;
50677 +}
50678 +
50679 +/* return 0 if @item1 and @item2 are not mergeable, !0 - otherwise */
50680 +int are_items_mergeable(const coord_t * i1 /* coord of first item */ ,
50681 +                       const coord_t * i2 /* coord of second item */ )
50682 +{
50683 +       item_plugin *iplug;
50684 +       reiser4_key k1;
50685 +       reiser4_key k2;
50686 +
50687 +       assert("nikita-1336", i1 != NULL);
50688 +       assert("nikita-1337", i2 != NULL);
50689 +
50690 +       iplug = item_plugin_by_coord(i1);
50691 +       assert("nikita-1338", iplug != NULL);
50692 +
50693 +       /* NOTE-NIKITA are_items_mergeable() is also called by assertions in
50694 +          shifting code when nodes are in "suspended" state. */
50695 +       assert("nikita-1663",
50696 +              keyle(item_key_by_coord(i1, &k1), item_key_by_coord(i2, &k2)));
50697 +
50698 +       if (iplug->b.mergeable != NULL) {
50699 +               return iplug->b.mergeable(i1, i2);
50700 +       } else if (iplug->b.max_key_inside != NULL) {
50701 +               iplug->b.max_key_inside(i1, &k1);
50702 +               item_key_by_coord(i2, &k2);
50703 +
50704 +               /* mergeable if ->max_key_inside() >= key of i2; */
50705 +               return keyge(iplug->b.max_key_inside(i1, &k1),
50706 +                            item_key_by_coord(i2, &k2));
50707 +       } else {
50708 +               item_key_by_coord(i1, &k1);
50709 +               item_key_by_coord(i2, &k2);
50710 +
50711 +               return
50712 +                   (get_key_locality(&k1) == get_key_locality(&k2)) &&
50713 +                   (get_key_objectid(&k1) == get_key_objectid(&k2))
50714 +                   && (iplug == item_plugin_by_coord(i2));
50715 +       }
50716 +}
50717 +
50718 +int item_is_extent(const coord_t * item)
50719 +{
50720 +       assert("vs-482", coord_is_existing_item(item));
50721 +       return item_id_by_coord(item) == EXTENT_POINTER_ID;
50722 +}
50723 +
50724 +int item_is_tail(const coord_t * item)
50725 +{
50726 +       assert("vs-482", coord_is_existing_item(item));
50727 +       return item_id_by_coord(item) == FORMATTING_ID;
50728 +}
50729 +
50730 +#if REISER4_DEBUG
50731 +
50732 +int item_is_statdata(const coord_t * item)
50733 +{
50734 +       assert("vs-516", coord_is_existing_item(item));
50735 +       return plugin_of_group(item_plugin_by_coord(item), STAT_DATA_ITEM_TYPE);
50736 +}
50737 +
50738 +int item_is_ctail(const coord_t * item)
50739 +{
50740 +       assert("edward-xx", coord_is_existing_item(item));
50741 +       return item_id_by_coord(item) == CTAIL_ID;
50742 +}
50743 +
50744 +#endif  /*  REISER4_DEBUG  */
50745 +
50746 +static int change_item(struct inode *inode,
50747 +                      reiser4_plugin * plugin,
50748 +                      pset_member memb)
50749 +{
50750 +       /* cannot change constituent item (sd, or dir_item) */
50751 +       return RETERR(-EINVAL);
50752 +}
50753 +
50754 +static reiser4_plugin_ops item_plugin_ops = {
50755 +       .init = NULL,
50756 +       .load = NULL,
50757 +       .save_len = NULL,
50758 +       .save = NULL,
50759 +       .change = change_item
50760 +};
50761 +
50762 +item_plugin item_plugins[LAST_ITEM_ID] = {
50763 +       [STATIC_STAT_DATA_ID] = {
50764 +               .h = {
50765 +                       .type_id = REISER4_ITEM_PLUGIN_TYPE,
50766 +                       .id = STATIC_STAT_DATA_ID,
50767 +                       .groups = (1 << STAT_DATA_ITEM_TYPE),
50768 +                       .pops = &item_plugin_ops,
50769 +                       .label = "sd",
50770 +                       .desc = "stat-data",
50771 +                       .linkage = {NULL, NULL}
50772 +               },
50773 +               .b = {
50774 +                       .max_key_inside = max_key_inside_single_key,
50775 +                       .can_contain_key = NULL,
50776 +                       .mergeable = not_mergeable,
50777 +                       .nr_units = nr_units_single_unit,
50778 +                       .lookup = NULL,
50779 +                       .init = NULL,
50780 +                       .paste = paste_no_paste,
50781 +                       .fast_paste = NULL,
50782 +                       .can_shift = NULL,
50783 +                       .copy_units = NULL,
50784 +                       .create_hook = NULL,
50785 +                       .kill_hook = NULL,
50786 +                       .shift_hook = NULL,
50787 +                       .cut_units = NULL,
50788 +                       .kill_units = NULL,
50789 +                       .unit_key = NULL,
50790 +                       .max_unit_key = NULL,
50791 +                       .estimate = NULL,
50792 +                       .item_data_by_flow = NULL,
50793 +#if REISER4_DEBUG
50794 +                       .check = NULL
50795 +#endif
50796 +               },
50797 +               .f = {
50798 +                       .utmost_child = NULL,
50799 +                       .utmost_child_real_block = NULL,
50800 +                       .update = NULL,
50801 +                       .scan = NULL,
50802 +                       .convert = NULL
50803 +               },
50804 +               .s = {
50805 +                       .sd = {
50806 +                               .init_inode = init_inode_static_sd,
50807 +                               .save_len = save_len_static_sd,
50808 +                               .save = save_static_sd
50809 +                       }
50810 +               }
50811 +       },
50812 +       [SIMPLE_DIR_ENTRY_ID] = {
50813 +               .h = {
50814 +                       .type_id = REISER4_ITEM_PLUGIN_TYPE,
50815 +                       .id = SIMPLE_DIR_ENTRY_ID,
50816 +                       .groups = (1 << DIR_ENTRY_ITEM_TYPE),
50817 +                       .pops = &item_plugin_ops,
50818 +                       .label = "de",
50819 +                       .desc = "directory entry",
50820 +                       .linkage = {NULL, NULL}
50821 +               },
50822 +               .b = {
50823 +                       .max_key_inside = max_key_inside_single_key,
50824 +                       .can_contain_key = NULL,
50825 +                       .mergeable = NULL,
50826 +                       .nr_units = nr_units_single_unit,
50827 +                       .lookup = NULL,
50828 +                       .init = NULL,
50829 +                       .paste = NULL,
50830 +                       .fast_paste = NULL,
50831 +                       .can_shift = NULL,
50832 +                       .copy_units = NULL,
50833 +                       .create_hook = NULL,
50834 +                       .kill_hook = NULL,
50835 +                       .shift_hook = NULL,
50836 +                       .cut_units = NULL,
50837 +                       .kill_units = NULL,
50838 +                       .unit_key = NULL,
50839 +                       .max_unit_key = NULL,
50840 +                       .estimate = NULL,
50841 +                       .item_data_by_flow = NULL,
50842 +#if REISER4_DEBUG
50843 +                       .check = NULL
50844 +#endif
50845 +               },
50846 +               .f = {
50847 +                       .utmost_child = NULL,
50848 +                       .utmost_child_real_block = NULL,
50849 +                       .update = NULL,
50850 +                       .scan = NULL,
50851 +                       .convert = NULL
50852 +               },
50853 +               .s = {
50854 +                       .dir = {
50855 +                               .extract_key = extract_key_de,
50856 +                               .update_key = update_key_de,
50857 +                               .extract_name = extract_name_de,
50858 +                               .extract_file_type = extract_file_type_de,
50859 +                               .add_entry = add_entry_de,
50860 +                               .rem_entry = rem_entry_de,
50861 +                               .max_name_len = max_name_len_de
50862 +                       }
50863 +               }
50864 +       },
50865 +       [COMPOUND_DIR_ID] = {
50866 +               .h = {
50867 +                       .type_id = REISER4_ITEM_PLUGIN_TYPE,
50868 +                       .id = COMPOUND_DIR_ID,
50869 +                       .groups = (1 << DIR_ENTRY_ITEM_TYPE),
50870 +                       .pops = &item_plugin_ops,
50871 +                       .label = "cde",
50872 +                       .desc = "compressed directory entry",
50873 +                       .linkage = {NULL, NULL}
50874 +               },
50875 +               .b = {
50876 +                       .max_key_inside = max_key_inside_cde,
50877 +                       .can_contain_key = can_contain_key_cde,
50878 +                       .mergeable = mergeable_cde,
50879 +                       .nr_units = nr_units_cde,
50880 +                       .lookup = lookup_cde,
50881 +                       .init = init_cde,
50882 +                       .paste = paste_cde,
50883 +                       .fast_paste = agree_to_fast_op,
50884 +                       .can_shift = can_shift_cde,
50885 +                       .copy_units = copy_units_cde,
50886 +                       .create_hook = NULL,
50887 +                       .kill_hook = NULL,
50888 +                       .shift_hook = NULL,
50889 +                       .cut_units = cut_units_cde,
50890 +                       .kill_units = kill_units_cde,
50891 +                       .unit_key = unit_key_cde,
50892 +                       .max_unit_key = unit_key_cde,
50893 +                       .estimate = estimate_cde,
50894 +                       .item_data_by_flow = NULL,
50895 +#if REISER4_DEBUG
50896 +                       .check = reiser4_check_cde
50897 +#endif
50898 +               },
50899 +               .f = {
50900 +                       .utmost_child = NULL,
50901 +                       .utmost_child_real_block = NULL,
50902 +                       .update = NULL,
50903 +                       .scan = NULL,
50904 +                       .convert = NULL
50905 +               },
50906 +               .s = {
50907 +                       .dir = {
50908 +                               .extract_key = extract_key_cde,
50909 +                               .update_key = update_key_cde,
50910 +                               .extract_name = extract_name_cde,
50911 +                               .extract_file_type = extract_file_type_de,
50912 +                               .add_entry = add_entry_cde,
50913 +                               .rem_entry = rem_entry_cde,
50914 +                               .max_name_len = max_name_len_cde
50915 +                       }
50916 +               }
50917 +       },
50918 +       [NODE_POINTER_ID] = {
50919 +               .h = {
50920 +                       .type_id = REISER4_ITEM_PLUGIN_TYPE,
50921 +                       .id = NODE_POINTER_ID,
50922 +                       .groups = (1 << INTERNAL_ITEM_TYPE),
50923 +                       .pops = NULL,
50924 +                       .label = "internal",
50925 +                       .desc = "internal item",
50926 +                       .linkage = {NULL, NULL}
50927 +               },
50928 +               .b = {
50929 +                       .max_key_inside = NULL,
50930 +                       .can_contain_key = NULL,
50931 +                       .mergeable = mergeable_internal,
50932 +                       .nr_units = nr_units_single_unit,
50933 +                       .lookup = lookup_internal,
50934 +                       .init = NULL,
50935 +                       .paste = NULL,
50936 +                       .fast_paste = NULL,
50937 +                       .can_shift = NULL,
50938 +                       .copy_units = NULL,
50939 +                       .create_hook = create_hook_internal,
50940 +                       .kill_hook = kill_hook_internal,
50941 +                       .shift_hook = shift_hook_internal,
50942 +                       .cut_units = NULL,
50943 +                       .kill_units = NULL,
50944 +                       .unit_key = NULL,
50945 +                       .max_unit_key = NULL,
50946 +                       .estimate = NULL,
50947 +                       .item_data_by_flow = NULL,
50948 +#if REISER4_DEBUG
50949 +                       .check = check__internal
50950 +#endif
50951 +               },
50952 +               .f = {
50953 +                       .utmost_child = utmost_child_internal,
50954 +                       .utmost_child_real_block =
50955 +                       utmost_child_real_block_internal,
50956 +                       .update = reiser4_update_internal,
50957 +                       .scan = NULL,
50958 +                       .convert = NULL
50959 +               },
50960 +               .s = {
50961 +                       .internal = {
50962 +                               .down_link = down_link_internal,
50963 +                               .has_pointer_to = has_pointer_to_internal
50964 +                       }
50965 +               }
50966 +       },
50967 +       [EXTENT_POINTER_ID] = {
50968 +               .h = {
50969 +                       .type_id = REISER4_ITEM_PLUGIN_TYPE,
50970 +                       .id = EXTENT_POINTER_ID,
50971 +                       .groups = (1 << UNIX_FILE_METADATA_ITEM_TYPE),
50972 +                       .pops = NULL,
50973 +                       .label = "extent",
50974 +                       .desc = "extent item",
50975 +                       .linkage = {NULL, NULL}
50976 +               },
50977 +               .b = {
50978 +                       .max_key_inside = max_key_inside_extent,
50979 +                       .can_contain_key = can_contain_key_extent,
50980 +                       .mergeable = mergeable_extent,
50981 +                       .nr_units = nr_units_extent,
50982 +                       .lookup = lookup_extent,
50983 +                       .init = NULL,
50984 +                       .paste = paste_extent,
50985 +                       .fast_paste = agree_to_fast_op,
50986 +                       .can_shift = can_shift_extent,
50987 +                       .create_hook = create_hook_extent,
50988 +                       .copy_units = copy_units_extent,
50989 +                       .kill_hook = kill_hook_extent,
50990 +                       .shift_hook = NULL,
50991 +                       .cut_units = cut_units_extent,
50992 +                       .kill_units = kill_units_extent,
50993 +                       .unit_key = unit_key_extent,
50994 +                       .max_unit_key = max_unit_key_extent,
50995 +                       .estimate = NULL,
50996 +                       .item_data_by_flow = NULL,
50997 +#if REISER4_DEBUG
50998 +                       .check = reiser4_check_extent
50999 +#endif
51000 +               },
51001 +               .f = {
51002 +                       .utmost_child = utmost_child_extent,
51003 +                       .utmost_child_real_block =
51004 +                       utmost_child_real_block_extent,
51005 +                       .update = NULL,
51006 +                       .scan = reiser4_scan_extent,
51007 +                       .convert = NULL,
51008 +                       .key_by_offset = key_by_offset_extent
51009 +               },
51010 +               .s = {
51011 +                       .file = {
51012 +                               .write = reiser4_write_extent,
51013 +                               .read = reiser4_read_extent,
51014 +                               .readpage = reiser4_readpage_extent,
51015 +                               .get_block = get_block_address_extent,
51016 +                               .append_key = append_key_extent,
51017 +                               .init_coord_extension =
51018 +                               init_coord_extension_extent
51019 +                       }
51020 +               }
51021 +       },
51022 +       [FORMATTING_ID] = {
51023 +               .h = {
51024 +                       .type_id = REISER4_ITEM_PLUGIN_TYPE,
51025 +                       .id = FORMATTING_ID,
51026 +                       .groups = (1 << UNIX_FILE_METADATA_ITEM_TYPE),
51027 +                       .pops = NULL,
51028 +                       .label = "body",
51029 +                       .desc = "body (or tail?) item",
51030 +                       .linkage = {NULL, NULL}
51031 +               },
51032 +               .b = {
51033 +                       .max_key_inside = max_key_inside_tail,
51034 +                       .can_contain_key = can_contain_key_tail,
51035 +                       .mergeable = mergeable_tail,
51036 +                       .nr_units = nr_units_tail,
51037 +                       .lookup = lookup_tail,
51038 +                       .init = NULL,
51039 +                       .paste = paste_tail,
51040 +                       .fast_paste = agree_to_fast_op,
51041 +                       .can_shift = can_shift_tail,
51042 +                       .create_hook = NULL,
51043 +                       .copy_units = copy_units_tail,
51044 +                       .kill_hook = kill_hook_tail,
51045 +                       .shift_hook = NULL,
51046 +                       .cut_units = cut_units_tail,
51047 +                       .kill_units = kill_units_tail,
51048 +                       .unit_key = unit_key_tail,
51049 +                       .max_unit_key = unit_key_tail,
51050 +                       .estimate = NULL,
51051 +                       .item_data_by_flow = NULL,
51052 +#if REISER4_DEBUG
51053 +                       .check = NULL
51054 +#endif
51055 +               },
51056 +               .f = {
51057 +                       .utmost_child = NULL,
51058 +                       .utmost_child_real_block = NULL,
51059 +                       .update = NULL,
51060 +                       .scan = NULL,
51061 +                       .convert = NULL
51062 +               },
51063 +               .s = {
51064 +                       .file = {
51065 +                               .write = reiser4_write_tail,
51066 +                               .read = reiser4_read_tail,
51067 +                               .readpage = readpage_tail,
51068 +                               .get_block = get_block_address_tail,
51069 +                               .append_key = append_key_tail,
51070 +                               .init_coord_extension =
51071 +                               init_coord_extension_tail
51072 +                       }
51073 +               }
51074 +       },
51075 +       [CTAIL_ID] = {
51076 +               .h = {
51077 +                       .type_id = REISER4_ITEM_PLUGIN_TYPE,
51078 +                       .id = CTAIL_ID,
51079 +                       .groups = (1 << UNIX_FILE_METADATA_ITEM_TYPE),
51080 +                       .pops = NULL,
51081 +                       .label = "ctail",
51082 +                       .desc = "cryptcompress tail item",
51083 +                       .linkage = {NULL, NULL}
51084 +               },
51085 +               .b = {
51086 +                       .max_key_inside = max_key_inside_tail,
51087 +                       .can_contain_key = can_contain_key_ctail,
51088 +                       .mergeable = mergeable_ctail,
51089 +                       .nr_units = nr_units_ctail,
51090 +                       .lookup = NULL,
51091 +                       .init = init_ctail,
51092 +                       .paste = paste_ctail,
51093 +                       .fast_paste = agree_to_fast_op,
51094 +                       .can_shift = can_shift_ctail,
51095 +                       .create_hook = create_hook_ctail,
51096 +                       .copy_units = copy_units_ctail,
51097 +                       .kill_hook = kill_hook_ctail,
51098 +                       .shift_hook = shift_hook_ctail,
51099 +                       .cut_units = cut_units_ctail,
51100 +                       .kill_units = kill_units_ctail,
51101 +                       .unit_key = unit_key_tail,
51102 +                       .max_unit_key = unit_key_tail,
51103 +                       .estimate = estimate_ctail,
51104 +                       .item_data_by_flow = NULL,
51105 +#if REISER4_DEBUG
51106 +                       .check = check_ctail
51107 +#endif
51108 +               },
51109 +               .f = {
51110 +                       .utmost_child = utmost_child_ctail,
51111 +                       /* FIXME-EDWARD: write this */
51112 +                       .utmost_child_real_block = NULL,
51113 +                       .update = NULL,
51114 +                       .scan = scan_ctail,
51115 +                       .convert = convert_ctail
51116 +               },
51117 +               .s = {
51118 +                       .file = {
51119 +                               .write = NULL,
51120 +                               .read = read_ctail,
51121 +                               .readpage = readpage_ctail,
51122 +                               .get_block = get_block_address_tail,
51123 +                               .append_key = append_key_ctail,
51124 +                               .init_coord_extension =
51125 +                               init_coord_extension_tail
51126 +                       }
51127 +               }
51128 +       },
51129 +       [BLACK_BOX_ID] = {
51130 +               .h = {
51131 +                       .type_id = REISER4_ITEM_PLUGIN_TYPE,
51132 +                       .id = BLACK_BOX_ID,
51133 +                       .groups = (1 << OTHER_ITEM_TYPE),
51134 +                       .pops = NULL,
51135 +                       .label = "blackbox",
51136 +                       .desc = "black box item",
51137 +                       .linkage = {NULL, NULL}
51138 +               },
51139 +               .b = {
51140 +                       .max_key_inside = NULL,
51141 +                       .can_contain_key = NULL,
51142 +                       .mergeable = not_mergeable,
51143 +                       .nr_units = nr_units_single_unit,
51144 +                       /* to need for ->lookup method */
51145 +                       .lookup = NULL,
51146 +                       .init = NULL,
51147 +                       .paste = NULL,
51148 +                       .fast_paste = NULL,
51149 +                       .can_shift = NULL,
51150 +                       .copy_units = NULL,
51151 +                       .create_hook = NULL,
51152 +                       .kill_hook = NULL,
51153 +                       .shift_hook = NULL,
51154 +                       .cut_units = NULL,
51155 +                       .kill_units = NULL,
51156 +                       .unit_key = NULL,
51157 +                       .max_unit_key = NULL,
51158 +                       .estimate = NULL,
51159 +                       .item_data_by_flow = NULL,
51160 +#if REISER4_DEBUG
51161 +                       .check = NULL
51162 +#endif
51163 +               }
51164 +       }
51165 +};
51166 +
51167 +/* Make Linus happy.
51168 +   Local variables:
51169 +   c-indentation-style: "K&R"
51170 +   mode-name: "LC"
51171 +   c-basic-offset: 8
51172 +   tab-width: 8
51173 +   fill-column: 120
51174 +   End:
51175 +*/
51176 diff -puN /dev/null fs/reiser4/plugin/item/item.h
51177 --- /dev/null
51178 +++ a/fs/reiser4/plugin/item/item.h
51179 @@ -0,0 +1,398 @@
51180 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
51181 +
51182 +/* first read balance.c comments before reading this */
51183 +
51184 +/* An item_plugin implements all of the operations required for
51185 +   balancing that are item specific. */
51186 +
51187 +/* an item plugin also implements other operations that are specific to that
51188 +   item.  These go into the item specific operations portion of the item
51189 +   handler, and all of the item specific portions of the item handler are put
51190 +   into a union. */
51191 +
51192 +#if !defined( __REISER4_ITEM_H__ )
51193 +#define __REISER4_ITEM_H__
51194 +
51195 +#include "../../forward.h"
51196 +#include "../plugin_header.h"
51197 +#include "../../dformat.h"
51198 +#include "../../seal.h"
51199 +#include "../../plugin/file/file.h"
51200 +
51201 +#include <linux/fs.h>          /* for struct file, struct inode  */
51202 +#include <linux/mm.h>          /* for struct page */
51203 +#include <linux/dcache.h>      /* for struct dentry */
51204 +
51205 +typedef enum {
51206 +       STAT_DATA_ITEM_TYPE,
51207 +       DIR_ENTRY_ITEM_TYPE,
51208 +       INTERNAL_ITEM_TYPE,
51209 +       UNIX_FILE_METADATA_ITEM_TYPE,
51210 +       OTHER_ITEM_TYPE
51211 +} item_type_id;
51212 +
51213 +/* this is the part of each item plugin that all items are expected to
51214 +   support or at least explicitly fail to support by setting the
51215 +   pointer to null. */
51216 +struct balance_ops {
51217 +       /* operations called by balancing
51218 +
51219 +          It is interesting to consider that some of these item
51220 +          operations could be given sources or targets that are not
51221 +          really items in nodes.  This could be ok/useful.
51222 +
51223 +        */
51224 +       /* maximal key that can _possibly_ be occupied by this item
51225 +
51226 +          When inserting, and node ->lookup() method (called by
51227 +          coord_by_key()) reaches an item after binary search,
51228 +          the  ->max_key_inside() item plugin method is used to determine
51229 +          whether new item should pasted into existing item
51230 +          (new_key<=max_key_inside()) or new item has to be created
51231 +          (new_key>max_key_inside()).
51232 +
51233 +          For items that occupy exactly one key (like stat-data)
51234 +          this method should return this key. For items that can
51235 +          grow indefinitely (extent, directory item) this should
51236 +          return reiser4_max_key().
51237 +
51238 +          For example extent with the key
51239 +
51240 +          (LOCALITY,4,OBJID,STARTING-OFFSET), and length BLK blocks,
51241 +
51242 +          ->max_key_inside is (LOCALITY,4,OBJID,0xffffffffffffffff), and
51243 +        */
51244 +       reiser4_key *(*max_key_inside) (const coord_t *, reiser4_key *);
51245 +
51246 +       /* true if item @coord can merge data at @key. */
51247 +       int (*can_contain_key) (const coord_t *, const reiser4_key *,
51248 +                               const reiser4_item_data *);
51249 +       /* mergeable() - check items for mergeability
51250 +
51251 +          Optional method. Returns true if two items can be merged.
51252 +
51253 +        */
51254 +       int (*mergeable) (const coord_t *, const coord_t *);
51255 +
51256 +       /* number of atomic things in an item.
51257 +          NOTE FOR CONTRIBUTORS: use a generic method
51258 +          nr_units_single_unit() for solid (atomic) items, as
51259 +          tree operations use it as a criterion of solidness
51260 +          (see is_solid_item macro) */
51261 +       pos_in_node_t(*nr_units) (const coord_t *);
51262 +
51263 +       /* search within item for a unit within the item, and return a
51264 +          pointer to it.  This can be used to calculate how many
51265 +          bytes to shrink an item if you use pointer arithmetic and
51266 +          compare to the start of the item body if the item's data
51267 +          are continuous in the node, if the item's data are not
51268 +          continuous in the node, all sorts of other things are maybe
51269 +          going to break as well. */
51270 +        lookup_result(*lookup) (const reiser4_key *, lookup_bias, coord_t *);
51271 +       /* method called by ode_plugin->create_item() to initialise new
51272 +          item */
51273 +       int (*init) (coord_t * target, coord_t * from,
51274 +                    reiser4_item_data * data);
51275 +       /* method called (e.g., by reiser4_resize_item()) to place new data
51276 +          into item when it grows */
51277 +       int (*paste) (coord_t *, reiser4_item_data *, carry_plugin_info *);
51278 +       /* return true if paste into @coord is allowed to skip
51279 +          carry. That is, if such paste would require any changes
51280 +          at the parent level
51281 +        */
51282 +       int (*fast_paste) (const coord_t *);
51283 +       /* how many but not more than @want units of @source can be
51284 +          shifted into @target node. If pend == append - we try to
51285 +          append last item of @target by first units of @source. If
51286 +          pend == prepend - we try to "prepend" first item in @target
51287 +          by last units of @source. @target node has @free_space
51288 +          bytes of free space. Total size of those units are returned
51289 +          via @size.
51290 +
51291 +          @target is not NULL if shifting to the mergeable item and
51292 +          NULL is new item will be created during shifting.
51293 +        */
51294 +       int (*can_shift) (unsigned free_space, coord_t *,
51295 +                         znode *, shift_direction, unsigned *size,
51296 +                         unsigned want);
51297 +
51298 +       /* starting off @from-th unit of item @source append or
51299 +          prepend @count units to @target. @target has been already
51300 +          expanded by @free_space bytes. That must be exactly what is
51301 +          needed for those items in @target. If @where_is_free_space
51302 +          == SHIFT_LEFT - free space is at the end of @target item,
51303 +          othersize - it is in the beginning of it. */
51304 +       void (*copy_units) (coord_t *, coord_t *,
51305 +                           unsigned from, unsigned count,
51306 +                           shift_direction where_is_free_space,
51307 +                           unsigned free_space);
51308 +
51309 +       int (*create_hook) (const coord_t *, void *);
51310 +       /* do whatever is necessary to do when @count units starting
51311 +          from @from-th one are removed from the tree */
51312 +       /* FIXME-VS: this is used to be here for, in particular,
51313 +          extents and items of internal type to free blocks they point
51314 +          to at the same time with removing items from a
51315 +          tree. Problems start, however, when dealloc_block fails due
51316 +          to some reason. Item gets removed, but blocks it pointed to
51317 +          are not freed. It is not clear how to fix this for items of
51318 +          internal type because a need to remove internal item may
51319 +          appear in the middle of balancing, and there is no way to
51320 +          undo changes made. OTOH, if space allocator involves
51321 +          balancing to perform dealloc_block - this will probably
51322 +          break balancing due to deadlock issues
51323 +        */
51324 +       int (*kill_hook) (const coord_t *, pos_in_node_t from,
51325 +                         pos_in_node_t count, struct carry_kill_data *);
51326 +       int (*shift_hook) (const coord_t *, unsigned from, unsigned count,
51327 +                          znode * _node);
51328 +
51329 +       /* unit @*from contains @from_key. unit @*to contains @to_key. Cut all keys between @from_key and @to_key
51330 +          including boundaries. When units are cut from item beginning - move space which gets freed to head of
51331 +          item. When units are cut from item end - move freed space to item end. When units are cut from the middle of
51332 +          item - move freed space to item head. Return amount of space which got freed. Save smallest removed key in
51333 +          @smallest_removed if it is not 0. Save new first item key in @new_first_key if it is not 0
51334 +        */
51335 +       int (*cut_units) (coord_t *, pos_in_node_t from, pos_in_node_t to,
51336 +                         struct carry_cut_data *,
51337 +                         reiser4_key * smallest_removed,
51338 +                         reiser4_key * new_first_key);
51339 +
51340 +       /* like cut_units, except that these units are removed from the
51341 +          tree, not only from a node */
51342 +       int (*kill_units) (coord_t *, pos_in_node_t from, pos_in_node_t to,
51343 +                          struct carry_kill_data *,
51344 +                          reiser4_key * smallest_removed,
51345 +                          reiser4_key * new_first);
51346 +
51347 +       /* if @key_of_coord == 1 - returned key of coord, otherwise -
51348 +          key of unit is returned. If @coord is not set to certain
51349 +          unit - ERR_PTR(-ENOENT) is returned */
51350 +       reiser4_key *(*unit_key) (const coord_t *, reiser4_key *);
51351 +       reiser4_key *(*max_unit_key) (const coord_t *, reiser4_key *);
51352 +       /* estimate how much space is needed for paste @data into item at
51353 +          @coord. if @coord==0 - estimate insertion, otherwise - estimate
51354 +          pasting
51355 +        */
51356 +       int (*estimate) (const coord_t *, const reiser4_item_data *);
51357 +
51358 +       /* converts flow @f to item data. @coord == 0 on insert */
51359 +       int (*item_data_by_flow) (const coord_t *, const flow_t *,
51360 +                                 reiser4_item_data *);
51361 +
51362 +       /*void (*show) (struct seq_file *, coord_t *); */
51363 +
51364 +#if REISER4_DEBUG
51365 +       /* used for debugging, every item should have here the most
51366 +          complete possible check of the consistency of the item that
51367 +          the inventor can construct */
51368 +       int (*check) (const coord_t *, const char **error);
51369 +#endif
51370 +
51371 +};
51372 +
51373 +struct flush_ops {
51374 +       /* return the right or left child of @coord, only if it is in memory */
51375 +       int (*utmost_child) (const coord_t *, sideof side, jnode ** child);
51376 +
51377 +       /* return whether the right or left child of @coord has a non-fake
51378 +          block number. */
51379 +       int (*utmost_child_real_block) (const coord_t *, sideof side,
51380 +                                       reiser4_block_nr *);
51381 +       /* relocate child at @coord to the @block */
51382 +       void (*update) (const coord_t *, const reiser4_block_nr *);
51383 +       /* count unformatted nodes per item for leave relocation policy, etc.. */
51384 +       int (*scan) (flush_scan * scan);
51385 +       /* convert item by flush */
51386 +       int (*convert) (flush_pos_t * pos);
51387 +       /* backward mapping from jnode offset to a key.  */
51388 +       int (*key_by_offset) (struct inode *, loff_t, reiser4_key *);
51389 +};
51390 +
51391 +/* operations specific to the directory item */
51392 +struct dir_entry_iops {
51393 +       /* extract stat-data key from directory entry at @coord and place it
51394 +          into @key. */
51395 +       int (*extract_key) (const coord_t *, reiser4_key * key);
51396 +       /* update object key in item. */
51397 +       int (*update_key) (const coord_t *, const reiser4_key *, lock_handle *);
51398 +       /* extract name from directory entry at @coord and return it */
51399 +       char *(*extract_name) (const coord_t *, char *buf);
51400 +       /* extract file type (DT_* stuff) from directory entry at @coord and
51401 +          return it */
51402 +       unsigned (*extract_file_type) (const coord_t *);
51403 +       int (*add_entry) (struct inode * dir,
51404 +                         coord_t *, lock_handle *,
51405 +                         const struct dentry * name,
51406 +                         reiser4_dir_entry_desc * entry);
51407 +       int (*rem_entry) (struct inode * dir, const struct qstr * name,
51408 +                         coord_t *, lock_handle *,
51409 +                         reiser4_dir_entry_desc * entry);
51410 +       int (*max_name_len) (const struct inode * dir);
51411 +};
51412 +
51413 +/* operations specific to items regular (unix) file metadata are built of */
51414 +struct file_iops{
51415 +       int (*write) (struct file *, struct inode *,
51416 +                     const char __user *, size_t, loff_t *pos);
51417 +       int (*read) (struct file *, flow_t *, hint_t *);
51418 +       int (*readpage) (void *, struct page *);
51419 +       int (*get_block) (const coord_t *, sector_t, sector_t *);
51420 +       /*
51421 +        * key of first byte which is not addressed by the item @coord is set
51422 +        * to.
51423 +        * For example, for extent item with the key
51424 +        *
51425 +        * (LOCALITY,4,OBJID,STARTING-OFFSET), and length BLK blocks,
51426 +        *
51427 +        * ->append_key is
51428 +        *
51429 +        * (LOCALITY,4,OBJID,STARTING-OFFSET + BLK * block_size)
51430 +        */
51431 +       reiser4_key *(*append_key) (const coord_t *, reiser4_key *);
51432 +
51433 +       void (*init_coord_extension) (uf_coord_t *, loff_t);
51434 +};
51435 +
51436 +/* operations specific to items of stat data type */
51437 +struct sd_iops {
51438 +       int (*init_inode) (struct inode * inode, char *sd, int len);
51439 +       int (*save_len) (struct inode * inode);
51440 +       int (*save) (struct inode * inode, char **area);
51441 +};
51442 +
51443 +/* operations specific to internal item */
51444 +struct internal_iops{
51445 +       /* all tree traversal want to know from internal item is where
51446 +          to go next. */
51447 +       void (*down_link) (const coord_t * coord,
51448 +                          const reiser4_key * key, reiser4_block_nr * block);
51449 +       /* check that given internal item contains given pointer. */
51450 +       int (*has_pointer_to) (const coord_t * coord,
51451 +                              const reiser4_block_nr * block);
51452 +};
51453 +
51454 +struct item_plugin {
51455 +       /* generic fields */
51456 +       plugin_header h;
51457 +       /* methods common for all item types */
51458 +       struct balance_ops b; /* balance operations */
51459 +       struct flush_ops f;   /* flush operates with items via this methods */
51460 +
51461 +       /* methods specific to particular type of item */
51462 +       union {
51463 +               struct dir_entry_iops dir;
51464 +               struct      file_iops file;
51465 +               struct        sd_iops sd;
51466 +               struct  internal_iops internal;
51467 +       } s;
51468 +};
51469 +
51470 +#define is_solid_item(iplug) ((iplug)->b.nr_units == nr_units_single_unit)
51471 +
51472 +static inline item_id item_id_by_plugin(item_plugin * plugin)
51473 +{
51474 +       return plugin->h.id;
51475 +}
51476 +
51477 +static inline char get_iplugid(item_plugin * iplug)
51478 +{
51479 +       assert("nikita-2838", iplug != NULL);
51480 +       assert("nikita-2839", iplug->h.id < 0xff);
51481 +       return (char)item_id_by_plugin(iplug);
51482 +}
51483 +
51484 +extern unsigned long znode_times_locked(const znode * z);
51485 +
51486 +static inline void coord_set_iplug(coord_t * coord, item_plugin * iplug)
51487 +{
51488 +       assert("nikita-2837", coord != NULL);
51489 +       assert("nikita-2838", iplug != NULL);
51490 +       coord->iplugid = get_iplugid(iplug);
51491 +       ON_DEBUG(coord->plug_v = znode_times_locked(coord->node));
51492 +}
51493 +
51494 +static inline item_plugin *coord_iplug(const coord_t * coord)
51495 +{
51496 +       assert("nikita-2833", coord != NULL);
51497 +       assert("nikita-2834", coord->iplugid != INVALID_PLUGID);
51498 +       assert("nikita-3549", coord->plug_v == znode_times_locked(coord->node));
51499 +       return (item_plugin *) plugin_by_id(REISER4_ITEM_PLUGIN_TYPE,
51500 +                                           coord->iplugid);
51501 +}
51502 +
51503 +extern int item_can_contain_key(const coord_t * item, const reiser4_key * key,
51504 +                               const reiser4_item_data *);
51505 +extern int are_items_mergeable(const coord_t * i1, const coord_t * i2);
51506 +extern int item_is_extent(const coord_t *);
51507 +extern int item_is_tail(const coord_t *);
51508 +extern int item_is_statdata(const coord_t * item);
51509 +extern int item_is_ctail(const coord_t *);
51510 +
51511 +extern pos_in_node_t item_length_by_coord(const coord_t * coord);
51512 +extern pos_in_node_t nr_units_single_unit(const coord_t * coord);
51513 +extern item_id item_id_by_coord(const coord_t * coord /* coord to query */ );
51514 +extern reiser4_key *item_key_by_coord(const coord_t * coord, reiser4_key * key);
51515 +extern reiser4_key *max_item_key_by_coord(const coord_t *, reiser4_key *);
51516 +extern reiser4_key *unit_key_by_coord(const coord_t * coord, reiser4_key * key);
51517 +extern reiser4_key *max_unit_key_by_coord(const coord_t * coord,
51518 +                                         reiser4_key * key);
51519 +extern void obtain_item_plugin(const coord_t * coord);
51520 +
51521 +#if defined(REISER4_DEBUG)
51522 +extern int znode_is_loaded(const znode * node);
51523 +#endif
51524 +
51525 +/* return plugin of item at @coord */
51526 +static inline item_plugin *item_plugin_by_coord(const coord_t *
51527 +                                               coord /* coord to query */ )
51528 +{
51529 +       assert("nikita-330", coord != NULL);
51530 +       assert("nikita-331", coord->node != NULL);
51531 +       assert("nikita-332", znode_is_loaded(coord->node));
51532 +
51533 +       if (unlikely(!coord_is_iplug_set(coord)))
51534 +               obtain_item_plugin(coord);
51535 +       return coord_iplug(coord);
51536 +}
51537 +
51538 +/* this returns true if item is of internal type */
51539 +static inline int item_is_internal(const coord_t * item)
51540 +{
51541 +       assert("vs-483", coord_is_existing_item(item));
51542 +       return plugin_of_group(item_plugin_by_coord(item), INTERNAL_ITEM_TYPE);
51543 +}
51544 +
51545 +extern void item_body_by_coord_hard(coord_t * coord);
51546 +extern void *item_body_by_coord_easy(const coord_t * coord);
51547 +#if REISER4_DEBUG
51548 +extern int item_body_is_valid(const coord_t * coord);
51549 +#endif
51550 +
51551 +/* return pointer to item body */
51552 +static inline void *item_body_by_coord(const coord_t *
51553 +                                      coord /* coord to query */ )
51554 +{
51555 +       assert("nikita-324", coord != NULL);
51556 +       assert("nikita-325", coord->node != NULL);
51557 +       assert("nikita-326", znode_is_loaded(coord->node));
51558 +
51559 +       if (coord->offset == INVALID_OFFSET)
51560 +               item_body_by_coord_hard((coord_t *) coord);
51561 +       assert("nikita-3201", item_body_is_valid(coord));
51562 +       assert("nikita-3550", coord->body_v == znode_times_locked(coord->node));
51563 +       return item_body_by_coord_easy(coord);
51564 +}
51565 +
51566 +/* __REISER4_ITEM_H__ */
51567 +#endif
51568 +/* Make Linus happy.
51569 +   Local variables:
51570 +   c-indentation-style: "K&R"
51571 +   mode-name: "LC"
51572 +   c-basic-offset: 8
51573 +   tab-width: 8
51574 +   fill-column: 120
51575 +   scroll-step: 1
51576 +   End:
51577 +*/
51578 diff -puN /dev/null fs/reiser4/plugin/item/sde.c
51579 --- /dev/null
51580 +++ a/fs/reiser4/plugin/item/sde.c
51581 @@ -0,0 +1,190 @@
51582 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
51583 +
51584 +/* Directory entry implementation */
51585 +#include "../../forward.h"
51586 +#include "../../debug.h"
51587 +#include "../../dformat.h"
51588 +#include "../../kassign.h"
51589 +#include "../../coord.h"
51590 +#include "sde.h"
51591 +#include "item.h"
51592 +#include "../plugin.h"
51593 +#include "../../znode.h"
51594 +#include "../../carry.h"
51595 +#include "../../tree.h"
51596 +#include "../../inode.h"
51597 +
51598 +#include <linux/fs.h>          /* for struct inode */
51599 +#include <linux/dcache.h>      /* for struct dentry */
51600 +#include <linux/quotaops.h>
51601 +
51602 +/* ->extract_key() method of simple directory item plugin. */
51603 +int extract_key_de(const coord_t * coord /* coord of item */ ,
51604 +                  reiser4_key * key /* resulting key */ )
51605 +{
51606 +       directory_entry_format *dent;
51607 +
51608 +       assert("nikita-1458", coord != NULL);
51609 +       assert("nikita-1459", key != NULL);
51610 +
51611 +       dent = (directory_entry_format *) item_body_by_coord(coord);
51612 +       assert("nikita-1158", item_length_by_coord(coord) >= (int)sizeof *dent);
51613 +       return extract_key_from_id(&dent->id, key);
51614 +}
51615 +
51616 +int
51617 +update_key_de(const coord_t * coord, const reiser4_key * key,
51618 +             lock_handle * lh UNUSED_ARG)
51619 +{
51620 +       directory_entry_format *dent;
51621 +       obj_key_id obj_id;
51622 +       int result;
51623 +
51624 +       assert("nikita-2342", coord != NULL);
51625 +       assert("nikita-2343", key != NULL);
51626 +
51627 +       dent = (directory_entry_format *) item_body_by_coord(coord);
51628 +       result = build_obj_key_id(key, &obj_id);
51629 +       if (result == 0) {
51630 +               dent->id = obj_id;
51631 +               znode_make_dirty(coord->node);
51632 +       }
51633 +       return 0;
51634 +}
51635 +
51636 +char *extract_dent_name(const coord_t * coord, directory_entry_format * dent,
51637 +                       char *buf)
51638 +{
51639 +       reiser4_key key;
51640 +
51641 +       unit_key_by_coord(coord, &key);
51642 +       if (get_key_type(&key) != KEY_FILE_NAME_MINOR)
51643 +               reiser4_print_address("oops", znode_get_block(coord->node));
51644 +       if (!is_longname_key(&key)) {
51645 +               if (is_dot_key(&key))
51646 +                       return (char *)".";
51647 +               else
51648 +                       return extract_name_from_key(&key, buf);
51649 +       } else
51650 +               return (char *)dent->name;
51651 +}
51652 +
51653 +/* ->extract_name() method of simple directory item plugin. */
51654 +char *extract_name_de(const coord_t * coord /* coord of item */ , char *buf)
51655 +{
51656 +       directory_entry_format *dent;
51657 +
51658 +       assert("nikita-1460", coord != NULL);
51659 +
51660 +       dent = (directory_entry_format *) item_body_by_coord(coord);
51661 +       return extract_dent_name(coord, dent, buf);
51662 +}
51663 +
51664 +/* ->extract_file_type() method of simple directory item plugin. */
51665 +unsigned extract_file_type_de(const coord_t * coord UNUSED_ARG /* coord of
51666 +                                                                * item */ )
51667 +{
51668 +       assert("nikita-1764", coord != NULL);
51669 +       /* we don't store file type in the directory entry yet.
51670 +
51671 +          But see comments at kassign.h:obj_key_id
51672 +        */
51673 +       return DT_UNKNOWN;
51674 +}
51675 +
51676 +int add_entry_de(struct inode *dir /* directory of item */ ,
51677 +                coord_t * coord /* coord of item */ ,
51678 +                lock_handle * lh /* insertion lock handle */ ,
51679 +                const struct dentry *de /* name to add */ ,
51680 +                reiser4_dir_entry_desc * entry /* parameters of new directory
51681 +                                                * entry */ )
51682 +{
51683 +       reiser4_item_data data;
51684 +       directory_entry_format *dent;
51685 +       int result;
51686 +       const char *name;
51687 +       int len;
51688 +       int longname;
51689 +
51690 +       name = de->d_name.name;
51691 +       len = de->d_name.len;
51692 +       assert("nikita-1163", strlen(name) == len);
51693 +
51694 +       longname = is_longname(name, len);
51695 +
51696 +       data.length = sizeof *dent;
51697 +       if (longname)
51698 +               data.length += len + 1;
51699 +       data.data = NULL;
51700 +       data.user = 0;
51701 +       data.iplug = item_plugin_by_id(SIMPLE_DIR_ENTRY_ID);
51702 +
51703 +       /* NOTE-NIKITA quota plugin */
51704 +       if (DQUOT_ALLOC_SPACE_NODIRTY(dir, data.length))
51705 +               return -EDQUOT;
51706 +
51707 +       result = insert_by_coord(coord, &data, &entry->key, lh, 0 /*flags */ );
51708 +       if (result != 0)
51709 +               return result;
51710 +
51711 +       dent = (directory_entry_format *) item_body_by_coord(coord);
51712 +       build_inode_key_id(entry->obj, &dent->id);
51713 +       if (longname) {
51714 +               memcpy(dent->name, name, len);
51715 +               put_unaligned(0, &dent->name[len]);
51716 +       }
51717 +       return 0;
51718 +}
51719 +
51720 +int rem_entry_de(struct inode *dir /* directory of item */ ,
51721 +                const struct qstr *name UNUSED_ARG,
51722 +                coord_t * coord /* coord of item */ ,
51723 +                lock_handle * lh UNUSED_ARG    /* lock handle for
51724 +                                                * removal */ ,
51725 +                reiser4_dir_entry_desc * entry UNUSED_ARG      /* parameters of
51726 +                                                                * directory entry
51727 +                                                                * being removed */ )
51728 +{
51729 +       coord_t shadow;
51730 +       int result;
51731 +       int length;
51732 +
51733 +       length = item_length_by_coord(coord);
51734 +       if (inode_get_bytes(dir) < length) {
51735 +               warning("nikita-2627", "Dir is broke: %llu: %llu",
51736 +                       (unsigned long long)get_inode_oid(dir),
51737 +                       inode_get_bytes(dir));
51738 +
51739 +               return RETERR(-EIO);
51740 +       }
51741 +
51742 +       /* cut_node() is supposed to take pointers to _different_
51743 +          coords, because it will modify them without respect to
51744 +          possible aliasing. To work around this, create temporary copy
51745 +          of @coord.
51746 +        */
51747 +       coord_dup(&shadow, coord);
51748 +       result =
51749 +           kill_node_content(coord, &shadow, NULL, NULL, NULL, NULL, NULL, 0);
51750 +       if (result == 0) {
51751 +               /* NOTE-NIKITA quota plugin */
51752 +               DQUOT_FREE_SPACE_NODIRTY(dir, length);
51753 +       }
51754 +       return result;
51755 +}
51756 +
51757 +int max_name_len_de(const struct inode *dir)
51758 +{
51759 +       return reiser4_tree_by_inode(dir)->nplug->max_item_size() -
51760 +               sizeof(directory_entry_format) - 2;
51761 +}
51762 +
51763 +/* Make Linus happy.
51764 +   Local variables:
51765 +   c-indentation-style: "K&R"
51766 +   mode-name: "LC"
51767 +   c-basic-offset: 8
51768 +   tab-width: 8
51769 +   fill-column: 120
51770 +   End:
51771 +*/
51772 diff -puN /dev/null fs/reiser4/plugin/item/sde.h
51773 --- /dev/null
51774 +++ a/fs/reiser4/plugin/item/sde.h
51775 @@ -0,0 +1,66 @@
51776 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
51777 +
51778 +/* Directory entry. */
51779 +
51780 +#if !defined( __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__ )
51781 +#define __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__
51782 +
51783 +#include "../../forward.h"
51784 +#include "../../dformat.h"
51785 +#include "../../kassign.h"
51786 +#include "../../key.h"
51787 +
51788 +#include <linux/fs.h>
51789 +#include <linux/dcache.h>      /* for struct dentry */
51790 +
51791 +typedef struct directory_entry_format {
51792 +       /* key of object stat-data. It's not necessary to store whole
51793 +          key here, because it's always key of stat-data, so minor
51794 +          packing locality and offset can be omitted here. But this
51795 +          relies on particular key allocation scheme for stat-data, so,
51796 +          for extensibility sake, whole key can be stored here.
51797 +
51798 +          We store key as array of bytes, because we don't want 8-byte
51799 +          alignment of dir entries.
51800 +        */
51801 +       obj_key_id id;
51802 +       /* file name. Null terminated string. */
51803 +       d8 name[0];
51804 +} directory_entry_format;
51805 +
51806 +void print_de(const char *prefix, coord_t * coord);
51807 +int extract_key_de(const coord_t * coord, reiser4_key * key);
51808 +int update_key_de(const coord_t * coord, const reiser4_key * key,
51809 +                 lock_handle * lh);
51810 +char *extract_name_de(const coord_t * coord, char *buf);
51811 +unsigned extract_file_type_de(const coord_t * coord);
51812 +int add_entry_de(struct inode *dir, coord_t * coord,
51813 +                lock_handle * lh, const struct dentry *name,
51814 +                reiser4_dir_entry_desc * entry);
51815 +int rem_entry_de(struct inode *dir, const struct qstr *name, coord_t * coord,
51816 +                lock_handle * lh, reiser4_dir_entry_desc * entry);
51817 +int max_name_len_de(const struct inode *dir);
51818 +
51819 +int de_rem_and_shrink(struct inode *dir, coord_t * coord, int length);
51820 +
51821 +char *extract_dent_name(const coord_t * coord,
51822 +                       directory_entry_format * dent, char *buf);
51823 +
51824 +#if REISER4_LARGE_KEY
51825 +#define DE_NAME_BUF_LEN (24)
51826 +#else
51827 +#define DE_NAME_BUF_LEN (16)
51828 +#endif
51829 +
51830 +/* __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__ */
51831 +#endif
51832 +
51833 +/* Make Linus happy.
51834 +   Local variables:
51835 +   c-indentation-style: "K&R"
51836 +   mode-name: "LC"
51837 +   c-basic-offset: 8
51838 +   tab-width: 8
51839 +   fill-column: 120
51840 +   End:
51841 +*/
51842 diff -puN /dev/null fs/reiser4/plugin/item/static_stat.c
51843 --- /dev/null
51844 +++ a/fs/reiser4/plugin/item/static_stat.c
51845 @@ -0,0 +1,1107 @@
51846 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
51847 +
51848 +/* stat data manipulation. */
51849 +
51850 +#include "../../forward.h"
51851 +#include "../../super.h"
51852 +#include "../../vfs_ops.h"
51853 +#include "../../inode.h"
51854 +#include "../../debug.h"
51855 +#include "../../dformat.h"
51856 +#include "../object.h"
51857 +#include "../plugin.h"
51858 +#include "../plugin_header.h"
51859 +#include "static_stat.h"
51860 +#include "item.h"
51861 +
51862 +#include <linux/types.h>
51863 +#include <linux/fs.h>
51864 +
51865 +/* see static_stat.h for explanation */
51866 +
51867 +/* helper function used while we are dumping/loading inode/plugin state
51868 +    to/from the stat-data. */
51869 +
51870 +static void move_on(int *length /* space remaining in stat-data */ ,
51871 +                   char **area /* current coord in stat data */ ,
51872 +                   int size_of /* how many bytes to move forward */ )
51873 +{
51874 +       assert("nikita-615", length != NULL);
51875 +       assert("nikita-616", area != NULL);
51876 +
51877 +       *length -= size_of;
51878 +       *area += size_of;
51879 +
51880 +       assert("nikita-617", *length >= 0);
51881 +}
51882 +
51883 +/* helper function used while loading inode/plugin state from stat-data.
51884 +    Complain if there is less space in stat-data than was expected.
51885 +    Can only happen on disk corruption. */
51886 +static int not_enough_space(struct inode *inode /* object being processed */ ,
51887 +                           const char *where /* error message */ )
51888 +{
51889 +       assert("nikita-618", inode != NULL);
51890 +
51891 +       warning("nikita-619", "Not enough space in %llu while loading %s",
51892 +               (unsigned long long)get_inode_oid(inode), where);
51893 +
51894 +       return RETERR(-EINVAL);
51895 +}
51896 +
51897 +/* helper function used while loading inode/plugin state from
51898 +    stat-data. Call it if invalid plugin id was found. */
51899 +static int unknown_plugin(reiser4_plugin_id id /* invalid id */ ,
51900 +                         struct inode *inode /* object being processed */ )
51901 +{
51902 +       warning("nikita-620", "Unknown plugin %i in %llu",
51903 +               id, (unsigned long long)get_inode_oid(inode));
51904 +
51905 +       return RETERR(-EINVAL);
51906 +}
51907 +
51908 +/* this is installed as ->init_inode() method of
51909 +    item_plugins[ STATIC_STAT_DATA_IT ] (fs/reiser4/plugin/item/item.c).
51910 +    Copies data from on-disk stat-data format into inode.
51911 +    Handles stat-data extensions. */
51912 +/* was sd_load */
51913 +int init_inode_static_sd(struct inode *inode /* object being processed */ ,
51914 +                        char *sd /* stat-data body */ ,
51915 +                        int len /* length of stat-data */ )
51916 +{
51917 +       int result;
51918 +       int bit;
51919 +       int chunk;
51920 +       __u16 mask;
51921 +       __u64 bigmask;
51922 +       reiser4_stat_data_base *sd_base;
51923 +       reiser4_inode *state;
51924 +
51925 +       assert("nikita-625", inode != NULL);
51926 +       assert("nikita-626", sd != NULL);
51927 +
51928 +       result = 0;
51929 +       sd_base = (reiser4_stat_data_base *) sd;
51930 +       state = reiser4_inode_data(inode);
51931 +       mask = le16_to_cpu(get_unaligned(&sd_base->extmask));
51932 +       bigmask = mask;
51933 +       reiser4_inode_set_flag(inode, REISER4_SDLEN_KNOWN);
51934 +
51935 +       move_on(&len, &sd, sizeof *sd_base);
51936 +       for (bit = 0, chunk = 0;
51937 +            mask != 0 || bit <= LAST_IMPORTANT_SD_EXTENSION;
51938 +            ++bit, mask >>= 1) {
51939 +               if (((bit + 1) % 16) != 0) {
51940 +                       /* handle extension */
51941 +                       sd_ext_plugin *sdplug;
51942 +
51943 +                       if (bit >= LAST_SD_EXTENSION) {
51944 +                               warning("vpf-1904",
51945 +                                       "No such extension %i in inode %llu",
51946 +                                       bit,
51947 +                                       (unsigned long long)
51948 +                                       get_inode_oid(inode));
51949 +
51950 +                               result = RETERR(-EINVAL);
51951 +                               break;
51952 +                       }
51953 +
51954 +                       sdplug = sd_ext_plugin_by_id(bit);
51955 +                       if (sdplug == NULL) {
51956 +                               warning("nikita-627",
51957 +                                       "No such extension %i in inode %llu",
51958 +                                       bit,
51959 +                                       (unsigned long long)
51960 +                                       get_inode_oid(inode));
51961 +
51962 +                               result = RETERR(-EINVAL);
51963 +                               break;
51964 +                       }
51965 +                       if (mask & 1) {
51966 +                               assert("nikita-628", sdplug->present);
51967 +                               /* alignment is not supported in node layout
51968 +                                  plugin yet.
51969 +                                  result = align( inode, &len, &sd,
51970 +                                  sdplug -> alignment );
51971 +                                  if( result != 0 )
51972 +                                  return result; */
51973 +                               result = sdplug->present(inode, &sd, &len);
51974 +                       } else if (sdplug->absent != NULL)
51975 +                               result = sdplug->absent(inode);
51976 +                       if (result)
51977 +                               break;
51978 +                       /* else, we are looking at the last bit in 16-bit
51979 +                          portion of bitmask */
51980 +               } else if (mask & 1) {
51981 +                       /* next portion of bitmask */
51982 +                       if (len < (int)sizeof(d16)) {
51983 +                               warning("nikita-629",
51984 +                                       "No space for bitmap in inode %llu",
51985 +                                       (unsigned long long)
51986 +                                       get_inode_oid(inode));
51987 +
51988 +                               result = RETERR(-EINVAL);
51989 +                               break;
51990 +                       }
51991 +                       mask = le16_to_cpu(get_unaligned((d16 *)sd));
51992 +                       bigmask <<= 16;
51993 +                       bigmask |= mask;
51994 +                       move_on(&len, &sd, sizeof(d16));
51995 +                       ++chunk;
51996 +                       if (chunk == 3) {
51997 +                               if (!(mask & 0x8000)) {
51998 +                                       /* clear last bit */
51999 +                                       mask &= ~0x8000;
52000 +                                       continue;
52001 +                               }
52002 +                               /* too much */
52003 +                               warning("nikita-630",
52004 +                                       "Too many extensions in %llu",
52005 +                                       (unsigned long long)
52006 +                                       get_inode_oid(inode));
52007 +
52008 +                               result = RETERR(-EINVAL);
52009 +                               break;
52010 +                       }
52011 +               } else
52012 +                       /* bitmask exhausted */
52013 +                       break;
52014 +       }
52015 +       state->extmask = bigmask;
52016 +       /* common initialisations */
52017 +       if (len - (bit / 16 * sizeof(d16)) > 0) {
52018 +               /* alignment in save_len_static_sd() is taken into account
52019 +                  -edward */
52020 +               warning("nikita-631", "unused space in inode %llu",
52021 +                       (unsigned long long)get_inode_oid(inode));
52022 +       }
52023 +
52024 +       return result;
52025 +}
52026 +
52027 +/* estimates size of stat-data required to store inode.
52028 +    Installed as ->save_len() method of
52029 +    item_plugins[ STATIC_STAT_DATA_IT ] (fs/reiser4/plugin/item/item.c). */
52030 +/* was sd_len */
52031 +int save_len_static_sd(struct inode *inode /* object being processed */ )
52032 +{
52033 +       unsigned int result;
52034 +       __u64 mask;
52035 +       int bit;
52036 +
52037 +       assert("nikita-632", inode != NULL);
52038 +
52039 +       result = sizeof(reiser4_stat_data_base);
52040 +       mask = reiser4_inode_data(inode)->extmask;
52041 +       for (bit = 0; mask != 0; ++bit, mask >>= 1) {
52042 +               if (mask & 1) {
52043 +                       sd_ext_plugin *sdplug;
52044 +
52045 +                       sdplug = sd_ext_plugin_by_id(bit);
52046 +                       assert("nikita-633", sdplug != NULL);
52047 +                       /* no aligment support
52048 +                          result +=
52049 +                          round_up( result, sdplug -> alignment ) - result; */
52050 +                       result += sdplug->save_len(inode);
52051 +               }
52052 +       }
52053 +       result += bit / 16 * sizeof(d16);
52054 +       return result;
52055 +}
52056 +
52057 +/* saves inode into stat-data.
52058 +    Installed as ->save() method of
52059 +    item_plugins[ STATIC_STAT_DATA_IT ] (fs/reiser4/plugin/item/item.c). */
52060 +/* was sd_save */
52061 +int save_static_sd(struct inode *inode /* object being processed */ ,
52062 +                  char **area /* where to save stat-data */ )
52063 +{
52064 +       int result;
52065 +       __u64 emask;
52066 +       int bit;
52067 +       unsigned int len;
52068 +       reiser4_stat_data_base *sd_base;
52069 +
52070 +       assert("nikita-634", inode != NULL);
52071 +       assert("nikita-635", area != NULL);
52072 +
52073 +       result = 0;
52074 +       emask = reiser4_inode_data(inode)->extmask;
52075 +       sd_base = (reiser4_stat_data_base *) * area;
52076 +       put_unaligned(cpu_to_le16((__u16)(emask & 0xffff)), &sd_base->extmask);
52077 +       /*cputod16((unsigned)(emask & 0xffff), &sd_base->extmask);*/
52078 +
52079 +       *area += sizeof *sd_base;
52080 +       len = 0xffffffffu;
52081 +       for (bit = 0; emask != 0; ++bit, emask >>= 1) {
52082 +               if (emask & 1) {
52083 +                       if ((bit + 1) % 16 != 0) {
52084 +                               sd_ext_plugin *sdplug;
52085 +                               sdplug = sd_ext_plugin_by_id(bit);
52086 +                               assert("nikita-636", sdplug != NULL);
52087 +                               /* no alignment support yet
52088 +                                  align( inode, &len, area,
52089 +                                  sdplug -> alignment ); */
52090 +                               result = sdplug->save(inode, area);
52091 +                               if (result)
52092 +                                       break;
52093 +                       } else {
52094 +                               put_unaligned(cpu_to_le16((__u16)(emask & 0xffff)),
52095 +                                             (d16 *)(*area));
52096 +                               /*cputod16((unsigned)(emask & 0xffff),
52097 +                                 (d16 *) * area);*/
52098 +                               *area += sizeof(d16);
52099 +                       }
52100 +               }
52101 +       }
52102 +       return result;
52103 +}
52104 +
52105 +/* stat-data extension handling functions. */
52106 +
52107 +static int present_lw_sd(struct inode *inode /* object being processed */ ,
52108 +                        char **area /* position in stat-data */ ,
52109 +                        int *len /* remaining length */ )
52110 +{
52111 +       if (*len >= (int)sizeof(reiser4_light_weight_stat)) {
52112 +               reiser4_light_weight_stat *sd_lw;
52113 +
52114 +               sd_lw = (reiser4_light_weight_stat *) * area;
52115 +
52116 +               inode->i_mode = le16_to_cpu(get_unaligned(&sd_lw->mode));
52117 +               inode->i_nlink = le32_to_cpu(get_unaligned(&sd_lw->nlink));
52118 +               inode->i_size = le64_to_cpu(get_unaligned(&sd_lw->size));
52119 +               if ((inode->i_mode & S_IFMT) == (S_IFREG | S_IFIFO)) {
52120 +                       inode->i_mode &= ~S_IFIFO;
52121 +                       warning("", "partially converted file is encountered");
52122 +                       reiser4_inode_set_flag(inode, REISER4_PART_MIXED);
52123 +               }
52124 +               move_on(len, area, sizeof *sd_lw);
52125 +               return 0;
52126 +       } else
52127 +               return not_enough_space(inode, "lw sd");
52128 +}
52129 +
52130 +static int save_len_lw_sd(struct inode *inode UNUSED_ARG       /* object being
52131 +                                                                * processed */ )
52132 +{
52133 +       return sizeof(reiser4_light_weight_stat);
52134 +}
52135 +
52136 +static int save_lw_sd(struct inode *inode /* object being processed */ ,
52137 +                     char **area /* position in stat-data */ )
52138 +{
52139 +       reiser4_light_weight_stat *sd;
52140 +       mode_t delta;
52141 +
52142 +       assert("nikita-2705", inode != NULL);
52143 +       assert("nikita-2706", area != NULL);
52144 +       assert("nikita-2707", *area != NULL);
52145 +
52146 +       sd = (reiser4_light_weight_stat *) * area;
52147 +
52148 +       delta = (reiser4_inode_get_flag(inode,
52149 +                                       REISER4_PART_MIXED) ? S_IFIFO : 0);
52150 +       put_unaligned(cpu_to_le16(inode->i_mode | delta), &sd->mode);
52151 +       put_unaligned(cpu_to_le32(inode->i_nlink), &sd->nlink);
52152 +       put_unaligned(cpu_to_le64((__u64) inode->i_size), &sd->size);
52153 +       *area += sizeof *sd;
52154 +       return 0;
52155 +}
52156 +
52157 +static int present_unix_sd(struct inode *inode /* object being processed */ ,
52158 +                          char **area /* position in stat-data */ ,
52159 +                          int *len /* remaining length */ )
52160 +{
52161 +       assert("nikita-637", inode != NULL);
52162 +       assert("nikita-638", area != NULL);
52163 +       assert("nikita-639", *area != NULL);
52164 +       assert("nikita-640", len != NULL);
52165 +       assert("nikita-641", *len > 0);
52166 +
52167 +       if (*len >= (int)sizeof(reiser4_unix_stat)) {
52168 +               reiser4_unix_stat *sd;
52169 +
52170 +               sd = (reiser4_unix_stat *) * area;
52171 +
52172 +               inode->i_uid = le32_to_cpu(get_unaligned(&sd->uid));
52173 +               inode->i_gid = le32_to_cpu(get_unaligned(&sd->gid));
52174 +               inode->i_atime.tv_sec = le32_to_cpu(get_unaligned(&sd->atime));
52175 +               inode->i_mtime.tv_sec = le32_to_cpu(get_unaligned(&sd->mtime));
52176 +               inode->i_ctime.tv_sec = le32_to_cpu(get_unaligned(&sd->ctime));
52177 +               if (S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode))
52178 +                       inode->i_rdev = le64_to_cpu(get_unaligned(&sd->u.rdev));
52179 +               else
52180 +                       inode_set_bytes(inode, (loff_t) le64_to_cpu(get_unaligned(&sd->u.bytes)));
52181 +               move_on(len, area, sizeof *sd);
52182 +               return 0;
52183 +       } else
52184 +               return not_enough_space(inode, "unix sd");
52185 +}
52186 +
52187 +static int absent_unix_sd(struct inode *inode /* object being processed */ )
52188 +{
52189 +       inode->i_uid = get_super_private(inode->i_sb)->default_uid;
52190 +       inode->i_gid = get_super_private(inode->i_sb)->default_gid;
52191 +       inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
52192 +       inode_set_bytes(inode, inode->i_size);
52193 +       /* mark inode as lightweight, so that caller (lookup_common) will
52194 +          complete initialisation by copying [ug]id from a parent. */
52195 +       reiser4_inode_set_flag(inode, REISER4_LIGHT_WEIGHT);
52196 +       return 0;
52197 +}
52198 +
52199 +/* Audited by: green(2002.06.14) */
52200 +static int save_len_unix_sd(struct inode *inode UNUSED_ARG     /* object being
52201 +                                                                * processed */ )
52202 +{
52203 +       return sizeof(reiser4_unix_stat);
52204 +}
52205 +
52206 +static int save_unix_sd(struct inode *inode /* object being processed */ ,
52207 +                       char **area /* position in stat-data */ )
52208 +{
52209 +       reiser4_unix_stat *sd;
52210 +
52211 +       assert("nikita-642", inode != NULL);
52212 +       assert("nikita-643", area != NULL);
52213 +       assert("nikita-644", *area != NULL);
52214 +
52215 +       sd = (reiser4_unix_stat *) * area;
52216 +       put_unaligned(cpu_to_le32(inode->i_uid), &sd->uid);
52217 +       put_unaligned(cpu_to_le32(inode->i_gid), &sd->gid);
52218 +       put_unaligned(cpu_to_le32((__u32) inode->i_atime.tv_sec), &sd->atime);
52219 +       put_unaligned(cpu_to_le32((__u32) inode->i_ctime.tv_sec), &sd->ctime);
52220 +       put_unaligned(cpu_to_le32((__u32) inode->i_mtime.tv_sec), &sd->mtime);
52221 +       if (S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode))
52222 +               put_unaligned(cpu_to_le64(inode->i_rdev), &sd->u.rdev);
52223 +       else
52224 +               put_unaligned(cpu_to_le64((__u64) inode_get_bytes(inode)), &sd->u.bytes);
52225 +       *area += sizeof *sd;
52226 +       return 0;
52227 +}
52228 +
52229 +static int
52230 +present_large_times_sd(struct inode *inode /* object being processed */ ,
52231 +                      char **area /* position in stat-data */ ,
52232 +                      int *len /* remaining length */ )
52233 +{
52234 +       if (*len >= (int)sizeof(reiser4_large_times_stat)) {
52235 +               reiser4_large_times_stat *sd_lt;
52236 +
52237 +               sd_lt = (reiser4_large_times_stat *) * area;
52238 +
52239 +               inode->i_atime.tv_nsec = le32_to_cpu(get_unaligned(&sd_lt->atime));
52240 +               inode->i_mtime.tv_nsec = le32_to_cpu(get_unaligned(&sd_lt->mtime));
52241 +               inode->i_ctime.tv_nsec = le32_to_cpu(get_unaligned(&sd_lt->ctime));
52242 +
52243 +               move_on(len, area, sizeof *sd_lt);
52244 +               return 0;
52245 +       } else
52246 +               return not_enough_space(inode, "large times sd");
52247 +}
52248 +
52249 +static int
52250 +save_len_large_times_sd(struct inode *inode UNUSED_ARG
52251 +                       /* object being processed */ )
52252 +{
52253 +       return sizeof(reiser4_large_times_stat);
52254 +}
52255 +
52256 +static int
52257 +save_large_times_sd(struct inode *inode /* object being processed */ ,
52258 +                   char **area /* position in stat-data */ )
52259 +{
52260 +       reiser4_large_times_stat *sd;
52261 +
52262 +       assert("nikita-2817", inode != NULL);
52263 +       assert("nikita-2818", area != NULL);
52264 +       assert("nikita-2819", *area != NULL);
52265 +
52266 +       sd = (reiser4_large_times_stat *) * area;
52267 +
52268 +       put_unaligned(cpu_to_le32((__u32) inode->i_atime.tv_nsec), &sd->atime);
52269 +       put_unaligned(cpu_to_le32((__u32) inode->i_ctime.tv_nsec), &sd->ctime);
52270 +       put_unaligned(cpu_to_le32((__u32) inode->i_mtime.tv_nsec), &sd->mtime);
52271 +
52272 +       *area += sizeof *sd;
52273 +       return 0;
52274 +}
52275 +
52276 +/* symlink stat data extension */
52277 +
52278 +/* allocate memory for symlink target and attach it to inode->i_private */
52279 +static int
52280 +symlink_target_to_inode(struct inode *inode, const char *target, int len)
52281 +{
52282 +       assert("vs-845", inode->i_private == NULL);
52283 +       assert("vs-846", !reiser4_inode_get_flag(inode,
52284 +                                                REISER4_GENERIC_PTR_USED));
52285 +       /* FIXME-VS: this is prone to deadlock. Not more than other similar
52286 +          places, though */
52287 +       inode->i_private = kmalloc((size_t) len + 1,
52288 +                                  reiser4_ctx_gfp_mask_get());
52289 +       if (!inode->i_private)
52290 +               return RETERR(-ENOMEM);
52291 +
52292 +       memcpy((char *)(inode->i_private), target, (size_t) len);
52293 +       ((char *)(inode->i_private))[len] = 0;
52294 +       reiser4_inode_set_flag(inode, REISER4_GENERIC_PTR_USED);
52295 +       return 0;
52296 +}
52297 +
52298 +/* this is called on read_inode. There is nothing to do actually, but some
52299 +   sanity checks */
52300 +static int present_symlink_sd(struct inode *inode, char **area, int *len)
52301 +{
52302 +       int result;
52303 +       int length;
52304 +       reiser4_symlink_stat *sd;
52305 +
52306 +       length = (int)inode->i_size;
52307 +       /*
52308 +        * *len is number of bytes in stat data item from *area to the end of
52309 +        * item. It must be not less than size of symlink + 1 for ending 0
52310 +        */
52311 +       if (length > *len)
52312 +               return not_enough_space(inode, "symlink");
52313 +
52314 +       if (*(*area + length) != 0) {
52315 +               warning("vs-840", "Symlink is not zero terminated");
52316 +               return RETERR(-EIO);
52317 +       }
52318 +
52319 +       sd = (reiser4_symlink_stat *) * area;
52320 +       result = symlink_target_to_inode(inode, sd->body, length);
52321 +
52322 +       move_on(len, area, length + 1);
52323 +       return result;
52324 +}
52325 +
52326 +static int save_len_symlink_sd(struct inode *inode)
52327 +{
52328 +       return inode->i_size + 1;
52329 +}
52330 +
52331 +/* this is called on create and update stat data. Do nothing on update but
52332 +   update @area */
52333 +static int save_symlink_sd(struct inode *inode, char **area)
52334 +{
52335 +       int result;
52336 +       int length;
52337 +       reiser4_symlink_stat *sd;
52338 +
52339 +       length = (int)inode->i_size;
52340 +       /* inode->i_size must be set already */
52341 +       assert("vs-841", length);
52342 +
52343 +       result = 0;
52344 +       sd = (reiser4_symlink_stat *) * area;
52345 +       if (!reiser4_inode_get_flag(inode, REISER4_GENERIC_PTR_USED)) {
52346 +               const char *target;
52347 +
52348 +               target = (const char *)(inode->i_private);
52349 +               inode->i_private = NULL;
52350 +
52351 +               result = symlink_target_to_inode(inode, target, length);
52352 +
52353 +               /* copy symlink to stat data */
52354 +               memcpy(sd->body, target, (size_t) length);
52355 +               (*area)[length] = 0;
52356 +       } else {
52357 +               /* there is nothing to do in update but move area */
52358 +               assert("vs-844",
52359 +                      !memcmp(inode->i_private, sd->body,
52360 +                              (size_t) length + 1));
52361 +       }
52362 +
52363 +       *area += (length + 1);
52364 +       return result;
52365 +}
52366 +
52367 +static int present_flags_sd(struct inode *inode /* object being processed */ ,
52368 +                           char **area /* position in stat-data */ ,
52369 +                           int *len /* remaining length */ )
52370 +{
52371 +       assert("nikita-645", inode != NULL);
52372 +       assert("nikita-646", area != NULL);
52373 +       assert("nikita-647", *area != NULL);
52374 +       assert("nikita-648", len != NULL);
52375 +       assert("nikita-649", *len > 0);
52376 +
52377 +       if (*len >= (int)sizeof(reiser4_flags_stat)) {
52378 +               reiser4_flags_stat *sd;
52379 +
52380 +               sd = (reiser4_flags_stat *) * area;
52381 +               inode->i_flags = le32_to_cpu(get_unaligned(&sd->flags));
52382 +               move_on(len, area, sizeof *sd);
52383 +               return 0;
52384 +       } else
52385 +               return not_enough_space(inode, "generation and attrs");
52386 +}
52387 +
52388 +/* Audited by: green(2002.06.14) */
52389 +static int save_len_flags_sd(struct inode *inode UNUSED_ARG    /* object being
52390 +                                                                * processed */ )
52391 +{
52392 +       return sizeof(reiser4_flags_stat);
52393 +}
52394 +
52395 +static int save_flags_sd(struct inode *inode /* object being processed */ ,
52396 +                        char **area /* position in stat-data */ )
52397 +{
52398 +       reiser4_flags_stat *sd;
52399 +
52400 +       assert("nikita-650", inode != NULL);
52401 +       assert("nikita-651", area != NULL);
52402 +       assert("nikita-652", *area != NULL);
52403 +
52404 +       sd = (reiser4_flags_stat *) * area;
52405 +       put_unaligned(cpu_to_le32(inode->i_flags), &sd->flags);
52406 +       *area += sizeof *sd;
52407 +       return 0;
52408 +}
52409 +
52410 +static int absent_plugin_sd(struct inode *inode);
52411 +static int present_plugin_sd(struct inode *inode /* object being processed */ ,
52412 +                            char **area /* position in stat-data */ ,
52413 +                            int *len /* remaining length */,
52414 +                            int is_pset /* 1 if plugin set, 0 if heir set. */)
52415 +{
52416 +       reiser4_plugin_stat *sd;
52417 +       reiser4_plugin *plugin;
52418 +       reiser4_inode *info;
52419 +       int i;
52420 +       __u16 mask;
52421 +       int result;
52422 +       int num_of_plugins;
52423 +
52424 +       assert("nikita-653", inode != NULL);
52425 +       assert("nikita-654", area != NULL);
52426 +       assert("nikita-655", *area != NULL);
52427 +       assert("nikita-656", len != NULL);
52428 +       assert("nikita-657", *len > 0);
52429 +
52430 +       if (*len < (int)sizeof(reiser4_plugin_stat))
52431 +               return not_enough_space(inode, "plugin");
52432 +
52433 +       sd = (reiser4_plugin_stat *) * area;
52434 +       info = reiser4_inode_data(inode);
52435 +
52436 +       mask = 0;
52437 +       num_of_plugins = le16_to_cpu(get_unaligned(&sd->plugins_no));
52438 +       move_on(len, area, sizeof *sd);
52439 +       result = 0;
52440 +       for (i = 0; i < num_of_plugins; ++i) {
52441 +               reiser4_plugin_slot *slot;
52442 +               reiser4_plugin_type type;
52443 +               pset_member memb;
52444 +
52445 +               slot = (reiser4_plugin_slot *) * area;
52446 +               if (*len < (int)sizeof *slot)
52447 +                       return not_enough_space(inode, "additional plugin");
52448 +
52449 +               memb = le16_to_cpu(get_unaligned(&slot->pset_memb));
52450 +               type = aset_member_to_type_unsafe(memb);
52451 +
52452 +               if (type == REISER4_PLUGIN_TYPES) {
52453 +                       warning("nikita-3502",
52454 +                               "wrong %s member (%i) for %llu", is_pset ?
52455 +                               "pset" : "hset", memb,
52456 +                               (unsigned long long)get_inode_oid(inode));
52457 +                       return RETERR(-EINVAL);
52458 +               }
52459 +               plugin = plugin_by_disk_id(reiser4_tree_by_inode(inode),
52460 +                                          type, &slot->id);
52461 +               if (plugin == NULL)
52462 +                       return unknown_plugin(le16_to_cpu(get_unaligned(&slot->id)), inode);
52463 +
52464 +               /* plugin is loaded into inode, mark this into inode's
52465 +                  bitmask of loaded non-standard plugins */
52466 +               if (!(mask & (1 << memb))) {
52467 +                       mask |= (1 << memb);
52468 +               } else {
52469 +                       warning("nikita-658", "duplicate plugin for %llu",
52470 +                               (unsigned long long)get_inode_oid(inode));
52471 +                       return RETERR(-EINVAL);
52472 +               }
52473 +               move_on(len, area, sizeof *slot);
52474 +               /* load plugin data, if any */
52475 +               if (plugin->h.pops != NULL && plugin->h.pops->load)
52476 +                       result = plugin->h.pops->load(inode, plugin, area, len);
52477 +               else
52478 +                       result = aset_set_unsafe(is_pset ? &info->pset :
52479 +                                                &info->hset, memb, plugin);
52480 +               if (result)
52481 +                       return result;
52482 +       }
52483 +       if (is_pset) {
52484 +               /* if object plugin wasn't loaded from stat-data, guess it by
52485 +                  mode bits */
52486 +               plugin = file_plugin_to_plugin(inode_file_plugin(inode));
52487 +               if (plugin == NULL)
52488 +                       result = absent_plugin_sd(inode);
52489 +               info->plugin_mask = mask;
52490 +       } else
52491 +               info->heir_mask = mask;
52492 +
52493 +       return result;
52494 +}
52495 +
52496 +static int present_pset_sd(struct inode *inode, char **area, int *len) {
52497 +       return present_plugin_sd(inode, area, len, 1 /* pset */);
52498 +}
52499 +
52500 +/* Determine object plugin for @inode based on i_mode.
52501 +
52502 +   Many objects in reiser4 file system are controlled by standard object
52503 +   plugins that emulate traditional unix objects: unix file, directory, symlink, fifo, and so on.
52504 +
52505 +   For such files we don't explicitly store plugin id in object stat
52506 +   data. Rather required plugin is guessed from mode bits, where file "type"
52507 +   is encoded (see stat(2)).
52508 +*/
52509 +static int
52510 +guess_plugin_by_mode(struct inode *inode /* object to guess plugins for */ )
52511 +{
52512 +       int fplug_id;
52513 +       int dplug_id;
52514 +       reiser4_inode *info;
52515 +
52516 +       assert("nikita-736", inode != NULL);
52517 +
52518 +       dplug_id = fplug_id = -1;
52519 +
52520 +       switch (inode->i_mode & S_IFMT) {
52521 +       case S_IFSOCK:
52522 +       case S_IFBLK:
52523 +       case S_IFCHR:
52524 +       case S_IFIFO:
52525 +               fplug_id = SPECIAL_FILE_PLUGIN_ID;
52526 +               break;
52527 +       case S_IFLNK:
52528 +               fplug_id = SYMLINK_FILE_PLUGIN_ID;
52529 +               break;
52530 +       case S_IFDIR:
52531 +               fplug_id = DIRECTORY_FILE_PLUGIN_ID;
52532 +               dplug_id = HASHED_DIR_PLUGIN_ID;
52533 +               break;
52534 +       default:
52535 +               warning("nikita-737", "wrong file mode: %o", inode->i_mode);
52536 +               return RETERR(-EIO);
52537 +       case S_IFREG:
52538 +               fplug_id = UNIX_FILE_PLUGIN_ID;
52539 +               break;
52540 +       }
52541 +       info = reiser4_inode_data(inode);
52542 +       set_plugin(&info->pset, PSET_FILE, (fplug_id >= 0) ?
52543 +                  plugin_by_id(REISER4_FILE_PLUGIN_TYPE, fplug_id) : NULL);
52544 +       set_plugin(&info->pset, PSET_DIR, (dplug_id >= 0) ?
52545 +                  plugin_by_id(REISER4_DIR_PLUGIN_TYPE, dplug_id) : NULL);
52546 +       return 0;
52547 +}
52548 +
52549 +/* Audited by: green(2002.06.14) */
52550 +static int absent_plugin_sd(struct inode *inode /* object being processed */ )
52551 +{
52552 +       int result;
52553 +
52554 +       assert("nikita-659", inode != NULL);
52555 +
52556 +       result = guess_plugin_by_mode(inode);
52557 +       /* if mode was wrong, guess_plugin_by_mode() returns "regular file",
52558 +          but setup_inode_ops() will call make_bad_inode().
52559 +          Another, more logical but bit more complex solution is to add
52560 +          "bad-file plugin". */
52561 +       /* FIXME-VS: activate was called here */
52562 +       return result;
52563 +}
52564 +
52565 +/* helper function for plugin_sd_save_len(): calculate how much space
52566 +    required to save state of given plugin */
52567 +/* Audited by: green(2002.06.14) */
52568 +static int len_for(reiser4_plugin * plugin /* plugin to save */ ,
52569 +                  struct inode *inode /* object being processed */ ,
52570 +                  pset_member memb,
52571 +                  int len, int is_pset)
52572 +{
52573 +       reiser4_inode *info;
52574 +       assert("nikita-661", inode != NULL);
52575 +
52576 +       if (plugin == NULL)
52577 +               return len;
52578 +
52579 +       info = reiser4_inode_data(inode);
52580 +       if (is_pset ?
52581 +           info->plugin_mask & (1 << memb) :
52582 +           info->heir_mask & (1 << memb)) {
52583 +               len += sizeof(reiser4_plugin_slot);
52584 +               if (plugin->h.pops && plugin->h.pops->save_len != NULL) {
52585 +                       /* non-standard plugin, call method */
52586 +                       /* commented as it is incompatible with alignment
52587 +                        * policy in save_plug() -edward */
52588 +                       /* len = round_up(len, plugin->h.pops->alignment); */
52589 +                       len += plugin->h.pops->save_len(inode, plugin);
52590 +               }
52591 +       }
52592 +       return len;
52593 +}
52594 +
52595 +/* calculate how much space is required to save state of all plugins,
52596 +    associated with inode */
52597 +static int save_len_plugin_sd(struct inode *inode /* object being processed */,
52598 +                             int is_pset)
52599 +{
52600 +       int len;
52601 +       int last;
52602 +       reiser4_inode *state;
52603 +       pset_member memb;
52604 +
52605 +       assert("nikita-663", inode != NULL);
52606 +
52607 +       state = reiser4_inode_data(inode);
52608 +
52609 +       /* common case: no non-standard plugins */
52610 +       if (is_pset ? state->plugin_mask == 0 : state->heir_mask == 0)
52611 +               return 0;
52612 +       len = sizeof(reiser4_plugin_stat);
52613 +       last = PSET_LAST;
52614 +
52615 +       for (memb = 0; memb < last; ++memb) {
52616 +             len = len_for(aset_get(is_pset ? state->pset : state->hset, memb),
52617 +                           inode, memb, len, is_pset);
52618 +       }
52619 +       assert("nikita-664", len > (int)sizeof(reiser4_plugin_stat));
52620 +       return len;
52621 +}
52622 +
52623 +static int save_len_pset_sd(struct inode *inode) {
52624 +       return save_len_plugin_sd(inode, 1 /* pset */);
52625 +}
52626 +
52627 +/* helper function for plugin_sd_save(): save plugin, associated with
52628 +    inode. */
52629 +static int save_plug(reiser4_plugin * plugin /* plugin to save */ ,
52630 +                    struct inode *inode /* object being processed */ ,
52631 +                    int memb /* what element of pset is saved */ ,
52632 +                    char **area /* position in stat-data */ ,
52633 +                    int *count /* incremented if plugin were actually saved. */,
52634 +                    int is_pset /* 1 for plugin set, 0 for heir set */)
52635 +{
52636 +       reiser4_plugin_slot *slot;
52637 +       int fake_len;
52638 +       int result;
52639 +
52640 +       assert("nikita-665", inode != NULL);
52641 +       assert("nikita-666", area != NULL);
52642 +       assert("nikita-667", *area != NULL);
52643 +
52644 +       if (plugin == NULL)
52645 +               return 0;
52646 +
52647 +       if (is_pset ?
52648 +           !(reiser4_inode_data(inode)->plugin_mask & (1 << memb)) :
52649 +           !(reiser4_inode_data(inode)->heir_mask & (1 << memb)))
52650 +               return 0;
52651 +       slot = (reiser4_plugin_slot *) * area;
52652 +       put_unaligned(cpu_to_le16(memb), &slot->pset_memb);
52653 +       put_unaligned(cpu_to_le16(plugin->h.id), &slot->id);
52654 +       fake_len = (int)0xffff;
52655 +       move_on(&fake_len, area, sizeof *slot);
52656 +       ++*count;
52657 +       result = 0;
52658 +       if (plugin->h.pops != NULL) {
52659 +               if (plugin->h.pops->save != NULL)
52660 +                       result = plugin->h.pops->save(inode, plugin, area);
52661 +       }
52662 +       return result;
52663 +}
52664 +
52665 +/* save state of all non-standard plugins associated with inode */
52666 +static int save_plugin_sd(struct inode *inode /* object being processed */ ,
52667 +                         char **area /* position in stat-data */,
52668 +                         int is_pset /* 1 for pset, 0 for hset */)
52669 +{
52670 +       int fake_len;
52671 +       int result = 0;
52672 +       int num_of_plugins;
52673 +       reiser4_plugin_stat *sd;
52674 +       reiser4_inode *state;
52675 +       pset_member memb;
52676 +
52677 +       assert("nikita-669", inode != NULL);
52678 +       assert("nikita-670", area != NULL);
52679 +       assert("nikita-671", *area != NULL);
52680 +
52681 +       state = reiser4_inode_data(inode);
52682 +       if (is_pset ? state->plugin_mask == 0 : state->heir_mask == 0)
52683 +               return 0;
52684 +       sd = (reiser4_plugin_stat *) * area;
52685 +       fake_len = (int)0xffff;
52686 +       move_on(&fake_len, area, sizeof *sd);
52687 +
52688 +       num_of_plugins = 0;
52689 +       for (memb = 0; memb < PSET_LAST; ++memb) {
52690 +               result = save_plug(aset_get(is_pset ? state->pset : state->hset,
52691 +                                           memb),
52692 +                                  inode, memb, area, &num_of_plugins, is_pset);
52693 +               if (result != 0)
52694 +                       break;
52695 +       }
52696 +
52697 +       put_unaligned(cpu_to_le16((__u16)num_of_plugins), &sd->plugins_no);
52698 +       return result;
52699 +}
52700 +
52701 +static int save_pset_sd(struct inode *inode, char **area) {
52702 +       return save_plugin_sd(inode, area, 1 /* pset */);
52703 +}
52704 +
52705 +static int present_hset_sd(struct inode *inode, char **area, int *len) {
52706 +       return present_plugin_sd(inode, area, len, 0 /* hset */);
52707 +}
52708 +
52709 +static int save_len_hset_sd(struct inode *inode) {
52710 +       return save_len_plugin_sd(inode, 0 /* pset */);
52711 +}
52712 +
52713 +static int save_hset_sd(struct inode *inode, char **area) {
52714 +       return save_plugin_sd(inode, area, 0 /* hset */);
52715 +}
52716 +
52717 +/* helper function for crypto_sd_present(), crypto_sd_save.
52718 +   Extract crypto info from stat-data and attach it to inode */
52719 +static int extract_crypto_info (struct inode * inode,
52720 +                               reiser4_crypto_stat * sd)
52721 +{
52722 +       struct reiser4_crypto_info * info;
52723 +       assert("edward-11", !inode_crypto_info(inode));
52724 +       assert("edward-1413",
52725 +              !reiser4_inode_get_flag(inode, REISER4_CRYPTO_STAT_LOADED));
52726 +       /* create and attach a crypto-stat without secret key loaded */
52727 +       info = reiser4_alloc_crypto_info(inode);
52728 +       if (IS_ERR(info))
52729 +               return PTR_ERR(info);
52730 +       info->keysize = le16_to_cpu(get_unaligned(&sd->keysize));
52731 +       memcpy(info->keyid, sd->keyid, inode_digest_plugin(inode)->fipsize);
52732 +       reiser4_attach_crypto_info(inode, info);
52733 +       reiser4_inode_set_flag(inode, REISER4_CRYPTO_STAT_LOADED);
52734 +       return 0;
52735 +}
52736 +
52737 +/* crypto stat-data extension */
52738 +
52739 +static int present_crypto_sd(struct inode *inode, char **area, int *len)
52740 +{
52741 +       int result;
52742 +       reiser4_crypto_stat *sd;
52743 +       digest_plugin *dplug = inode_digest_plugin(inode);
52744 +
52745 +       assert("edward-06", dplug != NULL);
52746 +       assert("edward-684", dplug->fipsize);
52747 +       assert("edward-07", area != NULL);
52748 +       assert("edward-08", *area != NULL);
52749 +       assert("edward-09", len != NULL);
52750 +       assert("edward-10", *len > 0);
52751 +
52752 +       if (*len < (int)sizeof(reiser4_crypto_stat)) {
52753 +               return not_enough_space(inode, "crypto-sd");
52754 +       }
52755 +       /* *len is number of bytes in stat data item from *area to the end of
52756 +          item. It must be not less than size of this extension */
52757 +       assert("edward-75", sizeof(*sd) + dplug->fipsize <= *len);
52758 +
52759 +       sd = (reiser4_crypto_stat *) * area;
52760 +       result = extract_crypto_info(inode, sd);
52761 +       move_on(len, area, sizeof(*sd) + dplug->fipsize);
52762 +
52763 +       return result;
52764 +}
52765 +
52766 +static int save_len_crypto_sd(struct inode *inode)
52767 +{
52768 +       return sizeof(reiser4_crypto_stat) +
52769 +               inode_digest_plugin(inode)->fipsize;
52770 +}
52771 +
52772 +static int save_crypto_sd(struct inode *inode, char **area)
52773 +{
52774 +       int result = 0;
52775 +       reiser4_crypto_stat *sd;
52776 +       struct reiser4_crypto_info * info = inode_crypto_info(inode);
52777 +       digest_plugin *dplug = inode_digest_plugin(inode);
52778 +
52779 +       assert("edward-12", dplug != NULL);
52780 +       assert("edward-13", area != NULL);
52781 +       assert("edward-14", *area != NULL);
52782 +       assert("edward-15", info != NULL);
52783 +       assert("edward-1414", info->keyid != NULL);
52784 +       assert("edward-1415", info->keysize != 0);
52785 +       assert("edward-76", reiser4_inode_data(inode) != NULL);
52786 +
52787 +       if (!reiser4_inode_get_flag(inode, REISER4_CRYPTO_STAT_LOADED)) {
52788 +               /* file is just created */
52789 +               sd = (reiser4_crypto_stat *) *area;
52790 +               /* copy everything but private key to the disk stat-data */
52791 +               put_unaligned(cpu_to_le16(info->keysize), &sd->keysize);
52792 +               memcpy(sd->keyid, info->keyid, (size_t) dplug->fipsize);
52793 +               reiser4_inode_set_flag(inode, REISER4_CRYPTO_STAT_LOADED);
52794 +       }
52795 +       *area += (sizeof(*sd) + dplug->fipsize);
52796 +       return result;
52797 +}
52798 +
52799 +static int eio(struct inode *inode, char **area, int *len)
52800 +{
52801 +       return RETERR(-EIO);
52802 +}
52803 +
52804 +sd_ext_plugin sd_ext_plugins[LAST_SD_EXTENSION] = {
52805 +       [LIGHT_WEIGHT_STAT] = {
52806 +               .h = {
52807 +                       .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
52808 +                       .id = LIGHT_WEIGHT_STAT,
52809 +                       .pops = NULL,
52810 +                       .label = "light-weight sd",
52811 +                       .desc = "sd for light-weight files",
52812 +                       .linkage = {NULL,NULL}
52813 +               },
52814 +               .present = present_lw_sd,
52815 +               .absent = NULL,
52816 +               .save_len = save_len_lw_sd,
52817 +               .save = save_lw_sd,
52818 +               .alignment = 8
52819 +       },
52820 +       [UNIX_STAT] = {
52821 +               .h = {
52822 +                       .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
52823 +                       .id = UNIX_STAT,
52824 +                       .pops = NULL,
52825 +                       .label = "unix-sd",
52826 +                       .desc = "unix stat-data fields",
52827 +                       .linkage = {NULL,NULL}
52828 +               },
52829 +               .present = present_unix_sd,
52830 +               .absent = absent_unix_sd,
52831 +               .save_len = save_len_unix_sd,
52832 +               .save = save_unix_sd,
52833 +               .alignment = 8
52834 +       },
52835 +       [LARGE_TIMES_STAT] = {
52836 +               .h = {
52837 +                       .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
52838 +                       .id = LARGE_TIMES_STAT,
52839 +                       .pops = NULL,
52840 +                       .label = "64time-sd",
52841 +                       .desc = "nanosecond resolution for times",
52842 +                       .linkage = {NULL,NULL}
52843 +               },
52844 +               .present = present_large_times_sd,
52845 +               .absent = NULL,
52846 +               .save_len = save_len_large_times_sd,
52847 +               .save = save_large_times_sd,
52848 +               .alignment = 8
52849 +       },
52850 +       [SYMLINK_STAT] = {
52851 +               /* stat data of symlink has this extension */
52852 +               .h = {
52853 +                       .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
52854 +                       .id = SYMLINK_STAT,
52855 +                       .pops = NULL,
52856 +                       .label = "symlink-sd",
52857 +                       .desc =
52858 +                       "stat data is appended with symlink name",
52859 +                       .linkage = {NULL,NULL}
52860 +               },
52861 +               .present = present_symlink_sd,
52862 +               .absent = NULL,
52863 +               .save_len = save_len_symlink_sd,
52864 +               .save = save_symlink_sd,
52865 +               .alignment = 8
52866 +       },
52867 +       [PLUGIN_STAT] = {
52868 +               .h = {
52869 +                       .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
52870 +                       .id = PLUGIN_STAT,
52871 +                       .pops = NULL,
52872 +                       .label = "plugin-sd",
52873 +                       .desc = "plugin stat-data fields",
52874 +                       .linkage = {NULL,NULL}
52875 +               },
52876 +               .present = present_pset_sd,
52877 +               .absent = absent_plugin_sd,
52878 +               .save_len = save_len_pset_sd,
52879 +               .save = save_pset_sd,
52880 +               .alignment = 8
52881 +       },
52882 +       [HEIR_STAT] = {
52883 +               .h = {
52884 +                       .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
52885 +                       .id = HEIR_STAT,
52886 +                       .pops = NULL,
52887 +                       .label = "heir-plugin-sd",
52888 +                       .desc = "heir plugin stat-data fields",
52889 +                       .linkage = {NULL,NULL}
52890 +               },
52891 +               .present = present_hset_sd,
52892 +               .absent = NULL,
52893 +               .save_len = save_len_hset_sd,
52894 +               .save = save_hset_sd,
52895 +               .alignment = 8
52896 +       },
52897 +       [FLAGS_STAT] = {
52898 +               .h = {
52899 +                       .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
52900 +                       .id = FLAGS_STAT,
52901 +                       .pops = NULL,
52902 +                       .label = "flags-sd",
52903 +                       .desc = "inode bit flags",
52904 +                       .linkage = {NULL, NULL}
52905 +               },
52906 +               .present = present_flags_sd,
52907 +               .absent = NULL,
52908 +               .save_len = save_len_flags_sd,
52909 +               .save = save_flags_sd,
52910 +               .alignment = 8
52911 +       },
52912 +       [CAPABILITIES_STAT] = {
52913 +               .h = {
52914 +                       .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
52915 +                       .id = CAPABILITIES_STAT,
52916 +                       .pops = NULL,
52917 +                       .label = "capabilities-sd",
52918 +                       .desc = "capabilities",
52919 +                       .linkage = {NULL, NULL}
52920 +               },
52921 +               .present = eio,
52922 +               .absent = NULL,
52923 +               .save_len = save_len_flags_sd,
52924 +               .save = save_flags_sd,
52925 +               .alignment = 8
52926 +       },
52927 +       [CRYPTO_STAT] = {
52928 +               .h = {
52929 +                       .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
52930 +                       .id = CRYPTO_STAT,
52931 +                       .pops = NULL,
52932 +                       .label = "crypto-sd",
52933 +                       .desc = "secret key size and id",
52934 +                       .linkage = {NULL, NULL}
52935 +               },
52936 +               .present = present_crypto_sd,
52937 +               .absent = NULL,
52938 +               .save_len = save_len_crypto_sd,
52939 +               .save = save_crypto_sd,
52940 +               .alignment = 8
52941 +       }
52942 +};
52943 +
52944 +/* Make Linus happy.
52945 +   Local variables:
52946 +   c-indentation-style: "K&R"
52947 +   mode-name: "LC"
52948 +   c-basic-offset: 8
52949 +   tab-width: 8
52950 +   fill-column: 120
52951 +   End:
52952 +*/
52953 diff -puN /dev/null fs/reiser4/plugin/item/static_stat.h
52954 --- /dev/null
52955 +++ a/fs/reiser4/plugin/item/static_stat.h
52956 @@ -0,0 +1,224 @@
52957 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
52958 +
52959 +/* This describes the static_stat item, used to hold all information needed by the stat() syscall.
52960 +
52961 +In the case where each file has not less than the fields needed by the
52962 +stat() syscall, it is more compact to store those fields in this
52963 +struct.
52964 +
52965 +If this item does not exist, then all stats are dynamically resolved.
52966 +At the moment, we either resolve all stats dynamically or all of them
52967 +statically.  If you think this is not fully optimal, and the rest of
52968 +reiser4 is working, then fix it...:-)
52969 +
52970 +*/
52971 +
52972 +#if !defined( __FS_REISER4_PLUGIN_ITEM_STATIC_STAT_H__ )
52973 +#define __FS_REISER4_PLUGIN_ITEM_STATIC_STAT_H__
52974 +
52975 +#include "../../forward.h"
52976 +#include "../../dformat.h"
52977 +
52978 +#include <linux/fs.h>          /* for struct inode */
52979 +
52980 +/* Stat data layout: goals and implementation.
52981 +
52982 +   We want to be able to have lightweight files which have complete flexibility in what semantic metadata is attached to
52983 +   them, including not having semantic metadata attached to them.
52984 +
52985 +   There is one problem with doing that, which is that if in fact you have exactly the same metadata for most files you
52986 +   want to store, then it takes more space to store that metadata in a dynamically sized structure than in a statically
52987 +   sized structure because the statically sized structure knows without recording it what the names and lengths of the
52988 +   attributes are.
52989 +
52990 +   This leads to a natural compromise, which is to special case those files which have simply the standard unix file
52991 +   attributes, and only employ the full dynamic stat data mechanism for those files that differ from the standard unix
52992 +   file in their use of file attributes.
52993 +
52994 +   Yet this compromise deserves to be compromised a little.
52995 +
52996 +   We accommodate the case where you have no more than the standard unix file attributes by using an "extension
52997 +   bitmask": each bit in it indicates presence or absence of or particular stat data extension (see sd_ext_bits enum).
52998 +
52999 +   If the first bit of the extension bitmask bit is 0, we have light-weight file whose attributes are either inherited
53000 +   from parent directory (as uid, gid) or initialised to some sane values.
53001 +
53002 +   To capitalize on existing code infrastructure, extensions are
53003 +   implemented as plugins of type REISER4_SD_EXT_PLUGIN_TYPE.
53004 +   Each stat-data extension plugin implements four methods:
53005 +
53006 +    ->present() called by sd_load() when this extension is found in stat-data
53007 +    ->absent() called by sd_load() when this extension is not found in stat-data
53008 +    ->save_len() called by sd_len() to calculate total length of stat-data
53009 +    ->save() called by sd_save() to store extension data into stat-data
53010 +
53011 +    Implementation is in fs/reiser4/plugin/item/static_stat.c
53012 +*/
53013 +
53014 +/* stat-data extension. Please order this by presumed frequency of use */
53015 +typedef enum {
53016 +       /* support for light-weight files */
53017 +       LIGHT_WEIGHT_STAT,
53018 +       /* data required to implement unix stat(2) call. Layout is in
53019 +          reiser4_unix_stat. If this is not present, file is light-weight */
53020 +       UNIX_STAT,
53021 +       /* this contains additional set of 32bit [anc]time fields to implement
53022 +          nanosecond resolution. Layout is in reiser4_large_times_stat. Usage
53023 +          if this extension is governed by 32bittimes mount option. */
53024 +       LARGE_TIMES_STAT,
53025 +       /* stat data has link name included */
53026 +       SYMLINK_STAT,
53027 +       /* on-disk slots of non-standard plugins for main plugin table
53028 +          (@reiser4_inode->pset), that is, plugins that cannot be deduced
53029 +          from file mode bits), for example, aggregation, interpolation etc. */
53030 +       PLUGIN_STAT,
53031 +       /* this extension contains persistent inode flags. These flags are
53032 +          single bits: immutable, append, only, etc. Layout is in
53033 +          reiser4_flags_stat. */
53034 +       FLAGS_STAT,
53035 +       /* this extension contains capabilities sets, associated with this
53036 +          file. Layout is in reiser4_capabilities_stat */
53037 +       CAPABILITIES_STAT,
53038 +       /* this extension contains size and public id of the secret key.
53039 +          Layout is in reiser4_crypto_stat */
53040 +       CRYPTO_STAT,
53041 +       /* on-disk slots of non-default plugins for inheritance, which
53042 +          are extracted to special plugin table (@reiser4_inode->hset).
53043 +          By default, children of the object will inherit plugins from
53044 +          its main plugin table (pset). */
53045 +       HEIR_STAT,
53046 +       LAST_SD_EXTENSION,
53047 +       /*
53048 +        * init_inode_static_sd() iterates over extension mask until all
53049 +        * non-zero bits are processed. This means, that neither ->present(),
53050 +        * nor ->absent() methods will be called for stat-data extensions that
53051 +        * go after last present extension. But some basic extensions, we want
53052 +        * either ->absent() or ->present() method to be called, because these
53053 +        * extensions set up something in inode even when they are not
53054 +        * present. This is what LAST_IMPORTANT_SD_EXTENSION is for: for all
53055 +        * extensions before and including LAST_IMPORTANT_SD_EXTENSION either
53056 +        * ->present(), or ->absent() method will be called, independently of
53057 +        * what other extensions are present.
53058 +        */
53059 +       LAST_IMPORTANT_SD_EXTENSION = PLUGIN_STAT
53060 +} sd_ext_bits;
53061 +
53062 +/* minimal stat-data. This allows to support light-weight files. */
53063 +typedef struct reiser4_stat_data_base {
53064 +       /*  0 */ __le16 extmask;
53065 +       /*  2 */
53066 +} PACKED reiser4_stat_data_base;
53067 +
53068 +typedef struct reiser4_light_weight_stat {
53069 +       /*  0 */ __le16 mode;
53070 +       /*  2 */ __le32 nlink;
53071 +       /*  6 */ __le64 size;
53072 +       /* size in bytes */
53073 +       /* 14 */
53074 +} PACKED reiser4_light_weight_stat;
53075 +
53076 +typedef struct reiser4_unix_stat {
53077 +       /* owner id */
53078 +       /*  0 */ __le32 uid;
53079 +       /* group id */
53080 +       /*  4 */ __le32 gid;
53081 +       /* access time */
53082 +       /*  8 */ __le32 atime;
53083 +       /* modification time */
53084 +       /* 12 */ __le32 mtime;
53085 +       /* change time */
53086 +       /* 16 */ __le32 ctime;
53087 +       union {
53088 +               /* minor:major for device files */
53089 +               /* 20 */ __le64 rdev;
53090 +               /* bytes used by file */
53091 +               /* 20 */ __le64 bytes;
53092 +       } u;
53093 +       /* 28 */
53094 +} PACKED reiser4_unix_stat;
53095 +
53096 +/* symlink stored as part of inode */
53097 +typedef struct reiser4_symlink_stat {
53098 +       char body[0];
53099 +} PACKED reiser4_symlink_stat;
53100 +
53101 +typedef struct reiser4_plugin_slot {
53102 +       /*  0 */ __le16 pset_memb;
53103 +       /*  2 */ __le16 id;
53104 +       /*  4 *//* here plugin stores its persistent state */
53105 +} PACKED reiser4_plugin_slot;
53106 +
53107 +/* stat-data extension for files with non-standard plugin. */
53108 +typedef struct reiser4_plugin_stat {
53109 +       /* number of additional plugins, associated with this object */
53110 +       /*  0 */ __le16 plugins_no;
53111 +       /*  2 */ reiser4_plugin_slot slot[0];
53112 +       /*  2 */
53113 +} PACKED reiser4_plugin_stat;
53114 +
53115 +/* stat-data extension for inode flags. Currently it is just fixed-width 32
53116 + * bit mask. If need arise, this can be replaced with variable width
53117 + * bitmask. */
53118 +typedef struct reiser4_flags_stat {
53119 +       /*  0 */ __le32 flags;
53120 +       /*  4 */
53121 +} PACKED reiser4_flags_stat;
53122 +
53123 +typedef struct reiser4_capabilities_stat {
53124 +       /*  0 */ __le32 effective;
53125 +       /*  8 */ __le32 permitted;
53126 +       /* 16 */
53127 +} PACKED reiser4_capabilities_stat;
53128 +
53129 +typedef struct reiser4_cluster_stat {
53130 +/* this defines cluster size (an attribute of cryptcompress objects) as PAGE_SIZE << cluster shift */
53131 +       /* 0 */ d8 cluster_shift;
53132 +       /* 1 */
53133 +} PACKED reiser4_cluster_stat;
53134 +
53135 +typedef struct reiser4_crypto_stat {
53136 +       /* secret key size, bits */
53137 +       /*  0 */ d16 keysize;
53138 +       /* secret key id */
53139 +       /*  2 */ d8 keyid[0];
53140 +       /* 2 */
53141 +} PACKED reiser4_crypto_stat;
53142 +
53143 +typedef struct reiser4_large_times_stat {
53144 +       /* access time */
53145 +       /* 0 */ d32 atime;
53146 +       /* modification time */
53147 +       /* 4 */ d32 mtime;
53148 +       /* change time */
53149 +       /* 8 */ d32 ctime;
53150 +       /* 12 */
53151 +} PACKED reiser4_large_times_stat;
53152 +
53153 +/* this structure is filled by sd_item_stat */
53154 +typedef struct sd_stat {
53155 +       int dirs;
53156 +       int files;
53157 +       int others;
53158 +} sd_stat;
53159 +
53160 +/* plugin->item.common.* */
53161 +extern void print_sd(const char *prefix, coord_t * coord);
53162 +extern void item_stat_static_sd(const coord_t * coord, void *vp);
53163 +
53164 +/* plugin->item.s.sd.* */
53165 +extern int init_inode_static_sd(struct inode *inode, char *sd, int len);
53166 +extern int save_len_static_sd(struct inode *inode);
53167 +extern int save_static_sd(struct inode *inode, char **area);
53168 +
53169 +/* __FS_REISER4_PLUGIN_ITEM_STATIC_STAT_H__ */
53170 +#endif
53171 +
53172 +/* Make Linus happy.
53173 +   Local variables:
53174 +   c-indentation-style: "K&R"
53175 +   mode-name: "LC"
53176 +   c-basic-offset: 8
53177 +   tab-width: 8
53178 +   fill-column: 120
53179 +   End:
53180 +*/
53181 diff -puN /dev/null fs/reiser4/plugin/item/tail.c
53182 --- /dev/null
53183 +++ a/fs/reiser4/plugin/item/tail.c
53184 @@ -0,0 +1,807 @@
53185 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
53186 +
53187 +#include "item.h"
53188 +#include "../../inode.h"
53189 +#include "../../page_cache.h"
53190 +#include "../../carry.h"
53191 +#include "../../vfs_ops.h"
53192 +
53193 +#include <linux/quotaops.h>
53194 +#include <asm/uaccess.h>
53195 +#include <linux/swap.h>
53196 +#include <linux/writeback.h>
53197 +
53198 +/* plugin->u.item.b.max_key_inside */
53199 +reiser4_key *max_key_inside_tail(const coord_t *coord, reiser4_key *key)
53200 +{
53201 +       item_key_by_coord(coord, key);
53202 +       set_key_offset(key, get_key_offset(reiser4_max_key()));
53203 +       return key;
53204 +}
53205 +
53206 +/* plugin->u.item.b.can_contain_key */
53207 +int can_contain_key_tail(const coord_t *coord, const reiser4_key *key,
53208 +                        const reiser4_item_data *data)
53209 +{
53210 +       reiser4_key item_key;
53211 +
53212 +       if (item_plugin_by_coord(coord) != data->iplug)
53213 +               return 0;
53214 +
53215 +       item_key_by_coord(coord, &item_key);
53216 +       if (get_key_locality(key) != get_key_locality(&item_key) ||
53217 +           get_key_objectid(key) != get_key_objectid(&item_key))
53218 +               return 0;
53219 +
53220 +       return 1;
53221 +}
53222 +
53223 +/* plugin->u.item.b.mergeable
53224 +   first item is of tail type */
53225 +/* Audited by: green(2002.06.14) */
53226 +int mergeable_tail(const coord_t *p1, const coord_t *p2)
53227 +{
53228 +       reiser4_key key1, key2;
53229 +
53230 +       assert("vs-535", plugin_of_group(item_plugin_by_coord(p1),
53231 +                                        UNIX_FILE_METADATA_ITEM_TYPE));
53232 +       assert("vs-365", item_id_by_coord(p1) == FORMATTING_ID);
53233 +
53234 +       if (item_id_by_coord(p2) != FORMATTING_ID) {
53235 +               /* second item is of another type */
53236 +               return 0;
53237 +       }
53238 +
53239 +       item_key_by_coord(p1, &key1);
53240 +       item_key_by_coord(p2, &key2);
53241 +       if (get_key_locality(&key1) != get_key_locality(&key2) ||
53242 +           get_key_objectid(&key1) != get_key_objectid(&key2)
53243 +           || get_key_type(&key1) != get_key_type(&key2)) {
53244 +               /* items of different objects */
53245 +               return 0;
53246 +       }
53247 +       if (get_key_offset(&key1) + nr_units_tail(p1) != get_key_offset(&key2)) {
53248 +               /* not adjacent items */
53249 +               return 0;
53250 +       }
53251 +       return 1;
53252 +}
53253 +
53254 +/* plugin->u.item.b.print
53255 +   plugin->u.item.b.check */
53256 +
53257 +/* plugin->u.item.b.nr_units */
53258 +pos_in_node_t nr_units_tail(const coord_t * coord)
53259 +{
53260 +       return item_length_by_coord(coord);
53261 +}
53262 +
53263 +/* plugin->u.item.b.lookup */
53264 +lookup_result
53265 +lookup_tail(const reiser4_key * key, lookup_bias bias, coord_t * coord)
53266 +{
53267 +       reiser4_key item_key;
53268 +       __u64 lookuped, offset;
53269 +       unsigned nr_units;
53270 +
53271 +       item_key_by_coord(coord, &item_key);
53272 +       offset = get_key_offset(item_key_by_coord(coord, &item_key));
53273 +       nr_units = nr_units_tail(coord);
53274 +
53275 +       /* key we are looking for must be greater than key of item @coord */
53276 +       assert("vs-416", keygt(key, &item_key));
53277 +
53278 +       /* offset we are looking for */
53279 +       lookuped = get_key_offset(key);
53280 +
53281 +       if (lookuped >= offset && lookuped < offset + nr_units) {
53282 +               /* byte we are looking for is in this item */
53283 +               coord->unit_pos = lookuped - offset;
53284 +               coord->between = AT_UNIT;
53285 +               return CBK_COORD_FOUND;
53286 +       }
53287 +
53288 +       /* set coord after last unit */
53289 +       coord->unit_pos = nr_units - 1;
53290 +       coord->between = AFTER_UNIT;
53291 +       return bias ==
53292 +           FIND_MAX_NOT_MORE_THAN ? CBK_COORD_FOUND : CBK_COORD_NOTFOUND;
53293 +}
53294 +
53295 +/* plugin->u.item.b.paste */
53296 +int
53297 +paste_tail(coord_t *coord, reiser4_item_data *data,
53298 +          carry_plugin_info *info UNUSED_ARG)
53299 +{
53300 +       unsigned old_item_length;
53301 +       char *item;
53302 +
53303 +       /* length the item had before resizing has been performed */
53304 +       old_item_length = item_length_by_coord(coord) - data->length;
53305 +
53306 +       /* tail items never get pasted in the middle */
53307 +       assert("vs-363",
53308 +              (coord->unit_pos == 0 && coord->between == BEFORE_UNIT) ||
53309 +              (coord->unit_pos == old_item_length - 1 &&
53310 +               coord->between == AFTER_UNIT) ||
53311 +              (coord->unit_pos == 0 && old_item_length == 0
53312 +               && coord->between == AT_UNIT));
53313 +
53314 +       item = item_body_by_coord(coord);
53315 +       if (coord->unit_pos == 0)
53316 +               /* make space for pasted data when pasting at the beginning of
53317 +                  the item */
53318 +               memmove(item + data->length, item, old_item_length);
53319 +
53320 +       if (coord->between == AFTER_UNIT)
53321 +               coord->unit_pos++;
53322 +
53323 +       if (data->data) {
53324 +               assert("vs-554", data->user == 0 || data->user == 1);
53325 +               if (data->user) {
53326 +                       assert("nikita-3035", reiser4_schedulable());
53327 +                       /* copy from user space */
53328 +                       if (__copy_from_user(item + coord->unit_pos,
53329 +                                            (const char __user *)data->data,
53330 +                                            (unsigned)data->length))
53331 +                               return RETERR(-EFAULT);
53332 +               } else
53333 +                       /* copy from kernel space */
53334 +                       memcpy(item + coord->unit_pos, data->data,
53335 +                              (unsigned)data->length);
53336 +       } else {
53337 +               memset(item + coord->unit_pos, 0, (unsigned)data->length);
53338 +       }
53339 +       return 0;
53340 +}
53341 +
53342 +/* plugin->u.item.b.fast_paste */
53343 +
53344 +/* plugin->u.item.b.can_shift
53345 +   number of units is returned via return value, number of bytes via @size. For
53346 +   tail items they coincide */
53347 +int
53348 +can_shift_tail(unsigned free_space, coord_t * source UNUSED_ARG,
53349 +              znode * target UNUSED_ARG, shift_direction direction UNUSED_ARG,
53350 +              unsigned *size, unsigned want)
53351 +{
53352 +       /* make sure that that we do not want to shift more than we have */
53353 +       assert("vs-364", want > 0
53354 +              && want <= (unsigned)item_length_by_coord(source));
53355 +
53356 +       *size = min(want, free_space);
53357 +       return *size;
53358 +}
53359 +
53360 +/* plugin->u.item.b.copy_units */
53361 +void
53362 +copy_units_tail(coord_t * target, coord_t * source,
53363 +               unsigned from, unsigned count,
53364 +               shift_direction where_is_free_space,
53365 +               unsigned free_space UNUSED_ARG)
53366 +{
53367 +       /* make sure that item @target is expanded already */
53368 +       assert("vs-366", (unsigned)item_length_by_coord(target) >= count);
53369 +       assert("vs-370", free_space >= count);
53370 +
53371 +       if (where_is_free_space == SHIFT_LEFT) {
53372 +               /* append item @target with @count first bytes of @source */
53373 +               assert("vs-365", from == 0);
53374 +
53375 +               memcpy((char *)item_body_by_coord(target) +
53376 +                      item_length_by_coord(target) - count,
53377 +                      (char *)item_body_by_coord(source), count);
53378 +       } else {
53379 +               /* target item is moved to right already */
53380 +               reiser4_key key;
53381 +
53382 +               assert("vs-367",
53383 +                      (unsigned)item_length_by_coord(source) == from + count);
53384 +
53385 +               memcpy((char *)item_body_by_coord(target),
53386 +                      (char *)item_body_by_coord(source) + from, count);
53387 +
53388 +               /* new units are inserted before first unit in an item,
53389 +                  therefore, we have to update item key */
53390 +               item_key_by_coord(source, &key);
53391 +               set_key_offset(&key, get_key_offset(&key) + from);
53392 +
53393 +               node_plugin_by_node(target->node)->update_item_key(target, &key,
53394 +                                                                  NULL /*info */);
53395 +       }
53396 +}
53397 +
53398 +/* plugin->u.item.b.create_hook */
53399 +
53400 +/* item_plugin->b.kill_hook
53401 +   this is called when @count units starting from @from-th one are going to be removed
53402 +   */
53403 +int
53404 +kill_hook_tail(const coord_t * coord, pos_in_node_t from,
53405 +              pos_in_node_t count, struct carry_kill_data *kdata)
53406 +{
53407 +       reiser4_key key;
53408 +       loff_t start, end;
53409 +
53410 +       assert("vs-1577", kdata);
53411 +       assert("vs-1579", kdata->inode);
53412 +
53413 +       item_key_by_coord(coord, &key);
53414 +       start = get_key_offset(&key) + from;
53415 +       end = start + count;
53416 +       fake_kill_hook_tail(kdata->inode, start, end, kdata->params.truncate);
53417 +       return 0;
53418 +}
53419 +
53420 +/* plugin->u.item.b.shift_hook */
53421 +
53422 +/* helper for kill_units_tail and cut_units_tail */
53423 +static int
53424 +do_cut_or_kill(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
53425 +              reiser4_key * smallest_removed, reiser4_key * new_first)
53426 +{
53427 +       pos_in_node_t count;
53428 +
53429 +       /* this method is only called to remove part of item */
53430 +       assert("vs-374", (to - from + 1) < item_length_by_coord(coord));
53431 +       /* tails items are never cut from the middle of an item */
53432 +       assert("vs-396", ergo(from != 0, to == coord_last_unit_pos(coord)));
53433 +       assert("vs-1558", ergo(from == 0, to < coord_last_unit_pos(coord)));
53434 +
53435 +       count = to - from + 1;
53436 +
53437 +       if (smallest_removed) {
53438 +               /* store smallest key removed */
53439 +               item_key_by_coord(coord, smallest_removed);
53440 +               set_key_offset(smallest_removed,
53441 +                              get_key_offset(smallest_removed) + from);
53442 +       }
53443 +       if (new_first) {
53444 +               /* head of item is cut */
53445 +               assert("vs-1529", from == 0);
53446 +
53447 +               item_key_by_coord(coord, new_first);
53448 +               set_key_offset(new_first,
53449 +                              get_key_offset(new_first) + from + count);
53450 +       }
53451 +
53452 +       if (REISER4_DEBUG)
53453 +               memset((char *)item_body_by_coord(coord) + from, 0, count);
53454 +       return count;
53455 +}
53456 +
53457 +/* plugin->u.item.b.cut_units */
53458 +int
53459 +cut_units_tail(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
53460 +              struct carry_cut_data *cdata UNUSED_ARG,
53461 +              reiser4_key * smallest_removed, reiser4_key * new_first)
53462 +{
53463 +       return do_cut_or_kill(coord, from, to, smallest_removed, new_first);
53464 +}
53465 +
53466 +/* plugin->u.item.b.kill_units */
53467 +int
53468 +kill_units_tail(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
53469 +               struct carry_kill_data *kdata, reiser4_key * smallest_removed,
53470 +               reiser4_key * new_first)
53471 +{
53472 +       kill_hook_tail(coord, from, to - from + 1, kdata);
53473 +       return do_cut_or_kill(coord, from, to, smallest_removed, new_first);
53474 +}
53475 +
53476 +/* plugin->u.item.b.unit_key */
53477 +reiser4_key *unit_key_tail(const coord_t * coord, reiser4_key * key)
53478 +{
53479 +       assert("vs-375", coord_is_existing_unit(coord));
53480 +
53481 +       item_key_by_coord(coord, key);
53482 +       set_key_offset(key, (get_key_offset(key) + coord->unit_pos));
53483 +
53484 +       return key;
53485 +}
53486 +
53487 +/* plugin->u.item.b.estimate
53488 +   plugin->u.item.b.item_data_by_flow */
53489 +
53490 +/* tail redpage function. It is called from readpage_tail(). */
53491 +static int do_readpage_tail(uf_coord_t *uf_coord, struct page *page)
53492 +{
53493 +       tap_t tap;
53494 +       int result;
53495 +       coord_t coord;
53496 +       lock_handle lh;
53497 +       int count, mapped;
53498 +       struct inode *inode;
53499 +       char *pagedata;
53500 +
53501 +       /* saving passed coord in order to do not move it by tap. */
53502 +       init_lh(&lh);
53503 +       copy_lh(&lh, uf_coord->lh);
53504 +       inode = page->mapping->host;
53505 +       coord_dup(&coord, &uf_coord->coord);
53506 +
53507 +       reiser4_tap_init(&tap, &coord, &lh, ZNODE_READ_LOCK);
53508 +
53509 +       if ((result = reiser4_tap_load(&tap)))
53510 +               goto out_tap_done;
53511 +
53512 +       /* lookup until page is filled up. */
53513 +       for (mapped = 0; mapped < PAGE_CACHE_SIZE; ) {
53514 +               /* number of bytes to be copied to page */
53515 +               count = item_length_by_coord(&coord) - coord.unit_pos;
53516 +               if (count > PAGE_CACHE_SIZE - mapped)
53517 +                       count = PAGE_CACHE_SIZE - mapped;
53518 +
53519 +               /* attach @page to address space and get data address */
53520 +               pagedata = kmap_atomic(page, KM_USER0);
53521 +
53522 +               /* copy tail item to page */
53523 +               memcpy(pagedata + mapped,
53524 +                      ((char *)item_body_by_coord(&coord) + coord.unit_pos),
53525 +                      count);
53526 +               mapped += count;
53527 +
53528 +               flush_dcache_page(page);
53529 +
53530 +               /* dettach page from address space */
53531 +               kunmap_atomic(pagedata, KM_USER0);
53532 +
53533 +               /* Getting next tail item. */
53534 +               if (mapped < PAGE_CACHE_SIZE) {
53535 +                       /*
53536 +                        * unlock page in order to avoid keep it locked
53537 +                        * during tree lookup, which takes long term locks
53538 +                        */
53539 +                       unlock_page(page);
53540 +
53541 +                       /* getting right neighbour. */
53542 +                       result = go_dir_el(&tap, RIGHT_SIDE, 0);
53543 +
53544 +                       /* lock page back */
53545 +                       lock_page(page);
53546 +                       if (PageUptodate(page)) {
53547 +                               /*
53548 +                                * another thread read the page, we have
53549 +                                * nothing to do
53550 +                                */
53551 +                               result = 0;
53552 +                               goto out_unlock_page;
53553 +                       }
53554 +
53555 +                       if (result) {
53556 +                               if (result == -E_NO_NEIGHBOR) {
53557 +                                       /*
53558 +                                        * rigth neighbor is not a formatted
53559 +                                        * node
53560 +                                        */
53561 +                                       result = 0;
53562 +                                       goto done;
53563 +                               } else {
53564 +                                       goto out_tap_relse;
53565 +                               }
53566 +                       } else {
53567 +                               if (!inode_file_plugin(inode)->
53568 +                                   owns_item(inode, &coord)) {
53569 +                                       /* item of another file is found */
53570 +                                       result = 0;
53571 +                                       goto done;
53572 +                               }
53573 +                       }
53574 +               }
53575 +       }
53576 +
53577 + done:
53578 +       if (mapped != PAGE_CACHE_SIZE)
53579 +               zero_user_segment(page, mapped, PAGE_CACHE_SIZE);
53580 +       SetPageUptodate(page);
53581 + out_unlock_page:
53582 +       unlock_page(page);
53583 + out_tap_relse:
53584 +       reiser4_tap_relse(&tap);
53585 + out_tap_done:
53586 +       reiser4_tap_done(&tap);
53587 +       return result;
53588 +}
53589 +
53590 +/*
53591 +   plugin->s.file.readpage
53592 +   reiser4_read->unix_file_read->page_cache_readahead->reiser4_readpage->unix_file_readpage->readpage_tail
53593 +   or
53594 +   filemap_fault->reiser4_readpage->readpage_unix_file->->readpage_tail
53595 +
53596 +   At the beginning: coord->node is read locked, zloaded, page is locked, coord is set to existing unit inside of tail
53597 +   item. */
53598 +int readpage_tail(void *vp, struct page *page)
53599 +{
53600 +       uf_coord_t *uf_coord = vp;
53601 +       ON_DEBUG(coord_t * coord = &uf_coord->coord);
53602 +       ON_DEBUG(reiser4_key key);
53603 +
53604 +       assert("umka-2515", PageLocked(page));
53605 +       assert("umka-2516", !PageUptodate(page));
53606 +       assert("umka-2517", !jprivate(page) && !PagePrivate(page));
53607 +       assert("umka-2518", page->mapping && page->mapping->host);
53608 +
53609 +       assert("umka-2519", znode_is_loaded(coord->node));
53610 +       assert("umka-2520", item_is_tail(coord));
53611 +       assert("umka-2521", coord_is_existing_unit(coord));
53612 +       assert("umka-2522", znode_is_rlocked(coord->node));
53613 +       assert("umka-2523",
53614 +              page->mapping->host->i_ino ==
53615 +              get_key_objectid(item_key_by_coord(coord, &key)));
53616 +
53617 +       return do_readpage_tail(uf_coord, page);
53618 +}
53619 +
53620 +/**
53621 + * overwrite_tail
53622 + * @flow:
53623 + * @coord:
53624 + *
53625 + * Overwrites tail item or its part by user data. Returns number of bytes
53626 + * written or error code.
53627 + */
53628 +static int overwrite_tail(flow_t *flow, coord_t *coord)
53629 +{
53630 +       unsigned count;
53631 +
53632 +       assert("vs-570", flow->user == 1);
53633 +       assert("vs-946", flow->data);
53634 +       assert("vs-947", coord_is_existing_unit(coord));
53635 +       assert("vs-948", znode_is_write_locked(coord->node));
53636 +       assert("nikita-3036", reiser4_schedulable());
53637 +
53638 +       count = item_length_by_coord(coord) - coord->unit_pos;
53639 +       if (count > flow->length)
53640 +               count = flow->length;
53641 +
53642 +       if (__copy_from_user((char *)item_body_by_coord(coord) + coord->unit_pos,
53643 +                            (const char __user *)flow->data, count))
53644 +               return RETERR(-EFAULT);
53645 +
53646 +       znode_make_dirty(coord->node);
53647 +       return count;
53648 +}
53649 +
53650 +/**
53651 + * insert_first_tail
53652 + * @inode:
53653 + * @flow:
53654 + * @coord:
53655 + * @lh:
53656 + *
53657 + * Returns number of bytes written or error code.
53658 + */
53659 +static ssize_t insert_first_tail(struct inode *inode, flow_t *flow,
53660 +                                coord_t *coord, lock_handle *lh)
53661 +{
53662 +       int result;
53663 +       loff_t to_write;
53664 +       struct unix_file_info *uf_info;
53665 +
53666 +       if (get_key_offset(&flow->key) != 0) {
53667 +               /*
53668 +                * file is empty and we have to write not to the beginning of
53669 +                * file. Create a hole at the beginning of file. On success
53670 +                * insert_flow returns 0 as number of written bytes which is
53671 +                * what we have to return on padding a file with holes
53672 +                */
53673 +               flow->data = NULL;
53674 +               flow->length = get_key_offset(&flow->key);
53675 +               set_key_offset(&flow->key, 0);
53676 +               /*
53677 +                * holes in files built of tails are stored just like if there
53678 +                * were real data which are all zeros. Therefore we have to
53679 +                * allocate quota here as well
53680 +                */
53681 +               if (DQUOT_ALLOC_SPACE_NODIRTY(inode, flow->length))
53682 +                       return RETERR(-EDQUOT);
53683 +               result = reiser4_insert_flow(coord, lh, flow);
53684 +               if (flow->length)
53685 +                       DQUOT_FREE_SPACE_NODIRTY(inode, flow->length);
53686 +
53687 +               uf_info = unix_file_inode_data(inode);
53688 +
53689 +               /*
53690 +                * first item insertion is only possible when writing to empty
53691 +                * file or performing tail conversion
53692 +                */
53693 +               assert("", (uf_info->container == UF_CONTAINER_EMPTY ||
53694 +                           (reiser4_inode_get_flag(inode,
53695 +                                                   REISER4_PART_MIXED) &&
53696 +                            reiser4_inode_get_flag(inode,
53697 +                                                   REISER4_PART_IN_CONV))));
53698 +               /* if file was empty - update its state */
53699 +               if (result == 0 && uf_info->container == UF_CONTAINER_EMPTY)
53700 +                       uf_info->container = UF_CONTAINER_TAILS;
53701 +               return result;
53702 +       }
53703 +
53704 +       /* check quota before appending data */
53705 +       if (DQUOT_ALLOC_SPACE_NODIRTY(inode, flow->length))
53706 +               return RETERR(-EDQUOT);
53707 +
53708 +       to_write = flow->length;
53709 +       result = reiser4_insert_flow(coord, lh, flow);
53710 +       if (flow->length)
53711 +               DQUOT_FREE_SPACE_NODIRTY(inode, flow->length);
53712 +       return (to_write - flow->length) ? (to_write - flow->length) : result;
53713 +}
53714 +
53715 +/**
53716 + * append_tail
53717 + * @inode:
53718 + * @flow:
53719 + * @coord:
53720 + * @lh:
53721 + *
53722 + * Returns number of bytes written or error code.
53723 + */
53724 +static ssize_t append_tail(struct inode *inode,
53725 +                          flow_t *flow, coord_t *coord, lock_handle *lh)
53726 +{
53727 +       int result;
53728 +       reiser4_key append_key;
53729 +       loff_t to_write;
53730 +
53731 +       if (!keyeq(&flow->key, append_key_tail(coord, &append_key))) {
53732 +               flow->data = NULL;
53733 +               flow->length = get_key_offset(&flow->key) - get_key_offset(&append_key);
53734 +               set_key_offset(&flow->key, get_key_offset(&append_key));
53735 +               /*
53736 +                * holes in files built of tails are stored just like if there
53737 +                * were real data which are all zeros. Therefore we have to
53738 +                * allocate quota here as well
53739 +                */
53740 +               if (DQUOT_ALLOC_SPACE_NODIRTY(inode, flow->length))
53741 +                       return RETERR(-EDQUOT);
53742 +               result = reiser4_insert_flow(coord, lh, flow);
53743 +               if (flow->length)
53744 +                       DQUOT_FREE_SPACE_NODIRTY(inode, flow->length);
53745 +               return result;
53746 +       }
53747 +
53748 +       /* check quota before appending data */
53749 +       if (DQUOT_ALLOC_SPACE_NODIRTY(inode, flow->length))
53750 +               return RETERR(-EDQUOT);
53751 +
53752 +       to_write = flow->length;
53753 +       result = reiser4_insert_flow(coord, lh, flow);
53754 +       if (flow->length)
53755 +               DQUOT_FREE_SPACE_NODIRTY(inode, flow->length);
53756 +       return (to_write - flow->length) ? (to_write - flow->length) : result;
53757 +}
53758 +
53759 +/**
53760 + * write_tail_reserve_space - reserve space for tail write operation
53761 + * @inode:
53762 + *
53763 + * Estimates and reserves space which may be required for writing one flow to a
53764 + * file
53765 + */
53766 +static int write_extent_reserve_space(struct inode *inode)
53767 +{
53768 +       __u64 count;
53769 +       reiser4_tree *tree;
53770 +
53771 +       /*
53772 +        * to write one flow to a file by tails we have to reserve disk space for:
53773 +
53774 +        * 1. find_file_item may have to insert empty node to the tree (empty
53775 +        * leaf node between two extent items). This requires 1 block and
53776 +        * number of blocks which are necessary to perform insertion of an
53777 +        * internal item into twig level.
53778 +        *
53779 +        * 2. flow insertion
53780 +        *
53781 +        * 3. stat data update
53782 +        */
53783 +       tree = reiser4_tree_by_inode(inode);
53784 +       count = estimate_one_insert_item(tree) +
53785 +               estimate_insert_flow(tree->height) +
53786 +               estimate_one_insert_item(tree);
53787 +       grab_space_enable();
53788 +       return reiser4_grab_space(count, 0 /* flags */);
53789 +}
53790 +
53791 +#define PAGE_PER_FLOW 4
53792 +
53793 +static loff_t faultin_user_pages(const char __user *buf, size_t count)
53794 +{
53795 +       loff_t faulted;
53796 +       int to_fault;
53797 +
53798 +       if (count > PAGE_PER_FLOW * PAGE_CACHE_SIZE)
53799 +               count = PAGE_PER_FLOW * PAGE_CACHE_SIZE;
53800 +       faulted = 0;
53801 +       while (count > 0) {
53802 +               to_fault = PAGE_CACHE_SIZE;
53803 +               if (count < to_fault)
53804 +                       to_fault = count;
53805 +               fault_in_pages_readable(buf + faulted, to_fault);
53806 +               count -= to_fault;
53807 +               faulted += to_fault;
53808 +       }
53809 +       return faulted;
53810 +}
53811 +
53812 +/**
53813 + * reiser4_write_tail - write method of tail item plugin
53814 + * @file: file to write to
53815 + * @buf: address of user-space buffer
53816 + * @count: number of bytes to write
53817 + * @pos: position in file to write to
53818 + *
53819 + * Returns number of written bytes or error code.
53820 + */
53821 +ssize_t reiser4_write_tail(struct file *file, struct inode * inode,
53822 +                          const char __user *buf, size_t count, loff_t *pos)
53823 +{
53824 +       struct hint hint;
53825 +       int result;
53826 +       flow_t flow;
53827 +       coord_t *coord;
53828 +       lock_handle *lh;
53829 +       znode *loaded;
53830 +
53831 +       assert("edward-1548", inode != NULL);
53832 +
53833 +       if (write_extent_reserve_space(inode))
53834 +               return RETERR(-ENOSPC);
53835 +
53836 +       result = load_file_hint(file, &hint);
53837 +       BUG_ON(result != 0);
53838 +
53839 +       flow.length = faultin_user_pages(buf, count);
53840 +       flow.user = 1;
53841 +       memcpy(&flow.data, &buf, sizeof(buf));
53842 +       flow.op = WRITE_OP;
53843 +       key_by_inode_and_offset_common(inode, *pos, &flow.key);
53844 +
53845 +       result = find_file_item(&hint, &flow.key, ZNODE_WRITE_LOCK, inode);
53846 +       if (IS_CBKERR(result))
53847 +               return result;
53848 +
53849 +       coord = &hint.ext_coord.coord;
53850 +       lh = hint.ext_coord.lh;
53851 +
53852 +       result = zload(coord->node);
53853 +       BUG_ON(result != 0);
53854 +       loaded = coord->node;
53855 +
53856 +       if (coord->between == AFTER_UNIT) {
53857 +               /* append with data or hole */
53858 +               result = append_tail(inode, &flow, coord, lh);
53859 +       } else if (coord->between == AT_UNIT) {
53860 +               /* overwrite */
53861 +               result = overwrite_tail(&flow, coord);
53862 +       } else {
53863 +               /* no items of this file yet. insert data or hole */
53864 +               result = insert_first_tail(inode, &flow, coord, lh);
53865 +       }
53866 +       zrelse(loaded);
53867 +       if (result < 0) {
53868 +               done_lh(lh);
53869 +               return result;
53870 +       }
53871 +
53872 +       /* seal and unlock znode */
53873 +       hint.ext_coord.valid = 0;
53874 +       if (hint.ext_coord.valid)
53875 +               reiser4_set_hint(&hint, &flow.key, ZNODE_WRITE_LOCK);
53876 +       else
53877 +               reiser4_unset_hint(&hint);
53878 +
53879 +       save_file_hint(file, &hint);
53880 +       return result;
53881 +}
53882 +
53883 +#if REISER4_DEBUG
53884 +
53885 +static int
53886 +coord_matches_key_tail(const coord_t * coord, const reiser4_key * key)
53887 +{
53888 +       reiser4_key item_key;
53889 +
53890 +       assert("vs-1356", coord_is_existing_unit(coord));
53891 +       assert("vs-1354", keylt(key, append_key_tail(coord, &item_key)));
53892 +       assert("vs-1355", keyge(key, item_key_by_coord(coord, &item_key)));
53893 +       return get_key_offset(key) ==
53894 +           get_key_offset(&item_key) + coord->unit_pos;
53895 +
53896 +}
53897 +
53898 +#endif
53899 +
53900 +/* plugin->u.item.s.file.read */
53901 +int reiser4_read_tail(struct file *file UNUSED_ARG, flow_t *f, hint_t *hint)
53902 +{
53903 +       unsigned count;
53904 +       int item_length;
53905 +       coord_t *coord;
53906 +       uf_coord_t *uf_coord;
53907 +
53908 +       uf_coord = &hint->ext_coord;
53909 +       coord = &uf_coord->coord;
53910 +
53911 +       assert("vs-571", f->user == 1);
53912 +       assert("vs-571", f->data);
53913 +       assert("vs-967", coord && coord->node);
53914 +       assert("vs-1117", znode_is_rlocked(coord->node));
53915 +       assert("vs-1118", znode_is_loaded(coord->node));
53916 +
53917 +       assert("nikita-3037", reiser4_schedulable());
53918 +       assert("vs-1357", coord_matches_key_tail(coord, &f->key));
53919 +
53920 +       /* calculate number of bytes to read off the item */
53921 +       item_length = item_length_by_coord(coord);
53922 +       count = item_length_by_coord(coord) - coord->unit_pos;
53923 +       if (count > f->length)
53924 +               count = f->length;
53925 +
53926 +       /* user page has to be brought in so that major page fault does not
53927 +        * occur here when longtem lock is held */
53928 +       if (__copy_to_user((char __user *)f->data,
53929 +                          ((char *)item_body_by_coord(coord) + coord->unit_pos),
53930 +                          count))
53931 +               return RETERR(-EFAULT);
53932 +
53933 +       /* probably mark_page_accessed() should only be called if
53934 +        * coord->unit_pos is zero. */
53935 +       mark_page_accessed(znode_page(coord->node));
53936 +       move_flow_forward(f, count);
53937 +
53938 +       coord->unit_pos += count;
53939 +       if (item_length == coord->unit_pos) {
53940 +               coord->unit_pos--;
53941 +               coord->between = AFTER_UNIT;
53942 +       }
53943 +       reiser4_set_hint(hint, &f->key, ZNODE_READ_LOCK);
53944 +       return 0;
53945 +}
53946 +
53947 +/*
53948 +   plugin->u.item.s.file.append_key
53949 +   key of first byte which is the next to last byte by addressed by this item
53950 +*/
53951 +reiser4_key *append_key_tail(const coord_t * coord, reiser4_key * key)
53952 +{
53953 +       item_key_by_coord(coord, key);
53954 +       set_key_offset(key, get_key_offset(key) + item_length_by_coord(coord));
53955 +       return key;
53956 +}
53957 +
53958 +/* plugin->u.item.s.file.init_coord_extension */
53959 +void init_coord_extension_tail(uf_coord_t * uf_coord, loff_t lookuped)
53960 +{
53961 +       uf_coord->valid = 1;
53962 +}
53963 +
53964 +/*
53965 +  plugin->u.item.s.file.get_block
53966 +*/
53967 +int
53968 +get_block_address_tail(const coord_t * coord, sector_t lblock, sector_t * block)
53969 +{
53970 +       assert("nikita-3252", znode_get_level(coord->node) == LEAF_LEVEL);
53971 +
53972 +       if (reiser4_blocknr_is_fake(znode_get_block(coord->node)))
53973 +               /* if node has'nt obtainet its block number yet, return 0.
53974 +                * Lets avoid upsetting users with some cosmic numbers beyond
53975 +                * the device capacity.*/
53976 +               *block = 0;
53977 +       else
53978 +               *block = *znode_get_block(coord->node);
53979 +       return 0;
53980 +}
53981 +
53982 +/*
53983 + * Local variables:
53984 + * c-indentation-style: "K&R"
53985 + * mode-name: "LC"
53986 + * c-basic-offset: 8
53987 + * tab-width: 8
53988 + * fill-column: 79
53989 + * scroll-step: 1
53990 + * End:
53991 + */
53992 diff -puN /dev/null fs/reiser4/plugin/item/tail.h
53993 --- /dev/null
53994 +++ a/fs/reiser4/plugin/item/tail.h
53995 @@ -0,0 +1,58 @@
53996 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
53997 +
53998 +#if !defined( __REISER4_TAIL_H__ )
53999 +#define __REISER4_TAIL_H__
54000 +
54001 +struct tail_coord_extension {
54002 +       int not_used;
54003 +};
54004 +
54005 +struct cut_list;
54006 +
54007 +/* plugin->u.item.b.* */
54008 +reiser4_key *max_key_inside_tail(const coord_t *, reiser4_key *);
54009 +int can_contain_key_tail(const coord_t * coord, const reiser4_key * key,
54010 +                        const reiser4_item_data *);
54011 +int mergeable_tail(const coord_t * p1, const coord_t * p2);
54012 +pos_in_node_t nr_units_tail(const coord_t *);
54013 +lookup_result lookup_tail(const reiser4_key *, lookup_bias, coord_t *);
54014 +int paste_tail(coord_t *, reiser4_item_data *, carry_plugin_info *);
54015 +int can_shift_tail(unsigned free_space, coord_t * source,
54016 +                  znode * target, shift_direction, unsigned *size,
54017 +                  unsigned want);
54018 +void copy_units_tail(coord_t * target, coord_t * source, unsigned from,
54019 +                    unsigned count, shift_direction, unsigned free_space);
54020 +int kill_hook_tail(const coord_t *, pos_in_node_t from, pos_in_node_t count,
54021 +                  struct carry_kill_data *);
54022 +int cut_units_tail(coord_t *, pos_in_node_t from, pos_in_node_t to,
54023 +                  struct carry_cut_data *, reiser4_key * smallest_removed,
54024 +                  reiser4_key * new_first);
54025 +int kill_units_tail(coord_t *, pos_in_node_t from, pos_in_node_t to,
54026 +                   struct carry_kill_data *, reiser4_key * smallest_removed,
54027 +                   reiser4_key * new_first);
54028 +reiser4_key *unit_key_tail(const coord_t *, reiser4_key *);
54029 +
54030 +/* plugin->u.item.s.* */
54031 +ssize_t reiser4_write_tail(struct file *file, struct inode * inode,
54032 +                          const char __user *buf, size_t count, loff_t *pos);
54033 +int reiser4_read_tail(struct file *, flow_t *, hint_t *);
54034 +int readpage_tail(void *vp, struct page *page);
54035 +reiser4_key *append_key_tail(const coord_t *, reiser4_key *);
54036 +void init_coord_extension_tail(uf_coord_t *, loff_t offset);
54037 +int get_block_address_tail(const coord_t *, sector_t, sector_t *);
54038 +int item_balance_dirty_pages(struct address_space *, const flow_t *,
54039 +                            hint_t *, int back_to_dirty, int set_hint);
54040 +
54041 +/* __REISER4_TAIL_H__ */
54042 +#endif
54043 +
54044 +/* Make Linus happy.
54045 +   Local variables:
54046 +   c-indentation-style: "K&R"
54047 +   mode-name: "LC"
54048 +   c-basic-offset: 8
54049 +   tab-width: 8
54050 +   fill-column: 120
54051 +   scroll-step: 1
54052 +   End:
54053 +*/
54054 diff -puN /dev/null fs/reiser4/plugin/node/Makefile
54055 --- /dev/null
54056 +++ a/fs/reiser4/plugin/node/Makefile
54057 @@ -0,0 +1,5 @@
54058 +obj-$(CONFIG_REISER4_FS) += node_plugins.o
54059 +
54060 +node_plugins-objs :=   \
54061 +       node.o          \
54062 +       node40.o
54063 diff -puN /dev/null fs/reiser4/plugin/node/node.c
54064 --- /dev/null
54065 +++ a/fs/reiser4/plugin/node/node.c
54066 @@ -0,0 +1,131 @@
54067 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
54068 +
54069 +/* Node plugin interface.
54070 +
54071 +   Description: The tree provides the abstraction of flows, which it
54072 +   internally fragments into items which it stores in nodes.
54073 +
54074 +   A key_atom is a piece of data bound to a single key.
54075 +
54076 +   For reasonable space efficiency to be achieved it is often
54077 +   necessary to store key_atoms in the nodes in the form of items, where
54078 +   an item is a sequence of key_atoms of the same or similar type. It is
54079 +   more space-efficient, because the item can implement (very)
54080 +   efficient compression of key_atom's bodies using internal knowledge
54081 +   about their semantics, and it can often avoid having a key for each
54082 +   key_atom. Each type of item has specific operations implemented by its
54083 +   item handler (see balance.c).
54084 +
54085 +   Rationale: the rest of the code (specifically balancing routines)
54086 +   accesses leaf level nodes through this interface. This way we can
54087 +   implement various block layouts and even combine various layouts
54088 +   within the same tree. Balancing/allocating algorithms should not
54089 +   care about peculiarities of splitting/merging specific item types,
54090 +   but rather should leave that to the item's item handler.
54091 +
54092 +   Items, including those that provide the abstraction of flows, have
54093 +   the property that if you move them in part or in whole to another
54094 +   node, the balancing code invokes their is_left_mergeable()
54095 +   item_operation to determine if they are mergeable with their new
54096 +   neighbor in the node you have moved them to.  For some items the
54097 +   is_left_mergeable() function always returns null.
54098 +
54099 +   When moving the bodies of items from one node to another:
54100 +
54101 +     if a partial item is shifted to another node the balancing code invokes
54102 +     an item handler method to handle the item splitting.
54103 +
54104 +     if the balancing code needs to merge with an item in the node it
54105 +     is shifting to, it will invoke an item handler method to handle
54106 +     the item merging.
54107 +
54108 +     if it needs to move whole item bodies unchanged, the balancing code uses xmemcpy()
54109 +     adjusting the item headers after the move is done using the node handler.
54110 +*/
54111 +
54112 +#include "../../forward.h"
54113 +#include "../../debug.h"
54114 +#include "../../key.h"
54115 +#include "../../coord.h"
54116 +#include "../plugin_header.h"
54117 +#include "../item/item.h"
54118 +#include "node.h"
54119 +#include "../plugin.h"
54120 +#include "../../znode.h"
54121 +#include "../../tree.h"
54122 +#include "../../super.h"
54123 +#include "../../reiser4.h"
54124 +
54125 +/**
54126 + * leftmost_key_in_node - get the smallest key in node
54127 + * @node:
54128 + * @key: store result here
54129 + *
54130 + * Stores the leftmost key of @node in @key.
54131 + */
54132 +reiser4_key *leftmost_key_in_node(const znode *node, reiser4_key *key)
54133 +{
54134 +       assert("nikita-1634", node != NULL);
54135 +       assert("nikita-1635", key != NULL);
54136 +
54137 +       if (!node_is_empty(node)) {
54138 +               coord_t first_item;
54139 +
54140 +               coord_init_first_unit(&first_item, (znode *) node);
54141 +               item_key_by_coord(&first_item, key);
54142 +       } else
54143 +               *key = *reiser4_max_key();
54144 +       return key;
54145 +}
54146 +
54147 +node_plugin node_plugins[LAST_NODE_ID] = {
54148 +       [NODE40_ID] = {
54149 +               .h = {
54150 +                       .type_id = REISER4_NODE_PLUGIN_TYPE,
54151 +                       .id = NODE40_ID,
54152 +                       .pops = NULL,
54153 +                       .label = "unified",
54154 +                       .desc = "unified node layout",
54155 +                       .linkage = {NULL, NULL}
54156 +               },
54157 +               .item_overhead = item_overhead_node40,
54158 +               .free_space = free_space_node40,
54159 +               .lookup = lookup_node40,
54160 +               .num_of_items = num_of_items_node40,
54161 +               .item_by_coord = item_by_coord_node40,
54162 +               .length_by_coord = length_by_coord_node40,
54163 +               .plugin_by_coord = plugin_by_coord_node40,
54164 +               .key_at = key_at_node40,
54165 +               .estimate = estimate_node40,
54166 +               .check = check_node40,
54167 +               .parse = parse_node40,
54168 +               .init = init_node40,
54169 +#ifdef GUESS_EXISTS
54170 +               .guess = guess_node40,
54171 +#endif
54172 +               .change_item_size = change_item_size_node40,
54173 +               .create_item = create_item_node40,
54174 +               .update_item_key = update_item_key_node40,
54175 +               .cut_and_kill = kill_node40,
54176 +               .cut = cut_node40,
54177 +               .shift = shift_node40,
54178 +               .shrink_item = shrink_item_node40,
54179 +               .fast_insert = fast_insert_node40,
54180 +               .fast_paste = fast_paste_node40,
54181 +               .fast_cut = fast_cut_node40,
54182 +               .max_item_size = max_item_size_node40,
54183 +               .prepare_removal = prepare_removal_node40,
54184 +               .set_item_plugin = set_item_plugin_node40
54185 +       }
54186 +};
54187 +
54188 +/*
54189 +   Local variables:
54190 +   c-indentation-style: "K&R"
54191 +   mode-name: "LC"
54192 +   c-basic-offset: 8
54193 +   tab-width: 8
54194 +   fill-column: 120
54195 +   scroll-step: 1
54196 +   End:
54197 +*/
54198 diff -puN /dev/null fs/reiser4/plugin/node/node.h
54199 --- /dev/null
54200 +++ a/fs/reiser4/plugin/node/node.h
54201 @@ -0,0 +1,272 @@
54202 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
54203 +
54204 +/* We need a definition of the default node layout here. */
54205 +
54206 +/* Generally speaking, it is best to have free space in the middle of the
54207 +   node so that two sets of things can grow towards it, and to have the
54208 +   item bodies on the left so that the last one of them grows into free
54209 +   space.  We optimize for the case where we append new items to the end
54210 +   of the node, or grow the last item, because it hurts nothing to so
54211 +   optimize and it is a common special case to do massive insertions in
54212 +   increasing key order (and one of cases more likely to have a real user
54213 +   notice the delay time for).
54214 +
54215 +   formatted leaf default layout: (leaf1)
54216 +
54217 +   |node header:item bodies:free space:key + pluginid + item offset|
54218 +
54219 +   We grow towards the middle, optimizing layout for the case where we
54220 +   append new items to the end of the node.  The node header is fixed
54221 +   length.  Keys, and item offsets plus pluginids for the items
54222 +   corresponding to them are in increasing key order, and are fixed
54223 +   length.  Item offsets are relative to start of node (16 bits creating
54224 +   a node size limit of 64k, 12 bits might be a better choice....).  Item
54225 +   bodies are in decreasing key order.  Item bodies have a variable size.
54226 +   There is a one to one to one mapping of keys to item offsets to item
54227 +   bodies.  Item offsets consist of pointers to the zeroth byte of the
54228 +   item body.  Item length equals the start of the next item minus the
54229 +   start of this item, except the zeroth item whose length equals the end
54230 +   of the node minus the start of that item (plus a byte).  In other
54231 +   words, the item length is not recorded anywhere, and it does not need
54232 +   to be since it is computable.
54233 +
54234 +   Leaf variable length items and keys layout : (lvar)
54235 +
54236 +   |node header:key offset + item offset + pluginid triplets:free space:key bodies:item bodies|
54237 +
54238 +   We grow towards the middle, optimizing layout for the case where we
54239 +   append new items to the end of the node.  The node header is fixed
54240 +   length.  Keys and item offsets for the items corresponding to them are
54241 +   in increasing key order, and keys are variable length.  Item offsets
54242 +   are relative to start of node (16 bits).  Item bodies are in
54243 +   decreasing key order.  Item bodies have a variable size.  There is a
54244 +   one to one to one mapping of keys to item offsets to item bodies.
54245 +   Item offsets consist of pointers to the zeroth byte of the item body.
54246 +   Item length equals the start of the next item's key minus the start of
54247 +   this item, except the zeroth item whose length equals the end of the
54248 +   node minus the start of that item (plus a byte).
54249 +
54250 +   leaf compressed keys layout: (lcomp)
54251 +
54252 +   |node header:key offset + key inherit + item offset pairs:free space:key bodies:item bodies|
54253 +
54254 +   We grow towards the middle, optimizing layout for the case where we
54255 +   append new items to the end of the node.  The node header is fixed
54256 +   length.  Keys and item offsets for the items corresponding to them are
54257 +   in increasing key order, and keys are variable length.  The "key
54258 +   inherit" field indicates how much of the key prefix is identical to
54259 +   the previous key (stem compression as described in "Managing
54260 +   Gigabytes" is used).  key_inherit is a one byte integer.  The
54261 +   intra-node searches performed through this layout are linear searches,
54262 +   and this is theorized to not hurt performance much due to the high
54263 +   cost of processor stalls on modern CPUs, and the small number of keys
54264 +   in a single node.  Item offsets are relative to start of node (16
54265 +   bits).  Item bodies are in decreasing key order.  Item bodies have a
54266 +   variable size.  There is a one to one to one mapping of keys to item
54267 +   offsets to item bodies.  Item offsets consist of pointers to the
54268 +   zeroth byte of the item body.  Item length equals the start of the
54269 +   next item minus the start of this item, except the zeroth item whose
54270 +   length equals the end of the node minus the start of that item (plus a
54271 +   byte).  In other words, item length and key length is not recorded
54272 +   anywhere, and it does not need to be since it is computable.
54273 +
54274 +   internal node default layout: (idef1)
54275 +
54276 +   just like ldef1 except that item bodies are either blocknrs of
54277 +   children or extents, and moving them may require updating parent
54278 +   pointers in the nodes that they point to.
54279 +*/
54280 +
54281 +/* There is an inherent 3-way tradeoff between optimizing and
54282 +   exchanging disks between different architectures and code
54283 +   complexity.  This is optimal and simple and inexchangeable.
54284 +   Someone else can do the code for exchanging disks and make it
54285 +   complex. It would not be that hard.  Using other than the PAGE_SIZE
54286 +   might be suboptimal.
54287 +*/
54288 +
54289 +#if !defined( __REISER4_NODE_H__ )
54290 +#define __REISER4_NODE_H__
54291 +
54292 +#define LEAF40_NODE_SIZE PAGE_CACHE_SIZE
54293 +
54294 +#include "../../dformat.h"
54295 +#include "../plugin_header.h"
54296 +
54297 +#include <linux/types.h>
54298 +
54299 +typedef enum {
54300 +       NS_FOUND = 0,
54301 +       NS_NOT_FOUND = -ENOENT
54302 +} node_search_result;
54303 +
54304 +/* Maximal possible space overhead for creation of new item in a node */
54305 +#define REISER4_NODE_MAX_OVERHEAD ( sizeof( reiser4_key ) + 32 )
54306 +
54307 +typedef enum {
54308 +       REISER4_NODE_DKEYS = (1 << 0),
54309 +       REISER4_NODE_TREE_STABLE = (1 << 1)
54310 +} reiser4_node_check_flag;
54311 +
54312 +/* cut and cut_and_kill have too long list of parameters. This structure is just to safe some space on stack */
54313 +struct cut_list {
54314 +       coord_t *from;
54315 +       coord_t *to;
54316 +       const reiser4_key *from_key;
54317 +       const reiser4_key *to_key;
54318 +       reiser4_key *smallest_removed;
54319 +       carry_plugin_info *info;
54320 +       __u32 flags;
54321 +       struct inode *inode;    /* this is to pass list of eflushed jnodes down to extent_kill_hook */
54322 +       lock_handle *left;
54323 +       lock_handle *right;
54324 +};
54325 +
54326 +struct carry_cut_data;
54327 +struct carry_kill_data;
54328 +
54329 +/* The responsibility of the node plugin is to store and give access
54330 +   to the sequence of items within the node.  */
54331 +typedef struct node_plugin {
54332 +       /* generic plugin fields */
54333 +       plugin_header h;
54334 +
54335 +       /* calculates the amount of space that will be required to store an
54336 +          item which is in addition to the space consumed by the item body.
54337 +          (the space consumed by the item body can be gotten by calling
54338 +          item->estimate) */
54339 +        size_t(*item_overhead) (const znode * node, flow_t * f);
54340 +
54341 +       /* returns free space by looking into node (i.e., without using
54342 +          znode->free_space). */
54343 +        size_t(*free_space) (znode * node);
54344 +       /* search within the node for the one item which might
54345 +          contain the key, invoking item->search_within to search within
54346 +          that item to see if it is in there */
54347 +        node_search_result(*lookup) (znode * node, const reiser4_key * key,
54348 +                                     lookup_bias bias, coord_t * coord);
54349 +       /* number of items in node */
54350 +       int (*num_of_items) (const znode * node);
54351 +
54352 +       /* store information about item in @coord in @data */
54353 +       /* break into several node ops, don't add any more uses of this before doing so */
54354 +       /*int ( *item_at )( const coord_t *coord, reiser4_item_data *data ); */
54355 +       char *(*item_by_coord) (const coord_t * coord);
54356 +       int (*length_by_coord) (const coord_t * coord);
54357 +       item_plugin *(*plugin_by_coord) (const coord_t * coord);
54358 +
54359 +       /* store item key in @key */
54360 +       reiser4_key *(*key_at) (const coord_t * coord, reiser4_key * key);
54361 +       /* conservatively estimate whether unit of what size can fit
54362 +          into node. This estimation should be performed without
54363 +          actually looking into the node's content (free space is saved in
54364 +          znode). */
54365 +        size_t(*estimate) (znode * node);
54366 +
54367 +       /* performs every consistency check the node plugin author could
54368 +          imagine. Optional. */
54369 +       int (*check) (const znode * node, __u32 flags, const char **error);
54370 +
54371 +       /* Called when node is read into memory and node plugin is
54372 +          already detected. This should read some data into znode (like free
54373 +          space counter) and, optionally, check data consistency.
54374 +        */
54375 +       int (*parse) (znode * node);
54376 +       /* This method is called on a new node to initialise plugin specific
54377 +          data (header, etc.) */
54378 +       int (*init) (znode * node);
54379 +       /* Check whether @node content conforms to this plugin format.
54380 +          Probably only useful after support for old V3.x formats is added.
54381 +          Uncomment after 4.0 only.
54382 +        */
54383 +       /*      int ( *guess )( const znode *node ); */
54384 +#if REISER4_DEBUG
54385 +       void (*print) (const char *prefix, const znode * node, __u32 flags);
54386 +#endif
54387 +       /* change size of @item by @by bytes. @item->node has enough free
54388 +          space. When @by > 0 - free space is appended to end of item. When
54389 +          @by < 0 - item is truncated - it is assumed that last @by bytes if
54390 +          the item are freed already */
54391 +       void (*change_item_size) (coord_t * item, int by);
54392 +
54393 +       /* create new item @length bytes long in coord @target */
54394 +       int (*create_item) (coord_t * target, const reiser4_key * key,
54395 +                           reiser4_item_data * data, carry_plugin_info * info);
54396 +
54397 +       /* update key of item. */
54398 +       void (*update_item_key) (coord_t * target, const reiser4_key * key,
54399 +                                carry_plugin_info * info);
54400 +
54401 +       int (*cut_and_kill) (struct carry_kill_data *, carry_plugin_info *);
54402 +       int (*cut) (struct carry_cut_data *, carry_plugin_info *);
54403 +
54404 +       /*
54405 +        * shrink item pointed to by @coord by @delta bytes.
54406 +        */
54407 +       int (*shrink_item) (coord_t * coord, int delta);
54408 +
54409 +       /* copy as much as possible but not more than up to @stop from
54410 +          @stop->node to @target. If (pend == append) then data from beginning of
54411 +          @stop->node are copied to the end of @target. If (pend == prepend) then
54412 +          data from the end of @stop->node are copied to the beginning of
54413 +          @target. Copied data are removed from @stop->node. Information
54414 +          about what to do on upper level is stored in @todo */
54415 +       int (*shift) (coord_t * stop, znode * target, shift_direction pend,
54416 +                     int delete_node, int including_insert_coord,
54417 +                     carry_plugin_info * info);
54418 +       /* return true if this node allows skip carry() in some situations
54419 +          (see fs/reiser4/tree.c:insert_by_coord()). Reiser3.x format
54420 +          emulation doesn't.
54421 +
54422 +          This will speedup insertions that doesn't require updates to the
54423 +          parent, by bypassing initialisation of carry() structures. It's
54424 +          believed that majority of insertions will fit there.
54425 +
54426 +        */
54427 +       int (*fast_insert) (const coord_t * coord);
54428 +       int (*fast_paste) (const coord_t * coord);
54429 +       int (*fast_cut) (const coord_t * coord);
54430 +       /* this limits max size of item which can be inserted into a node and
54431 +          number of bytes item in a node may be appended with */
54432 +       int (*max_item_size) (void);
54433 +       int (*prepare_removal) (znode * empty, carry_plugin_info * info);
54434 +       /* change plugin id of items which are in a node already. Currently it is Used in tail conversion for regular
54435 +        * files */
54436 +       int (*set_item_plugin) (coord_t * coord, item_id);
54437 +} node_plugin;
54438 +
54439 +typedef enum {
54440 +       /* standard unified node layout used for both leaf and internal
54441 +          nodes */
54442 +       NODE40_ID,
54443 +       LAST_NODE_ID
54444 +} reiser4_node_id;
54445 +
54446 +extern reiser4_key *leftmost_key_in_node(const znode * node, reiser4_key * key);
54447 +#if REISER4_DEBUG
54448 +extern void print_node_content(const char *prefix, const znode * node,
54449 +                              __u32 flags);
54450 +#endif
54451 +
54452 +extern void indent_znode(const znode * node);
54453 +
54454 +typedef struct common_node_header {
54455 +       /*
54456 +        * identifier of node plugin. Must be located at the very beginning of
54457 +        * a node.
54458 +        */
54459 +       __le16 plugin_id;
54460 +} common_node_header;
54461 +
54462 +/* __REISER4_NODE_H__ */
54463 +#endif
54464 +/*
54465 + * Local variables:
54466 + * c-indentation-style: "K&R"
54467 + * mode-name: "LC"
54468 + * c-basic-offset: 8
54469 + * tab-width: 8
54470 + * fill-column: 79
54471 + * scroll-step: 1
54472 + * End:
54473 + */
54474 diff -puN /dev/null fs/reiser4/plugin/node/node40.c
54475 --- /dev/null
54476 +++ a/fs/reiser4/plugin/node/node40.c
54477 @@ -0,0 +1,2924 @@
54478 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
54479 +
54480 +#include "../../debug.h"
54481 +#include "../../key.h"
54482 +#include "../../coord.h"
54483 +#include "../plugin_header.h"
54484 +#include "../item/item.h"
54485 +#include "node.h"
54486 +#include "node40.h"
54487 +#include "../plugin.h"
54488 +#include "../../jnode.h"
54489 +#include "../../znode.h"
54490 +#include "../../pool.h"
54491 +#include "../../carry.h"
54492 +#include "../../tap.h"
54493 +#include "../../tree.h"
54494 +#include "../../super.h"
54495 +#include "../../reiser4.h"
54496 +
54497 +#include <asm/uaccess.h>
54498 +#include <linux/types.h>
54499 +#include <linux/prefetch.h>
54500 +
54501 +/* leaf 40 format:
54502 +
54503 +  [node header | item 0, item 1, .., item N-1 |  free space | item_head N-1, .. item_head 1, item head 0 ]
54504 +   plugin_id (16)                                                key
54505 +   free_space (16)                                               pluginid (16)
54506 +   free_space_start (16)                                         offset (16)
54507 +   level (8)
54508 +   num_items (16)
54509 +   magic (32)
54510 +   flush_time (32)
54511 +*/
54512 +/* NIKITA-FIXME-HANS: I told you guys not less than 10 times to not call it r4fs.  Change to "ReIs". */
54513 +/* magic number that is stored in ->magic field of node header */
54514 +static const __u32 REISER4_NODE_MAGIC = 0x52344653;    /* (*(__u32 *)"R4FS"); */
54515 +
54516 +static int prepare_for_update(znode * left, znode * right,
54517 +                             carry_plugin_info * info);
54518 +
54519 +/* header of node of reiser40 format is at the beginning of node */
54520 +static inline node40_header *node40_node_header(const znode * node     /* node to
54521 +                                                                        * query */ )
54522 +{
54523 +       assert("nikita-567", node != NULL);
54524 +       assert("nikita-568", znode_page(node) != NULL);
54525 +       assert("nikita-569", zdata(node) != NULL);
54526 +       return (node40_header *) zdata(node);
54527 +}
54528 +
54529 +/* functions to get/set fields of node40_header */
54530 +#define nh40_get_magic(nh) le32_to_cpu(get_unaligned(&(nh)->magic))
54531 +#define nh40_get_free_space(nh) le16_to_cpu(get_unaligned(&(nh)->free_space))
54532 +#define nh40_get_free_space_start(nh) le16_to_cpu(get_unaligned(&(nh)->free_space_start))
54533 +#define nh40_get_level(nh) get_unaligned(&(nh)->level)
54534 +#define nh40_get_num_items(nh) le16_to_cpu(get_unaligned(&(nh)->nr_items))
54535 +#define nh40_get_flush_id(nh) le64_to_cpu(get_unaligned(&(nh)->flush_id))
54536 +
54537 +#define nh40_set_magic(nh, value) put_unaligned(cpu_to_le32(value), &(nh)->magic)
54538 +#define nh40_set_free_space(nh, value) put_unaligned(cpu_to_le16(value), &(nh)->free_space)
54539 +#define nh40_set_free_space_start(nh, value) put_unaligned(cpu_to_le16(value), &(nh)->free_space_start)
54540 +#define nh40_set_level(nh, value) put_unaligned(value, &(nh)->level)
54541 +#define nh40_set_num_items(nh, value) put_unaligned(cpu_to_le16(value), &(nh)->nr_items)
54542 +#define nh40_set_mkfs_id(nh, value) put_unaligned(cpu_to_le32(value), &(nh)->mkfs_id)
54543 +
54544 +/* plugin field of node header should be read/set by
54545 +   plugin_by_disk_id/save_disk_plugin */
54546 +
54547 +/* array of item headers is at the end of node */
54548 +static inline item_header40 *node40_ih_at(const znode * node, unsigned pos)
54549 +{
54550 +       return (item_header40 *) (zdata(node) + znode_size(node)) - pos - 1;
54551 +}
54552 +
54553 +/* ( page_address( node -> pg ) + PAGE_CACHE_SIZE ) - pos - 1
54554 + */
54555 +static inline item_header40 *node40_ih_at_coord(const coord_t * coord)
54556 +{
54557 +       return (item_header40 *) (zdata(coord->node) +
54558 +                                 znode_size(coord->node)) - (coord->item_pos) -
54559 +           1;
54560 +}
54561 +
54562 +/* functions to get/set fields of item_header40 */
54563 +#define ih40_get_offset(ih) le16_to_cpu(get_unaligned(&(ih)->offset))
54564 +
54565 +#define ih40_set_offset(ih, value) put_unaligned(cpu_to_le16(value), &(ih)->offset)
54566 +
54567 +/* plugin field of item header should be read/set by
54568 +   plugin_by_disk_id/save_disk_plugin */
54569 +
54570 +/* plugin methods */
54571 +
54572 +/* plugin->u.node.item_overhead
54573 +   look for description of this method in plugin/node/node.h */
54574 +size_t
54575 +item_overhead_node40(const znode * node UNUSED_ARG, flow_t * f UNUSED_ARG)
54576 +{
54577 +       return sizeof(item_header40);
54578 +}
54579 +
54580 +/* plugin->u.node.free_space
54581 +   look for description of this method in plugin/node/node.h */
54582 +size_t free_space_node40(znode * node)
54583 +{
54584 +       assert("nikita-577", node != NULL);
54585 +       assert("nikita-578", znode_is_loaded(node));
54586 +       assert("nikita-579", zdata(node) != NULL);
54587 +
54588 +       return nh40_get_free_space(node40_node_header(node));
54589 +}
54590 +
54591 +/* private inline version of node40_num_of_items() for use in this file. This
54592 +   is necessary, because address of node40_num_of_items() is taken and it is
54593 +   never inlined as a result. */
54594 +static inline short node40_num_of_items_internal(const znode * node)
54595 +{
54596 +       return nh40_get_num_items(node40_node_header(node));
54597 +}
54598 +
54599 +#if REISER4_DEBUG
54600 +static inline void check_num_items(const znode * node)
54601 +{
54602 +       assert("nikita-2749",
54603 +              node40_num_of_items_internal(node) == node->nr_items);
54604 +       assert("nikita-2746", znode_is_write_locked(node));
54605 +}
54606 +#else
54607 +#define check_num_items(node) noop
54608 +#endif
54609 +
54610 +/* plugin->u.node.num_of_items
54611 +   look for description of this method in plugin/node/node.h */
54612 +int num_of_items_node40(const znode * node)
54613 +{
54614 +       return node40_num_of_items_internal(node);
54615 +}
54616 +
54617 +static void
54618 +node40_set_num_items(znode * node, node40_header * nh, unsigned value)
54619 +{
54620 +       assert("nikita-2751", node != NULL);
54621 +       assert("nikita-2750", nh == node40_node_header(node));
54622 +
54623 +       check_num_items(node);
54624 +       nh40_set_num_items(nh, value);
54625 +       node->nr_items = value;
54626 +       check_num_items(node);
54627 +}
54628 +
54629 +/* plugin->u.node.item_by_coord
54630 +   look for description of this method in plugin/node/node.h */
54631 +char *item_by_coord_node40(const coord_t * coord)
54632 +{
54633 +       item_header40 *ih;
54634 +       char *p;
54635 +
54636 +       /* @coord is set to existing item */
54637 +       assert("nikita-596", coord != NULL);
54638 +       assert("vs-255", coord_is_existing_item(coord));
54639 +
54640 +       ih = node40_ih_at_coord(coord);
54641 +       p = zdata(coord->node) + ih40_get_offset(ih);
54642 +       return p;
54643 +}
54644 +
54645 +/* plugin->u.node.length_by_coord
54646 +   look for description of this method in plugin/node/node.h */
54647 +int length_by_coord_node40(const coord_t * coord)
54648 +{
54649 +       item_header40 *ih;
54650 +       int result;
54651 +
54652 +       /* @coord is set to existing item */
54653 +       assert("vs-256", coord != NULL);
54654 +       assert("vs-257", coord_is_existing_item(coord));
54655 +
54656 +       ih = node40_ih_at_coord(coord);
54657 +       if ((int)coord->item_pos ==
54658 +           node40_num_of_items_internal(coord->node) - 1)
54659 +               result =
54660 +                   nh40_get_free_space_start(node40_node_header(coord->node)) -
54661 +                   ih40_get_offset(ih);
54662 +       else
54663 +               result = ih40_get_offset(ih - 1) - ih40_get_offset(ih);
54664 +
54665 +       return result;
54666 +}
54667 +
54668 +static pos_in_node_t
54669 +node40_item_length(const znode * node, pos_in_node_t item_pos)
54670 +{
54671 +       item_header40 *ih;
54672 +       pos_in_node_t result;
54673 +
54674 +       /* @coord is set to existing item */
54675 +       assert("vs-256", node != NULL);
54676 +       assert("vs-257", node40_num_of_items_internal(node) > item_pos);
54677 +
54678 +       ih = node40_ih_at(node, item_pos);
54679 +       if (item_pos == node40_num_of_items_internal(node) - 1)
54680 +               result =
54681 +                   nh40_get_free_space_start(node40_node_header(node)) -
54682 +                   ih40_get_offset(ih);
54683 +       else
54684 +               result = ih40_get_offset(ih - 1) - ih40_get_offset(ih);
54685 +
54686 +       return result;
54687 +}
54688 +
54689 +/* plugin->u.node.plugin_by_coord
54690 +   look for description of this method in plugin/node/node.h */
54691 +item_plugin *plugin_by_coord_node40(const coord_t * coord)
54692 +{
54693 +       item_header40 *ih;
54694 +       item_plugin *result;
54695 +
54696 +       /* @coord is set to existing item */
54697 +       assert("vs-258", coord != NULL);
54698 +       assert("vs-259", coord_is_existing_item(coord));
54699 +
54700 +       ih = node40_ih_at_coord(coord);
54701 +       /* pass NULL in stead of current tree. This is time critical call. */
54702 +       result = item_plugin_by_disk_id(NULL, &ih->plugin_id);
54703 +       return result;
54704 +}
54705 +
54706 +/* plugin->u.node.key_at
54707 +   look for description of this method in plugin/node/node.h */
54708 +reiser4_key *key_at_node40(const coord_t * coord, reiser4_key * key)
54709 +{
54710 +       item_header40 *ih;
54711 +
54712 +       assert("nikita-1765", coord_is_existing_item(coord));
54713 +
54714 +       /* @coord is set to existing item */
54715 +       ih = node40_ih_at_coord(coord);
54716 +       memcpy(key, &ih->key, sizeof(reiser4_key));
54717 +       return key;
54718 +}
54719 +
54720 +/* VS-FIXME-HANS: please review whether the below are properly disabled when debugging is disabled */
54721 +
54722 +#define NODE_INCSTAT(n, counter)                                               \
54723 +       reiser4_stat_inc_at_level(znode_get_level(n), node.lookup.counter)
54724 +
54725 +#define NODE_ADDSTAT(n, counter, val)                                          \
54726 +       reiser4_stat_add_at_level(znode_get_level(n), node.lookup.counter, val)
54727 +
54728 +/* plugin->u.node.lookup
54729 +   look for description of this method in plugin/node/node.h */
54730 +node_search_result lookup_node40(znode * node /* node to query */ ,
54731 +                                const reiser4_key * key /* key to look for */ ,
54732 +                                lookup_bias bias /* search bias */ ,
54733 +                                coord_t * coord /* resulting coord */ )
54734 +{
54735 +       int left;
54736 +       int right;
54737 +       int found;
54738 +       int items;
54739 +
54740 +       item_header40 *lefth;
54741 +       item_header40 *righth;
54742 +
54743 +       item_plugin *iplug;
54744 +       item_header40 *bstop;
54745 +       item_header40 *ih;
54746 +       cmp_t order;
54747 +
54748 +       assert("nikita-583", node != NULL);
54749 +       assert("nikita-584", key != NULL);
54750 +       assert("nikita-585", coord != NULL);
54751 +       assert("nikita-2693", znode_is_any_locked(node));
54752 +       cassert(REISER4_SEQ_SEARCH_BREAK > 2);
54753 +
54754 +       items = node_num_items(node);
54755 +
54756 +       if (unlikely(items == 0)) {
54757 +               coord_init_first_unit(coord, node);
54758 +               return NS_NOT_FOUND;
54759 +       }
54760 +
54761 +       /* binary search for item that can contain given key */
54762 +       left = 0;
54763 +       right = items - 1;
54764 +       coord->node = node;
54765 +       coord_clear_iplug(coord);
54766 +       found = 0;
54767 +
54768 +       lefth = node40_ih_at(node, left);
54769 +       righth = node40_ih_at(node, right);
54770 +
54771 +       /* It is known that for small arrays sequential search is on average
54772 +          more efficient than binary. This is because sequential search is
54773 +          coded as tight loop that can be better optimized by compilers and
54774 +          for small array size gain from this optimization makes sequential
54775 +          search the winner. Another, maybe more important, reason for this,
54776 +          is that sequential array is more CPU cache friendly, whereas binary
54777 +          search effectively destroys CPU caching.
54778 +
54779 +          Critical here is the notion of "smallness". Reasonable value of
54780 +          REISER4_SEQ_SEARCH_BREAK can be found by playing with code in
54781 +          fs/reiser4/ulevel/ulevel.c:test_search().
54782 +
54783 +          Don't try to further optimize sequential search by scanning from
54784 +          right to left in attempt to use more efficient loop termination
54785 +          condition (comparison with 0). This doesn't work.
54786 +
54787 +        */
54788 +
54789 +       while (right - left >= REISER4_SEQ_SEARCH_BREAK) {
54790 +               int median;
54791 +               item_header40 *medianh;
54792 +
54793 +               median = (left + right) / 2;
54794 +               medianh = node40_ih_at(node, median);
54795 +
54796 +               assert("nikita-1084", median >= 0);
54797 +               assert("nikita-1085", median < items);
54798 +               switch (keycmp(key, &medianh->key)) {
54799 +               case LESS_THAN:
54800 +                       right = median;
54801 +                       righth = medianh;
54802 +                       break;
54803 +               default:
54804 +                       wrong_return_value("nikita-586", "keycmp");
54805 +               case GREATER_THAN:
54806 +                       left = median;
54807 +                       lefth = medianh;
54808 +                       break;
54809 +               case EQUAL_TO:
54810 +                       do {
54811 +                               --median;
54812 +                               /* headers are ordered from right to left */
54813 +                               ++medianh;
54814 +                       } while (median >= 0 && keyeq(key, &medianh->key));
54815 +                       right = left = median + 1;
54816 +                       ih = lefth = righth = medianh - 1;
54817 +                       found = 1;
54818 +                       break;
54819 +               }
54820 +       }
54821 +       /* sequential scan. Item headers, and, therefore, keys are stored at
54822 +          the rightmost part of a node from right to left. We are trying to
54823 +          access memory from left to right, and hence, scan in _descending_
54824 +          order of item numbers.
54825 +        */
54826 +       if (!found) {
54827 +               for (left = right, ih = righth; left >= 0; ++ih, --left) {
54828 +                       cmp_t comparison;
54829 +
54830 +                       prefetchkey(&(ih + 1)->key);
54831 +                       comparison = keycmp(&ih->key, key);
54832 +                       if (comparison == GREATER_THAN)
54833 +                               continue;
54834 +                       if (comparison == EQUAL_TO) {
54835 +                               found = 1;
54836 +                               do {
54837 +                                       --left;
54838 +                                       ++ih;
54839 +                               } while (left >= 0 && keyeq(&ih->key, key));
54840 +                               ++left;
54841 +                               --ih;
54842 +                       } else {
54843 +                               assert("nikita-1256", comparison == LESS_THAN);
54844 +                       }
54845 +                       break;
54846 +               }
54847 +               if (unlikely(left < 0))
54848 +                       left = 0;
54849 +       }
54850 +
54851 +       assert("nikita-3212", right >= left);
54852 +       assert("nikita-3214",
54853 +              equi(found, keyeq(&node40_ih_at(node, left)->key, key)));
54854 +
54855 +       coord_set_item_pos(coord, left);
54856 +       coord->unit_pos = 0;
54857 +       coord->between = AT_UNIT;
54858 +
54859 +       /* key < leftmost key in a mode or node is corrupted and keys
54860 +          are not sorted  */
54861 +       bstop = node40_ih_at(node, (unsigned)left);
54862 +       order = keycmp(&bstop->key, key);
54863 +       if (unlikely(order == GREATER_THAN)) {
54864 +               if (unlikely(left != 0)) {
54865 +                       /* screw up */
54866 +                       warning("nikita-587", "Key less than %i key in a node",
54867 +                               left);
54868 +                       reiser4_print_key("key", key);
54869 +                       reiser4_print_key("min", &bstop->key);
54870 +                       print_coord_content("coord", coord);
54871 +                       return RETERR(-EIO);
54872 +               } else {
54873 +                       coord->between = BEFORE_UNIT;
54874 +                       return NS_NOT_FOUND;
54875 +               }
54876 +       }
54877 +       /* left <= key, ok */
54878 +       iplug = item_plugin_by_disk_id(znode_get_tree(node), &bstop->plugin_id);
54879 +
54880 +       if (unlikely(iplug == NULL)) {
54881 +               warning("nikita-588", "Unknown plugin %i",
54882 +                       le16_to_cpu(get_unaligned(&bstop->plugin_id)));
54883 +               reiser4_print_key("key", key);
54884 +               print_coord_content("coord", coord);
54885 +               return RETERR(-EIO);
54886 +       }
54887 +
54888 +       coord_set_iplug(coord, iplug);
54889 +
54890 +       /* if exact key from item header was found by binary search, no
54891 +          further checks are necessary. */
54892 +       if (found) {
54893 +               assert("nikita-1259", order == EQUAL_TO);
54894 +               return NS_FOUND;
54895 +       }
54896 +       if (iplug->b.max_key_inside != NULL) {
54897 +               reiser4_key max_item_key;
54898 +
54899 +               /* key > max_item_key --- outside of an item */
54900 +               if (keygt(key, iplug->b.max_key_inside(coord, &max_item_key))) {
54901 +                       coord->unit_pos = 0;
54902 +                       coord->between = AFTER_ITEM;
54903 +                       /* FIXME-VS: key we are looking for does not fit into
54904 +                          found item. Return NS_NOT_FOUND then. Without that
54905 +                          the following case does not work: there is extent of
54906 +                          file 10000, 10001. File 10000, 10002 has been just
54907 +                          created. When writing to position 0 in that file -
54908 +                          traverse_tree will stop here on twig level. When we
54909 +                          want it to go down to leaf level
54910 +                        */
54911 +                       return NS_NOT_FOUND;
54912 +               }
54913 +       }
54914 +
54915 +       if (iplug->b.lookup != NULL) {
54916 +               return iplug->b.lookup(key, bias, coord);
54917 +       } else {
54918 +               assert("nikita-1260", order == LESS_THAN);
54919 +               coord->between = AFTER_UNIT;
54920 +               return (bias == FIND_EXACT) ? NS_NOT_FOUND : NS_FOUND;
54921 +       }
54922 +}
54923 +
54924 +#undef NODE_ADDSTAT
54925 +#undef NODE_INCSTAT
54926 +
54927 +/* plugin->u.node.estimate
54928 +   look for description of this method in plugin/node/node.h */
54929 +size_t estimate_node40(znode * node)
54930 +{
54931 +       size_t result;
54932 +
54933 +       assert("nikita-597", node != NULL);
54934 +
54935 +       result = free_space_node40(node) - sizeof(item_header40);
54936 +
54937 +       return (result > 0) ? result : 0;
54938 +}
54939 +
54940 +/* plugin->u.node.check
54941 +   look for description of this method in plugin/node/node.h */
54942 +int check_node40(const znode * node /* node to check */ ,
54943 +                __u32 flags /* check flags */ ,
54944 +                const char **error /* where to store error message */ )
54945 +{
54946 +       int nr_items;
54947 +       int i;
54948 +       reiser4_key prev;
54949 +       unsigned old_offset;
54950 +       tree_level level;
54951 +       coord_t coord;
54952 +       int result;
54953 +
54954 +       assert("nikita-580", node != NULL);
54955 +       assert("nikita-581", error != NULL);
54956 +       assert("nikita-2948", znode_is_loaded(node));
54957 +
54958 +       if (ZF_ISSET(node, JNODE_HEARD_BANSHEE))
54959 +               return 0;
54960 +
54961 +       assert("nikita-582", zdata(node) != NULL);
54962 +
54963 +       nr_items = node40_num_of_items_internal(node);
54964 +       if (nr_items < 0) {
54965 +               *error = "Negative number of items";
54966 +               return -1;
54967 +       }
54968 +
54969 +       if (flags & REISER4_NODE_DKEYS)
54970 +               prev = *znode_get_ld_key((znode *) node);
54971 +       else
54972 +               prev = *reiser4_min_key();
54973 +
54974 +       old_offset = 0;
54975 +       coord_init_zero(&coord);
54976 +       coord.node = (znode *) node;
54977 +       coord.unit_pos = 0;
54978 +       coord.between = AT_UNIT;
54979 +       level = znode_get_level(node);
54980 +       for (i = 0; i < nr_items; i++) {
54981 +               item_header40 *ih;
54982 +               reiser4_key unit_key;
54983 +               unsigned j;
54984 +
54985 +               ih = node40_ih_at(node, (unsigned)i);
54986 +               coord_set_item_pos(&coord, i);
54987 +               if ((ih40_get_offset(ih) >=
54988 +                    znode_size(node) - nr_items * sizeof(item_header40)) ||
54989 +                   (ih40_get_offset(ih) < sizeof(node40_header))) {
54990 +                       *error = "Offset is out of bounds";
54991 +                       return -1;
54992 +               }
54993 +               if (ih40_get_offset(ih) <= old_offset) {
54994 +                       *error = "Offsets are in wrong order";
54995 +                       return -1;
54996 +               }
54997 +               if ((i == 0) && (ih40_get_offset(ih) != sizeof(node40_header))) {
54998 +                       *error = "Wrong offset of first item";
54999 +                       return -1;
55000 +               }
55001 +               old_offset = ih40_get_offset(ih);
55002 +
55003 +               if (keygt(&prev, &ih->key)) {
55004 +                       *error = "Keys are in wrong order";
55005 +                       return -1;
55006 +               }
55007 +               if (!keyeq(&ih->key, unit_key_by_coord(&coord, &unit_key))) {
55008 +                       *error = "Wrong key of first unit";
55009 +                       return -1;
55010 +               }
55011 +               prev = ih->key;
55012 +               for (j = 0; j < coord_num_units(&coord); ++j) {
55013 +                       coord.unit_pos = j;
55014 +                       unit_key_by_coord(&coord, &unit_key);
55015 +                       if (keygt(&prev, &unit_key)) {
55016 +                               *error = "Unit keys are in wrong order";
55017 +                               return -1;
55018 +                       }
55019 +                       prev = unit_key;
55020 +               }
55021 +               coord.unit_pos = 0;
55022 +               if (level != TWIG_LEVEL && item_is_extent(&coord)) {
55023 +                       *error = "extent on the wrong level";
55024 +                       return -1;
55025 +               }
55026 +               if (level == LEAF_LEVEL && item_is_internal(&coord)) {
55027 +                       *error = "internal item on the wrong level";
55028 +                       return -1;
55029 +               }
55030 +               if (level != LEAF_LEVEL &&
55031 +                   !item_is_internal(&coord) && !item_is_extent(&coord)) {
55032 +                       *error = "wrong item on the internal level";
55033 +                       return -1;
55034 +               }
55035 +               if (level > TWIG_LEVEL && !item_is_internal(&coord)) {
55036 +                       *error = "non-internal item on the internal level";
55037 +                       return -1;
55038 +               }
55039 +#if REISER4_DEBUG
55040 +               if (item_plugin_by_coord(&coord)->b.check
55041 +                   && item_plugin_by_coord(&coord)->b.check(&coord, error))
55042 +                       return -1;
55043 +#endif
55044 +               if (i) {
55045 +                       coord_t prev_coord;
55046 +                       /* two neighboring items can not be mergeable */
55047 +                       coord_dup(&prev_coord, &coord);
55048 +                       coord_prev_item(&prev_coord);
55049 +                       if (are_items_mergeable(&prev_coord, &coord)) {
55050 +                               *error = "mergeable items in one node";
55051 +                               return -1;
55052 +                       }
55053 +
55054 +               }
55055 +       }
55056 +
55057 +       if ((flags & REISER4_NODE_DKEYS) && !node_is_empty(node)) {
55058 +               coord_t coord;
55059 +               item_plugin *iplug;
55060 +
55061 +               coord_init_last_unit(&coord, node);
55062 +               iplug = item_plugin_by_coord(&coord);
55063 +               if ((item_is_extent(&coord) || item_is_tail(&coord)) &&
55064 +                   iplug->s.file.append_key != NULL) {
55065 +                       reiser4_key mkey;
55066 +
55067 +                       iplug->s.file.append_key(&coord, &mkey);
55068 +                       set_key_offset(&mkey, get_key_offset(&mkey) - 1);
55069 +                       read_lock_dk(current_tree);
55070 +                       result = keygt(&mkey, znode_get_rd_key((znode *) node));
55071 +                       read_unlock_dk(current_tree);
55072 +                       if (result) {
55073 +                               *error = "key of rightmost item is too large";
55074 +                               return -1;
55075 +                       }
55076 +               }
55077 +       }
55078 +       if (flags & REISER4_NODE_DKEYS) {
55079 +               read_lock_tree(current_tree);
55080 +               read_lock_dk(current_tree);
55081 +
55082 +               flags |= REISER4_NODE_TREE_STABLE;
55083 +
55084 +               if (keygt(&prev, znode_get_rd_key((znode *) node))) {
55085 +                       if (flags & REISER4_NODE_TREE_STABLE) {
55086 +                               *error = "Last key is greater than rdkey";
55087 +                               read_unlock_dk(current_tree);
55088 +                               read_unlock_tree(current_tree);
55089 +                               return -1;
55090 +                       }
55091 +               }
55092 +               if (keygt
55093 +                   (znode_get_ld_key((znode *) node),
55094 +                    znode_get_rd_key((znode *) node))) {
55095 +                       *error = "ldkey is greater than rdkey";
55096 +                       read_unlock_dk(current_tree);
55097 +                       read_unlock_tree(current_tree);
55098 +                       return -1;
55099 +               }
55100 +               if (ZF_ISSET(node, JNODE_LEFT_CONNECTED) &&
55101 +                   (node->left != NULL) &&
55102 +                   !ZF_ISSET(node->left, JNODE_HEARD_BANSHEE) &&
55103 +                   ergo(flags & REISER4_NODE_TREE_STABLE,
55104 +                        !keyeq(znode_get_rd_key(node->left),
55105 +                               znode_get_ld_key((znode *) node)))
55106 +                   && ergo(!(flags & REISER4_NODE_TREE_STABLE),
55107 +                           keygt(znode_get_rd_key(node->left),
55108 +                                 znode_get_ld_key((znode *) node)))) {
55109 +                       *error = "left rdkey or ldkey is wrong";
55110 +                       read_unlock_dk(current_tree);
55111 +                       read_unlock_tree(current_tree);
55112 +                       return -1;
55113 +               }
55114 +               if (ZF_ISSET(node, JNODE_RIGHT_CONNECTED) &&
55115 +                   (node->right != NULL) &&
55116 +                   !ZF_ISSET(node->right, JNODE_HEARD_BANSHEE) &&
55117 +                   ergo(flags & REISER4_NODE_TREE_STABLE,
55118 +                        !keyeq(znode_get_rd_key((znode *) node),
55119 +                               znode_get_ld_key(node->right)))
55120 +                   && ergo(!(flags & REISER4_NODE_TREE_STABLE),
55121 +                           keygt(znode_get_rd_key((znode *) node),
55122 +                                 znode_get_ld_key(node->right)))) {
55123 +                       *error = "rdkey or right ldkey is wrong";
55124 +                       read_unlock_dk(current_tree);
55125 +                       read_unlock_tree(current_tree);
55126 +                       return -1;
55127 +               }
55128 +
55129 +               read_unlock_dk(current_tree);
55130 +               read_unlock_tree(current_tree);
55131 +       }
55132 +
55133 +       return 0;
55134 +}
55135 +
55136 +/* plugin->u.node.parse
55137 +   look for description of this method in plugin/node/node.h */
55138 +int parse_node40(znode * node /* node to parse */ )
55139 +{
55140 +       node40_header *header;
55141 +       int result;
55142 +       d8 level;
55143 +
55144 +       header = node40_node_header((znode *) node);
55145 +       result = -EIO;
55146 +       level = nh40_get_level(header);
55147 +       if (unlikely(((__u8) znode_get_level(node)) != level))
55148 +               warning("nikita-494", "Wrong level found in node: %i != %i",
55149 +                       znode_get_level(node), level);
55150 +       else if (unlikely(nh40_get_magic(header) != REISER4_NODE_MAGIC))
55151 +               warning("nikita-495",
55152 +                       "Wrong magic in tree node: want %x, got %x",
55153 +                       REISER4_NODE_MAGIC, nh40_get_magic(header));
55154 +       else {
55155 +               node->nr_items = node40_num_of_items_internal(node);
55156 +               result = 0;
55157 +       }
55158 +       return RETERR(result);
55159 +}
55160 +
55161 +/* plugin->u.node.init
55162 +   look for description of this method in plugin/node/node.h */
55163 +int init_node40(znode * node /* node to initialise */ )
55164 +{
55165 +       node40_header *header;
55166 +
55167 +       assert("nikita-570", node != NULL);
55168 +       assert("nikita-572", zdata(node) != NULL);
55169 +
55170 +       header = node40_node_header(node);
55171 +       memset(header, 0, sizeof(node40_header));
55172 +       nh40_set_free_space(header, znode_size(node) - sizeof(node40_header));
55173 +       nh40_set_free_space_start(header, sizeof(node40_header));
55174 +       /* sane hypothesis: 0 in CPU format is 0 in disk format */
55175 +       /* items: 0 */
55176 +       save_plugin_id(node_plugin_to_plugin(node->nplug),
55177 +                      &header->common_header.plugin_id);
55178 +       nh40_set_level(header, znode_get_level(node));
55179 +       nh40_set_magic(header, REISER4_NODE_MAGIC);
55180 +       node->nr_items = 0;
55181 +       nh40_set_mkfs_id(header, reiser4_mkfs_id(reiser4_get_current_sb()));
55182 +
55183 +       /* flags: 0 */
55184 +       return 0;
55185 +}
55186 +
55187 +#ifdef GUESS_EXISTS
55188 +int guess_node40(const znode * node /* node to guess plugin of */ )
55189 +{
55190 +       node40_header *nethack;
55191 +
55192 +       assert("nikita-1058", node != NULL);
55193 +       nethack = node40_node_header(node);
55194 +       return
55195 +           (nh40_get_magic(nethack) == REISER4_NODE_MAGIC) &&
55196 +           (plugin_by_disk_id(znode_get_tree(node),
55197 +                              REISER4_NODE_PLUGIN_TYPE,
55198 +                              &nethack->common_header.plugin_id)->h.id ==
55199 +            NODE40_ID);
55200 +}
55201 +#endif
55202 +
55203 +/* plugin->u.node.chage_item_size
55204 +   look for description of this method in plugin/node/node.h */
55205 +void change_item_size_node40(coord_t * coord, int by)
55206 +{
55207 +       node40_header *nh;
55208 +       item_header40 *ih;
55209 +       char *item_data;
55210 +       int item_length;
55211 +       unsigned i;
55212 +
55213 +       /* make sure that @item is coord of existing item */
55214 +       assert("vs-210", coord_is_existing_item(coord));
55215 +
55216 +       nh = node40_node_header(coord->node);
55217 +
55218 +       item_data = item_by_coord_node40(coord);
55219 +       item_length = length_by_coord_node40(coord);
55220 +
55221 +       /* move item bodies */
55222 +       ih = node40_ih_at_coord(coord);
55223 +       memmove(item_data + item_length + by, item_data + item_length,
55224 +               nh40_get_free_space_start(node40_node_header(coord->node)) -
55225 +               (ih40_get_offset(ih) + item_length));
55226 +
55227 +       /* update offsets of moved items */
55228 +       for (i = coord->item_pos + 1; i < nh40_get_num_items(nh); i++) {
55229 +               ih = node40_ih_at(coord->node, i);
55230 +               ih40_set_offset(ih, ih40_get_offset(ih) + by);
55231 +       }
55232 +
55233 +       /* update node header */
55234 +       nh40_set_free_space(nh, nh40_get_free_space(nh) - by);
55235 +       nh40_set_free_space_start(nh, nh40_get_free_space_start(nh) + by);
55236 +}
55237 +
55238 +static int should_notify_parent(const znode * node)
55239 +{
55240 +       /* FIXME_JMACD This looks equivalent to znode_is_root(), right? -josh */
55241 +       return !disk_addr_eq(znode_get_block(node),
55242 +                            &znode_get_tree(node)->root_block);
55243 +}
55244 +
55245 +/* plugin->u.node.create_item
55246 +   look for description of this method in plugin/node/node.h */
55247 +int
55248 +create_item_node40(coord_t *target, const reiser4_key *key,
55249 +                  reiser4_item_data *data, carry_plugin_info *info)
55250 +{
55251 +       node40_header *nh;
55252 +       item_header40 *ih;
55253 +       unsigned offset;
55254 +       unsigned i;
55255 +
55256 +       nh = node40_node_header(target->node);
55257 +
55258 +       assert("vs-212", coord_is_between_items(target));
55259 +       /* node must have enough free space */
55260 +       assert("vs-254",
55261 +              free_space_node40(target->node) >=
55262 +              data->length + sizeof(item_header40));
55263 +       assert("vs-1410", data->length >= 0);
55264 +
55265 +       if (coord_set_to_right(target))
55266 +               /* there are not items to the right of @target, so, new item
55267 +                  will be inserted after last one */
55268 +               coord_set_item_pos(target, nh40_get_num_items(nh));
55269 +
55270 +       if (target->item_pos < nh40_get_num_items(nh)) {
55271 +               /* there are items to be moved to prepare space for new
55272 +                  item */
55273 +               ih = node40_ih_at_coord(target);
55274 +               /* new item will start at this offset */
55275 +               offset = ih40_get_offset(ih);
55276 +
55277 +               memmove(zdata(target->node) + offset + data->length,
55278 +                       zdata(target->node) + offset,
55279 +                       nh40_get_free_space_start(nh) - offset);
55280 +               /* update headers of moved items */
55281 +               for (i = target->item_pos; i < nh40_get_num_items(nh); i++) {
55282 +                       ih = node40_ih_at(target->node, i);
55283 +                       ih40_set_offset(ih, ih40_get_offset(ih) + data->length);
55284 +               }
55285 +
55286 +               /* @ih is set to item header of the last item, move item headers */
55287 +               memmove(ih - 1, ih,
55288 +                       sizeof(item_header40) * (nh40_get_num_items(nh) -
55289 +                                                target->item_pos));
55290 +       } else {
55291 +               /* new item will start at this offset */
55292 +               offset = nh40_get_free_space_start(nh);
55293 +       }
55294 +
55295 +       /* make item header for the new item */
55296 +       ih = node40_ih_at_coord(target);
55297 +       memcpy(&ih->key, key, sizeof(reiser4_key));
55298 +       ih40_set_offset(ih, offset);
55299 +       save_plugin_id(item_plugin_to_plugin(data->iplug), &ih->plugin_id);
55300 +
55301 +       /* update node header */
55302 +       nh40_set_free_space(nh,
55303 +                           nh40_get_free_space(nh) - data->length -
55304 +                           sizeof(item_header40));
55305 +       nh40_set_free_space_start(nh,
55306 +                                 nh40_get_free_space_start(nh) + data->length);
55307 +       node40_set_num_items(target->node, nh, nh40_get_num_items(nh) + 1);
55308 +
55309 +       /* FIXME: check how does create_item work when between is set to BEFORE_UNIT */
55310 +       target->unit_pos = 0;
55311 +       target->between = AT_UNIT;
55312 +       coord_clear_iplug(target);
55313 +
55314 +       /* initialize item */
55315 +       if (data->iplug->b.init != NULL) {
55316 +               data->iplug->b.init(target, NULL, data);
55317 +       }
55318 +       /* copy item body */
55319 +       if (data->iplug->b.paste != NULL) {
55320 +               data->iplug->b.paste(target, data, info);
55321 +       } else if (data->data != NULL) {
55322 +               if (data->user) {
55323 +                       /* AUDIT: Are we really should not check that pointer
55324 +                          from userspace was valid and data bytes were
55325 +                          available? How will we return -EFAULT of some kind
55326 +                          without this check? */
55327 +                       assert("nikita-3038", reiser4_schedulable());
55328 +                       /* copy data from user space */
55329 +                       __copy_from_user(zdata(target->node) + offset,
55330 +                                        (const char __user *)data->data,
55331 +                                        (unsigned)data->length);
55332 +               } else
55333 +                       /* copy from kernel space */
55334 +                       memcpy(zdata(target->node) + offset, data->data,
55335 +                              (unsigned)data->length);
55336 +       }
55337 +
55338 +       if (target->item_pos == 0) {
55339 +               /* left delimiting key has to be updated */
55340 +               prepare_for_update(NULL, target->node, info);
55341 +       }
55342 +
55343 +       if (item_plugin_by_coord(target)->b.create_hook != NULL) {
55344 +               item_plugin_by_coord(target)->b.create_hook(target, data->arg);
55345 +       }
55346 +
55347 +       return 0;
55348 +}
55349 +
55350 +/* plugin->u.node.update_item_key
55351 +   look for description of this method in plugin/node/node.h */
55352 +void
55353 +update_item_key_node40(coord_t * target, const reiser4_key * key,
55354 +                      carry_plugin_info * info)
55355 +{
55356 +       item_header40 *ih;
55357 +
55358 +       ih = node40_ih_at_coord(target);
55359 +       memcpy(&ih->key, key, sizeof(reiser4_key));
55360 +
55361 +       if (target->item_pos == 0) {
55362 +               prepare_for_update(NULL, target->node, info);
55363 +       }
55364 +}
55365 +
55366 +/* this bits encode cut mode */
55367 +#define CMODE_TAIL 1
55368 +#define CMODE_WHOLE 2
55369 +#define CMODE_HEAD 4
55370 +
55371 +struct cut40_info {
55372 +       int mode;
55373 +       pos_in_node_t tail_removed;     /* position of item which gets tail removed */
55374 +       pos_in_node_t first_removed;    /* position of first the leftmost item among items removed completely */
55375 +       pos_in_node_t removed_count;    /* number of items removed completely */
55376 +       pos_in_node_t head_removed;     /* position of item which gets head removed */
55377 +
55378 +       pos_in_node_t freed_space_start;
55379 +       pos_in_node_t freed_space_end;
55380 +       pos_in_node_t first_moved;
55381 +       pos_in_node_t head_removed_location;
55382 +};
55383 +
55384 +static void init_cinfo(struct cut40_info *cinfo)
55385 +{
55386 +       cinfo->mode = 0;
55387 +       cinfo->tail_removed = MAX_POS_IN_NODE;
55388 +       cinfo->first_removed = MAX_POS_IN_NODE;
55389 +       cinfo->removed_count = MAX_POS_IN_NODE;
55390 +       cinfo->head_removed = MAX_POS_IN_NODE;
55391 +       cinfo->freed_space_start = MAX_POS_IN_NODE;
55392 +       cinfo->freed_space_end = MAX_POS_IN_NODE;
55393 +       cinfo->first_moved = MAX_POS_IN_NODE;
55394 +       cinfo->head_removed_location = MAX_POS_IN_NODE;
55395 +}
55396 +
55397 +/* complete cut_node40/kill_node40 content by removing the gap created by */
55398 +static void compact(znode * node, struct cut40_info *cinfo)
55399 +{
55400 +       node40_header *nh;
55401 +       item_header40 *ih;
55402 +       pos_in_node_t freed;
55403 +       pos_in_node_t pos, nr_items;
55404 +
55405 +       assert("vs-1526", (cinfo->freed_space_start != MAX_POS_IN_NODE &&
55406 +                          cinfo->freed_space_end != MAX_POS_IN_NODE &&
55407 +                          cinfo->first_moved != MAX_POS_IN_NODE));
55408 +       assert("vs-1523", cinfo->freed_space_end >= cinfo->freed_space_start);
55409 +
55410 +       nh = node40_node_header(node);
55411 +       nr_items = nh40_get_num_items(nh);
55412 +
55413 +       /* remove gap made up by removal */
55414 +       memmove(zdata(node) + cinfo->freed_space_start,
55415 +               zdata(node) + cinfo->freed_space_end,
55416 +               nh40_get_free_space_start(nh) - cinfo->freed_space_end);
55417 +
55418 +       /* update item headers of moved items - change their locations */
55419 +       pos = cinfo->first_moved;
55420 +       ih = node40_ih_at(node, pos);
55421 +       if (cinfo->head_removed_location != MAX_POS_IN_NODE) {
55422 +               assert("vs-1580", pos == cinfo->head_removed);
55423 +               ih40_set_offset(ih, cinfo->head_removed_location);
55424 +               pos++;
55425 +               ih--;
55426 +       }
55427 +
55428 +       freed = cinfo->freed_space_end - cinfo->freed_space_start;
55429 +       for (; pos < nr_items; pos++, ih--) {
55430 +               assert("vs-1581", ih == node40_ih_at(node, pos));
55431 +               ih40_set_offset(ih, ih40_get_offset(ih) - freed);
55432 +       }
55433 +
55434 +       /* free space start moved to right */
55435 +       nh40_set_free_space_start(nh, nh40_get_free_space_start(nh) - freed);
55436 +
55437 +       if (cinfo->removed_count != MAX_POS_IN_NODE) {
55438 +               /* number of items changed. Remove item headers of those items */
55439 +               ih = node40_ih_at(node, nr_items - 1);
55440 +               memmove(ih + cinfo->removed_count, ih,
55441 +                       sizeof(item_header40) * (nr_items -
55442 +                                                cinfo->removed_count -
55443 +                                                cinfo->first_removed));
55444 +               freed += sizeof(item_header40) * cinfo->removed_count;
55445 +               node40_set_num_items(node, nh, nr_items - cinfo->removed_count);
55446 +       }
55447 +
55448 +       /* total amount of free space increased */
55449 +       nh40_set_free_space(nh, nh40_get_free_space(nh) + freed);
55450 +}
55451 +
55452 +int shrink_item_node40(coord_t * coord, int delta)
55453 +{
55454 +       node40_header *nh;
55455 +       item_header40 *ih;
55456 +       pos_in_node_t pos;
55457 +       pos_in_node_t nr_items;
55458 +       char *end;
55459 +       znode *node;
55460 +       int off;
55461 +
55462 +       assert("nikita-3487", coord != NULL);
55463 +       assert("nikita-3488", delta >= 0);
55464 +
55465 +       node = coord->node;
55466 +       nh = node40_node_header(node);
55467 +       nr_items = nh40_get_num_items(nh);
55468 +
55469 +       ih = node40_ih_at_coord(coord);
55470 +       assert("nikita-3489", delta <= length_by_coord_node40(coord));
55471 +       off = ih40_get_offset(ih) + length_by_coord_node40(coord);
55472 +       end = zdata(node) + off;
55473 +
55474 +       /* remove gap made up by removal */
55475 +       memmove(end - delta, end, nh40_get_free_space_start(nh) - off);
55476 +
55477 +       /* update item headers of moved items - change their locations */
55478 +       pos = coord->item_pos + 1;
55479 +       ih = node40_ih_at(node, pos);
55480 +       for (; pos < nr_items; pos++, ih--) {
55481 +               assert("nikita-3490", ih == node40_ih_at(node, pos));
55482 +               ih40_set_offset(ih, ih40_get_offset(ih) - delta);
55483 +       }
55484 +
55485 +       /* free space start moved to left */
55486 +       nh40_set_free_space_start(nh, nh40_get_free_space_start(nh) - delta);
55487 +       /* total amount of free space increased */
55488 +       nh40_set_free_space(nh, nh40_get_free_space(nh) + delta);
55489 +       /*
55490 +        * This method does _not_ changes number of items. Hence, it cannot
55491 +        * make node empty. Also it doesn't remove items at all, which means
55492 +        * that no keys have to be updated either.
55493 +        */
55494 +       return 0;
55495 +}
55496 +
55497 +/* this is used by cut_node40 and kill_node40. It analyses input parameters and calculates cut mode. There are 2 types
55498 +   of cut. First is when a unit is removed from the middle of an item.  In this case this function returns 1. All the
55499 +   rest fits into second case: 0 or 1 of items getting tail cut, 0 or more items removed completely and 0 or 1 item
55500 +   getting head cut. Function returns 0 in this case */
55501 +static int
55502 +parse_cut(struct cut40_info *cinfo, const struct cut_kill_params *params)
55503 +{
55504 +       reiser4_key left_key, right_key;
55505 +       reiser4_key min_from_key, max_to_key;
55506 +       const reiser4_key *from_key, *to_key;
55507 +
55508 +       init_cinfo(cinfo);
55509 +
55510 +       /* calculate minimal key stored in first item of items to be cut (params->from) */
55511 +       item_key_by_coord(params->from, &min_from_key);
55512 +       /* and max key stored in last item of items to be cut (params->to) */
55513 +       max_item_key_by_coord(params->to, &max_to_key);
55514 +
55515 +       /* if cut key range is not defined in input parameters - define it using cut coord range */
55516 +       if (params->from_key == NULL) {
55517 +               assert("vs-1513", params->to_key == NULL);
55518 +               unit_key_by_coord(params->from, &left_key);
55519 +               from_key = &left_key;
55520 +               max_unit_key_by_coord(params->to, &right_key);
55521 +               to_key = &right_key;
55522 +       } else {
55523 +               from_key = params->from_key;
55524 +               to_key = params->to_key;
55525 +       }
55526 +
55527 +       if (params->from->item_pos == params->to->item_pos) {
55528 +               if (keylt(&min_from_key, from_key)
55529 +                   && keylt(to_key, &max_to_key))
55530 +                       return 1;
55531 +
55532 +               if (keygt(from_key, &min_from_key)) {
55533 +                       /* tail of item is to be cut cut */
55534 +                       cinfo->tail_removed = params->from->item_pos;
55535 +                       cinfo->mode |= CMODE_TAIL;
55536 +               } else if (keylt(to_key, &max_to_key)) {
55537 +                       /* head of item is to be cut */
55538 +                       cinfo->head_removed = params->from->item_pos;
55539 +                       cinfo->mode |= CMODE_HEAD;
55540 +               } else {
55541 +                       /* item is removed completely */
55542 +                       cinfo->first_removed = params->from->item_pos;
55543 +                       cinfo->removed_count = 1;
55544 +                       cinfo->mode |= CMODE_WHOLE;
55545 +               }
55546 +       } else {
55547 +               cinfo->first_removed = params->from->item_pos + 1;
55548 +               cinfo->removed_count =
55549 +                   params->to->item_pos - params->from->item_pos - 1;
55550 +
55551 +               if (keygt(from_key, &min_from_key)) {
55552 +                       /* first item is not cut completely */
55553 +                       cinfo->tail_removed = params->from->item_pos;
55554 +                       cinfo->mode |= CMODE_TAIL;
55555 +               } else {
55556 +                       cinfo->first_removed--;
55557 +                       cinfo->removed_count++;
55558 +               }
55559 +               if (keylt(to_key, &max_to_key)) {
55560 +                       /* last item is not cut completely */
55561 +                       cinfo->head_removed = params->to->item_pos;
55562 +                       cinfo->mode |= CMODE_HEAD;
55563 +               } else {
55564 +                       cinfo->removed_count++;
55565 +               }
55566 +               if (cinfo->removed_count)
55567 +                       cinfo->mode |= CMODE_WHOLE;
55568 +       }
55569 +
55570 +       return 0;
55571 +}
55572 +
55573 +static void
55574 +call_kill_hooks(znode * node, pos_in_node_t from, pos_in_node_t count,
55575 +               carry_kill_data * kdata)
55576 +{
55577 +       coord_t coord;
55578 +       item_plugin *iplug;
55579 +       pos_in_node_t pos;
55580 +
55581 +       coord.node = node;
55582 +       coord.unit_pos = 0;
55583 +       coord.between = AT_UNIT;
55584 +       for (pos = 0; pos < count; pos++) {
55585 +               coord_set_item_pos(&coord, from + pos);
55586 +               coord.unit_pos = 0;
55587 +               coord.between = AT_UNIT;
55588 +               iplug = item_plugin_by_coord(&coord);
55589 +               if (iplug->b.kill_hook) {
55590 +                       iplug->b.kill_hook(&coord, 0, coord_num_units(&coord),
55591 +                                          kdata);
55592 +               }
55593 +       }
55594 +}
55595 +
55596 +/* this is used to kill item partially */
55597 +static pos_in_node_t
55598 +kill_units(coord_t * coord, pos_in_node_t from, pos_in_node_t to, void *data,
55599 +          reiser4_key * smallest_removed, reiser4_key * new_first_key)
55600 +{
55601 +       struct carry_kill_data *kdata;
55602 +       item_plugin *iplug;
55603 +
55604 +       kdata = data;
55605 +       iplug = item_plugin_by_coord(coord);
55606 +
55607 +       assert("vs-1524", iplug->b.kill_units);
55608 +       return iplug->b.kill_units(coord, from, to, kdata, smallest_removed,
55609 +                                  new_first_key);
55610 +}
55611 +
55612 +/* call item plugin to cut tail of file */
55613 +static pos_in_node_t
55614 +kill_tail(coord_t * coord, void *data, reiser4_key * smallest_removed)
55615 +{
55616 +       struct carry_kill_data *kdata;
55617 +       pos_in_node_t to;
55618 +
55619 +       kdata = data;
55620 +       to = coord_last_unit_pos(coord);
55621 +       return kill_units(coord, coord->unit_pos, to, kdata, smallest_removed,
55622 +                         NULL);
55623 +}
55624 +
55625 +/* call item plugin to cut head of item */
55626 +static pos_in_node_t
55627 +kill_head(coord_t * coord, void *data, reiser4_key * smallest_removed,
55628 +         reiser4_key * new_first_key)
55629 +{
55630 +       return kill_units(coord, 0, coord->unit_pos, data, smallest_removed,
55631 +                         new_first_key);
55632 +}
55633 +
55634 +/* this is used to cut item partially */
55635 +static pos_in_node_t
55636 +cut_units(coord_t * coord, pos_in_node_t from, pos_in_node_t to, void *data,
55637 +         reiser4_key * smallest_removed, reiser4_key * new_first_key)
55638 +{
55639 +       carry_cut_data *cdata;
55640 +       item_plugin *iplug;
55641 +
55642 +       cdata = data;
55643 +       iplug = item_plugin_by_coord(coord);
55644 +       assert("vs-302", iplug->b.cut_units);
55645 +       return iplug->b.cut_units(coord, from, to, cdata, smallest_removed,
55646 +                                 new_first_key);
55647 +}
55648 +
55649 +/* call item plugin to cut tail of file */
55650 +static pos_in_node_t
55651 +cut_tail(coord_t * coord, void *data, reiser4_key * smallest_removed)
55652 +{
55653 +       carry_cut_data *cdata;
55654 +       pos_in_node_t to;
55655 +
55656 +       cdata = data;
55657 +       to = coord_last_unit_pos(cdata->params.from);
55658 +       return cut_units(coord, coord->unit_pos, to, data, smallest_removed, NULL);
55659 +}
55660 +
55661 +/* call item plugin to cut head of item */
55662 +static pos_in_node_t
55663 +cut_head(coord_t * coord, void *data, reiser4_key * smallest_removed,
55664 +        reiser4_key * new_first_key)
55665 +{
55666 +       return cut_units(coord, 0, coord->unit_pos, data, smallest_removed,
55667 +                        new_first_key);
55668 +}
55669 +
55670 +/* this returns 1 of key of first item changed, 0 - if it did not */
55671 +static int
55672 +prepare_for_compact(struct cut40_info *cinfo,
55673 +                   const struct cut_kill_params *params, int is_cut,
55674 +                   void *data, carry_plugin_info * info)
55675 +{
55676 +       znode *node;
55677 +       item_header40 *ih;
55678 +       pos_in_node_t freed;
55679 +       pos_in_node_t item_pos;
55680 +       coord_t coord;
55681 +       reiser4_key new_first_key;
55682 +       pos_in_node_t(*kill_units_f) (coord_t *, pos_in_node_t, pos_in_node_t,
55683 +                                     void *, reiser4_key *, reiser4_key *);
55684 +       pos_in_node_t(*kill_tail_f) (coord_t *, void *, reiser4_key *);
55685 +       pos_in_node_t(*kill_head_f) (coord_t *, void *, reiser4_key *,
55686 +                                    reiser4_key *);
55687 +       int retval;
55688 +
55689 +       retval = 0;
55690 +
55691 +       node = params->from->node;
55692 +
55693 +       assert("vs-184", node == params->to->node);
55694 +       assert("vs-312", !node_is_empty(node));
55695 +       assert("vs-297",
55696 +              coord_compare(params->from, params->to) != COORD_CMP_ON_RIGHT);
55697 +
55698 +       if (is_cut) {
55699 +               kill_units_f = cut_units;
55700 +               kill_tail_f = cut_tail;
55701 +               kill_head_f = cut_head;
55702 +       } else {
55703 +               kill_units_f = kill_units;
55704 +               kill_tail_f = kill_tail;
55705 +               kill_head_f = kill_head;
55706 +       }
55707 +
55708 +       if (parse_cut(cinfo, params) == 1) {
55709 +               /* cut from the middle of item */
55710 +               freed =
55711 +                   kill_units_f(params->from, params->from->unit_pos,
55712 +                                params->to->unit_pos, data,
55713 +                                params->smallest_removed, NULL);
55714 +
55715 +               item_pos = params->from->item_pos;
55716 +               ih = node40_ih_at(node, item_pos);
55717 +               cinfo->freed_space_start =
55718 +                   ih40_get_offset(ih) + node40_item_length(node,
55719 +                                                            item_pos) - freed;
55720 +               cinfo->freed_space_end = cinfo->freed_space_start + freed;
55721 +               cinfo->first_moved = item_pos + 1;
55722 +       } else {
55723 +               assert("vs-1521", (cinfo->tail_removed != MAX_POS_IN_NODE ||
55724 +                                  cinfo->first_removed != MAX_POS_IN_NODE ||
55725 +                                  cinfo->head_removed != MAX_POS_IN_NODE));
55726 +
55727 +               switch (cinfo->mode) {
55728 +               case CMODE_TAIL:
55729 +                       /* one item gets cut partially from its end */
55730 +                       assert("vs-1562",
55731 +                              cinfo->tail_removed == params->from->item_pos);
55732 +
55733 +                       freed =
55734 +                           kill_tail_f(params->from, data,
55735 +                                       params->smallest_removed);
55736 +
55737 +                       item_pos = cinfo->tail_removed;
55738 +                       ih = node40_ih_at(node, item_pos);
55739 +                       cinfo->freed_space_start =
55740 +                           ih40_get_offset(ih) + node40_item_length(node,
55741 +                                                                    item_pos) -
55742 +                           freed;
55743 +                       cinfo->freed_space_end =
55744 +                           cinfo->freed_space_start + freed;
55745 +                       cinfo->first_moved = cinfo->tail_removed + 1;
55746 +                       break;
55747 +
55748 +               case CMODE_WHOLE:
55749 +                       /* one or more items get removed completely */
55750 +                       assert("vs-1563",
55751 +                              cinfo->first_removed == params->from->item_pos);
55752 +                       assert("vs-1564", cinfo->removed_count > 0
55753 +                              && cinfo->removed_count != MAX_POS_IN_NODE);
55754 +
55755 +                       /* call kill hook for all items removed completely */
55756 +                       if (is_cut == 0)
55757 +                               call_kill_hooks(node, cinfo->first_removed,
55758 +                                               cinfo->removed_count, data);
55759 +
55760 +                       item_pos = cinfo->first_removed;
55761 +                       ih = node40_ih_at(node, item_pos);
55762 +
55763 +                       if (params->smallest_removed)
55764 +                               memcpy(params->smallest_removed, &ih->key,
55765 +                                      sizeof(reiser4_key));
55766 +
55767 +                       cinfo->freed_space_start = ih40_get_offset(ih);
55768 +
55769 +                       item_pos += (cinfo->removed_count - 1);
55770 +                       ih -= (cinfo->removed_count - 1);
55771 +                       cinfo->freed_space_end =
55772 +                           ih40_get_offset(ih) + node40_item_length(node,
55773 +                                                                    item_pos);
55774 +                       cinfo->first_moved = item_pos + 1;
55775 +                       if (cinfo->first_removed == 0)
55776 +                               /* key of first item of the node changes */
55777 +                               retval = 1;
55778 +                       break;
55779 +
55780 +               case CMODE_HEAD:
55781 +                       /* one item gets cut partially from its head */
55782 +                       assert("vs-1565",
55783 +                              cinfo->head_removed == params->from->item_pos);
55784 +
55785 +                       freed =
55786 +                           kill_head_f(params->to, data,
55787 +                                       params->smallest_removed,
55788 +                                       &new_first_key);
55789 +
55790 +                       item_pos = cinfo->head_removed;
55791 +                       ih = node40_ih_at(node, item_pos);
55792 +                       cinfo->freed_space_start = ih40_get_offset(ih);
55793 +                       cinfo->freed_space_end = ih40_get_offset(ih) + freed;
55794 +                       cinfo->first_moved = cinfo->head_removed + 1;
55795 +
55796 +                       /* item head is removed, therefore, item key changed */
55797 +                       coord.node = node;
55798 +                       coord_set_item_pos(&coord, item_pos);
55799 +                       coord.unit_pos = 0;
55800 +                       coord.between = AT_UNIT;
55801 +                       update_item_key_node40(&coord, &new_first_key, NULL);
55802 +                       if (item_pos == 0)
55803 +                               /* key of first item of the node changes */
55804 +                               retval = 1;
55805 +                       break;
55806 +
55807 +               case CMODE_TAIL | CMODE_WHOLE:
55808 +                       /* one item gets cut from its end and one or more items get removed completely */
55809 +                       assert("vs-1566",
55810 +                              cinfo->tail_removed == params->from->item_pos);
55811 +                       assert("vs-1567",
55812 +                              cinfo->first_removed == cinfo->tail_removed + 1);
55813 +                       assert("vs-1564", cinfo->removed_count > 0
55814 +                              && cinfo->removed_count != MAX_POS_IN_NODE);
55815 +
55816 +                       freed =
55817 +                           kill_tail_f(params->from, data,
55818 +                                       params->smallest_removed);
55819 +
55820 +                       item_pos = cinfo->tail_removed;
55821 +                       ih = node40_ih_at(node, item_pos);
55822 +                       cinfo->freed_space_start =
55823 +                           ih40_get_offset(ih) + node40_item_length(node,
55824 +                                                                    item_pos) -
55825 +                           freed;
55826 +
55827 +                       /* call kill hook for all items removed completely */
55828 +                       if (is_cut == 0)
55829 +                               call_kill_hooks(node, cinfo->first_removed,
55830 +                                               cinfo->removed_count, data);
55831 +
55832 +                       item_pos += cinfo->removed_count;
55833 +                       ih -= cinfo->removed_count;
55834 +                       cinfo->freed_space_end =
55835 +                           ih40_get_offset(ih) + node40_item_length(node,
55836 +                                                                    item_pos);
55837 +                       cinfo->first_moved = item_pos + 1;
55838 +                       break;
55839 +
55840 +               case CMODE_WHOLE | CMODE_HEAD:
55841 +                       /* one or more items get removed completely and one item gets cut partially from its head */
55842 +                       assert("vs-1568",
55843 +                              cinfo->first_removed == params->from->item_pos);
55844 +                       assert("vs-1564", cinfo->removed_count > 0
55845 +                              && cinfo->removed_count != MAX_POS_IN_NODE);
55846 +                       assert("vs-1569",
55847 +                              cinfo->head_removed ==
55848 +                              cinfo->first_removed + cinfo->removed_count);
55849 +
55850 +                       /* call kill hook for all items removed completely */
55851 +                       if (is_cut == 0)
55852 +                               call_kill_hooks(node, cinfo->first_removed,
55853 +                                               cinfo->removed_count, data);
55854 +
55855 +                       item_pos = cinfo->first_removed;
55856 +                       ih = node40_ih_at(node, item_pos);
55857 +
55858 +                       if (params->smallest_removed)
55859 +                               memcpy(params->smallest_removed, &ih->key,
55860 +                                      sizeof(reiser4_key));
55861 +
55862 +                       freed =
55863 +                           kill_head_f(params->to, data, NULL, &new_first_key);
55864 +
55865 +                       cinfo->freed_space_start = ih40_get_offset(ih);
55866 +
55867 +                       ih = node40_ih_at(node, cinfo->head_removed);
55868 +                       /* this is the most complex case. Item which got head removed and items which are to be moved
55869 +                          intact change their location differently. */
55870 +                       cinfo->freed_space_end = ih40_get_offset(ih) + freed;
55871 +                       cinfo->first_moved = cinfo->head_removed;
55872 +                       cinfo->head_removed_location = cinfo->freed_space_start;
55873 +
55874 +                       /* item head is removed, therefore, item key changed */
55875 +                       coord.node = node;
55876 +                       coord_set_item_pos(&coord, cinfo->head_removed);
55877 +                       coord.unit_pos = 0;
55878 +                       coord.between = AT_UNIT;
55879 +                       update_item_key_node40(&coord, &new_first_key, NULL);
55880 +
55881 +                       assert("vs-1579", cinfo->first_removed == 0);
55882 +                       /* key of first item of the node changes */
55883 +                       retval = 1;
55884 +                       break;
55885 +
55886 +               case CMODE_TAIL | CMODE_HEAD:
55887 +                       /* one item get cut from its end and its neighbor gets cut from its tail */
55888 +                       impossible("vs-1576", "this can not happen currently");
55889 +                       break;
55890 +
55891 +               case CMODE_TAIL | CMODE_WHOLE | CMODE_HEAD:
55892 +                       impossible("vs-1577", "this can not happen currently");
55893 +                       break;
55894 +               default:
55895 +                       impossible("vs-1578", "unexpected cut mode");
55896 +                       break;
55897 +               }
55898 +       }
55899 +       return retval;
55900 +}
55901 +
55902 +/* plugin->u.node.kill
55903 +   return value is number of items removed completely */
55904 +int kill_node40(struct carry_kill_data *kdata, carry_plugin_info * info)
55905 +{
55906 +       znode *node;
55907 +       struct cut40_info cinfo;
55908 +       int first_key_changed;
55909 +
55910 +       node = kdata->params.from->node;
55911 +
55912 +       first_key_changed =
55913 +           prepare_for_compact(&cinfo, &kdata->params, 0 /* not cut */ , kdata,
55914 +                               info);
55915 +       compact(node, &cinfo);
55916 +
55917 +       if (info) {
55918 +               /* it is not called by node40_shift, so we have to take care
55919 +                  of changes on upper levels */
55920 +               if (node_is_empty(node)
55921 +                   && !(kdata->flags & DELETE_RETAIN_EMPTY))
55922 +                       /* all contents of node is deleted */
55923 +                       prepare_removal_node40(node, info);
55924 +               else if (first_key_changed) {
55925 +                       prepare_for_update(NULL, node, info);
55926 +               }
55927 +       }
55928 +
55929 +       coord_clear_iplug(kdata->params.from);
55930 +       coord_clear_iplug(kdata->params.to);
55931 +
55932 +       znode_make_dirty(node);
55933 +       return cinfo.removed_count == MAX_POS_IN_NODE ? 0 : cinfo.removed_count;
55934 +}
55935 +
55936 +/* plugin->u.node.cut
55937 +   return value is number of items removed completely */
55938 +int cut_node40(struct carry_cut_data *cdata, carry_plugin_info * info)
55939 +{
55940 +       znode *node;
55941 +       struct cut40_info cinfo;
55942 +       int first_key_changed;
55943 +
55944 +       node = cdata->params.from->node;
55945 +
55946 +       first_key_changed =
55947 +           prepare_for_compact(&cinfo, &cdata->params, 1 /* not cut */ , cdata,
55948 +                               info);
55949 +       compact(node, &cinfo);
55950 +
55951 +       if (info) {
55952 +               /* it is not called by node40_shift, so we have to take care
55953 +                  of changes on upper levels */
55954 +               if (node_is_empty(node))
55955 +                       /* all contents of node is deleted */
55956 +                       prepare_removal_node40(node, info);
55957 +               else if (first_key_changed) {
55958 +                       prepare_for_update(NULL, node, info);
55959 +               }
55960 +       }
55961 +
55962 +       coord_clear_iplug(cdata->params.from);
55963 +       coord_clear_iplug(cdata->params.to);
55964 +
55965 +       znode_make_dirty(node);
55966 +       return cinfo.removed_count == MAX_POS_IN_NODE ? 0 : cinfo.removed_count;
55967 +}
55968 +
55969 +/* this structure is used by shift method of node40 plugin */
55970 +struct shift_params {
55971 +       shift_direction pend;   /* when @pend == append - we are shifting to
55972 +                                  left, when @pend == prepend - to right */
55973 +       coord_t wish_stop;      /* when shifting to left this is last unit we
55974 +                                  want shifted, when shifting to right - this
55975 +                                  is set to unit we want to start shifting
55976 +                                  from */
55977 +       znode *target;
55978 +       int everything;         /* it is set to 1 if everything we have to shift is
55979 +                                  shifted, 0 - otherwise */
55980 +
55981 +       /* FIXME-VS: get rid of read_stop */
55982 +
55983 +       /* these are set by estimate_shift */
55984 +       coord_t real_stop;      /* this will be set to last unit which will be
55985 +                                  really shifted */
55986 +
55987 +       /* coordinate in source node before operation of unit which becomes
55988 +          first after shift to left of last after shift to right */
55989 +       union {
55990 +               coord_t future_first;
55991 +               coord_t future_last;
55992 +       } u;
55993 +
55994 +       unsigned merging_units; /* number of units of first item which have to
55995 +                                  be merged with last item of target node */
55996 +       unsigned merging_bytes; /* number of bytes in those units */
55997 +
55998 +       unsigned entire;        /* items shifted in their entirety */
55999 +       unsigned entire_bytes;  /* number of bytes in those items */
56000 +
56001 +       unsigned part_units;    /* number of units of partially copied item */
56002 +       unsigned part_bytes;    /* number of bytes in those units */
56003 +
56004 +       unsigned shift_bytes;   /* total number of bytes in items shifted (item
56005 +                                  headers not included) */
56006 +
56007 +};
56008 +
56009 +static int item_creation_overhead(coord_t *item)
56010 +{
56011 +       return node_plugin_by_coord(item)->item_overhead(item->node, NULL);
56012 +}
56013 +
56014 +/* how many units are there in @source starting from source->unit_pos
56015 +   but not further than @stop_coord */
56016 +static int
56017 +wanted_units(coord_t *source, coord_t *stop_coord, shift_direction pend)
56018 +{
56019 +       if (pend == SHIFT_LEFT) {
56020 +               assert("vs-181", source->unit_pos == 0);
56021 +       } else {
56022 +               assert("vs-182",
56023 +                      source->unit_pos == coord_last_unit_pos(source));
56024 +       }
56025 +
56026 +       if (source->item_pos != stop_coord->item_pos) {
56027 +               /* @source and @stop_coord are different items */
56028 +               return coord_last_unit_pos(source) + 1;
56029 +       }
56030 +
56031 +       if (pend == SHIFT_LEFT) {
56032 +               return stop_coord->unit_pos + 1;
56033 +       } else {
56034 +               return source->unit_pos - stop_coord->unit_pos + 1;
56035 +       }
56036 +}
56037 +
56038 +/* this calculates what can be copied from @shift->wish_stop.node to
56039 +   @shift->target */
56040 +static void
56041 +estimate_shift(struct shift_params *shift, const reiser4_context * ctx)
56042 +{
56043 +       unsigned target_free_space, size;
56044 +       pos_in_node_t stop_item;        /* item which estimating should not consider */
56045 +       unsigned want;          /* number of units of item we want shifted */
56046 +       coord_t source;         /* item being estimated */
56047 +       item_plugin *iplug;
56048 +
56049 +       /* shifting to left/right starts from first/last units of
56050 +          @shift->wish_stop.node */
56051 +       if (shift->pend == SHIFT_LEFT) {
56052 +               coord_init_first_unit(&source, shift->wish_stop.node);
56053 +       } else {
56054 +               coord_init_last_unit(&source, shift->wish_stop.node);
56055 +       }
56056 +       shift->real_stop = source;
56057 +
56058 +       /* free space in target node and number of items in source */
56059 +       target_free_space = znode_free_space(shift->target);
56060 +
56061 +       shift->everything = 0;
56062 +       if (!node_is_empty(shift->target)) {
56063 +               /* target node is not empty, check for boundary items
56064 +                  mergeability */
56065 +               coord_t to;
56066 +
56067 +               /* item we try to merge @source with */
56068 +               if (shift->pend == SHIFT_LEFT) {
56069 +                       coord_init_last_unit(&to, shift->target);
56070 +               } else {
56071 +                       coord_init_first_unit(&to, shift->target);
56072 +               }
56073 +
56074 +               if ((shift->pend == SHIFT_LEFT) ? are_items_mergeable(&to,
56075 +                                                                     &source) :
56076 +                   are_items_mergeable(&source, &to)) {
56077 +                       /* how many units of @source do we want to merge to
56078 +                          item @to */
56079 +                       want =
56080 +                           wanted_units(&source, &shift->wish_stop,
56081 +                                        shift->pend);
56082 +
56083 +                       /* how many units of @source we can merge to item
56084 +                          @to */
56085 +                       iplug = item_plugin_by_coord(&source);
56086 +                       if (iplug->b.can_shift != NULL)
56087 +                               shift->merging_units =
56088 +                                   iplug->b.can_shift(target_free_space,
56089 +                                                      &source, shift->target,
56090 +                                                      shift->pend, &size,
56091 +                                                      want);
56092 +                       else {
56093 +                               shift->merging_units = 0;
56094 +                               size = 0;
56095 +                       }
56096 +                       shift->merging_bytes = size;
56097 +                       shift->shift_bytes += size;
56098 +                       /* update stop coord to be set to last unit of @source
56099 +                          we can merge to @target */
56100 +                       if (shift->merging_units)
56101 +                               /* at least one unit can be shifted */
56102 +                               shift->real_stop.unit_pos =
56103 +                                   (shift->merging_units - source.unit_pos -
56104 +                                    1) * shift->pend;
56105 +                       else {
56106 +                               /* nothing can be shifted */
56107 +                               if (shift->pend == SHIFT_LEFT)
56108 +                                       coord_init_before_first_item(&shift->
56109 +                                                                    real_stop,
56110 +                                                                    source.
56111 +                                                                    node);
56112 +                               else
56113 +                                       coord_init_after_last_item(&shift->
56114 +                                                                  real_stop,
56115 +                                                                  source.node);
56116 +                       }
56117 +                       assert("nikita-2081", shift->real_stop.unit_pos + 1);
56118 +
56119 +                       if (shift->merging_units != want) {
56120 +                               /* we could not copy as many as we want, so,
56121 +                                  there is no reason for estimating any
56122 +                                  longer */
56123 +                               return;
56124 +                       }
56125 +
56126 +                       target_free_space -= size;
56127 +                       coord_add_item_pos(&source, shift->pend);
56128 +               }
56129 +       }
56130 +
56131 +       /* number of item nothing of which we want to shift */
56132 +       stop_item = shift->wish_stop.item_pos + shift->pend;
56133 +
56134 +       /* calculate how many items can be copied into given free
56135 +          space as whole */
56136 +       for (; source.item_pos != stop_item;
56137 +            coord_add_item_pos(&source, shift->pend)) {
56138 +               if (shift->pend == SHIFT_RIGHT)
56139 +                       source.unit_pos = coord_last_unit_pos(&source);
56140 +
56141 +               /* how many units of @source do we want to copy */
56142 +               want = wanted_units(&source, &shift->wish_stop, shift->pend);
56143 +
56144 +               if (want == coord_last_unit_pos(&source) + 1) {
56145 +                       /* we want this item to be copied entirely */
56146 +                       size =
56147 +                           item_length_by_coord(&source) +
56148 +                           item_creation_overhead(&source);
56149 +                       if (size <= target_free_space) {
56150 +                               /* item fits into target node as whole */
56151 +                               target_free_space -= size;
56152 +                               shift->shift_bytes +=
56153 +                                   size - item_creation_overhead(&source);
56154 +                               shift->entire_bytes +=
56155 +                                   size - item_creation_overhead(&source);
56156 +                               shift->entire++;
56157 +
56158 +                               /* update shift->real_stop coord to be set to
56159 +                                  last unit of @source we can merge to
56160 +                                  @target */
56161 +                               shift->real_stop = source;
56162 +                               if (shift->pend == SHIFT_LEFT)
56163 +                                       shift->real_stop.unit_pos =
56164 +                                           coord_last_unit_pos(&shift->
56165 +                                                               real_stop);
56166 +                               else
56167 +                                       shift->real_stop.unit_pos = 0;
56168 +                               continue;
56169 +                       }
56170 +               }
56171 +
56172 +               /* we reach here only for an item which does not fit into
56173 +                  target node in its entirety. This item may be either
56174 +                  partially shifted, or not shifted at all. We will have to
56175 +                  create new item in target node, so decrease amout of free
56176 +                  space by an item creation overhead. We can reach here also
56177 +                  if stop coord is in this item */
56178 +               if (target_free_space >=
56179 +                   (unsigned)item_creation_overhead(&source)) {
56180 +                       target_free_space -= item_creation_overhead(&source);
56181 +                       iplug = item_plugin_by_coord(&source);
56182 +                       if (iplug->b.can_shift) {
56183 +                               shift->part_units = iplug->b.can_shift(target_free_space,
56184 +                                                                      &source,
56185 +                                                                      NULL, /* target */
56186 +                                                                      shift->pend,
56187 +                                                                      &size,
56188 +                                                                      want);
56189 +                       } else {
56190 +                               target_free_space = 0;
56191 +                               shift->part_units = 0;
56192 +                               size = 0;
56193 +                       }
56194 +               } else {
56195 +                       target_free_space = 0;
56196 +                       shift->part_units = 0;
56197 +                       size = 0;
56198 +               }
56199 +               shift->part_bytes = size;
56200 +               shift->shift_bytes += size;
56201 +
56202 +               /* set @shift->real_stop to last unit of @source we can merge
56203 +                  to @shift->target */
56204 +               if (shift->part_units) {
56205 +                       shift->real_stop = source;
56206 +                       shift->real_stop.unit_pos =
56207 +                           (shift->part_units - source.unit_pos -
56208 +                            1) * shift->pend;
56209 +                       assert("nikita-2082", shift->real_stop.unit_pos + 1);
56210 +               }
56211 +
56212 +               if (want != shift->part_units)
56213 +                       /* not everything wanted were shifted */
56214 +                       return;
56215 +               break;
56216 +       }
56217 +
56218 +       shift->everything = 1;
56219 +}
56220 +
56221 +static void
56222 +copy_units(coord_t * target, coord_t * source, unsigned from, unsigned count,
56223 +          shift_direction dir, unsigned free_space)
56224 +{
56225 +       item_plugin *iplug;
56226 +
56227 +       assert("nikita-1463", target != NULL);
56228 +       assert("nikita-1464", source != NULL);
56229 +       assert("nikita-1465", from + count <= coord_num_units(source));
56230 +
56231 +       iplug = item_plugin_by_coord(source);
56232 +       assert("nikita-1468", iplug == item_plugin_by_coord(target));
56233 +       iplug->b.copy_units(target, source, from, count, dir, free_space);
56234 +
56235 +       if (dir == SHIFT_RIGHT) {
56236 +               /* FIXME-VS: this looks not necessary. update_item_key was
56237 +                  called already by copy_units method */
56238 +               reiser4_key split_key;
56239 +
56240 +               assert("nikita-1469", target->unit_pos == 0);
56241 +
56242 +               unit_key_by_coord(target, &split_key);
56243 +               node_plugin_by_coord(target)->update_item_key(target,
56244 +                                                             &split_key, NULL);
56245 +       }
56246 +}
56247 +
56248 +/* copy part of @shift->real_stop.node starting either from its beginning or
56249 +   from its end and ending at @shift->real_stop to either the end or the
56250 +   beginning of @shift->target */
56251 +static void copy(struct shift_params *shift)
56252 +{
56253 +       node40_header *nh;
56254 +       coord_t from;
56255 +       coord_t to;
56256 +       item_header40 *from_ih, *to_ih;
56257 +       int free_space_start;
56258 +       int new_items;
56259 +       unsigned old_items;
56260 +       int old_offset;
56261 +       unsigned i;
56262 +
56263 +       nh = node40_node_header(shift->target);
56264 +       free_space_start = nh40_get_free_space_start(nh);
56265 +       old_items = nh40_get_num_items(nh);
56266 +       new_items = shift->entire + (shift->part_units ? 1 : 0);
56267 +       assert("vs-185",
56268 +              shift->shift_bytes ==
56269 +              shift->merging_bytes + shift->entire_bytes + shift->part_bytes);
56270 +
56271 +       from = shift->wish_stop;
56272 +
56273 +       coord_init_first_unit(&to, shift->target);
56274 +
56275 +       /* NOTE:NIKITA->VS not sure what I am doing: shift->target is empty,
56276 +          hence to.between is set to EMPTY_NODE above. Looks like we want it
56277 +          to be AT_UNIT.
56278 +
56279 +          Oh, wonders of ->betweeness...
56280 +
56281 +        */
56282 +       to.between = AT_UNIT;
56283 +
56284 +       if (shift->pend == SHIFT_LEFT) {
56285 +               /* copying to left */
56286 +
56287 +               coord_set_item_pos(&from, 0);
56288 +               from_ih = node40_ih_at(from.node, 0);
56289 +
56290 +               coord_set_item_pos(&to,
56291 +                                  node40_num_of_items_internal(to.node) - 1);
56292 +               if (shift->merging_units) {
56293 +                       /* expand last item, so that plugin methods will see
56294 +                          correct data */
56295 +                       free_space_start += shift->merging_bytes;
56296 +                       nh40_set_free_space_start(nh,
56297 +                                                 (unsigned)free_space_start);
56298 +                       nh40_set_free_space(nh,
56299 +                                           nh40_get_free_space(nh) -
56300 +                                           shift->merging_bytes);
56301 +
56302 +                       /* appending last item of @target */
56303 +                       copy_units(&to, &from, 0,       /* starting from 0-th unit */
56304 +                                  shift->merging_units, SHIFT_LEFT,
56305 +                                  shift->merging_bytes);
56306 +                       coord_inc_item_pos(&from);
56307 +                       from_ih--;
56308 +                       coord_inc_item_pos(&to);
56309 +               }
56310 +
56311 +               to_ih = node40_ih_at(shift->target, old_items);
56312 +               if (shift->entire) {
56313 +                       /* copy @entire items entirely */
56314 +
56315 +                       /* copy item headers */
56316 +                       memcpy(to_ih - shift->entire + 1,
56317 +                              from_ih - shift->entire + 1,
56318 +                              shift->entire * sizeof(item_header40));
56319 +                       /* update item header offset */
56320 +                       old_offset = ih40_get_offset(from_ih);
56321 +                       /* AUDIT: Looks like if we calculate old_offset + free_space_start here instead of just old_offset, we can perform one "add" operation less per each iteration */
56322 +                       for (i = 0; i < shift->entire; i++, to_ih--, from_ih--)
56323 +                               ih40_set_offset(to_ih,
56324 +                                               ih40_get_offset(from_ih) -
56325 +                                               old_offset + free_space_start);
56326 +
56327 +                       /* copy item bodies */
56328 +                       memcpy(zdata(shift->target) + free_space_start, zdata(from.node) + old_offset,  /*ih40_get_offset (from_ih), */
56329 +                              shift->entire_bytes);
56330 +
56331 +                       coord_add_item_pos(&from, (int)shift->entire);
56332 +                       coord_add_item_pos(&to, (int)shift->entire);
56333 +               }
56334 +
56335 +               nh40_set_free_space_start(nh,
56336 +                                         free_space_start +
56337 +                                         shift->shift_bytes -
56338 +                                         shift->merging_bytes);
56339 +               nh40_set_free_space(nh,
56340 +                                   nh40_get_free_space(nh) -
56341 +                                   (shift->shift_bytes - shift->merging_bytes +
56342 +                                    sizeof(item_header40) * new_items));
56343 +
56344 +               /* update node header */
56345 +               node40_set_num_items(shift->target, nh, old_items + new_items);
56346 +               assert("vs-170",
56347 +                      nh40_get_free_space(nh) < znode_size(shift->target));
56348 +
56349 +               if (shift->part_units) {
56350 +                       /* copy heading part (@part units) of @source item as
56351 +                          a new item into @target->node */
56352 +
56353 +                       /* copy item header of partially copied item */
56354 +                       coord_set_item_pos(&to,
56355 +                                          node40_num_of_items_internal(to.node)
56356 +                                          - 1);
56357 +                       memcpy(to_ih, from_ih, sizeof(item_header40));
56358 +                       ih40_set_offset(to_ih,
56359 +                                       nh40_get_free_space_start(nh) -
56360 +                                       shift->part_bytes);
56361 +                       if (item_plugin_by_coord(&to)->b.init)
56362 +                               item_plugin_by_coord(&to)->b.init(&to, &from,
56363 +                                                                 NULL);
56364 +                       copy_units(&to, &from, 0, shift->part_units, SHIFT_LEFT,
56365 +                                  shift->part_bytes);
56366 +               }
56367 +
56368 +       } else {
56369 +               /* copying to right */
56370 +
56371 +               coord_set_item_pos(&from,
56372 +                                  node40_num_of_items_internal(from.node) - 1);
56373 +               from_ih = node40_ih_at_coord(&from);
56374 +
56375 +               coord_set_item_pos(&to, 0);
56376 +
56377 +               /* prepare space for new items */
56378 +               memmove(zdata(to.node) + sizeof(node40_header) +
56379 +                       shift->shift_bytes,
56380 +                       zdata(to.node) + sizeof(node40_header),
56381 +                       free_space_start - sizeof(node40_header));
56382 +               /* update item headers of moved items */
56383 +               to_ih = node40_ih_at(to.node, 0);
56384 +               /* first item gets @merging_bytes longer. free space appears
56385 +                  at its beginning */
56386 +               if (!node_is_empty(to.node))
56387 +                       ih40_set_offset(to_ih,
56388 +                                       ih40_get_offset(to_ih) +
56389 +                                       shift->shift_bytes -
56390 +                                       shift->merging_bytes);
56391 +
56392 +               for (i = 1; i < old_items; i++)
56393 +                       ih40_set_offset(to_ih - i,
56394 +                                       ih40_get_offset(to_ih - i) +
56395 +                                       shift->shift_bytes);
56396 +
56397 +               /* move item headers to make space for new items */
56398 +               memmove(to_ih - old_items + 1 - new_items,
56399 +                       to_ih - old_items + 1,
56400 +                       sizeof(item_header40) * old_items);
56401 +               to_ih -= (new_items - 1);
56402 +
56403 +               nh40_set_free_space_start(nh,
56404 +                                         free_space_start +
56405 +                                         shift->shift_bytes);
56406 +               nh40_set_free_space(nh,
56407 +                                   nh40_get_free_space(nh) -
56408 +                                   (shift->shift_bytes +
56409 +                                    sizeof(item_header40) * new_items));
56410 +
56411 +               /* update node header */
56412 +               node40_set_num_items(shift->target, nh, old_items + new_items);
56413 +               assert("vs-170",
56414 +                      nh40_get_free_space(nh) < znode_size(shift->target));
56415 +
56416 +               if (shift->merging_units) {
56417 +                       coord_add_item_pos(&to, new_items);
56418 +                       to.unit_pos = 0;
56419 +                       to.between = AT_UNIT;
56420 +                       /* prepend first item of @to */
56421 +                       copy_units(&to, &from,
56422 +                                  coord_last_unit_pos(&from) -
56423 +                                  shift->merging_units + 1,
56424 +                                  shift->merging_units, SHIFT_RIGHT,
56425 +                                  shift->merging_bytes);
56426 +                       coord_dec_item_pos(&from);
56427 +                       from_ih++;
56428 +               }
56429 +
56430 +               if (shift->entire) {
56431 +                       /* copy @entire items entirely */
56432 +
56433 +                       /* copy item headers */
56434 +                       memcpy(to_ih, from_ih,
56435 +                              shift->entire * sizeof(item_header40));
56436 +
56437 +                       /* update item header offset */
56438 +                       old_offset =
56439 +                           ih40_get_offset(from_ih + shift->entire - 1);
56440 +                       /* AUDIT: old_offset + sizeof (node40_header) + shift->part_bytes calculation can be taken off the loop. */
56441 +                       for (i = 0; i < shift->entire; i++, to_ih++, from_ih++)
56442 +                               ih40_set_offset(to_ih,
56443 +                                               ih40_get_offset(from_ih) -
56444 +                                               old_offset +
56445 +                                               sizeof(node40_header) +
56446 +                                               shift->part_bytes);
56447 +                       /* copy item bodies */
56448 +                       coord_add_item_pos(&from, -(int)(shift->entire - 1));
56449 +                       memcpy(zdata(to.node) + sizeof(node40_header) +
56450 +                              shift->part_bytes, item_by_coord_node40(&from),
56451 +                              shift->entire_bytes);
56452 +                       coord_dec_item_pos(&from);
56453 +               }
56454 +
56455 +               if (shift->part_units) {
56456 +                       coord_set_item_pos(&to, 0);
56457 +                       to.unit_pos = 0;
56458 +                       to.between = AT_UNIT;
56459 +                       /* copy heading part (@part units) of @source item as
56460 +                          a new item into @target->node */
56461 +
56462 +                       /* copy item header of partially copied item */
56463 +                       memcpy(to_ih, from_ih, sizeof(item_header40));
56464 +                       ih40_set_offset(to_ih, sizeof(node40_header));
56465 +                       if (item_plugin_by_coord(&to)->b.init)
56466 +                               item_plugin_by_coord(&to)->b.init(&to, &from,
56467 +                                                                 NULL);
56468 +                       copy_units(&to, &from,
56469 +                                  coord_last_unit_pos(&from) -
56470 +                                  shift->part_units + 1, shift->part_units,
56471 +                                  SHIFT_RIGHT, shift->part_bytes);
56472 +               }
56473 +       }
56474 +}
56475 +
56476 +/* remove everything either before or after @fact_stop. Number of items
56477 +   removed completely is returned */
56478 +static int delete_copied(struct shift_params *shift)
56479 +{
56480 +       coord_t from;
56481 +       coord_t to;
56482 +       struct carry_cut_data cdata;
56483 +
56484 +       if (shift->pend == SHIFT_LEFT) {
56485 +               /* we were shifting to left, remove everything from the
56486 +                  beginning of @shift->wish_stop->node upto
56487 +                  @shift->wish_stop */
56488 +               coord_init_first_unit(&from, shift->real_stop.node);
56489 +               to = shift->real_stop;
56490 +
56491 +               /* store old coordinate of unit which will be first after
56492 +                  shift to left */
56493 +               shift->u.future_first = to;
56494 +               coord_next_unit(&shift->u.future_first);
56495 +       } else {
56496 +               /* we were shifting to right, remove everything from
56497 +                  @shift->stop_coord upto to end of
56498 +                  @shift->stop_coord->node */
56499 +               from = shift->real_stop;
56500 +               coord_init_last_unit(&to, from.node);
56501 +
56502 +               /* store old coordinate of unit which will be last after
56503 +                  shift to right */
56504 +               shift->u.future_last = from;
56505 +               coord_prev_unit(&shift->u.future_last);
56506 +       }
56507 +
56508 +       cdata.params.from = &from;
56509 +       cdata.params.to = &to;
56510 +       cdata.params.from_key = NULL;
56511 +       cdata.params.to_key = NULL;
56512 +       cdata.params.smallest_removed = NULL;
56513 +       return cut_node40(&cdata, NULL);
56514 +}
56515 +
56516 +/* something was moved between @left and @right. Add carry operation to @info
56517 +   list to have carry to update delimiting key between them */
56518 +static int
56519 +prepare_for_update(znode * left, znode * right, carry_plugin_info * info)
56520 +{
56521 +       carry_op *op;
56522 +       carry_node *cn;
56523 +
56524 +       if (info == NULL)
56525 +               /* nowhere to send operation to. */
56526 +               return 0;
56527 +
56528 +       if (!should_notify_parent(right))
56529 +               return 0;
56530 +
56531 +       op = node_post_carry(info, COP_UPDATE, right, 1);
56532 +       if (IS_ERR(op) || op == NULL)
56533 +               return op ? PTR_ERR(op) : -EIO;
56534 +
56535 +       if (left != NULL) {
56536 +               carry_node *reference;
56537 +
56538 +               if (info->doing)
56539 +                       reference = insert_carry_node(info->doing,
56540 +                                                     info->todo, left);
56541 +               else
56542 +                       reference = op->node;
56543 +               assert("nikita-2992", reference != NULL);
56544 +               cn = reiser4_add_carry(info->todo, POOLO_BEFORE, reference);
56545 +               if (IS_ERR(cn))
56546 +                       return PTR_ERR(cn);
56547 +               cn->parent = 1;
56548 +               cn->node = left;
56549 +               if (ZF_ISSET(left, JNODE_ORPHAN))
56550 +                       cn->left_before = 1;
56551 +               op->u.update.left = cn;
56552 +       } else
56553 +               op->u.update.left = NULL;
56554 +       return 0;
56555 +}
56556 +
56557 +/* plugin->u.node.prepare_removal
56558 +   to delete a pointer to @empty from the tree add corresponding carry
56559 +   operation (delete) to @info list */
56560 +int prepare_removal_node40(znode * empty, carry_plugin_info * info)
56561 +{
56562 +       carry_op *op;
56563 +       reiser4_tree *tree;
56564 +
56565 +       if (!should_notify_parent(empty))
56566 +               return 0;
56567 +       /* already on a road to Styx */
56568 +       if (ZF_ISSET(empty, JNODE_HEARD_BANSHEE))
56569 +               return 0;
56570 +       op = node_post_carry(info, COP_DELETE, empty, 1);
56571 +       if (IS_ERR(op) || op == NULL)
56572 +               return RETERR(op ? PTR_ERR(op) : -EIO);
56573 +
56574 +       op->u.delete.child = NULL;
56575 +       op->u.delete.flags = 0;
56576 +
56577 +       /* fare thee well */
56578 +       tree = znode_get_tree(empty);
56579 +       read_lock_tree(tree);
56580 +       write_lock_dk(tree);
56581 +       znode_set_ld_key(empty, znode_get_rd_key(empty));
56582 +       if (znode_is_left_connected(empty) && empty->left)
56583 +               znode_set_rd_key(empty->left, znode_get_rd_key(empty));
56584 +       write_unlock_dk(tree);
56585 +       read_unlock_tree(tree);
56586 +
56587 +       ZF_SET(empty, JNODE_HEARD_BANSHEE);
56588 +       return 0;
56589 +}
56590 +
56591 +/* something were shifted from @insert_coord->node to @shift->target, update
56592 +   @insert_coord correspondingly */
56593 +static void
56594 +adjust_coord(coord_t * insert_coord, struct shift_params *shift, int removed,
56595 +            int including_insert_coord)
56596 +{
56597 +       /* item plugin was invalidated by shifting */
56598 +       coord_clear_iplug(insert_coord);
56599 +
56600 +       if (node_is_empty(shift->wish_stop.node)) {
56601 +               assert("vs-242", shift->everything);
56602 +               if (including_insert_coord) {
56603 +                       if (shift->pend == SHIFT_RIGHT) {
56604 +                               /* set @insert_coord before first unit of
56605 +                                  @shift->target node */
56606 +                               coord_init_before_first_item(insert_coord,
56607 +                                                            shift->target);
56608 +                       } else {
56609 +                               /* set @insert_coord after last in target node */
56610 +                               coord_init_after_last_item(insert_coord,
56611 +                                                          shift->target);
56612 +                       }
56613 +               } else {
56614 +                       /* set @insert_coord inside of empty node. There is
56615 +                          only one possible coord within an empty
56616 +                          node. init_first_unit will set that coord */
56617 +                       coord_init_first_unit(insert_coord,
56618 +                                             shift->wish_stop.node);
56619 +               }
56620 +               return;
56621 +       }
56622 +
56623 +       if (shift->pend == SHIFT_RIGHT) {
56624 +               /* there was shifting to right */
56625 +               if (shift->everything) {
56626 +                       /* everything wanted was shifted */
56627 +                       if (including_insert_coord) {
56628 +                               /* @insert_coord is set before first unit of
56629 +                                  @to node */
56630 +                               coord_init_before_first_item(insert_coord,
56631 +                                                            shift->target);
56632 +                               insert_coord->between = BEFORE_UNIT;
56633 +                       } else {
56634 +                               /* @insert_coord is set after last unit of
56635 +                                  @insert->node */
56636 +                               coord_init_last_unit(insert_coord,
56637 +                                                    shift->wish_stop.node);
56638 +                               insert_coord->between = AFTER_UNIT;
56639 +                       }
56640 +               }
56641 +               return;
56642 +       }
56643 +
56644 +       /* there was shifting to left */
56645 +       if (shift->everything) {
56646 +               /* everything wanted was shifted */
56647 +               if (including_insert_coord) {
56648 +                       /* @insert_coord is set after last unit in @to node */
56649 +                       coord_init_after_last_item(insert_coord, shift->target);
56650 +               } else {
56651 +                       /* @insert_coord is set before first unit in the same
56652 +                          node */
56653 +                       coord_init_before_first_item(insert_coord,
56654 +                                                    shift->wish_stop.node);
56655 +               }
56656 +               return;
56657 +       }
56658 +
56659 +       /* FIXME-VS: the code below is complicated because with between ==
56660 +          AFTER_ITEM unit_pos is set to 0 */
56661 +
56662 +       if (!removed) {
56663 +               /* no items were shifted entirely */
56664 +               assert("vs-195", shift->merging_units == 0
56665 +                      || shift->part_units == 0);
56666 +
56667 +               if (shift->real_stop.item_pos == insert_coord->item_pos) {
56668 +                       if (shift->merging_units) {
56669 +                               if (insert_coord->between == AFTER_UNIT) {
56670 +                                       assert("nikita-1441",
56671 +                                              insert_coord->unit_pos >=
56672 +                                              shift->merging_units);
56673 +                                       insert_coord->unit_pos -=
56674 +                                           shift->merging_units;
56675 +                               } else if (insert_coord->between == BEFORE_UNIT) {
56676 +                                       assert("nikita-2090",
56677 +                                              insert_coord->unit_pos >
56678 +                                              shift->merging_units);
56679 +                                       insert_coord->unit_pos -=
56680 +                                           shift->merging_units;
56681 +                               }
56682 +
56683 +                               assert("nikita-2083",
56684 +                                      insert_coord->unit_pos + 1);
56685 +                       } else {
56686 +                               if (insert_coord->between == AFTER_UNIT) {
56687 +                                       assert("nikita-1442",
56688 +                                              insert_coord->unit_pos >=
56689 +                                              shift->part_units);
56690 +                                       insert_coord->unit_pos -=
56691 +                                           shift->part_units;
56692 +                               } else if (insert_coord->between == BEFORE_UNIT) {
56693 +                                       assert("nikita-2089",
56694 +                                              insert_coord->unit_pos >
56695 +                                              shift->part_units);
56696 +                                       insert_coord->unit_pos -=
56697 +                                           shift->part_units;
56698 +                               }
56699 +
56700 +                               assert("nikita-2084",
56701 +                                      insert_coord->unit_pos + 1);
56702 +                       }
56703 +               }
56704 +               return;
56705 +       }
56706 +
56707 +       /* we shifted to left and there was no enough space for everything */
56708 +       switch (insert_coord->between) {
56709 +       case AFTER_UNIT:
56710 +       case BEFORE_UNIT:
56711 +               if (shift->real_stop.item_pos == insert_coord->item_pos)
56712 +                       insert_coord->unit_pos -= shift->part_units;
56713 +       case AFTER_ITEM:
56714 +               coord_add_item_pos(insert_coord, -removed);
56715 +               break;
56716 +       default:
56717 +               impossible("nikita-2087", "not ready");
56718 +       }
56719 +       assert("nikita-2085", insert_coord->unit_pos + 1);
56720 +}
56721 +
56722 +static int call_shift_hooks(struct shift_params *shift)
56723 +{
56724 +       unsigned i, shifted;
56725 +       coord_t coord;
56726 +       item_plugin *iplug;
56727 +
56728 +       assert("vs-275", !node_is_empty(shift->target));
56729 +
56730 +       /* number of items shift touches */
56731 +       shifted =
56732 +           shift->entire + (shift->merging_units ? 1 : 0) +
56733 +           (shift->part_units ? 1 : 0);
56734 +
56735 +       if (shift->pend == SHIFT_LEFT) {
56736 +               /* moved items are at the end */
56737 +               coord_init_last_unit(&coord, shift->target);
56738 +               coord.unit_pos = 0;
56739 +
56740 +               assert("vs-279", shift->pend == 1);
56741 +               for (i = 0; i < shifted; i++) {
56742 +                       unsigned from, count;
56743 +
56744 +                       iplug = item_plugin_by_coord(&coord);
56745 +                       if (i == 0 && shift->part_units) {
56746 +                               assert("vs-277",
56747 +                                      coord_num_units(&coord) ==
56748 +                                      shift->part_units);
56749 +                               count = shift->part_units;
56750 +                               from = 0;
56751 +                       } else if (i == shifted - 1 && shift->merging_units) {
56752 +                               count = shift->merging_units;
56753 +                               from = coord_num_units(&coord) - count;
56754 +                       } else {
56755 +                               count = coord_num_units(&coord);
56756 +                               from = 0;
56757 +                       }
56758 +
56759 +                       if (iplug->b.shift_hook) {
56760 +                               iplug->b.shift_hook(&coord, from, count,
56761 +                                                   shift->wish_stop.node);
56762 +                       }
56763 +                       coord_add_item_pos(&coord, -shift->pend);
56764 +               }
56765 +       } else {
56766 +               /* moved items are at the beginning */
56767 +               coord_init_first_unit(&coord, shift->target);
56768 +
56769 +               assert("vs-278", shift->pend == -1);
56770 +               for (i = 0; i < shifted; i++) {
56771 +                       unsigned from, count;
56772 +
56773 +                       iplug = item_plugin_by_coord(&coord);
56774 +                       if (i == 0 && shift->part_units) {
56775 +                               assert("vs-277",
56776 +                                      coord_num_units(&coord) ==
56777 +                                      shift->part_units);
56778 +                               count = coord_num_units(&coord);
56779 +                               from = 0;
56780 +                       } else if (i == shifted - 1 && shift->merging_units) {
56781 +                               count = shift->merging_units;
56782 +                               from = 0;
56783 +                       } else {
56784 +                               count = coord_num_units(&coord);
56785 +                               from = 0;
56786 +                       }
56787 +
56788 +                       if (iplug->b.shift_hook) {
56789 +                               iplug->b.shift_hook(&coord, from, count,
56790 +                                                   shift->wish_stop.node);
56791 +                       }
56792 +                       coord_add_item_pos(&coord, -shift->pend);
56793 +               }
56794 +       }
56795 +
56796 +       return 0;
56797 +}
56798 +
56799 +/* shift to left is completed. Return 1 if unit @old was moved to left neighbor */
56800 +static int
56801 +unit_moved_left(const struct shift_params *shift, const coord_t * old)
56802 +{
56803 +       assert("vs-944", shift->real_stop.node == old->node);
56804 +
56805 +       if (shift->real_stop.item_pos < old->item_pos)
56806 +               return 0;
56807 +       if (shift->real_stop.item_pos == old->item_pos) {
56808 +               if (shift->real_stop.unit_pos < old->unit_pos)
56809 +                       return 0;
56810 +       }
56811 +       return 1;
56812 +}
56813 +
56814 +/* shift to right is completed. Return 1 if unit @old was moved to right
56815 +   neighbor */
56816 +static int
56817 +unit_moved_right(const struct shift_params *shift, const coord_t * old)
56818 +{
56819 +       assert("vs-944", shift->real_stop.node == old->node);
56820 +
56821 +       if (shift->real_stop.item_pos > old->item_pos)
56822 +               return 0;
56823 +       if (shift->real_stop.item_pos == old->item_pos) {
56824 +               if (shift->real_stop.unit_pos > old->unit_pos)
56825 +                       return 0;
56826 +       }
56827 +       return 1;
56828 +}
56829 +
56830 +/* coord @old was set in node from which shift was performed. What was shifted
56831 +   is stored in @shift. Update @old correspondingly to performed shift */
56832 +static coord_t *adjust_coord2(const struct shift_params *shift,
56833 +                             const coord_t * old, coord_t * new)
56834 +{
56835 +       coord_clear_iplug(new);
56836 +       new->between = old->between;
56837 +
56838 +       coord_clear_iplug(new);
56839 +       if (old->node == shift->target) {
56840 +               if (shift->pend == SHIFT_LEFT) {
56841 +                       /* coord which is set inside of left neighbor does not
56842 +                          change during shift to left */
56843 +                       coord_dup(new, old);
56844 +                       return new;
56845 +               }
56846 +               new->node = old->node;
56847 +               coord_set_item_pos(new,
56848 +                                  old->item_pos + shift->entire +
56849 +                                  (shift->part_units ? 1 : 0));
56850 +               new->unit_pos = old->unit_pos;
56851 +               if (old->item_pos == 0 && shift->merging_units)
56852 +                       new->unit_pos += shift->merging_units;
56853 +               return new;
56854 +       }
56855 +
56856 +       assert("vs-977", old->node == shift->wish_stop.node);
56857 +       if (shift->pend == SHIFT_LEFT) {
56858 +               if (unit_moved_left(shift, old)) {
56859 +                       /* unit @old moved to left neighbor. Calculate its
56860 +                          coordinate there */
56861 +                       new->node = shift->target;
56862 +                       coord_set_item_pos(new,
56863 +                                          node_num_items(shift->target) -
56864 +                                          shift->entire -
56865 +                                          (shift->part_units ? 1 : 0) +
56866 +                                          old->item_pos);
56867 +
56868 +                       new->unit_pos = old->unit_pos;
56869 +                       if (shift->merging_units) {
56870 +                               coord_dec_item_pos(new);
56871 +                               if (old->item_pos == 0) {
56872 +                                       /* unit_pos only changes if item got
56873 +                                          merged */
56874 +                                       new->unit_pos =
56875 +                                           coord_num_units(new) -
56876 +                                           (shift->merging_units -
56877 +                                            old->unit_pos);
56878 +                               }
56879 +                       }
56880 +               } else {
56881 +                       /* unit @old did not move to left neighbor.
56882 +
56883 +                          Use _nocheck, because @old is outside of its node.
56884 +                        */
56885 +                       coord_dup_nocheck(new, old);
56886 +                       coord_add_item_pos(new,
56887 +                                          -shift->u.future_first.item_pos);
56888 +                       if (new->item_pos == 0)
56889 +                               new->unit_pos -= shift->u.future_first.unit_pos;
56890 +               }
56891 +       } else {
56892 +               if (unit_moved_right(shift, old)) {
56893 +                       /* unit @old moved to right neighbor */
56894 +                       new->node = shift->target;
56895 +                       coord_set_item_pos(new,
56896 +                                          old->item_pos -
56897 +                                          shift->real_stop.item_pos);
56898 +                       if (new->item_pos == 0) {
56899 +                               /* unit @old might change unit pos */
56900 +                               coord_set_item_pos(new,
56901 +                                                  old->unit_pos -
56902 +                                                  shift->real_stop.unit_pos);
56903 +                       }
56904 +               } else {
56905 +                       /* unit @old did not move to right neighbor, therefore
56906 +                          it did not change */
56907 +                       coord_dup(new, old);
56908 +               }
56909 +       }
56910 +       coord_set_iplug(new, item_plugin_by_coord(new));
56911 +       return new;
56912 +}
56913 +
56914 +/* this is called when shift is completed (something of source node is copied
56915 +   to target and deleted in source) to update all taps set in current
56916 +   context */
56917 +static void update_taps(const struct shift_params *shift)
56918 +{
56919 +       tap_t *tap;
56920 +       coord_t new;
56921 +
56922 +       for_all_taps(tap) {
56923 +               /* update only taps set to nodes participating in shift */
56924 +               if (tap->coord->node == shift->wish_stop.node
56925 +                   || tap->coord->node == shift->target)
56926 +                       tap_to_coord(tap,
56927 +                                    adjust_coord2(shift, tap->coord, &new));
56928 +       }
56929 +}
56930 +
56931 +#if REISER4_DEBUG
56932 +
56933 +struct shift_check {
56934 +       reiser4_key key;
56935 +       __u16 plugin_id;
56936 +       union {
56937 +               __u64 bytes;
56938 +               __u64 entries;
56939 +               void *unused;
56940 +       } u;
56941 +};
56942 +
56943 +void *shift_check_prepare(const znode * left, const znode * right)
56944 +{
56945 +       pos_in_node_t i, nr_items;
56946 +       int mergeable;
56947 +       struct shift_check *data;
56948 +       item_header40 *ih;
56949 +
56950 +       if (node_is_empty(left) || node_is_empty(right))
56951 +               mergeable = 0;
56952 +       else {
56953 +               coord_t l, r;
56954 +
56955 +               coord_init_last_unit(&l, left);
56956 +               coord_init_first_unit(&r, right);
56957 +               mergeable = are_items_mergeable(&l, &r);
56958 +       }
56959 +       nr_items =
56960 +           node40_num_of_items_internal(left) +
56961 +           node40_num_of_items_internal(right) - (mergeable ? 1 : 0);
56962 +       data =
56963 +               kmalloc(sizeof(struct shift_check) * nr_items,
56964 +                       reiser4_ctx_gfp_mask_get());
56965 +       if (data != NULL) {
56966 +               coord_t coord;
56967 +               pos_in_node_t item_pos;
56968 +
56969 +               coord_init_first_unit(&coord, left);
56970 +               i = 0;
56971 +
56972 +               for (item_pos = 0;
56973 +                    item_pos < node40_num_of_items_internal(left);
56974 +                    item_pos++) {
56975 +
56976 +                       coord_set_item_pos(&coord, item_pos);
56977 +                       ih = node40_ih_at_coord(&coord);
56978 +
56979 +                       data[i].key = ih->key;
56980 +                       data[i].plugin_id = le16_to_cpu(get_unaligned(&ih->plugin_id));
56981 +                       switch (data[i].plugin_id) {
56982 +                       case CTAIL_ID:
56983 +                       case FORMATTING_ID:
56984 +                               data[i].u.bytes = coord_num_units(&coord);
56985 +                               break;
56986 +                       case EXTENT_POINTER_ID:
56987 +                               data[i].u.bytes =
56988 +                                       reiser4_extent_size(&coord,
56989 +                                                      coord_num_units(&coord));
56990 +                               break;
56991 +                       case COMPOUND_DIR_ID:
56992 +                               data[i].u.entries = coord_num_units(&coord);
56993 +                               break;
56994 +                       default:
56995 +                               data[i].u.unused = NULL;
56996 +                               break;
56997 +                       }
56998 +                       i++;
56999 +               }
57000 +
57001 +               coord_init_first_unit(&coord, right);
57002 +
57003 +               if (mergeable) {
57004 +                       assert("vs-1609", i != 0);
57005 +
57006 +                       ih = node40_ih_at_coord(&coord);
57007 +
57008 +                       assert("vs-1589",
57009 +                              data[i - 1].plugin_id ==
57010 +                              le16_to_cpu(get_unaligned(&ih->plugin_id)));
57011 +                       switch (data[i - 1].plugin_id) {
57012 +                       case CTAIL_ID:
57013 +                       case FORMATTING_ID:
57014 +                               data[i - 1].u.bytes += coord_num_units(&coord);
57015 +                               break;
57016 +                       case EXTENT_POINTER_ID:
57017 +                               data[i - 1].u.bytes +=
57018 +                                   reiser4_extent_size(&coord,
57019 +                                               coord_num_units(&coord));
57020 +                               break;
57021 +                       case COMPOUND_DIR_ID:
57022 +                               data[i - 1].u.entries +=
57023 +                                   coord_num_units(&coord);
57024 +                               break;
57025 +                       default:
57026 +                               impossible("vs-1605", "wrong mergeable item");
57027 +                               break;
57028 +                       }
57029 +                       item_pos = 1;
57030 +               } else
57031 +                       item_pos = 0;
57032 +               for (; item_pos < node40_num_of_items_internal(right);
57033 +                    item_pos++) {
57034 +
57035 +                       assert("vs-1604", i < nr_items);
57036 +                       coord_set_item_pos(&coord, item_pos);
57037 +                       ih = node40_ih_at_coord(&coord);
57038 +
57039 +                       data[i].key = ih->key;
57040 +                       data[i].plugin_id = le16_to_cpu(get_unaligned(&ih->plugin_id));
57041 +                       switch (data[i].plugin_id) {
57042 +                       case CTAIL_ID:
57043 +                       case FORMATTING_ID:
57044 +                               data[i].u.bytes = coord_num_units(&coord);
57045 +                               break;
57046 +                       case EXTENT_POINTER_ID:
57047 +                               data[i].u.bytes =
57048 +                                   reiser4_extent_size(&coord,
57049 +                                               coord_num_units(&coord));
57050 +                               break;
57051 +                       case COMPOUND_DIR_ID:
57052 +                               data[i].u.entries = coord_num_units(&coord);
57053 +                               break;
57054 +                       default:
57055 +                               data[i].u.unused = NULL;
57056 +                               break;
57057 +                       }
57058 +                       i++;
57059 +               }
57060 +               assert("vs-1606", i == nr_items);
57061 +       }
57062 +       return data;
57063 +}
57064 +
57065 +void shift_check(void *vp, const znode * left, const znode * right)
57066 +{
57067 +       pos_in_node_t i, nr_items;
57068 +       coord_t coord;
57069 +       __u64 last_bytes;
57070 +       int mergeable;
57071 +       item_header40 *ih;
57072 +       pos_in_node_t item_pos;
57073 +       struct shift_check *data;
57074 +
57075 +       data = (struct shift_check *)vp;
57076 +
57077 +       if (data == NULL)
57078 +               return;
57079 +
57080 +       if (node_is_empty(left) || node_is_empty(right))
57081 +               mergeable = 0;
57082 +       else {
57083 +               coord_t l, r;
57084 +
57085 +               coord_init_last_unit(&l, left);
57086 +               coord_init_first_unit(&r, right);
57087 +               mergeable = are_items_mergeable(&l, &r);
57088 +       }
57089 +
57090 +       nr_items =
57091 +           node40_num_of_items_internal(left) +
57092 +           node40_num_of_items_internal(right) - (mergeable ? 1 : 0);
57093 +
57094 +       i = 0;
57095 +       last_bytes = 0;
57096 +
57097 +       coord_init_first_unit(&coord, left);
57098 +
57099 +       for (item_pos = 0; item_pos < node40_num_of_items_internal(left);
57100 +            item_pos++) {
57101 +
57102 +               coord_set_item_pos(&coord, item_pos);
57103 +               ih = node40_ih_at_coord(&coord);
57104 +
57105 +               assert("vs-1611", i == item_pos);
57106 +               assert("vs-1590", keyeq(&ih->key, &data[i].key));
57107 +               assert("vs-1591",
57108 +                      le16_to_cpu(get_unaligned(&ih->plugin_id)) == data[i].plugin_id);
57109 +               if ((i < (node40_num_of_items_internal(left) - 1))
57110 +                   || !mergeable) {
57111 +                       switch (data[i].plugin_id) {
57112 +                       case CTAIL_ID:
57113 +                       case FORMATTING_ID:
57114 +                               assert("vs-1592",
57115 +                                      data[i].u.bytes ==
57116 +                                      coord_num_units(&coord));
57117 +                               break;
57118 +                       case EXTENT_POINTER_ID:
57119 +                               assert("vs-1593",
57120 +                                      data[i].u.bytes ==
57121 +                                      reiser4_extent_size(&coord,
57122 +                                                          coord_num_units
57123 +                                                          (&coord)));
57124 +                               break;
57125 +                       case COMPOUND_DIR_ID:
57126 +                               assert("vs-1594",
57127 +                                      data[i].u.entries ==
57128 +                                      coord_num_units(&coord));
57129 +                               break;
57130 +                       default:
57131 +                               break;
57132 +                       }
57133 +               }
57134 +               if (item_pos == (node40_num_of_items_internal(left) - 1)
57135 +                   && mergeable) {
57136 +                       switch (data[i].plugin_id) {
57137 +                       case CTAIL_ID:
57138 +                       case FORMATTING_ID:
57139 +                               last_bytes = coord_num_units(&coord);
57140 +                               break;
57141 +                       case EXTENT_POINTER_ID:
57142 +                               last_bytes =
57143 +                                   reiser4_extent_size(&coord,
57144 +                                               coord_num_units(&coord));
57145 +                               break;
57146 +                       case COMPOUND_DIR_ID:
57147 +                               last_bytes = coord_num_units(&coord);
57148 +                               break;
57149 +                       default:
57150 +                               impossible("vs-1595", "wrong mergeable item");
57151 +                               break;
57152 +                       }
57153 +               }
57154 +               i++;
57155 +       }
57156 +
57157 +       coord_init_first_unit(&coord, right);
57158 +       if (mergeable) {
57159 +               ih = node40_ih_at_coord(&coord);
57160 +
57161 +               assert("vs-1589",
57162 +                      data[i - 1].plugin_id == le16_to_cpu(get_unaligned(&ih->plugin_id)));
57163 +               assert("vs-1608", last_bytes != 0);
57164 +               switch (data[i - 1].plugin_id) {
57165 +               case CTAIL_ID:
57166 +               case FORMATTING_ID:
57167 +                       assert("vs-1596",
57168 +                              data[i - 1].u.bytes ==
57169 +                              last_bytes + coord_num_units(&coord));
57170 +                       break;
57171 +
57172 +               case EXTENT_POINTER_ID:
57173 +                       assert("vs-1597",
57174 +                              data[i - 1].u.bytes ==
57175 +                              last_bytes + reiser4_extent_size(&coord,
57176 +                                                               coord_num_units
57177 +                                                               (&coord)));
57178 +                       break;
57179 +
57180 +               case COMPOUND_DIR_ID:
57181 +                       assert("vs-1598",
57182 +                              data[i - 1].u.bytes ==
57183 +                              last_bytes + coord_num_units(&coord));
57184 +                       break;
57185 +               default:
57186 +                       impossible("vs-1599", "wrong mergeable item");
57187 +                       break;
57188 +               }
57189 +               item_pos = 1;
57190 +       } else
57191 +               item_pos = 0;
57192 +
57193 +       for (; item_pos < node40_num_of_items_internal(right); item_pos++) {
57194 +
57195 +               coord_set_item_pos(&coord, item_pos);
57196 +               ih = node40_ih_at_coord(&coord);
57197 +
57198 +               assert("vs-1612", keyeq(&ih->key, &data[i].key));
57199 +               assert("vs-1613",
57200 +                      le16_to_cpu(get_unaligned(&ih->plugin_id)) == data[i].plugin_id);
57201 +               switch (data[i].plugin_id) {
57202 +               case CTAIL_ID:
57203 +               case FORMATTING_ID:
57204 +                       assert("vs-1600",
57205 +                              data[i].u.bytes == coord_num_units(&coord));
57206 +                       break;
57207 +               case EXTENT_POINTER_ID:
57208 +                       assert("vs-1601",
57209 +                              data[i].u.bytes ==
57210 +                              reiser4_extent_size(&coord,
57211 +                                                  coord_num_units
57212 +                                                  (&coord)));
57213 +                       break;
57214 +               case COMPOUND_DIR_ID:
57215 +                       assert("vs-1602",
57216 +                              data[i].u.entries == coord_num_units(&coord));
57217 +                       break;
57218 +               default:
57219 +                       break;
57220 +               }
57221 +               i++;
57222 +       }
57223 +
57224 +       assert("vs-1603", i == nr_items);
57225 +       kfree(data);
57226 +}
57227 +
57228 +#endif
57229 +
57230 +/* plugin->u.node.shift
57231 +   look for description of this method in plugin/node/node.h */
57232 +int shift_node40(coord_t * from, znode * to, shift_direction pend, int delete_child,   /* if @from->node becomes empty - it will be
57233 +                                                                                          deleted from the tree if this is set to 1 */
57234 +                int including_stop_coord, carry_plugin_info * info)
57235 +{
57236 +       struct shift_params shift;
57237 +       int result;
57238 +       znode *left, *right;
57239 +       znode *source;
57240 +       int target_empty;
57241 +
57242 +       assert("nikita-2161", coord_check(from));
57243 +
57244 +       memset(&shift, 0, sizeof(shift));
57245 +       shift.pend = pend;
57246 +       shift.wish_stop = *from;
57247 +       shift.target = to;
57248 +
57249 +       assert("nikita-1473", znode_is_write_locked(from->node));
57250 +       assert("nikita-1474", znode_is_write_locked(to));
57251 +
57252 +       source = from->node;
57253 +
57254 +       /* set @shift.wish_stop to rightmost/leftmost unit among units we want
57255 +          shifted */
57256 +       if (pend == SHIFT_LEFT) {
57257 +               result = coord_set_to_left(&shift.wish_stop);
57258 +               left = to;
57259 +               right = from->node;
57260 +       } else {
57261 +               result = coord_set_to_right(&shift.wish_stop);
57262 +               left = from->node;
57263 +               right = to;
57264 +       }
57265 +
57266 +       if (result) {
57267 +               /* move insertion coord even if there is nothing to move */
57268 +               if (including_stop_coord) {
57269 +                       /* move insertion coord (@from) */
57270 +                       if (pend == SHIFT_LEFT) {
57271 +                               /* after last item in target node */
57272 +                               coord_init_after_last_item(from, to);
57273 +                       } else {
57274 +                               /* before first item in target node */
57275 +                               coord_init_before_first_item(from, to);
57276 +                       }
57277 +               }
57278 +
57279 +               if (delete_child && node_is_empty(shift.wish_stop.node))
57280 +                       result =
57281 +                           prepare_removal_node40(shift.wish_stop.node, info);
57282 +               else
57283 +                       result = 0;
57284 +               /* there is nothing to shift */
57285 +               assert("nikita-2078", coord_check(from));
57286 +               return result;
57287 +       }
57288 +
57289 +       target_empty = node_is_empty(to);
57290 +
57291 +       /* when first node plugin with item body compression is implemented,
57292 +          this must be changed to call node specific plugin */
57293 +
57294 +       /* shift->stop_coord is updated to last unit which really will be
57295 +          shifted */
57296 +       estimate_shift(&shift, get_current_context());
57297 +       if (!shift.shift_bytes) {
57298 +               /* we could not shift anything */
57299 +               assert("nikita-2079", coord_check(from));
57300 +               return 0;
57301 +       }
57302 +
57303 +       copy(&shift);
57304 +
57305 +       /* result value of this is important. It is used by adjust_coord below */
57306 +       result = delete_copied(&shift);
57307 +
57308 +       assert("vs-1610", result >= 0);
57309 +       assert("vs-1471",
57310 +              ((reiser4_context *) current->journal_info)->magic ==
57311 +              context_magic);
57312 +
57313 +       /* item which has been moved from one node to another might want to do
57314 +          something on that event. This can be done by item's shift_hook
57315 +          method, which will be now called for every moved items */
57316 +       call_shift_hooks(&shift);
57317 +
57318 +       assert("vs-1472",
57319 +              ((reiser4_context *) current->journal_info)->magic ==
57320 +              context_magic);
57321 +
57322 +       update_taps(&shift);
57323 +
57324 +       assert("vs-1473",
57325 +              ((reiser4_context *) current->journal_info)->magic ==
57326 +              context_magic);
57327 +
57328 +       /* adjust @from pointer in accordance with @including_stop_coord flag
57329 +          and amount of data which was really shifted */
57330 +       adjust_coord(from, &shift, result, including_stop_coord);
57331 +
57332 +       if (target_empty)
57333 +               /*
57334 +                * items were shifted into empty node. Update delimiting key.
57335 +                */
57336 +               result = prepare_for_update(NULL, left, info);
57337 +
57338 +       /* add update operation to @info, which is the list of operations to
57339 +          be performed on a higher level */
57340 +       result = prepare_for_update(left, right, info);
57341 +       if (!result && node_is_empty(source) && delete_child) {
57342 +               /* all contents of @from->node is moved to @to and @from->node
57343 +                  has to be removed from the tree, so, on higher level we
57344 +                  will be removing the pointer to node @from->node */
57345 +               result = prepare_removal_node40(source, info);
57346 +       }
57347 +       assert("nikita-2080", coord_check(from));
57348 +       return result ? result : (int)shift.shift_bytes;
57349 +}
57350 +
57351 +/* plugin->u.node.fast_insert()
57352 +   look for description of this method in plugin/node/node.h */
57353 +int fast_insert_node40(const coord_t * coord UNUSED_ARG /* node to query */ )
57354 +{
57355 +       return 1;
57356 +}
57357 +
57358 +/* plugin->u.node.fast_paste()
57359 +   look for description of this method in plugin/node/node.h */
57360 +int fast_paste_node40(const coord_t * coord UNUSED_ARG /* node to query */ )
57361 +{
57362 +       return 1;
57363 +}
57364 +
57365 +/* plugin->u.node.fast_cut()
57366 +   look for description of this method in plugin/node/node.h */
57367 +int fast_cut_node40(const coord_t * coord UNUSED_ARG /* node to query */ )
57368 +{
57369 +       return 1;
57370 +}
57371 +
57372 +/* plugin->u.node.modify - not defined */
57373 +
57374 +/* plugin->u.node.max_item_size */
57375 +int max_item_size_node40(void)
57376 +{
57377 +       return reiser4_get_current_sb()->s_blocksize - sizeof(node40_header) -
57378 +           sizeof(item_header40);
57379 +}
57380 +
57381 +/* plugin->u.node.set_item_plugin */
57382 +int set_item_plugin_node40(coord_t *coord, item_id id)
57383 +{
57384 +       item_header40 *ih;
57385 +
57386 +       ih = node40_ih_at_coord(coord);
57387 +       put_unaligned(cpu_to_le16(id), &ih->plugin_id);
57388 +       coord->iplugid = id;
57389 +       return 0;
57390 +}
57391 +
57392 +/*
57393 +   Local variables:
57394 +   c-indentation-style: "K&R"
57395 +   mode-name: "LC"
57396 +   c-basic-offset: 8
57397 +   tab-width: 8
57398 +   fill-column: 120
57399 +   scroll-step: 1
57400 +   End:
57401 +*/
57402 diff -puN /dev/null fs/reiser4/plugin/node/node40.h
57403 --- /dev/null
57404 +++ a/fs/reiser4/plugin/node/node40.h
57405 @@ -0,0 +1,125 @@
57406 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
57407 +
57408 +#if !defined( __REISER4_NODE40_H__ )
57409 +#define __REISER4_NODE40_H__
57410 +
57411 +#include "../../forward.h"
57412 +#include "../../dformat.h"
57413 +#include "node.h"
57414 +
57415 +#include <linux/types.h>
57416 +
57417 +/* format of node header for 40 node layouts. Keep bloat out of this struct.  */
57418 +typedef struct node40_header {
57419 +       /* identifier of node plugin. Must be located at the very beginning
57420 +          of a node. */
57421 +       common_node_header common_header;       /* this is 16 bits */
57422 +       /* number of items. Should be first element in the node header,
57423 +          because we haven't yet finally decided whether it shouldn't go into
57424 +          common_header.
57425 +        */
57426 +/* NIKITA-FIXME-HANS: Create a macro such that if there is only one
57427 + * node format at compile time, and it is this one, accesses do not function dereference when
57428 + * accessing these fields (and otherwise they do).  Probably 80% of users will only have one node format at a time throughout the life of reiser4.  */
57429 +       d16 nr_items;
57430 +       /* free space in node measured in bytes */
57431 +       d16 free_space;
57432 +       /* offset to start of free space in node */
57433 +       d16 free_space_start;
57434 +       /* for reiser4_fsck.  When information about what is a free
57435 +          block is corrupted, and we try to recover everything even
57436 +          if marked as freed, then old versions of data may
57437 +          duplicate newer versions, and this field allows us to
57438 +          restore the newer version.  Also useful for when users
57439 +          who don't have the new trashcan installed on their linux distro
57440 +          delete the wrong files and send us desperate emails
57441 +          offering $25 for them back.  */
57442 +
57443 +       /* magic field we need to tell formatted nodes NIKITA-FIXME-HANS: improve this comment */
57444 +       d32 magic;
57445 +       /* flushstamp is made of mk_id and write_counter. mk_id is an
57446 +          id generated randomly at mkreiserfs time. So we can just
57447 +          skip all nodes with different mk_id. write_counter is d64
57448 +          incrementing counter of writes on disk. It is used for
57449 +          choosing the newest data at fsck time. NIKITA-FIXME-HANS: why was field name changed but not comment? */
57450 +
57451 +       d32 mkfs_id;
57452 +       d64 flush_id;
57453 +       /* node flags to be used by fsck (reiser4ck or reiser4fsck?)
57454 +          and repacker NIKITA-FIXME-HANS: say more or reference elsewhere that says more */
57455 +       d16 flags;
57456 +
57457 +       /* 1 is leaf level, 2 is twig level, root is the numerically
57458 +          largest level */
57459 +       d8 level;
57460 +
57461 +       d8 pad;
57462 +} PACKED node40_header;
57463 +
57464 +/* item headers are not standard across all node layouts, pass
57465 +   pos_in_node to functions instead */
57466 +typedef struct item_header40 {
57467 +       /* key of item */
57468 +       /*  0 */ reiser4_key key;
57469 +       /* offset from start of a node measured in 8-byte chunks */
57470 +       /* 24 */ d16 offset;
57471 +       /* 26 */ d16 flags;
57472 +       /* 28 */ d16 plugin_id;
57473 +} PACKED item_header40;
57474 +
57475 +size_t item_overhead_node40(const znode * node, flow_t * aflow);
57476 +size_t free_space_node40(znode * node);
57477 +node_search_result lookup_node40(znode * node, const reiser4_key * key,
57478 +                                lookup_bias bias, coord_t * coord);
57479 +int num_of_items_node40(const znode * node);
57480 +char *item_by_coord_node40(const coord_t * coord);
57481 +int length_by_coord_node40(const coord_t * coord);
57482 +item_plugin *plugin_by_coord_node40(const coord_t * coord);
57483 +reiser4_key *key_at_node40(const coord_t * coord, reiser4_key * key);
57484 +size_t estimate_node40(znode * node);
57485 +int check_node40(const znode * node, __u32 flags, const char **error);
57486 +int parse_node40(znode * node);
57487 +int init_node40(znode * node);
57488 +#ifdef GUESS_EXISTS
57489 +int guess_node40(const znode * node);
57490 +#endif
57491 +void change_item_size_node40(coord_t * coord, int by);
57492 +int create_item_node40(coord_t * target, const reiser4_key * key,
57493 +                      reiser4_item_data * data, carry_plugin_info * info);
57494 +void update_item_key_node40(coord_t * target, const reiser4_key * key,
57495 +                           carry_plugin_info * info);
57496 +int kill_node40(struct carry_kill_data *, carry_plugin_info *);
57497 +int cut_node40(struct carry_cut_data *, carry_plugin_info *);
57498 +int shift_node40(coord_t * from, znode * to, shift_direction pend,
57499 +                /* if @from->node becomes
57500 +                   empty - it will be deleted from
57501 +                   the tree if this is set to 1
57502 +                 */
57503 +                int delete_child, int including_stop_coord,
57504 +                carry_plugin_info * info);
57505 +
57506 +int fast_insert_node40(const coord_t * coord);
57507 +int fast_paste_node40(const coord_t * coord);
57508 +int fast_cut_node40(const coord_t * coord);
57509 +int max_item_size_node40(void);
57510 +int prepare_removal_node40(znode * empty, carry_plugin_info * info);
57511 +int set_item_plugin_node40(coord_t * coord, item_id id);
57512 +int shrink_item_node40(coord_t * coord, int delta);
57513 +
57514 +#if REISER4_DEBUG
57515 +void *shift_check_prepare(const znode *left, const znode *right);
57516 +void shift_check(void *vp, const znode *left, const znode *right);
57517 +#endif
57518 +
57519 +/* __REISER4_NODE40_H__ */
57520 +#endif
57521 +/*
57522 +   Local variables:
57523 +   c-indentation-style: "K&R"
57524 +   mode-name: "LC"
57525 +   c-basic-offset: 8
57526 +   tab-width: 8
57527 +   fill-column: 120
57528 +   scroll-step: 1
57529 +   End:
57530 +*/
57531 diff -puN /dev/null fs/reiser4/plugin/object.c
57532 --- /dev/null
57533 +++ a/fs/reiser4/plugin/object.c
57534 @@ -0,0 +1,531 @@
57535 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
57536 + * reiser4/README */
57537 +
57538 +/*
57539 + * Examples of object plugins: file, directory, symlink, special file.
57540 + *
57541 + * Plugins associated with inode:
57542 + *
57543 + * Plugin of inode is plugin referenced by plugin-id field of on-disk
57544 + * stat-data. How we store this plugin in in-core inode is not
57545 + * important. Currently pointers are used, another variant is to store offsets
57546 + * and do array lookup on each access.
57547 + *
57548 + * Now, each inode has one selected plugin: object plugin that
57549 + * determines what type of file this object is: directory, regular etc.
57550 + *
57551 + * This main plugin can use other plugins that are thus subordinated to
57552 + * it. Directory instance of object plugin uses hash; regular file
57553 + * instance uses tail policy plugin.
57554 + *
57555 + * Object plugin is either taken from id in stat-data or guessed from
57556 + * i_mode bits. Once it is established we ask it to install its
57557 + * subordinate plugins, by looking again in stat-data or inheriting them
57558 + * from parent.
57559 + *
57560 + * How new inode is initialized during ->read_inode():
57561 + * 1 read stat-data and initialize inode fields: i_size, i_mode,
57562 + *   i_generation, capabilities etc.
57563 + * 2 read plugin id from stat data or try to guess plugin id
57564 + *   from inode->i_mode bits if plugin id is missing.
57565 + * 3 Call ->init_inode() method of stat-data plugin to initialise inode fields.
57566 + *
57567 + * NIKITA-FIXME-HANS: can you say a little about 1 being done before 3?  What
57568 + * if stat data does contain i_size, etc., due to it being an unusual plugin?
57569 + *
57570 + * 4 Call ->activate() method of object's plugin. Plugin is either read from
57571 + *    from stat-data or guessed from mode bits
57572 + * 5 Call ->inherit() method of object plugin to inherit as yet un initialized
57573 + *    plugins from parent.
57574 + *
57575 + * Easy induction proves that on last step all plugins of inode would be
57576 + * initialized.
57577 + *
57578 + * When creating new object:
57579 + * 1 obtain object plugin id (see next period)
57580 + * NIKITA-FIXME-HANS: period?
57581 + * 2 ->install() this plugin
57582 + * 3 ->inherit() the rest from the parent
57583 + *
57584 + * We need some examples of creating an object with default and non-default
57585 + * plugin ids.  Nikita, please create them.
57586 + */
57587 +
57588 +#include "../inode.h"
57589 +
57590 +static int _bugop(void)
57591 +{
57592 +       BUG_ON(1);
57593 +       return 0;
57594 +}
57595 +
57596 +#define bugop ((void *)_bugop)
57597 +
57598 +static int _dummyop(void)
57599 +{
57600 +       return 0;
57601 +}
57602 +
57603 +#define dummyop ((void *)_dummyop)
57604 +
57605 +static int change_file(struct inode *inode,
57606 +                      reiser4_plugin * plugin,
57607 +                      pset_member memb)
57608 +{
57609 +       /* cannot change object plugin of already existing object */
57610 +       if (memb == PSET_FILE)
57611 +               return RETERR(-EINVAL);
57612 +
57613 +       /* Change PSET_CREATE */
57614 +       return aset_set_unsafe(&reiser4_inode_data(inode)->pset, memb, plugin);
57615 +}
57616 +
57617 +static reiser4_plugin_ops file_plugin_ops = {
57618 +       .change = change_file
57619 +};
57620 +
57621 +static struct inode_operations         null_i_ops = {.create = NULL};
57622 +static struct file_operations          null_f_ops = {.owner = NULL};
57623 +static struct address_space_operations null_a_ops = {.writepage = NULL};
57624 +
57625 +/* VFS methods for regular files */
57626 +static struct inode_operations regular_file_i_ops = {
57627 +       .permission = reiser4_permission_common,
57628 +       .setattr = reiser4_setattr,
57629 +       .getattr = reiser4_getattr_common
57630 +};
57631 +static struct file_operations regular_file_f_ops = {
57632 +       .llseek = generic_file_llseek,
57633 +       .read = reiser4_read_careful,
57634 +       .write = reiser4_write_careful,
57635 +       .aio_read = generic_file_aio_read,
57636 +       .ioctl = reiser4_ioctl_careful,
57637 +       .mmap = reiser4_mmap_careful,
57638 +       .open = reiser4_open_careful,
57639 +       .release = reiser4_release_careful,
57640 +       .fsync = reiser4_sync_file_common,
57641 +       .splice_read = generic_file_splice_read,
57642 +       .splice_write = generic_file_splice_write
57643 +};
57644 +static struct address_space_operations regular_file_a_ops = {
57645 +       .writepage = reiser4_writepage,
57646 +       .readpage = reiser4_readpage,
57647 +       .sync_page = block_sync_page,
57648 +       .writepages = reiser4_writepages,
57649 +       .set_page_dirty = reiser4_set_page_dirty,
57650 +       .readpages = reiser4_readpages,
57651 +       .prepare_write = reiser4_prepare_write,
57652 +       .commit_write = reiser4_commit_write,
57653 +       .bmap = reiser4_bmap_careful,
57654 +       .invalidatepage = reiser4_invalidatepage,
57655 +       .releasepage = reiser4_releasepage
57656 +};
57657 +
57658 +/* VFS methods for symlink files */
57659 +static struct inode_operations symlink_file_i_ops = {
57660 +       .readlink = generic_readlink,
57661 +       .follow_link = reiser4_follow_link_common,
57662 +       .permission = reiser4_permission_common,
57663 +       .setattr = reiser4_setattr_common,
57664 +       .getattr = reiser4_getattr_common
57665 +};
57666 +
57667 +/* VFS methods for special files */
57668 +static struct inode_operations special_file_i_ops = {
57669 +       .permission = reiser4_permission_common,
57670 +       .setattr = reiser4_setattr_common,
57671 +       .getattr = reiser4_getattr_common
57672 +};
57673 +
57674 +/* VFS methods for directories */
57675 +static struct inode_operations directory_i_ops = {
57676 +       .create = reiser4_create_common,
57677 +       .lookup = reiser4_lookup_common,
57678 +       .link = reiser4_link_common,
57679 +       .unlink = reiser4_unlink_common,
57680 +       .symlink = reiser4_symlink_common,
57681 +       .mkdir = reiser4_mkdir_common,
57682 +       .rmdir = reiser4_unlink_common,
57683 +       .mknod = reiser4_mknod_common,
57684 +       .rename = reiser4_rename_common,
57685 +       .permission = reiser4_permission_common,
57686 +       .setattr = reiser4_setattr_common,
57687 +       .getattr = reiser4_getattr_common
57688 +};
57689 +static struct file_operations directory_f_ops = {
57690 +       .llseek = reiser4_llseek_dir_common,
57691 +       .read = generic_read_dir,
57692 +       .readdir = reiser4_readdir_common,
57693 +       .release = reiser4_release_dir_common,
57694 +       .fsync = reiser4_sync_common
57695 +};
57696 +static struct address_space_operations directory_a_ops = {
57697 +       .writepage = bugop,
57698 +       .sync_page = bugop,
57699 +       .writepages = dummyop,
57700 +       .set_page_dirty = bugop,
57701 +       .readpages = bugop,
57702 +       .prepare_write = bugop,
57703 +       .commit_write = bugop,
57704 +       .bmap = bugop,
57705 +       .invalidatepage = bugop,
57706 +       .releasepage = bugop
57707 +};
57708 +
57709 +/*
57710 + * Definitions of object plugins.
57711 + */
57712 +
57713 +file_plugin file_plugins[LAST_FILE_PLUGIN_ID] = {
57714 +       [UNIX_FILE_PLUGIN_ID] = {
57715 +               .h = {
57716 +                       .type_id = REISER4_FILE_PLUGIN_TYPE,
57717 +                       .id = UNIX_FILE_PLUGIN_ID,
57718 +                       .groups = (1 << REISER4_REGULAR_FILE),
57719 +                       .pops = &file_plugin_ops,
57720 +                       .label = "reg",
57721 +                       .desc = "regular file",
57722 +                       .linkage = {NULL, NULL},
57723 +               },
57724 +               /*
57725 +                * invariant vfs ops
57726 +                */
57727 +               .inode_ops = &regular_file_i_ops,
57728 +               .file_ops = &regular_file_f_ops,
57729 +               .as_ops = &regular_file_a_ops,
57730 +               /*
57731 +                * private i_ops
57732 +                */
57733 +               .setattr = setattr_unix_file,
57734 +               .open = open_unix_file,
57735 +               .read = read_unix_file,
57736 +               .write = write_unix_file,
57737 +               .ioctl = ioctl_unix_file,
57738 +               .mmap = mmap_unix_file,
57739 +               .release = release_unix_file,
57740 +               /*
57741 +                * private f_ops
57742 +                */
57743 +               .readpage = readpage_unix_file,
57744 +               .readpages = readpages_unix_file,
57745 +               .writepages = writepages_unix_file,
57746 +               .prepare_write = prepare_write_unix_file,
57747 +               .commit_write = commit_write_unix_file,
57748 +               /*
57749 +                * private a_ops
57750 +                */
57751 +               .bmap = bmap_unix_file,
57752 +               /*
57753 +                * other private methods
57754 +                */
57755 +               .write_sd_by_inode = write_sd_by_inode_common,
57756 +               .flow_by_inode = flow_by_inode_unix_file,
57757 +               .key_by_inode = key_by_inode_and_offset_common,
57758 +               .set_plug_in_inode = set_plug_in_inode_common,
57759 +               .adjust_to_parent = adjust_to_parent_common,
57760 +               .create_object = reiser4_create_object_common,
57761 +               .delete_object = delete_object_unix_file,
57762 +               .add_link = reiser4_add_link_common,
57763 +               .rem_link = reiser4_rem_link_common,
57764 +               .owns_item = owns_item_unix_file,
57765 +               .can_add_link = can_add_link_common,
57766 +               .detach = dummyop,
57767 +               .bind = dummyop,
57768 +               .safelink = safelink_common,
57769 +               .estimate = {
57770 +                       .create = estimate_create_common,
57771 +                       .update = estimate_update_common,
57772 +                       .unlink = estimate_unlink_common
57773 +               },
57774 +               .init_inode_data = init_inode_data_unix_file,
57775 +               .cut_tree_worker = cut_tree_worker_common,
57776 +               .wire = {
57777 +                       .write = wire_write_common,
57778 +                       .read = wire_read_common,
57779 +                       .get = wire_get_common,
57780 +                       .size = wire_size_common,
57781 +                       .done = wire_done_common
57782 +               }
57783 +       },
57784 +       [DIRECTORY_FILE_PLUGIN_ID] = {
57785 +               .h = {
57786 +                       .type_id = REISER4_FILE_PLUGIN_TYPE,
57787 +                       .id = DIRECTORY_FILE_PLUGIN_ID,
57788 +                       .groups = (1 << REISER4_DIRECTORY_FILE),
57789 +                       .pops = &file_plugin_ops,
57790 +                       .label = "dir",
57791 +                       .desc = "directory",
57792 +                       .linkage = {NULL, NULL}
57793 +               },
57794 +               .inode_ops = &null_i_ops,
57795 +               .file_ops = &null_f_ops,
57796 +               .as_ops = &null_a_ops,
57797 +
57798 +               .write_sd_by_inode = write_sd_by_inode_common,
57799 +               .flow_by_inode = bugop,
57800 +               .key_by_inode = bugop,
57801 +               .set_plug_in_inode = set_plug_in_inode_common,
57802 +               .adjust_to_parent = adjust_to_parent_common_dir,
57803 +               .create_object = reiser4_create_object_common,
57804 +               .delete_object = reiser4_delete_dir_common,
57805 +               .add_link = reiser4_add_link_common,
57806 +               .rem_link = rem_link_common_dir,
57807 +               .owns_item = owns_item_common_dir,
57808 +               .can_add_link = can_add_link_common,
57809 +               .can_rem_link = can_rem_link_common_dir,
57810 +               .detach = reiser4_detach_common_dir,
57811 +               .bind = reiser4_bind_common_dir,
57812 +               .safelink = safelink_common,
57813 +               .estimate = {
57814 +                       .create = estimate_create_common_dir,
57815 +                       .update = estimate_update_common,
57816 +                       .unlink = estimate_unlink_common_dir
57817 +               },
57818 +               .wire = {
57819 +                       .write = wire_write_common,
57820 +                       .read = wire_read_common,
57821 +                       .get = wire_get_common,
57822 +                       .size = wire_size_common,
57823 +                       .done = wire_done_common
57824 +               },
57825 +               .init_inode_data = init_inode_ordering,
57826 +               .cut_tree_worker = cut_tree_worker_common,
57827 +       },
57828 +       [SYMLINK_FILE_PLUGIN_ID] = {
57829 +               .h = {
57830 +                       .type_id = REISER4_FILE_PLUGIN_TYPE,
57831 +                       .id = SYMLINK_FILE_PLUGIN_ID,
57832 +                       .groups = (1 << REISER4_SYMLINK_FILE),
57833 +                       .pops = &file_plugin_ops,
57834 +                       .label = "symlink",
57835 +                       .desc = "symbolic link",
57836 +                       .linkage = {NULL, NULL}
57837 +               },
57838 +               .inode_ops = &symlink_file_i_ops,
57839 +               /* inode->i_fop of symlink is initialized by NULL in
57840 +                * setup_inode_ops */
57841 +               .file_ops = &null_f_ops,
57842 +               .as_ops = &null_a_ops,
57843 +
57844 +               .write_sd_by_inode = write_sd_by_inode_common,
57845 +               .set_plug_in_inode = set_plug_in_inode_common,
57846 +               .adjust_to_parent = adjust_to_parent_common,
57847 +               .create_object = reiser4_create_symlink,
57848 +               .delete_object = reiser4_delete_object_common,
57849 +               .add_link = reiser4_add_link_common,
57850 +               .rem_link = reiser4_rem_link_common,
57851 +               .can_add_link = can_add_link_common,
57852 +               .detach = dummyop,
57853 +               .bind = dummyop,
57854 +               .safelink = safelink_common,
57855 +               .estimate = {
57856 +                       .create = estimate_create_common,
57857 +                       .update = estimate_update_common,
57858 +                       .unlink = estimate_unlink_common
57859 +               },
57860 +               .init_inode_data = init_inode_ordering,
57861 +               .cut_tree_worker = cut_tree_worker_common,
57862 +               .destroy_inode = destroy_inode_symlink,
57863 +               .wire = {
57864 +                       .write = wire_write_common,
57865 +                       .read = wire_read_common,
57866 +                       .get = wire_get_common,
57867 +                       .size = wire_size_common,
57868 +                       .done = wire_done_common
57869 +               }
57870 +       },
57871 +       [SPECIAL_FILE_PLUGIN_ID] = {
57872 +               .h = {
57873 +                       .type_id = REISER4_FILE_PLUGIN_TYPE,
57874 +                       .id = SPECIAL_FILE_PLUGIN_ID,
57875 +                       .groups = (1 << REISER4_SPECIAL_FILE),
57876 +                       .pops = &file_plugin_ops,
57877 +                       .label = "special",
57878 +                       .desc =
57879 +                       "special: fifo, device or socket",
57880 +                       .linkage = {NULL, NULL}
57881 +               },
57882 +               .inode_ops = &special_file_i_ops,
57883 +               /* file_ops of special files (sockets, block, char, fifo) are
57884 +                  initialized by init_special_inode. */
57885 +               .file_ops = &null_f_ops,
57886 +               .as_ops = &null_a_ops,
57887 +
57888 +               .write_sd_by_inode = write_sd_by_inode_common,
57889 +               .set_plug_in_inode = set_plug_in_inode_common,
57890 +               .adjust_to_parent = adjust_to_parent_common,
57891 +               .create_object = reiser4_create_object_common,
57892 +               .delete_object = reiser4_delete_object_common,
57893 +               .add_link = reiser4_add_link_common,
57894 +               .rem_link = reiser4_rem_link_common,
57895 +               .owns_item = owns_item_common,
57896 +               .can_add_link = can_add_link_common,
57897 +               .detach = dummyop,
57898 +               .bind = dummyop,
57899 +               .safelink = safelink_common,
57900 +               .estimate = {
57901 +                       .create = estimate_create_common,
57902 +                       .update = estimate_update_common,
57903 +                       .unlink = estimate_unlink_common
57904 +               },
57905 +               .init_inode_data = init_inode_ordering,
57906 +               .cut_tree_worker = cut_tree_worker_common,
57907 +               .wire = {
57908 +                       .write = wire_write_common,
57909 +                       .read = wire_read_common,
57910 +                       .get = wire_get_common,
57911 +                       .size = wire_size_common,
57912 +                       .done = wire_done_common
57913 +               }
57914 +       },
57915 +       [CRYPTCOMPRESS_FILE_PLUGIN_ID] = {
57916 +               .h = {
57917 +                       .type_id = REISER4_FILE_PLUGIN_TYPE,
57918 +                       .id = CRYPTCOMPRESS_FILE_PLUGIN_ID,
57919 +                       .groups = (1 << REISER4_REGULAR_FILE),
57920 +                       .pops = &file_plugin_ops,
57921 +                       .label = "cryptcompress",
57922 +                       .desc = "cryptcompress file",
57923 +                       .linkage = {NULL, NULL}
57924 +               },
57925 +               .inode_ops = &regular_file_i_ops,
57926 +               .file_ops = &regular_file_f_ops,
57927 +               .as_ops = &regular_file_a_ops,
57928 +
57929 +               .setattr = setattr_cryptcompress,
57930 +               .open = open_cryptcompress,
57931 +               .read = read_cryptcompress,
57932 +               .write = write_cryptcompress,
57933 +               .ioctl = ioctl_cryptcompress,
57934 +               .mmap = mmap_cryptcompress,
57935 +               .release = release_cryptcompress,
57936 +
57937 +               .readpage = readpage_cryptcompress,
57938 +               .readpages = readpages_cryptcompress,
57939 +               .writepages = writepages_cryptcompress,
57940 +               .prepare_write = prepare_write_cryptcompress,
57941 +               .commit_write = commit_write_cryptcompress,
57942 +
57943 +               .bmap = bmap_cryptcompress,
57944 +
57945 +               .write_sd_by_inode = write_sd_by_inode_common,
57946 +               .flow_by_inode = flow_by_inode_cryptcompress,
57947 +               .key_by_inode = key_by_inode_cryptcompress,
57948 +               .set_plug_in_inode = set_plug_in_inode_common,
57949 +               .adjust_to_parent = adjust_to_parent_cryptcompress,
57950 +               .create_object = create_object_cryptcompress,
57951 +               .delete_object = delete_object_cryptcompress,
57952 +               .add_link = reiser4_add_link_common,
57953 +               .rem_link = reiser4_rem_link_common,
57954 +               .owns_item = owns_item_common,
57955 +               .can_add_link = can_add_link_common,
57956 +               .detach = dummyop,
57957 +               .bind = dummyop,
57958 +               .safelink = safelink_common,
57959 +               .estimate = {
57960 +                       .create = estimate_create_common,
57961 +                       .update = estimate_update_common,
57962 +                       .unlink = estimate_unlink_common
57963 +               },
57964 +               .init_inode_data = init_inode_data_cryptcompress,
57965 +               .cut_tree_worker = cut_tree_worker_cryptcompress,
57966 +               .destroy_inode = destroy_inode_cryptcompress,
57967 +               .wire = {
57968 +                       .write = wire_write_common,
57969 +                       .read = wire_read_common,
57970 +                       .get = wire_get_common,
57971 +                       .size = wire_size_common,
57972 +                       .done = wire_done_common
57973 +               }
57974 +       }
57975 +};
57976 +
57977 +static int change_dir(struct inode *inode,
57978 +                     reiser4_plugin * plugin,
57979 +                     pset_member memb)
57980 +{
57981 +       /* cannot change dir plugin of already existing object */
57982 +       return RETERR(-EINVAL);
57983 +}
57984 +
57985 +static reiser4_plugin_ops dir_plugin_ops = {
57986 +       .change = change_dir
57987 +};
57988 +
57989 +/*
57990 + * definition of directory plugins
57991 + */
57992 +
57993 +dir_plugin dir_plugins[LAST_DIR_ID] = {
57994 +       /* standard hashed directory plugin */
57995 +       [HASHED_DIR_PLUGIN_ID] = {
57996 +               .h = {
57997 +                       .type_id = REISER4_DIR_PLUGIN_TYPE,
57998 +                       .id = HASHED_DIR_PLUGIN_ID,
57999 +                       .pops = &dir_plugin_ops,
58000 +                       .label = "dir",
58001 +                       .desc = "hashed directory",
58002 +                       .linkage = {NULL, NULL}
58003 +               },
58004 +               .inode_ops = &directory_i_ops,
58005 +               .file_ops = &directory_f_ops,
58006 +               .as_ops = &directory_a_ops,
58007 +
58008 +               .get_parent = get_parent_common,
58009 +               .is_name_acceptable = is_name_acceptable_common,
58010 +               .build_entry_key = build_entry_key_hashed,
58011 +               .build_readdir_key = build_readdir_key_common,
58012 +               .add_entry = reiser4_add_entry_common,
58013 +               .rem_entry = reiser4_rem_entry_common,
58014 +               .init = reiser4_dir_init_common,
58015 +               .done = reiser4_dir_done_common,
58016 +               .attach = reiser4_attach_common,
58017 +               .detach = reiser4_detach_common,
58018 +               .estimate = {
58019 +                       .add_entry = estimate_add_entry_common,
58020 +                       .rem_entry = estimate_rem_entry_common,
58021 +                       .unlink = dir_estimate_unlink_common
58022 +               }
58023 +       },
58024 +       /* hashed directory for which seekdir/telldir are guaranteed to
58025 +        * work. Brain-damage. */
58026 +       [SEEKABLE_HASHED_DIR_PLUGIN_ID] = {
58027 +               .h = {
58028 +                       .type_id = REISER4_DIR_PLUGIN_TYPE,
58029 +                       .id = SEEKABLE_HASHED_DIR_PLUGIN_ID,
58030 +                       .pops = &dir_plugin_ops,
58031 +                       .label = "dir32",
58032 +                       .desc = "directory hashed with 31 bit hash",
58033 +                       .linkage = {NULL, NULL}
58034 +               },
58035 +               .inode_ops = &directory_i_ops,
58036 +               .file_ops = &directory_f_ops,
58037 +               .as_ops = &directory_a_ops,
58038 +
58039 +               .get_parent = get_parent_common,
58040 +               .is_name_acceptable = is_name_acceptable_common,
58041 +               .build_entry_key = build_entry_key_seekable,
58042 +               .build_readdir_key = build_readdir_key_common,
58043 +               .add_entry = reiser4_add_entry_common,
58044 +               .rem_entry = reiser4_rem_entry_common,
58045 +               .init = reiser4_dir_init_common,
58046 +               .done = reiser4_dir_done_common,
58047 +               .attach = reiser4_attach_common,
58048 +               .detach = reiser4_detach_common,
58049 +               .estimate = {
58050 +                       .add_entry = estimate_add_entry_common,
58051 +                       .rem_entry = estimate_rem_entry_common,
58052 +                       .unlink = dir_estimate_unlink_common
58053 +               }
58054 +       }
58055 +};
58056 +
58057 +/* Make Linus happy.
58058 +   Local variables:
58059 +   c-indentation-style: "K&R"
58060 +   mode-name: "LC"
58061 +   c-basic-offset: 8
58062 +   tab-width: 8
58063 +   fill-column: 120
58064 +   End:
58065 +*/
58066 diff -puN /dev/null fs/reiser4/plugin/object.h
58067 --- /dev/null
58068 +++ a/fs/reiser4/plugin/object.h
58069 @@ -0,0 +1,120 @@
58070 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by
58071 + * reiser4/README */
58072 +
58073 +/* Declaration of object plugin functions. */
58074 +
58075 +#if !defined(__FS_REISER4_PLUGIN_OBJECT_H__)
58076 +#define __FS_REISER4_PLUGIN_OBJECT_H__
58077 +
58078 +#include "../type_safe_hash.h"
58079 +
58080 +/* common implementations of inode operations */
58081 +int reiser4_create_common(struct inode *parent, struct dentry *dentry,
58082 +                         int mode, struct nameidata *);
58083 +struct dentry *reiser4_lookup_common(struct inode *parent,
58084 +                                     struct dentry *dentry,
58085 +                                     struct nameidata *nameidata);
58086 +int reiser4_link_common(struct dentry *existing, struct inode *parent,
58087 +                       struct dentry *newname);
58088 +int reiser4_unlink_common(struct inode *parent, struct dentry *victim);
58089 +int reiser4_mkdir_common(struct inode *parent, struct dentry *dentry, int mode);
58090 +int reiser4_symlink_common(struct inode *parent, struct dentry *dentry,
58091 +                  const char *linkname);
58092 +int reiser4_mknod_common(struct inode *parent, struct dentry *dentry,
58093 +                int mode, dev_t rdev);
58094 +int reiser4_rename_common(struct inode *old_dir, struct dentry *old_name,
58095 +                         struct inode *new_dir, struct dentry *new_name);
58096 +void *reiser4_follow_link_common(struct dentry *, struct nameidata *data);
58097 +int reiser4_permission_common(struct inode *, int mask);
58098 +int reiser4_setattr_common(struct dentry *, struct iattr *);
58099 +int reiser4_getattr_common(struct vfsmount *mnt, struct dentry *,
58100 +                          struct kstat *);
58101 +
58102 +/* common implementations of file operations */
58103 +loff_t reiser4_llseek_dir_common(struct file *, loff_t off, int origin);
58104 +int reiser4_readdir_common(struct file *, void *dirent, filldir_t);
58105 +int reiser4_release_dir_common(struct inode *, struct file *);
58106 +int reiser4_sync_common(struct file *, struct dentry *, int datasync);
58107 +
58108 +/* common implementations of address space operations */
58109 +int prepare_write_common(struct file *, struct page *, unsigned from,
58110 +                        unsigned to);
58111 +
58112 +/* file plugin operations: common implementations */
58113 +int write_sd_by_inode_common(struct inode *);
58114 +int key_by_inode_and_offset_common(struct inode *, loff_t, reiser4_key *);
58115 +int set_plug_in_inode_common(struct inode *object, struct inode *parent,
58116 +                            reiser4_object_create_data *);
58117 +int adjust_to_parent_common(struct inode *object, struct inode *parent,
58118 +                           struct inode *root);
58119 +int adjust_to_parent_common_dir(struct inode *object, struct inode *parent,
58120 +                               struct inode *root);
58121 +int adjust_to_parent_cryptcompress(struct inode *object, struct inode *parent,
58122 +                                  struct inode *root);
58123 +int reiser4_create_object_common(struct inode *object, struct inode *parent,
58124 +                                reiser4_object_create_data *);
58125 +int reiser4_delete_object_common(struct inode *);
58126 +int reiser4_delete_dir_common(struct inode *);
58127 +int reiser4_add_link_common(struct inode *object, struct inode *parent);
58128 +int reiser4_rem_link_common(struct inode *object, struct inode *parent);
58129 +int rem_link_common_dir(struct inode *object, struct inode *parent);
58130 +int owns_item_common(const struct inode *, const coord_t *);
58131 +int owns_item_common_dir(const struct inode *, const coord_t *);
58132 +int can_add_link_common(const struct inode *);
58133 +int can_rem_link_common_dir(const struct inode *);
58134 +int reiser4_detach_common_dir(struct inode *child, struct inode *parent);
58135 +int reiser4_bind_common_dir(struct inode *child, struct inode *parent);
58136 +int safelink_common(struct inode *, reiser4_safe_link_t, __u64 value);
58137 +reiser4_block_nr estimate_create_common(const struct inode *);
58138 +reiser4_block_nr estimate_create_common_dir(const struct inode *);
58139 +reiser4_block_nr estimate_update_common(const struct inode *);
58140 +reiser4_block_nr estimate_unlink_common(const struct inode *,
58141 +                                       const struct inode *);
58142 +reiser4_block_nr estimate_unlink_common_dir(const struct inode *,
58143 +                                           const struct inode *);
58144 +char *wire_write_common(struct inode *, char *start);
58145 +char *wire_read_common(char *addr, reiser4_object_on_wire *);
58146 +struct dentry *wire_get_common(struct super_block *, reiser4_object_on_wire *);
58147 +int wire_size_common(struct inode *);
58148 +void wire_done_common(reiser4_object_on_wire *);
58149 +
58150 +/* dir plugin operations: common implementations */
58151 +struct dentry *get_parent_common(struct inode *child);
58152 +int is_name_acceptable_common(const struct inode *, const char *name, int len);
58153 +void build_entry_key_common(const struct inode *,
58154 +                           const struct qstr *qname, reiser4_key *);
58155 +int build_readdir_key_common(struct file *dir, reiser4_key *);
58156 +int reiser4_add_entry_common(struct inode *object, struct dentry *where,
58157 +                    reiser4_object_create_data * , reiser4_dir_entry_desc *);
58158 +int reiser4_rem_entry_common(struct inode *object, struct dentry *where,
58159 +                    reiser4_dir_entry_desc *);
58160 +int reiser4_dir_init_common(struct inode *object, struct inode *parent,
58161 +                           reiser4_object_create_data *);
58162 +int reiser4_dir_done_common(struct inode *);
58163 +int reiser4_attach_common(struct inode *child, struct inode *parent);
58164 +int reiser4_detach_common(struct inode *object, struct inode *parent);
58165 +reiser4_block_nr estimate_add_entry_common(const struct inode *);
58166 +reiser4_block_nr estimate_rem_entry_common(const struct inode *);
58167 +reiser4_block_nr dir_estimate_unlink_common(const struct inode *,
58168 +                                           const struct inode *);
58169 +
58170 +/* these are essential parts of common implementations, they are to make
58171 +   customized implementations easier */
58172 +int do_prepare_write(struct file *, struct page *, unsigned from, unsigned to);
58173 +
58174 +/* merely useful functions */
58175 +int lookup_sd(struct inode *, znode_lock_mode, coord_t *, lock_handle * ,
58176 +             const reiser4_key * , int silent);
58177 +
58178 +/* __FS_REISER4_PLUGIN_OBJECT_H__ */
58179 +#endif
58180 +
58181 +/* Make Linus happy.
58182 +   Local variables:
58183 +   c-indentation-style: "K&R"
58184 +   mode-name: "LC"
58185 +   c-basic-offset: 8
58186 +   tab-width: 8
58187 +   fill-column: 120
58188 +   End:
58189 +*/
58190 diff -puN /dev/null fs/reiser4/plugin/plugin.c
58191 --- /dev/null
58192 +++ a/fs/reiser4/plugin/plugin.c
58193 @@ -0,0 +1,560 @@
58194 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
58195 + * reiser4/README */
58196 +
58197 +/* Basic plugin infrastructure, lookup etc. */
58198 +
58199 +/* PLUGINS:
58200 +
58201 +   Plugins are internal Reiser4 "modules" or "objects" used to increase
58202 +   extensibility and allow external users to easily adapt reiser4 to
58203 +   their needs.
58204 +
58205 +   Plugins are classified into several disjoint "types". Plugins
58206 +   belonging to the particular plugin type are termed "instances" of
58207 +   this type. Existing types are listed by enum reiser4_plugin_type
58208 +   (see plugin/plugin_header.h)
58209 +
58210 +NIKITA-FIXME-HANS: update this list, and review this entire comment for currency
58211 +
58212 +   Object (file) plugin determines how given file-system object serves
58213 +   standard VFS requests for read, write, seek, mmap etc. Instances of
58214 +   file plugins are: regular file, directory, symlink. Another example
58215 +   of file plugin is audit plugin, that optionally records accesses to
58216 +   underlying object and forwards requests to it.
58217 +
58218 +   Hash plugins compute hashes used by reiser4 to store and locate
58219 +   files within directories. Instances of hash plugin type are: r5,
58220 +   tea, rupasov.
58221 +
58222 +   Tail plugins (or, more precisely, tail policy plugins) determine
58223 +   when last part of the file should be stored in a formatted item.
58224 +
58225 +   Scope and lookup:
58226 +
58227 +   label such that pair ( type_label, plugin_label ) is unique.  This
58228 +   pair is a globally persistent and user-visible plugin
58229 +   identifier. Internally kernel maintains plugins and plugin types in
58230 +   arrays using an index into those arrays as plugin and plugin type
58231 +   identifiers. File-system in turn, also maintains persistent
58232 +   "dictionary" which is mapping from plugin label to numerical
58233 +   identifier which is stored in file-system objects.  That is, we
58234 +   store the offset into the plugin array for that plugin type as the
58235 +   plugin id in the stat data of the filesystem object.
58236 +
58237 +   Internal kernel plugin type identifier (index in plugins[] array) is
58238 +   of type reiser4_plugin_type. Set of available plugin types is
58239 +   currently static, but dynamic loading doesn't seem to pose
58240 +   insurmountable problems.
58241 +
58242 +   Within each type plugins are addressed by the identifiers of type
58243 +   reiser4_plugin_id (indices in reiser4_plugin_type_data.builtin[]).
58244 +   Such identifiers are only required to be unique within one type,
58245 +   not globally.
58246 +
58247 +   Thus, plugin in memory is uniquely identified by the pair (type_id,
58248 +   id).
58249 +
58250 +   Usage:
58251 +
58252 +   There exists only one instance of each plugin instance, but this
58253 +   single instance can be associated with many entities (file-system
58254 +   objects, items, nodes, transactions, file-descriptors etc.). Entity
58255 +   to which plugin of given type is termed (due to the lack of
58256 +   imagination) "subject" of this plugin type and, by abuse of
58257 +   terminology, subject of particular instance of this type to which
58258 +   it's attached currently. For example, inode is subject of object
58259 +   plugin type. Inode representing directory is subject of directory
58260 +   plugin, hash plugin type and some particular instance of hash plugin
58261 +   type. Inode, representing regular file is subject of "regular file"
58262 +   plugin, tail-policy plugin type etc.
58263 +
58264 +   With each subject the plugin possibly stores some state. For example,
58265 +   the state of a directory plugin (instance of object plugin type) is pointer
58266 +   to hash plugin (if directories always use hashing that is).
58267 +
58268 +   Interface:
58269 +
58270 +   In addition to a scalar identifier, each plugin type and plugin
58271 +   proper has a "label": short string and a "description"---longer
58272 +   descriptive string. Labels and descriptions of plugin types are
58273 +   hard-coded into plugins[] array, declared and defined in
58274 +   plugin.c. Label and description of plugin are stored in .label and
58275 +   .desc fields of reiser4_plugin_header respectively. It's possible to
58276 +   locate plugin by the pair of labels.
58277 +
58278 +   Features (not implemented):
58279 +
58280 +    . user-level plugin manipulations:
58281 +      + reiser4("filename/..file_plugin<='audit'");
58282 +      + write(open("filename/..file_plugin"), "audit", 8);
58283 +
58284 +    . user level utilities lsplug and chplug to manipulate plugins.
58285 +      Utilities are not of primary priority. Possibly they will be not
58286 +      working on v4.0
58287 +
58288 +   NIKITA-FIXME-HANS: this should be a mkreiserfs option not a mount
58289 +   option, do you agree?  I don't think that specifying it at mount time,
58290 +   and then changing it with each mount, is a good model for usage.
58291 +
58292 +    . mount option "plug" to set-up plugins of root-directory.
58293 +      "plug=foo:bar" will set "bar" as default plugin of type "foo".
58294 +
58295 +   Limitations:
58296 +
58297 +    . each plugin type has to provide at least one builtin
58298 +      plugin. This is technical limitation and it can be lifted in the
58299 +      future.
58300 +
58301 +   TODO:
58302 +
58303 +   New plugin types/plugings:
58304 +   Things we should be able to separately choose to inherit:
58305 +
58306 +   security plugins
58307 +
58308 +   stat data
58309 +
58310 +   file bodies
58311 +
58312 +   file plugins
58313 +
58314 +   dir plugins
58315 +
58316 +    . perm:acl
58317 +
58318 +    . audi---audit plugin intercepting and possibly logging all
58319 +      accesses to object. Requires to put stub functions in file_operations
58320 +      in stead of generic_file_*.
58321 +
58322 +NIKITA-FIXME-HANS: why make overflows a plugin?
58323 +    . over---handle hash overflows
58324 +
58325 +    . sqnt---handle different access patterns and instruments read-ahead
58326 +
58327 +NIKITA-FIXME-HANS: describe the line below in more detail.
58328 +
58329 +    . hier---handle inheritance of plugins along file-system hierarchy
58330 +
58331 +   Different kinds of inheritance: on creation vs. on access.
58332 +   Compatible/incompatible plugins.
58333 +   Inheritance for multi-linked files.
58334 +   Layered plugins.
58335 +   Notion of plugin context is abandoned.
58336 +
58337 +Each file is associated
58338 +   with one plugin and dependant plugins (hash, etc.) are stored as
58339 +   main plugin state. Now, if we have plugins used for regular files
58340 +   but not for directories, how such plugins would be inherited?
58341 +    . always store them with directories also
58342 +
58343 +NIKTIA-FIXME-HANS: Do the line above.  It is not exclusive of doing
58344 +the line below which is also useful.
58345 +
58346 +    . use inheritance hierarchy, independent of file-system namespace
58347 +*/
58348 +
58349 +#include "../debug.h"
58350 +#include "../dformat.h"
58351 +#include "plugin_header.h"
58352 +#include "item/static_stat.h"
58353 +#include "node/node.h"
58354 +#include "security/perm.h"
58355 +#include "space/space_allocator.h"
58356 +#include "disk_format/disk_format.h"
58357 +#include "plugin.h"
58358 +#include "../reiser4.h"
58359 +#include "../jnode.h"
58360 +#include "../inode.h"
58361 +
58362 +#include <linux/fs.h>          /* for struct super_block  */
58363 +
58364 +/*
58365 + * init_plugins - initialize plugin sub-system.
58366 + * Just call this once on reiser4 startup.
58367 + *
58368 + * Initializes plugin sub-system. It is part of reiser4 module
58369 + * initialization. For each plugin of each type init method is called and each
58370 + * plugin is put into list of plugins.
58371 + */
58372 +int init_plugins(void)
58373 +{
58374 +       reiser4_plugin_type type_id;
58375 +
58376 +       for (type_id = 0; type_id < REISER4_PLUGIN_TYPES; ++type_id) {
58377 +               struct reiser4_plugin_type_data *ptype;
58378 +               int i;
58379 +
58380 +               ptype = &plugins[type_id];
58381 +               assert("nikita-3508", ptype->label != NULL);
58382 +               assert("nikita-3509", ptype->type_id == type_id);
58383 +
58384 +               INIT_LIST_HEAD(&ptype->plugins_list);
58385 +/* NIKITA-FIXME-HANS: change builtin_num to some other name lacking the term
58386 + * builtin. */
58387 +               for (i = 0; i < ptype->builtin_num; ++i) {
58388 +                       reiser4_plugin *plugin;
58389 +
58390 +                       plugin = plugin_at(ptype, i);
58391 +
58392 +                       if (plugin->h.label == NULL)
58393 +                               /* uninitialized slot encountered */
58394 +                               continue;
58395 +                       assert("nikita-3445", plugin->h.type_id == type_id);
58396 +                       plugin->h.id = i;
58397 +                       if (plugin->h.pops != NULL &&
58398 +                           plugin->h.pops->init != NULL) {
58399 +                               int result;
58400 +
58401 +                               result = plugin->h.pops->init(plugin);
58402 +                               if (result != 0)
58403 +                                       return result;
58404 +                       }
58405 +                       INIT_LIST_HEAD(&plugin->h.linkage);
58406 +                       list_add_tail(&plugin->h.linkage, &ptype->plugins_list);
58407 +               }
58408 +       }
58409 +       return 0;
58410 +}
58411 +
58412 +/* true if plugin type id is valid */
58413 +int is_plugin_type_valid(reiser4_plugin_type type)
58414 +{
58415 +       /* "type" is unsigned, so no comparison with 0 is
58416 +          necessary */
58417 +       return (type < REISER4_PLUGIN_TYPES);
58418 +}
58419 +
58420 +/* true if plugin id is valid */
58421 +int is_plugin_id_valid(reiser4_plugin_type type, reiser4_plugin_id id)
58422 +{
58423 +       assert("nikita-1653", is_plugin_type_valid(type));
58424 +       return id < plugins[type].builtin_num;
58425 +}
58426 +
58427 +/* return plugin by its @type and @id.
58428 +
58429 +   Both arguments are checked for validness: this is supposed to be called
58430 +   from user-level.
58431 +
58432 +NIKITA-FIXME-HANS: Do you instead mean that this checks ids created in
58433 +user space, and passed to the filesystem by use of method files? Your
58434 +comment really confused me on the first reading....
58435 +
58436 +*/
58437 +reiser4_plugin *plugin_by_unsafe_id(reiser4_plugin_type type /* plugin type
58438 +                                                                * unchecked */,
58439 +                                   reiser4_plugin_id id        /* plugin id,
58440 +                                                                * unchecked */)
58441 +{
58442 +       if (is_plugin_type_valid(type)) {
58443 +               if (is_plugin_id_valid(type, id))
58444 +                       return plugin_at(&plugins[type], id);
58445 +               else
58446 +                       /* id out of bounds */
58447 +                       warning("nikita-2913",
58448 +                               "Invalid plugin id: [%i:%i]", type, id);
58449 +       } else
58450 +               /* type_id out of bounds */
58451 +               warning("nikita-2914", "Invalid type_id: %i", type);
58452 +       return NULL;
58453 +}
58454 +
58455 +/**
58456 + * save_plugin_id - store plugin id in disk format
58457 + * @plugin: plugin to convert
58458 + * @area: where to store result
58459 + *
58460 + * Puts id of @plugin in little endian format to address @area.
58461 + */
58462 +int save_plugin_id(reiser4_plugin *plugin /* plugin to convert */ ,
58463 +                  d16 * area/* where to store result */)
58464 +{
58465 +       assert("nikita-1261", plugin != NULL);
58466 +       assert("nikita-1262", area != NULL);
58467 +
58468 +       put_unaligned(cpu_to_le16(plugin->h.id), area);
58469 +       return 0;
58470 +}
58471 +
58472 +/* list of all plugins of given type */
58473 +struct list_head *get_plugin_list(reiser4_plugin_type type)
58474 +{
58475 +       assert("nikita-1056", is_plugin_type_valid(type));
58476 +       return &plugins[type].plugins_list;
58477 +}
58478 +
58479 +static void update_pset_mask(reiser4_inode * info, pset_member memb)
58480 +{
58481 +       struct dentry *rootdir;
58482 +       reiser4_inode *root;
58483 +
58484 +       assert("edward-1443", memb != PSET_FILE);
58485 +
58486 +       rootdir = inode_by_reiser4_inode(info)->i_sb->s_root;
58487 +       if (rootdir != NULL) {
58488 +               root = reiser4_inode_data(rootdir->d_inode);
58489 +               /*
58490 +                * if inode is different from the default one, or we are
58491 +                * changing plugin of root directory, update plugin_mask
58492 +                */
58493 +               if (aset_get(info->pset, memb) !=
58494 +                   aset_get(root->pset, memb) ||
58495 +                   info == root)
58496 +                       info->plugin_mask |= (1 << memb);
58497 +               else
58498 +                       info->plugin_mask &= ~(1 << memb);
58499 +       }
58500 +}
58501 +
58502 +/* Get specified plugin set member from parent,
58503 +   or from fs-defaults (if no parent is given) and
58504 +   install the result to pset of @self */
58505 +int grab_plugin_pset(struct inode *self,
58506 +                    struct inode *ancestor,
58507 +                    pset_member memb)
58508 +{
58509 +       reiser4_plugin *plug;
58510 +       reiser4_inode *info;
58511 +       int result = 0;
58512 +
58513 +       /* Do not grab if initialised already. */
58514 +       info = reiser4_inode_data(self);
58515 +       if (aset_get(info->pset, memb) != NULL)
58516 +               return 0;
58517 +       if (ancestor) {
58518 +               reiser4_inode *parent;
58519 +
58520 +               parent = reiser4_inode_data(ancestor);
58521 +               plug = aset_get(parent->hset, memb) ? :
58522 +                       aset_get(parent->pset, memb);
58523 +       } else
58524 +               plug = get_default_plugin(memb);
58525 +
58526 +       result = set_plugin(&info->pset, memb, plug);
58527 +       if (result == 0) {
58528 +               if (!ancestor || self->i_sb->s_root->d_inode != self)
58529 +                       update_pset_mask(info, memb);
58530 +       }
58531 +       return result;
58532 +}
58533 +
58534 +/* Take missing pset members from root inode */
58535 +int finish_pset(struct inode *inode)
58536 +{
58537 +       reiser4_plugin *plug;
58538 +       reiser4_inode *root;
58539 +       reiser4_inode *info;
58540 +       pset_member memb;
58541 +       int result = 0;
58542 +
58543 +       root = reiser4_inode_data(inode->i_sb->s_root->d_inode);
58544 +       info = reiser4_inode_data(inode);
58545 +
58546 +       assert("edward-1455", root != NULL);
58547 +       assert("edward-1456", info != NULL);
58548 +
58549 +       /* file and directory plugins are already initialized. */
58550 +       for (memb = PSET_DIR + 1; memb < PSET_LAST; ++memb) {
58551 +
58552 +               /* Do not grab if initialised already. */
58553 +               if (aset_get(info->pset, memb) != NULL)
58554 +                       continue;
58555 +
58556 +               plug = aset_get(root->pset, memb);
58557 +               result = set_plugin(&info->pset, memb, plug);
58558 +               if (result != 0)
58559 +                       break;
58560 +       }
58561 +       if (result != 0) {
58562 +               warning("nikita-3447",
58563 +                       "Cannot set up plugins for %lli",
58564 +                       (unsigned long long)
58565 +                       get_inode_oid(inode));
58566 +       }
58567 +       return result;
58568 +}
58569 +
58570 +int force_plugin_pset(struct inode *self, pset_member memb,
58571 +                     reiser4_plugin * plug)
58572 +{
58573 +       reiser4_inode *info;
58574 +       int result = 0;
58575 +
58576 +       if (!self->i_sb->s_root || self->i_sb->s_root->d_inode == self) {
58577 +               /* Changing pset in the root object. */
58578 +               return RETERR(-EINVAL);
58579 +       }
58580 +
58581 +       info = reiser4_inode_data(self);
58582 +       if (plug->h.pops != NULL && plug->h.pops->change != NULL)
58583 +               result = plug->h.pops->change(self, plug, memb);
58584 +       else
58585 +               result = aset_set_unsafe(&info->pset, memb, plug);
58586 +       if (result == 0) {
58587 +               __u16 oldmask = info->plugin_mask;
58588 +
58589 +               update_pset_mask(info, memb);
58590 +               if (oldmask != info->plugin_mask)
58591 +                       reiser4_inode_clr_flag(self, REISER4_SDLEN_KNOWN);
58592 +       }
58593 +       return result;
58594 +}
58595 +
58596 +struct reiser4_plugin_type_data plugins[REISER4_PLUGIN_TYPES] = {
58597 +       /* C90 initializers */
58598 +       [REISER4_FILE_PLUGIN_TYPE] = {
58599 +               .type_id = REISER4_FILE_PLUGIN_TYPE,
58600 +               .label = "file",
58601 +               .desc = "Object plugins",
58602 +               .builtin_num = sizeof_array(file_plugins),
58603 +               .builtin = file_plugins,
58604 +               .plugins_list = {NULL, NULL},
58605 +               .size = sizeof(file_plugin)
58606 +       },
58607 +       [REISER4_DIR_PLUGIN_TYPE] = {
58608 +               .type_id = REISER4_DIR_PLUGIN_TYPE,
58609 +               .label = "dir",
58610 +               .desc = "Directory plugins",
58611 +               .builtin_num = sizeof_array(dir_plugins),
58612 +               .builtin = dir_plugins,
58613 +               .plugins_list = {NULL, NULL},
58614 +               .size = sizeof(dir_plugin)
58615 +       },
58616 +       [REISER4_HASH_PLUGIN_TYPE] = {
58617 +               .type_id = REISER4_HASH_PLUGIN_TYPE,
58618 +               .label = "hash",
58619 +               .desc = "Directory hashes",
58620 +               .builtin_num = sizeof_array(hash_plugins),
58621 +               .builtin = hash_plugins,
58622 +               .plugins_list = {NULL, NULL},
58623 +               .size = sizeof(hash_plugin)
58624 +       },
58625 +       [REISER4_FIBRATION_PLUGIN_TYPE] = {
58626 +               .type_id =
58627 +               REISER4_FIBRATION_PLUGIN_TYPE,
58628 +               .label = "fibration",
58629 +               .desc = "Directory fibrations",
58630 +               .builtin_num = sizeof_array(fibration_plugins),
58631 +               .builtin = fibration_plugins,
58632 +               .plugins_list = {NULL, NULL},
58633 +               .size = sizeof(fibration_plugin)
58634 +       },
58635 +       [REISER4_CIPHER_PLUGIN_TYPE] = {
58636 +               .type_id = REISER4_CIPHER_PLUGIN_TYPE,
58637 +               .label = "cipher",
58638 +               .desc = "Cipher plugins",
58639 +               .builtin_num = sizeof_array(cipher_plugins),
58640 +               .builtin = cipher_plugins,
58641 +               .plugins_list = {NULL, NULL},
58642 +               .size = sizeof(cipher_plugin)
58643 +       },
58644 +       [REISER4_DIGEST_PLUGIN_TYPE] = {
58645 +               .type_id = REISER4_DIGEST_PLUGIN_TYPE,
58646 +               .label = "digest",
58647 +               .desc = "Digest plugins",
58648 +               .builtin_num = sizeof_array(digest_plugins),
58649 +               .builtin = digest_plugins,
58650 +               .plugins_list = {NULL, NULL},
58651 +               .size = sizeof(digest_plugin)
58652 +       },
58653 +       [REISER4_COMPRESSION_PLUGIN_TYPE] = {
58654 +               .type_id = REISER4_COMPRESSION_PLUGIN_TYPE,
58655 +               .label = "compression",
58656 +               .desc = "Compression plugins",
58657 +               .builtin_num = sizeof_array(compression_plugins),
58658 +               .builtin = compression_plugins,
58659 +               .plugins_list = {NULL, NULL},
58660 +               .size = sizeof(compression_plugin)
58661 +       },
58662 +       [REISER4_FORMATTING_PLUGIN_TYPE] = {
58663 +               .type_id = REISER4_FORMATTING_PLUGIN_TYPE,
58664 +               .label = "formatting",
58665 +               .desc = "Tail inlining policies",
58666 +               .builtin_num = sizeof_array(formatting_plugins),
58667 +               .builtin = formatting_plugins,
58668 +               .plugins_list = {NULL, NULL},
58669 +               .size = sizeof(formatting_plugin)
58670 +       },
58671 +       [REISER4_PERM_PLUGIN_TYPE] = {
58672 +               .type_id = REISER4_PERM_PLUGIN_TYPE,
58673 +               .label = "perm",
58674 +               .desc = "Permission checks",
58675 +               .builtin_num = sizeof_array(perm_plugins),
58676 +               .builtin = perm_plugins,
58677 +               .plugins_list = {NULL, NULL},
58678 +               .size = sizeof(perm_plugin)
58679 +       },
58680 +       [REISER4_ITEM_PLUGIN_TYPE] = {
58681 +               .type_id = REISER4_ITEM_PLUGIN_TYPE,
58682 +               .label = "item",
58683 +               .desc = "Item handlers",
58684 +               .builtin_num = sizeof_array(item_plugins),
58685 +               .builtin = item_plugins,
58686 +               .plugins_list = {NULL, NULL},
58687 +               .size = sizeof(item_plugin)
58688 +       },
58689 +       [REISER4_NODE_PLUGIN_TYPE] = {
58690 +               .type_id = REISER4_NODE_PLUGIN_TYPE,
58691 +               .label = "node",
58692 +               .desc = "node layout handlers",
58693 +               .builtin_num = sizeof_array(node_plugins),
58694 +               .builtin = node_plugins,
58695 +               .plugins_list = {NULL, NULL},
58696 +               .size = sizeof(node_plugin)
58697 +       },
58698 +       [REISER4_SD_EXT_PLUGIN_TYPE] = {
58699 +               .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
58700 +               .label = "sd_ext",
58701 +               .desc = "Parts of stat-data",
58702 +               .builtin_num = sizeof_array(sd_ext_plugins),
58703 +               .builtin = sd_ext_plugins,
58704 +               .plugins_list = {NULL, NULL},
58705 +               .size = sizeof(sd_ext_plugin)
58706 +       },
58707 +       [REISER4_FORMAT_PLUGIN_TYPE] = {
58708 +               .type_id = REISER4_FORMAT_PLUGIN_TYPE,
58709 +               .label = "disk_layout",
58710 +               .desc = "defines filesystem on disk layout",
58711 +               .builtin_num = sizeof_array(format_plugins),
58712 +               .builtin = format_plugins,
58713 +               .plugins_list = {NULL, NULL},
58714 +               .size = sizeof(disk_format_plugin)
58715 +       },
58716 +       [REISER4_JNODE_PLUGIN_TYPE] = {
58717 +               .type_id = REISER4_JNODE_PLUGIN_TYPE,
58718 +               .label = "jnode",
58719 +               .desc = "defines kind of jnode",
58720 +               .builtin_num = sizeof_array(jnode_plugins),
58721 +               .builtin = jnode_plugins,
58722 +               .plugins_list = {NULL, NULL},
58723 +               .size = sizeof(jnode_plugin)
58724 +       },
58725 +       [REISER4_COMPRESSION_MODE_PLUGIN_TYPE] = {
58726 +               .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
58727 +               .label = "compression_mode",
58728 +               .desc = "Defines compression mode",
58729 +               .builtin_num = sizeof_array(compression_mode_plugins),
58730 +               .builtin = compression_mode_plugins,
58731 +               .plugins_list = {NULL, NULL},
58732 +               .size = sizeof(compression_mode_plugin)
58733 +       },
58734 +       [REISER4_CLUSTER_PLUGIN_TYPE] = {
58735 +               .type_id = REISER4_CLUSTER_PLUGIN_TYPE,
58736 +               .label = "cluster",
58737 +               .desc = "Defines cluster size",
58738 +               .builtin_num = sizeof_array(cluster_plugins),
58739 +               .builtin = cluster_plugins,
58740 +               .plugins_list = {NULL, NULL},
58741 +               .size = sizeof(cluster_plugin)
58742 +       }
58743 +};
58744 +
58745 +/*
58746 + * Local variables:
58747 + * c-indentation-style: "K&R"
58748 + * mode-name: "LC"
58749 + * c-basic-offset: 8
58750 + * tab-width: 8
58751 + * fill-column: 120
58752 + * End:
58753 + */
58754 diff -puN /dev/null fs/reiser4/plugin/plugin.h
58755 --- /dev/null
58756 +++ a/fs/reiser4/plugin/plugin.h
58757 @@ -0,0 +1,942 @@
58758 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
58759 + * reiser4/README */
58760 +
58761 +/* Basic plugin data-types.
58762 +   see fs/reiser4/plugin/plugin.c for details */
58763 +
58764 +#if !defined(__FS_REISER4_PLUGIN_TYPES_H__)
58765 +#define __FS_REISER4_PLUGIN_TYPES_H__
58766 +
58767 +#include "../forward.h"
58768 +#include "../debug.h"
58769 +#include "../dformat.h"
58770 +#include "../key.h"
58771 +#include "compress/compress.h"
58772 +#include "crypto/cipher.h"
58773 +#include "plugin_header.h"
58774 +#include "item/static_stat.h"
58775 +#include "item/internal.h"
58776 +#include "item/sde.h"
58777 +#include "item/cde.h"
58778 +#include "item/item.h"
58779 +#include "node/node.h"
58780 +#include "node/node40.h"
58781 +#include "security/perm.h"
58782 +#include "fibration.h"
58783 +
58784 +#include "space/bitmap.h"
58785 +#include "space/space_allocator.h"
58786 +
58787 +#include "disk_format/disk_format40.h"
58788 +#include "disk_format/disk_format.h"
58789 +
58790 +#include <linux/fs.h>          /* for struct super_block, address_space  */
58791 +#include <linux/mm.h>          /* for struct page */
58792 +#include <linux/buffer_head.h> /* for struct buffer_head */
58793 +#include <linux/dcache.h>      /* for struct dentry */
58794 +#include <linux/types.h>
58795 +#include <linux/crypto.h>
58796 +
58797 +typedef struct reiser4_object_on_wire reiser4_object_on_wire;
58798 +
58799 +/*
58800 + * File plugin.  Defines the set of methods that file plugins implement, some
58801 + * of which are optional.
58802 + *
58803 + * A file plugin offers to the caller an interface for IO ( writing to and/or
58804 + * reading from) to what the caller sees as one sequence of bytes.  An IO to it
58805 + * may affect more than one physical sequence of bytes, or no physical sequence
58806 + * of bytes, it may affect sequences of bytes offered by other file plugins to
58807 + * the semantic layer, and the file plugin may invoke other plugins and
58808 + * delegate work to them, but its interface is structured for offering the
58809 + * caller the ability to read and/or write what the caller sees as being a
58810 + * single sequence of bytes.
58811 + *
58812 + * The file plugin must present a sequence of bytes to the caller, but it does
58813 + * not necessarily have to store a sequence of bytes, it does not necessarily
58814 + * have to support efficient tree traversal to any offset in the sequence of
58815 + * bytes (tail and extent items, whose keys contain offsets, do however provide
58816 + * efficient non-sequential lookup of any offset in the sequence of bytes).
58817 + *
58818 + * Directory plugins provide methods for selecting file plugins by resolving a
58819 + * name for them.
58820 + *
58821 + * The functionality other filesystems call an attribute, and rigidly tie
58822 + * together, we decompose into orthogonal selectable features of files.  Using
58823 + * the terminology we will define next, an attribute is a perhaps constrained,
58824 + * perhaps static length, file whose parent has a uni-count-intra-link to it,
58825 + * which might be grandparent-major-packed, and whose parent has a deletion
58826 + * method that deletes it.
58827 + *
58828 + * File plugins can implement constraints.
58829 + *
58830 + * Files can be of variable length (e.g. regular unix files), or of static
58831 + * length (e.g. static sized attributes).
58832 + *
58833 + * An object may have many sequences of bytes, and many file plugins, but, it
58834 + * has exactly one objectid.  It is usually desirable that an object has a
58835 + * deletion method which deletes every item with that objectid.  Items cannot
58836 + * in general be found by just their objectids.  This means that an object must
58837 + * have either a method built into its deletion plugin method for knowing what
58838 + * items need to be deleted, or links stored with the object that provide the
58839 + * plugin with a method for finding those items.  Deleting a file within an
58840 + * object may or may not have the effect of deleting the entire object,
58841 + * depending on the file plugin's deletion method.
58842 + *
58843 + * LINK TAXONOMY:
58844 + *
58845 + * Many objects have a reference count, and when the reference count reaches 0
58846 + * the object's deletion method is invoked.  Some links embody a reference
58847 + * count increase ("countlinks"), and others do not ("nocountlinks").
58848 + *
58849 + * Some links are bi-directional links ("bilinks"), and some are
58850 + * uni-directional("unilinks").
58851 + *
58852 + * Some links are between parts of the same object ("intralinks"), and some are
58853 + * between different objects ("interlinks").
58854 + *
58855 + * PACKING TAXONOMY:
58856 + *
58857 + * Some items of an object are stored with a major packing locality based on
58858 + * their object's objectid (e.g. unix directory items in plan A), and these are
58859 + * called "self-major-packed".
58860 + *
58861 + * Some items of an object are stored with a major packing locality based on
58862 + * their semantic parent object's objectid (e.g. unix file bodies in plan A),
58863 + * and these are called "parent-major-packed".
58864 + *
58865 + * Some items of an object are stored with a major packing locality based on
58866 + * their semantic grandparent, and these are called "grandparent-major-packed".
58867 + * Now carefully notice that we run into trouble with key length if we have to
58868 + * store a 8 byte major+minor grandparent based packing locality, an 8 byte
58869 + * parent objectid, an 8 byte attribute objectid, and an 8 byte offset, all in
58870 + * a 24 byte key.  One of these fields must be sacrificed if an item is to be
58871 + * grandparent-major-packed, and which to sacrifice is left to the item author
58872 + * choosing to make the item grandparent-major-packed.  You cannot make tail
58873 + * items and extent items grandparent-major-packed, though you could make them
58874 + * self-major-packed (usually they are parent-major-packed).
58875 + *
58876 + * In the case of ACLs (which are composed of fixed length ACEs which consist
58877 + * of {subject-type, subject, and permission bitmask} triples), it makes sense
58878 + * to not have an offset field in the ACE item key, and to allow duplicate keys
58879 + * for ACEs.  Thus, the set of ACES for a given file is found by looking for a
58880 + * key consisting of the objectid of the grandparent (thus grouping all ACLs in
58881 + * a directory together), the minor packing locality of ACE, the objectid of
58882 + * the file, and 0.
58883 + *
58884 + * IO involves moving data from one location to another, which means that two
58885 + * locations must be specified, source and destination.
58886 + *
58887 + * This source and destination can be in the filesystem, or they can be a
58888 + * pointer in the user process address space plus a byte count.
58889 + *
58890 + * If both source and destination are in the filesystem, then at least one of
58891 + * them must be representable as a pure stream of bytes (which we call a flow,
58892 + * and define as a struct containing a key, a data pointer, and a length).
58893 + * This may mean converting one of them into a flow.  We provide a generic
58894 + * cast_into_flow() method, which will work for any plugin supporting
58895 + * read_flow(), though it is inefficiently implemented in that it temporarily
58896 + * stores the flow in a buffer (Question: what to do with huge flows that
58897 + * cannot fit into memory?  Answer: we must not convert them all at once. )
58898 + *
58899 + * Performing a write requires resolving the write request into a flow defining
58900 + * the source, and a method that performs the write, and a key that defines
58901 + * where in the tree the write is to go.
58902 + *
58903 + * Performing a read requires resolving the read request into a flow defining
58904 + * the target, and a method that performs the read, and a key that defines
58905 + * where in the tree the read is to come from.
58906 + *
58907 + * There will exist file plugins which have no pluginid stored on the disk for
58908 + * them, and which are only invoked by other plugins.
58909 + */
58910 +
58911 +/* This should be incremented with each new contributed
58912 +   pair (plugin type, plugin id).
58913 +   NOTE: Make sure there is a release of reiser4progs
58914 +   with the corresponding version number */
58915 +#define PLUGIN_LIBRARY_VERSION 0
58916 +
58917 + /* enumeration of fields within plugin_set */
58918 +typedef enum {
58919 +       PSET_FILE,
58920 +       PSET_DIR,               /* PSET_FILE and PSET_DIR should be first
58921 +                                * elements: inode.c:read_inode() depends on
58922 +                                * this. */
58923 +       PSET_PERM,
58924 +       PSET_FORMATTING,
58925 +       PSET_HASH,
58926 +       PSET_FIBRATION,
58927 +       PSET_SD,
58928 +       PSET_DIR_ITEM,
58929 +       PSET_CIPHER,
58930 +       PSET_DIGEST,
58931 +       PSET_COMPRESSION,
58932 +       PSET_COMPRESSION_MODE,
58933 +       PSET_CLUSTER,
58934 +       PSET_CREATE,
58935 +       PSET_LAST
58936 +} pset_member;
58937 +
58938 +/* builtin file-plugins */
58939 +typedef enum {
58940 +       /* regular file */
58941 +       UNIX_FILE_PLUGIN_ID,
58942 +       /* directory */
58943 +       DIRECTORY_FILE_PLUGIN_ID,
58944 +       /* symlink */
58945 +       SYMLINK_FILE_PLUGIN_ID,
58946 +       /* for objects completely handled by the VFS: fifos, devices,
58947 +          sockets  */
58948 +       SPECIAL_FILE_PLUGIN_ID,
58949 +       /* regular cryptcompress file */
58950 +       CRYPTCOMPRESS_FILE_PLUGIN_ID,
58951 +       /* number of file plugins. Used as size of arrays to hold
58952 +          file plugins. */
58953 +       LAST_FILE_PLUGIN_ID
58954 +} reiser4_file_id;
58955 +
58956 +typedef struct file_plugin {
58957 +
58958 +       /* generic fields */
58959 +       plugin_header h;
58960 +
58961 +       /* VFS methods.
58962 +        * Must be invariant with respect to plugin conversion.
58963 +        * It can be achieved by using "common" methods, which
58964 +        * are the same for all plugins that take participation in
58965 +        * conversion, or by using "generic" or "careful" methods,
58966 +        * which provide automatic redirection to proper private
58967 +        * plugin methods ("careful" are the same as "generic",
58968 +        * but with protection of pset and other disk structures
58969 +        * from being rebuilt during conversion.
58970 +        */
58971 +       struct inode_operations * inode_ops;
58972 +       struct file_operations * file_ops;
58973 +       struct address_space_operations * as_ops;
58974 +       /**
58975 +        * Private methods. These are optional. If used they will allow you
58976 +        * to minimize the amount of code needed to implement a deviation
58977 +        * from some other method that also uses them.
58978 +        */
58979 +       /*
58980 +        * private inode_ops
58981 +        */
58982 +       int (*setattr)(struct dentry *, struct iattr *);
58983 +       /*
58984 +        * private file_ops
58985 +        */
58986 +       /* do whatever is necessary to do when object is opened */
58987 +       int (*open) (struct inode *inode, struct file *file);
58988 +       ssize_t (*read) (struct file *, char __user *buf, size_t read_amount,
58989 +                       loff_t *off);
58990 +       /* write as much as possible bytes from nominated @write_amount
58991 +        * before plugin scheduling is occurred. Save scheduling state
58992 +        * in @cont */
58993 +       ssize_t (*write) (struct file *, const char __user *buf,
58994 +                         size_t write_amount, loff_t * off,
58995 +                         struct psched_context * cont);
58996 +       int (*ioctl) (struct inode *inode, struct file *filp,
58997 +                     unsigned int cmd, unsigned long arg);
58998 +       int (*mmap) (struct file *, struct vm_area_struct *);
58999 +       int (*release) (struct inode *, struct file *);
59000 +       /*
59001 +        * private a_ops
59002 +        */
59003 +       int (*readpage) (struct file *file, struct page *page);
59004 +       int (*readpages)(struct file *file, struct address_space *mapping,
59005 +                         struct list_head *pages, unsigned nr_pages);
59006 +       int (*writepages)(struct address_space *mapping,
59007 +                         struct writeback_control *wbc);
59008 +       int (*prepare_write)(struct file *file, struct page *page,
59009 +                            unsigned from, unsigned to);
59010 +       int (*commit_write)(struct file *file, struct page *page,
59011 +                           unsigned from, unsigned to);
59012 +       sector_t (*bmap) (struct address_space * mapping, sector_t lblock);
59013 +       /* other private methods */
59014 +       /* save inode cached stat-data onto disk. It was called
59015 +          reiserfs_update_sd() in 3.x */
59016 +       int (*write_sd_by_inode) (struct inode *);
59017 +       /*
59018 +        * Construct flow into @flow according to user-supplied data.
59019 +        *
59020 +        * This is used by read/write methods to construct a flow to
59021 +        * write/read. ->flow_by_inode() is plugin method, rather than single
59022 +        * global implementation, because key in a flow used by plugin may
59023 +        * depend on data in a @buf.
59024 +        *
59025 +        * NIKITA-FIXME-HANS: please create statistics on what functions are
59026 +        * dereferenced how often for the mongo benchmark.  You can supervise
59027 +        * Elena doing this for you if that helps.  Email me the list of the
59028 +        * top 10, with their counts, and an estimate of the total number of
59029 +        * CPU cycles spent dereferencing as a percentage of CPU cycles spent
59030 +        * processing (non-idle processing).  If the total percent is, say,
59031 +        * less than 1%, it will make our coding discussions much easier, and
59032 +        * keep me from questioning whether functions like the below are too
59033 +        * frequently called to be dereferenced.  If the total percent is more
59034 +        * than 1%, perhaps private methods should be listed in a "required"
59035 +        * comment at the top of each plugin (with stern language about how if
59036 +        * the comment is missing it will not be accepted by the maintainer),
59037 +        * and implemented using macros not dereferenced functions.  How about
59038 +        * replacing this whole private methods part of the struct with a
59039 +        * thorough documentation of what the standard helper functions are for
59040 +        * use in constructing plugins?  I think users have been asking for
59041 +        * that, though not in so many words.
59042 +        */
59043 +       int (*flow_by_inode) (struct inode *, const char __user *buf,
59044 +                             int user, loff_t size,
59045 +                             loff_t off, rw_op op, flow_t *);
59046 +       /*
59047 +        * Return the key used to retrieve an offset of a file. It is used by
59048 +        * default implementation of ->flow_by_inode() method
59049 +        * (common_build_flow()) and, among other things, to get to the extent
59050 +        * from jnode of unformatted node.
59051 +        */
59052 +       int (*key_by_inode) (struct inode *, loff_t off, reiser4_key *);
59053 +
59054 +       /* NIKITA-FIXME-HANS: this comment is not as clear to others as you
59055 +        * think.... */
59056 +       /*
59057 +        * set the plugin for a file.  Called during file creation in creat()
59058 +        * but not reiser4() unless an inode already exists for the file.
59059 +        */
59060 +       int (*set_plug_in_inode) (struct inode *inode, struct inode *parent,
59061 +                                 reiser4_object_create_data *);
59062 +
59063 +       /* NIKITA-FIXME-HANS: comment and name seem to say different things,
59064 +        * are you setting up the object itself also or just adjusting the
59065 +        * parent?.... */
59066 +       /* set up plugins for new @object created in @parent. @root is root
59067 +          directory. */
59068 +       int (*adjust_to_parent) (struct inode *object, struct inode *parent,
59069 +                                struct inode *root);
59070 +       /*
59071 +        * this does whatever is necessary to do when object is created. For
59072 +        * instance, for unix files stat data is inserted. It is supposed to be
59073 +        * called by create of struct inode_operations.
59074 +        */
59075 +       int (*create_object) (struct inode *object, struct inode *parent,
59076 +                             reiser4_object_create_data *);
59077 +       /*
59078 +        * this method should check REISER4_NO_SD and set REISER4_NO_SD on
59079 +        * success. Deletion of an object usually includes removal of items
59080 +        * building file body (for directories this is removal of "." and "..")
59081 +        * and removal of stat-data item.
59082 +        */
59083 +       int (*delete_object) (struct inode *);
59084 +
59085 +       /* add link from @parent to @object */
59086 +       int (*add_link) (struct inode *object, struct inode *parent);
59087 +
59088 +       /* remove link from @parent to @object */
59089 +       int (*rem_link) (struct inode *object, struct inode *parent);
59090 +
59091 +       /*
59092 +        * return true if item addressed by @coord belongs to @inode.  This is
59093 +        * used by read/write to properly slice flow into items in presence of
59094 +        * multiple key assignment policies, because items of a file are not
59095 +        * necessarily contiguous in a key space, for example, in a plan-b.
59096 +        */
59097 +       int (*owns_item) (const struct inode *, const coord_t *);
59098 +
59099 +       /* checks whether yet another hard links to this object can be
59100 +          added  */
59101 +       int (*can_add_link) (const struct inode *);
59102 +
59103 +       /* checks whether hard links to this object can be removed */
59104 +       int (*can_rem_link) (const struct inode *);
59105 +
59106 +       /* not empty for DIRECTORY_FILE_PLUGIN_ID only currently. It calls
59107 +          detach of directory plugin to remove ".." */
59108 +       int (*detach) (struct inode *child, struct inode *parent);
59109 +
59110 +       /* called when @child was just looked up in the @parent. It is not
59111 +          empty for DIRECTORY_FILE_PLUGIN_ID only where it calls attach of
59112 +          directory plugin */
59113 +       int (*bind) (struct inode *child, struct inode *parent);
59114 +
59115 +       /* process safe-link during mount */
59116 +       int (*safelink) (struct inode *object, reiser4_safe_link_t link,
59117 +                        __u64 value);
59118 +
59119 +       /* The couple of estimate methods for all file operations */
59120 +       struct {
59121 +               reiser4_block_nr(*create) (const struct inode *);
59122 +               reiser4_block_nr(*update) (const struct inode *);
59123 +               reiser4_block_nr(*unlink) (const struct inode *,
59124 +                                          const struct inode *);
59125 +       } estimate;
59126 +
59127 +       /*
59128 +        * reiser4 specific part of inode has a union of structures which are
59129 +        * specific to a plugin. This method is called when inode is read
59130 +        * (read_inode) and when file is created (common_create_child) so that
59131 +        * file plugin could initialize its inode data
59132 +        */
59133 +       void (*init_inode_data) (struct inode *, reiser4_object_create_data * ,
59134 +                                int);
59135 +
59136 +       /*
59137 +        * This method performs progressive deletion of items and whole nodes
59138 +        * from right to left.
59139 +        *
59140 +        * @tap: the point deletion process begins from,
59141 +        * @from_key: the beginning of the deleted key range,
59142 +        * @to_key: the end of the deleted key range,
59143 +        * @smallest_removed: the smallest removed key,
59144 +        *
59145 +        * @return: 0 if success, error code otherwise, -E_REPEAT means that
59146 +        * long cut_tree operation was interrupted for allowing atom commit .
59147 +        */
59148 +       int (*cut_tree_worker) (tap_t *, const reiser4_key * from_key,
59149 +                               const reiser4_key * to_key,
59150 +                               reiser4_key * smallest_removed, struct inode *,
59151 +                               int, int *);
59152 +
59153 +       /* called from ->destroy_inode() */
59154 +       void (*destroy_inode) (struct inode *);
59155 +
59156 +       /*
59157 +        * methods to serialize object identify. This is used, for example, by
59158 +        * reiser4_{en,de}code_fh().
59159 +        */
59160 +       struct {
59161 +               /* store object's identity at @area */
59162 +               char *(*write) (struct inode *inode, char *area);
59163 +               /* parse object from wire to the @obj */
59164 +               char *(*read) (char *area, reiser4_object_on_wire * obj);
59165 +               /* given object identity in @obj, find or create its dentry */
59166 +               struct dentry *(*get) (struct super_block *s,
59167 +                                      reiser4_object_on_wire * obj);
59168 +               /* how many bytes ->wire.write() consumes */
59169 +               int (*size) (struct inode *inode);
59170 +               /* finish with object identify */
59171 +               void (*done) (reiser4_object_on_wire * obj);
59172 +       } wire;
59173 +} file_plugin;
59174 +
59175 +extern file_plugin file_plugins[LAST_FILE_PLUGIN_ID];
59176 +
59177 +struct reiser4_object_on_wire {
59178 +       file_plugin *plugin;
59179 +       union {
59180 +               struct {
59181 +                       obj_key_id key_id;
59182 +               } std;
59183 +               void *generic;
59184 +       } u;
59185 +};
59186 +
59187 +/* builtin dir-plugins */
59188 +typedef enum {
59189 +       HASHED_DIR_PLUGIN_ID,
59190 +       SEEKABLE_HASHED_DIR_PLUGIN_ID,
59191 +       LAST_DIR_ID
59192 +} reiser4_dir_id;
59193 +
59194 +typedef struct dir_plugin {
59195 +       /* generic fields */
59196 +       plugin_header h;
59197 +
59198 +       struct inode_operations * inode_ops;
59199 +       struct file_operations * file_ops;
59200 +       struct address_space_operations * as_ops;
59201 +
59202 +       /*
59203 +        * private methods: These are optional.  If used they will allow you to
59204 +        * minimize the amount of code needed to implement a deviation from
59205 +        * some other method that uses them.  You could logically argue that
59206 +        * they should be a separate type of plugin.
59207 +        */
59208 +
59209 +       struct dentry *(*get_parent) (struct inode *childdir);
59210 +
59211 +       /*
59212 +        * check whether "name" is acceptable name to be inserted into this
59213 +        * object. Optionally implemented by directory-like objects.  Can check
59214 +        * for maximal length, reserved symbols etc
59215 +        */
59216 +       int (*is_name_acceptable) (const struct inode *inode, const char *name,
59217 +                                  int len);
59218 +
59219 +       void (*build_entry_key) (const struct inode *dir /* directory where
59220 +                                                         * entry is (or will
59221 +                                                         * be) in.*/ ,
59222 +                                const struct qstr *name /* name of file
59223 +                                                         * referenced by this
59224 +                                                         * entry */ ,
59225 +                                reiser4_key * result   /* resulting key of
59226 +                                                        * directory entry */ );
59227 +       int (*build_readdir_key) (struct file *dir, reiser4_key * result);
59228 +       int (*add_entry) (struct inode *object, struct dentry *where,
59229 +                         reiser4_object_create_data * data,
59230 +                         reiser4_dir_entry_desc * entry);
59231 +       int (*rem_entry) (struct inode *object, struct dentry *where,
59232 +                         reiser4_dir_entry_desc * entry);
59233 +
59234 +       /*
59235 +        * initialize directory structure for newly created object. For normal
59236 +        * unix directories, insert dot and dotdot.
59237 +        */
59238 +       int (*init) (struct inode *object, struct inode *parent,
59239 +                    reiser4_object_create_data * data);
59240 +
59241 +       /* destroy directory */
59242 +       int (*done) (struct inode *child);
59243 +
59244 +       /* called when @subdir was just looked up in the @dir */
59245 +       int (*attach) (struct inode *subdir, struct inode *dir);
59246 +       int (*detach) (struct inode *subdir, struct inode *dir);
59247 +
59248 +       struct {
59249 +               reiser4_block_nr(*add_entry) (const struct inode *);
59250 +               reiser4_block_nr(*rem_entry) (const struct inode *);
59251 +               reiser4_block_nr(*unlink) (const struct inode *,
59252 +                                          const struct inode *);
59253 +       } estimate;
59254 +} dir_plugin;
59255 +
59256 +extern dir_plugin dir_plugins[LAST_DIR_ID];
59257 +
59258 +typedef struct formatting_plugin {
59259 +       /* generic fields */
59260 +       plugin_header h;
59261 +       /* returns non-zero iff file's tail has to be stored
59262 +          in a direct item. */
59263 +       int (*have_tail) (const struct inode *inode, loff_t size);
59264 +} formatting_plugin;
59265 +
59266 +typedef struct hash_plugin {
59267 +       /* generic fields */
59268 +       plugin_header h;
59269 +       /* computes hash of the given name */
59270 +        __u64(*hash) (const unsigned char *name, int len);
59271 +} hash_plugin;
59272 +
59273 +typedef struct cipher_plugin {
59274 +       /* generic fields */
59275 +       plugin_header h;
59276 +       struct crypto_blkcipher * (*alloc) (void);
59277 +       void (*free) (struct crypto_blkcipher *tfm);
59278 +       /* Offset translator. For each offset this returns (k * offset), where
59279 +          k (k >= 1) is an expansion factor of the cipher algorithm.
59280 +          For all symmetric algorithms k == 1. For asymmetric algorithms (which
59281 +          inflate data) offset translation guarantees that all disk cluster's
59282 +          units will have keys smaller then next cluster's one.
59283 +        */
59284 +        loff_t(*scale) (struct inode *inode, size_t blocksize, loff_t src);
59285 +       /* Cipher algorithms can accept data only by chunks of cipher block
59286 +          size. This method is to align any flow up to cipher block size when
59287 +          we pass it to cipher algorithm. To align means to append padding of
59288 +          special format specific to the cipher algorithm */
59289 +       int (*align_stream) (__u8 *tail, int clust_size, int blocksize);
59290 +       /* low-level key manager (check, install, etc..) */
59291 +       int (*setkey) (struct crypto_tfm *tfm, const __u8 *key,
59292 +                      unsigned int keylen);
59293 +       /* main text processing procedures */
59294 +       void (*encrypt) (__u32 *expkey, __u8 *dst, const __u8 *src);
59295 +       void (*decrypt) (__u32 *expkey, __u8 *dst, const __u8 *src);
59296 +} cipher_plugin;
59297 +
59298 +typedef struct digest_plugin {
59299 +       /* generic fields */
59300 +       plugin_header h;
59301 +       /* fingerprint size in bytes */
59302 +       int fipsize;
59303 +       struct crypto_hash * (*alloc) (void);
59304 +       void (*free) (struct crypto_hash *tfm);
59305 +} digest_plugin;
59306 +
59307 +typedef struct compression_plugin {
59308 +       /* generic fields */
59309 +       plugin_header h;
59310 +       int (*init) (void);
59311 +       /* the maximum number of bytes the size of the "compressed" data can
59312 +        * exceed the uncompressed data. */
59313 +       int (*overrun) (unsigned src_len);
59314 +        coa_t(*alloc) (tfm_action act);
59315 +       void (*free) (coa_t coa, tfm_action act);
59316 +       /* minimal size of the flow we still try to compress */
59317 +       int (*min_size_deflate) (void);
59318 +        __u32(*checksum) (char *data, __u32 length);
59319 +       /* main transform procedures */
59320 +       void (*compress) (coa_t coa, __u8 *src_first, unsigned src_len,
59321 +                         __u8 *dst_first, unsigned *dst_len);
59322 +       void (*decompress) (coa_t coa, __u8 *src_first, unsigned src_len,
59323 +                           __u8 *dst_first, unsigned *dst_len);
59324 +} compression_plugin;
59325 +
59326 +typedef struct compression_mode_plugin {
59327 +       /* generic fields */
59328 +       plugin_header h;
59329 +       /* this is called when estimating compressibility
59330 +          of a logical cluster by its content */
59331 +       int (*should_deflate) (struct inode *inode, cloff_t index);
59332 +       /* this is called when results of compression should be saved */
59333 +       int (*accept_hook) (struct inode *inode, cloff_t index);
59334 +       /* this is called when results of compression should be discarded */
59335 +       int (*discard_hook) (struct inode *inode, cloff_t index);
59336 +} compression_mode_plugin;
59337 +
59338 +typedef struct cluster_plugin {
59339 +       /* generic fields */
59340 +       plugin_header h;
59341 +       int shift;
59342 +} cluster_plugin;
59343 +
59344 +typedef struct sd_ext_plugin {
59345 +       /* generic fields */
59346 +       plugin_header h;
59347 +       int (*present) (struct inode *inode, char **area, int *len);
59348 +       int (*absent) (struct inode *inode);
59349 +       int (*save_len) (struct inode *inode);
59350 +       int (*save) (struct inode *inode, char **area);
59351 +       /* alignment requirement for this stat-data part */
59352 +       int alignment;
59353 +} sd_ext_plugin;
59354 +
59355 +/* this plugin contains methods to allocate objectid for newly created files,
59356 +   to deallocate objectid when file gets removed, to report number of used and
59357 +   free objectids */
59358 +typedef struct oid_allocator_plugin {
59359 +       /* generic fields */
59360 +       plugin_header h;
59361 +       int (*init_oid_allocator) (reiser4_oid_allocator * map, __u64 nr_files,
59362 +                                  __u64 oids);
59363 +       /* used to report statfs->f_files */
59364 +        __u64(*oids_used) (reiser4_oid_allocator * map);
59365 +       /* get next oid to use */
59366 +        __u64(*next_oid) (reiser4_oid_allocator * map);
59367 +       /* used to report statfs->f_ffree */
59368 +        __u64(*oids_free) (reiser4_oid_allocator * map);
59369 +       /* allocate new objectid */
59370 +       int (*allocate_oid) (reiser4_oid_allocator * map, oid_t *);
59371 +       /* release objectid */
59372 +       int (*release_oid) (reiser4_oid_allocator * map, oid_t);
59373 +       /* how many pages to reserve in transaction for allocation of new
59374 +          objectid */
59375 +       int (*oid_reserve_allocate) (reiser4_oid_allocator * map);
59376 +       /* how many pages to reserve in transaction for freeing of an
59377 +          objectid */
59378 +       int (*oid_reserve_release) (reiser4_oid_allocator * map);
59379 +       void (*print_info) (const char *, reiser4_oid_allocator *);
59380 +} oid_allocator_plugin;
59381 +
59382 +/* disk layout plugin: this specifies super block, journal, bitmap (if there
59383 +   are any) locations, etc */
59384 +typedef struct disk_format_plugin {
59385 +       /* generic fields */
59386 +       plugin_header h;
59387 +       /* replay journal, initialize super_info_data, etc */
59388 +       int (*init_format) (struct super_block *, void *data);
59389 +
59390 +       /* key of root directory stat data */
59391 +       const reiser4_key * (*root_dir_key) (const struct super_block *);
59392 +
59393 +       int (*release) (struct super_block *);
59394 +       jnode * (*log_super) (struct super_block *);
59395 +       int (*check_open) (const struct inode *object);
59396 +       int (*version_update) (struct super_block *);
59397 +} disk_format_plugin;
59398 +
59399 +struct jnode_plugin {
59400 +       /* generic fields */
59401 +       plugin_header h;
59402 +       int (*init) (jnode * node);
59403 +       int (*parse) (jnode * node);
59404 +       struct address_space *(*mapping) (const jnode * node);
59405 +       unsigned long (*index) (const jnode * node);
59406 +       jnode * (*clone) (jnode * node);
59407 +};
59408 +
59409 +/* plugin instance.                                                         */
59410 +/*                                                                          */
59411 +/* This is "wrapper" union for all types of plugins. Most of the code uses  */
59412 +/* plugins of particular type (file_plugin, dir_plugin, etc.)  rather than  */
59413 +/* operates with pointers to reiser4_plugin. This union is only used in     */
59414 +/* some generic code in plugin/plugin.c that operates on all                */
59415 +/* plugins. Technically speaking purpose of this union is to add type       */
59416 +/* safety to said generic code: each plugin type (file_plugin, for          */
59417 +/* example), contains plugin_header as its first memeber. This first member */
59418 +/* is located at the same place in memory as .h member of                   */
59419 +/* reiser4_plugin. Generic code, obtains pointer to reiser4_plugin and      */
59420 +/* looks in the .h which is header of plugin type located in union. This    */
59421 +/* allows to avoid type-casts.                                              */
59422 +union reiser4_plugin {
59423 +       /* generic fields */
59424 +       plugin_header h;
59425 +       /* file plugin */
59426 +       file_plugin file;
59427 +       /* directory plugin */
59428 +       dir_plugin dir;
59429 +       /* hash plugin, used by directory plugin */
59430 +       hash_plugin hash;
59431 +       /* fibration plugin used by directory plugin */
59432 +       fibration_plugin fibration;
59433 +       /* cipher transform plugin, used by file plugin */
59434 +       cipher_plugin cipher;
59435 +       /* digest transform plugin, used by file plugin */
59436 +       digest_plugin digest;
59437 +       /* compression transform plugin, used by file plugin */
59438 +       compression_plugin compression;
59439 +       /* tail plugin, used by file plugin */
59440 +       formatting_plugin formatting;
59441 +       /* permission plugin */
59442 +       perm_plugin perm;
59443 +       /* node plugin */
59444 +       node_plugin node;
59445 +       /* item plugin */
59446 +       item_plugin item;
59447 +       /* stat-data extension plugin */
59448 +       sd_ext_plugin sd_ext;
59449 +       /* disk layout plugin */
59450 +       disk_format_plugin format;
59451 +       /* object id allocator plugin */
59452 +       oid_allocator_plugin oid_allocator;
59453 +       /* plugin for different jnode types */
59454 +       jnode_plugin jnode;
59455 +       /* compression mode plugin, used by object plugin */
59456 +       compression_mode_plugin compression_mode;
59457 +       /* cluster plugin, used by object plugin */
59458 +       cluster_plugin clust;
59459 +       /* place-holder for new plugin types that can be registered
59460 +          dynamically, and used by other dynamically loaded plugins.  */
59461 +       void *generic;
59462 +};
59463 +
59464 +struct reiser4_plugin_ops {
59465 +       /* called when plugin is initialized */
59466 +       int (*init) (reiser4_plugin * plugin);
59467 +       /* called when plugin is unloaded */
59468 +       int (*done) (reiser4_plugin * plugin);
59469 +       /* load given plugin from disk */
59470 +       int (*load) (struct inode *inode,
59471 +                    reiser4_plugin * plugin, char **area, int *len);
59472 +       /* how many space is required to store this plugin's state
59473 +          in stat-data */
59474 +       int (*save_len) (struct inode *inode, reiser4_plugin * plugin);
59475 +       /* save persistent plugin-data to disk */
59476 +       int (*save) (struct inode *inode, reiser4_plugin * plugin,
59477 +                    char **area);
59478 +       /* alignment requirement for on-disk state of this plugin
59479 +          in number of bytes */
59480 +       int alignment;
59481 +       /* install itself into given inode. This can return error
59482 +          (e.g., you cannot change hash of non-empty directory). */
59483 +       int (*change) (struct inode *inode, reiser4_plugin * plugin,
59484 +                      pset_member memb);
59485 +       /* install itself into given inode. This can return error
59486 +          (e.g., you cannot change hash of non-empty directory). */
59487 +       int (*inherit) (struct inode *inode, struct inode *parent,
59488 +                       reiser4_plugin * plugin);
59489 +};
59490 +
59491 +/* functions implemented in fs/reiser4/plugin/plugin.c */
59492 +
59493 +/* stores plugin reference in reiser4-specific part of inode */
59494 +extern int set_object_plugin(struct inode *inode, reiser4_plugin_id id);
59495 +extern int init_plugins(void);
59496 +
59497 +/* builtin plugins */
59498 +
59499 +/* builtin hash-plugins */
59500 +
59501 +typedef enum {
59502 +       RUPASOV_HASH_ID,
59503 +       R5_HASH_ID,
59504 +       TEA_HASH_ID,
59505 +       FNV1_HASH_ID,
59506 +       DEGENERATE_HASH_ID,
59507 +       LAST_HASH_ID
59508 +} reiser4_hash_id;
59509 +
59510 +/* builtin cipher plugins */
59511 +
59512 +typedef enum {
59513 +       NONE_CIPHER_ID,
59514 +       LAST_CIPHER_ID
59515 +} reiser4_cipher_id;
59516 +
59517 +/* builtin digest plugins */
59518 +
59519 +typedef enum {
59520 +       SHA256_32_DIGEST_ID,
59521 +       LAST_DIGEST_ID
59522 +} reiser4_digest_id;
59523 +
59524 +/* builtin compression mode plugins */
59525 +typedef enum {
59526 +       NONE_COMPRESSION_MODE_ID,
59527 +       LATTD_COMPRESSION_MODE_ID,
59528 +       ULTIM_COMPRESSION_MODE_ID,
59529 +       FORCE_COMPRESSION_MODE_ID,
59530 +       CONVX_COMPRESSION_MODE_ID,
59531 +       LAST_COMPRESSION_MODE_ID
59532 +} reiser4_compression_mode_id;
59533 +
59534 +/* builtin cluster plugins */
59535 +typedef enum {
59536 +       CLUSTER_64K_ID,
59537 +       CLUSTER_32K_ID,
59538 +       CLUSTER_16K_ID,
59539 +       CLUSTER_8K_ID,
59540 +       CLUSTER_4K_ID,
59541 +       LAST_CLUSTER_ID
59542 +} reiser4_cluster_id;
59543 +
59544 +/* builtin tail-plugins */
59545 +
59546 +typedef enum {
59547 +       NEVER_TAILS_FORMATTING_ID,
59548 +       ALWAYS_TAILS_FORMATTING_ID,
59549 +       SMALL_FILE_FORMATTING_ID,
59550 +       LAST_TAIL_FORMATTING_ID
59551 +} reiser4_formatting_id;
59552 +
59553 +/* data type used to pack parameters that we pass to vfs object creation
59554 +   function create_object() */
59555 +struct reiser4_object_create_data {
59556 +       /* plugin to control created object */
59557 +       reiser4_file_id id;
59558 +       /* mode of regular file, directory or special file */
59559 +/* what happens if some other sort of perm plugin is in use? */
59560 +       int mode;
59561 +       /* rdev of special file */
59562 +       dev_t rdev;
59563 +       /* symlink target */
59564 +       const char *name;
59565 +       /* add here something for non-standard objects you invent, like
59566 +          query for interpolation file etc. */
59567 +
59568 +       struct reiser4_crypto_info *crypto;
59569 +
59570 +       struct inode *parent;
59571 +       struct dentry *dentry;
59572 +};
59573 +
59574 +/* description of directory entry being created/destroyed/sought for
59575 +
59576 +   It is passed down to the directory plugin and farther to the
59577 +   directory item plugin methods. Creation of new directory is done in
59578 +   several stages: first we search for an entry with the same name, then
59579 +   create new one. reiser4_dir_entry_desc is used to store some information
59580 +   collected at some stage of this process and required later: key of
59581 +   item that we want to insert/delete and pointer to an object that will
59582 +   be bound by the new directory entry. Probably some more fields will
59583 +   be added there.
59584 +
59585 +*/
59586 +struct reiser4_dir_entry_desc {
59587 +       /* key of directory entry */
59588 +       reiser4_key key;
59589 +       /* object bound by this entry. */
59590 +       struct inode *obj;
59591 +};
59592 +
59593 +#define MAX_PLUGIN_TYPE_LABEL_LEN  32
59594 +#define MAX_PLUGIN_PLUG_LABEL_LEN  32
59595 +
59596 +#define PLUGIN_BY_ID(TYPE, ID, FIELD)                                  \
59597 +static inline TYPE *TYPE ## _by_id(reiser4_plugin_id id)               \
59598 +{                                                                      \
59599 +       reiser4_plugin *plugin = plugin_by_id(ID, id);                  \
59600 +       return plugin ? &plugin->FIELD : NULL;                          \
59601 +}                                                                      \
59602 +static inline TYPE *TYPE ## _by_disk_id(reiser4_tree * tree, d16 *id)  \
59603 +{                                                                      \
59604 +       reiser4_plugin *plugin = plugin_by_disk_id(tree, ID, id);       \
59605 +       return plugin ? &plugin->FIELD : NULL;                          \
59606 +}                                                                      \
59607 +static inline TYPE *TYPE ## _by_unsafe_id(reiser4_plugin_id id)                \
59608 +{                                                                      \
59609 +       reiser4_plugin *plugin = plugin_by_unsafe_id(ID, id);           \
59610 +       return plugin ? &plugin->FIELD : NULL;                          \
59611 +}                                                                      \
59612 +static inline reiser4_plugin* TYPE ## _to_plugin(TYPE* plugin)         \
59613 +{                                                                      \
59614 +       return (reiser4_plugin *) plugin;                               \
59615 +}                                                                      \
59616 +static inline reiser4_plugin_id TYPE ## _id(TYPE* plugin)              \
59617 +{                                                                      \
59618 +       return TYPE ## _to_plugin(plugin)->h.id;                        \
59619 +}                                                                      \
59620 +typedef struct { int foo; } TYPE ## _plugin_dummy
59621 +
59622 +PLUGIN_BY_ID(item_plugin, REISER4_ITEM_PLUGIN_TYPE, item);
59623 +PLUGIN_BY_ID(file_plugin, REISER4_FILE_PLUGIN_TYPE, file);
59624 +PLUGIN_BY_ID(dir_plugin, REISER4_DIR_PLUGIN_TYPE, dir);
59625 +PLUGIN_BY_ID(node_plugin, REISER4_NODE_PLUGIN_TYPE, node);
59626 +PLUGIN_BY_ID(sd_ext_plugin, REISER4_SD_EXT_PLUGIN_TYPE, sd_ext);
59627 +PLUGIN_BY_ID(perm_plugin, REISER4_PERM_PLUGIN_TYPE, perm);
59628 +PLUGIN_BY_ID(hash_plugin, REISER4_HASH_PLUGIN_TYPE, hash);
59629 +PLUGIN_BY_ID(fibration_plugin, REISER4_FIBRATION_PLUGIN_TYPE, fibration);
59630 +PLUGIN_BY_ID(cipher_plugin, REISER4_CIPHER_PLUGIN_TYPE, cipher);
59631 +PLUGIN_BY_ID(digest_plugin, REISER4_DIGEST_PLUGIN_TYPE, digest);
59632 +PLUGIN_BY_ID(compression_plugin, REISER4_COMPRESSION_PLUGIN_TYPE, compression);
59633 +PLUGIN_BY_ID(formatting_plugin, REISER4_FORMATTING_PLUGIN_TYPE, formatting);
59634 +PLUGIN_BY_ID(disk_format_plugin, REISER4_FORMAT_PLUGIN_TYPE, format);
59635 +PLUGIN_BY_ID(jnode_plugin, REISER4_JNODE_PLUGIN_TYPE, jnode);
59636 +PLUGIN_BY_ID(compression_mode_plugin, REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
59637 +            compression_mode);
59638 +PLUGIN_BY_ID(cluster_plugin, REISER4_CLUSTER_PLUGIN_TYPE, clust);
59639 +
59640 +extern int save_plugin_id(reiser4_plugin * plugin, d16 * area);
59641 +
59642 +extern struct list_head *get_plugin_list(reiser4_plugin_type type_id);
59643 +
59644 +#define for_all_plugins(ptype, plugin)                                                 \
59645 +for (plugin = list_entry(get_plugin_list(ptype)->next, reiser4_plugin, h.linkage);     \
59646 +     get_plugin_list(ptype) != &plugin->h.linkage;                                     \
59647 +     plugin = list_entry(plugin->h.linkage.next, reiser4_plugin, h.linkage))
59648 +
59649 +
59650 +extern int grab_plugin_pset(struct inode *self, struct inode *ancestor,
59651 +                           pset_member memb);
59652 +extern int force_plugin_pset(struct inode *self, pset_member memb,
59653 +                            reiser4_plugin *plug);
59654 +extern int finish_pset(struct inode *inode);
59655 +
59656 +/* defined in fs/reiser4/plugin/object.c */
59657 +extern file_plugin file_plugins[LAST_FILE_PLUGIN_ID];
59658 +/* defined in fs/reiser4/plugin/object.c */
59659 +extern dir_plugin dir_plugins[LAST_DIR_ID];
59660 +/* defined in fs/reiser4/plugin/item/static_stat.c */
59661 +extern sd_ext_plugin sd_ext_plugins[LAST_SD_EXTENSION];
59662 +/* defined in fs/reiser4/plugin/hash.c */
59663 +extern hash_plugin hash_plugins[LAST_HASH_ID];
59664 +/* defined in fs/reiser4/plugin/fibration.c */
59665 +extern fibration_plugin fibration_plugins[LAST_FIBRATION_ID];
59666 +/* defined in fs/reiser4/plugin/crypt.c */
59667 +extern cipher_plugin cipher_plugins[LAST_CIPHER_ID];
59668 +/* defined in fs/reiser4/plugin/digest.c */
59669 +extern digest_plugin digest_plugins[LAST_DIGEST_ID];
59670 +/* defined in fs/reiser4/plugin/compress/compress.c */
59671 +extern compression_plugin compression_plugins[LAST_COMPRESSION_ID];
59672 +/* defined in fs/reiser4/plugin/compress/compression_mode.c */
59673 +extern compression_mode_plugin
59674 +compression_mode_plugins[LAST_COMPRESSION_MODE_ID];
59675 +/* defined in fs/reiser4/plugin/cluster.c */
59676 +extern cluster_plugin cluster_plugins[LAST_CLUSTER_ID];
59677 +/* defined in fs/reiser4/plugin/tail.c */
59678 +extern formatting_plugin formatting_plugins[LAST_TAIL_FORMATTING_ID];
59679 +/* defined in fs/reiser4/plugin/security/security.c */
59680 +extern perm_plugin perm_plugins[LAST_PERM_ID];
59681 +/* defined in fs/reiser4/plugin/item/item.c */
59682 +extern item_plugin item_plugins[LAST_ITEM_ID];
59683 +/* defined in fs/reiser4/plugin/node/node.c */
59684 +extern node_plugin node_plugins[LAST_NODE_ID];
59685 +/* defined in fs/reiser4/plugin/disk_format/disk_format.c */
59686 +extern disk_format_plugin format_plugins[LAST_FORMAT_ID];
59687 +
59688 +/* __FS_REISER4_PLUGIN_TYPES_H__ */
59689 +#endif
59690 +
59691 +/* Make Linus happy.
59692 +   Local variables:
59693 +   c-indentation-style: "K&R"
59694 +   mode-name: "LC"
59695 +   c-basic-offset: 8
59696 +   tab-width: 8
59697 +   fill-column: 120
59698 +   End:
59699 +*/
59700 diff -puN /dev/null fs/reiser4/plugin/plugin_header.h
59701 --- /dev/null
59702 +++ a/fs/reiser4/plugin/plugin_header.h
59703 @@ -0,0 +1,157 @@
59704 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
59705 +
59706 +/* plugin header. Data structures required by all plugin types. */
59707 +
59708 +#if !defined(__PLUGIN_HEADER_H__)
59709 +#define __PLUGIN_HEADER_H__
59710 +
59711 +/* plugin data-types and constants */
59712 +
59713 +#include "../debug.h"
59714 +#include "../dformat.h"
59715 +
59716 +/* Every plugin type can be considered as a class of virtual objects
59717 +   {(type, i) | i = 0, 1, ...}, which has one the following categories
59718 +   of virtualization:
59719 +   A - no virtualization;
59720 +   F - per-file virtualization;
59721 +   S - per-superblock virtualization;
59722 +   FIXME-EDWARD: Define every such category */
59723 +
59724 +/* Supported plugin types: (id, (virtualization category), short description) */
59725 +typedef enum {
59726 +       REISER4_FILE_PLUGIN_TYPE,             /* (F) service VFS enry-points */
59727 +       REISER4_DIR_PLUGIN_TYPE,              /* (F) service VFS enry-points */
59728 +       REISER4_ITEM_PLUGIN_TYPE,             /* (F) manage items */
59729 +       REISER4_NODE_PLUGIN_TYPE,             /* (S) manage formatted nodes */
59730 +       REISER4_HASH_PLUGIN_TYPE,             /* (F) compute hash */
59731 +       REISER4_FIBRATION_PLUGIN_TYPE,        /* (F) directory fibrations */
59732 +       REISER4_FORMATTING_PLUGIN_TYPE,       /* (F) tail-packing policy */
59733 +       REISER4_PERM_PLUGIN_TYPE,             /*       stub (vacancy)     */
59734 +       REISER4_SD_EXT_PLUGIN_TYPE,           /* (A) stat-data extensions */
59735 +       REISER4_FORMAT_PLUGIN_TYPE,           /* (S) specify disk format */
59736 +       REISER4_JNODE_PLUGIN_TYPE,            /* (A) in-memory node headers */
59737 +       REISER4_CIPHER_PLUGIN_TYPE,           /* (F) cipher transform algs */
59738 +       REISER4_DIGEST_PLUGIN_TYPE,           /* (F) digest transform algs */
59739 +       REISER4_COMPRESSION_PLUGIN_TYPE,      /* (F) compression tfm algs */
59740 +       REISER4_COMPRESSION_MODE_PLUGIN_TYPE, /* (F) compression heuristic */
59741 +       REISER4_CLUSTER_PLUGIN_TYPE,          /* (F) size of logical cluster */
59742 +       REISER4_PLUGIN_TYPES
59743 +} reiser4_plugin_type;
59744 +
59745 +/* Supported plugin groups */
59746 +typedef enum {
59747 +       REISER4_DIRECTORY_FILE,
59748 +       REISER4_REGULAR_FILE,
59749 +       REISER4_SYMLINK_FILE,
59750 +       REISER4_SPECIAL_FILE,
59751 +} file_plugin_group;
59752 +
59753 +struct reiser4_plugin_ops;
59754 +/* generic plugin operations, supported by each
59755 +    plugin type. */
59756 +typedef struct reiser4_plugin_ops reiser4_plugin_ops;
59757 +
59758 +/* the common part of all plugin instances. */
59759 +typedef struct plugin_header {
59760 +       /* plugin type */
59761 +       reiser4_plugin_type type_id;
59762 +       /* id of this plugin */
59763 +       reiser4_plugin_id id;
59764 +       /* bitmask of groups the plugin belongs to. */
59765 +       reiser4_plugin_groups groups;
59766 +       /* plugin operations */
59767 +       reiser4_plugin_ops *pops;
59768 +/* NIKITA-FIXME-HANS: usage of and access to label and desc is not commented and
59769 + * defined. */
59770 +       /* short label of this plugin */
59771 +       const char *label;
59772 +       /* descriptive string.. */
59773 +       const char *desc;
59774 +       /* list linkage */
59775 +       struct list_head linkage;
59776 +} plugin_header;
59777 +
59778 +#define plugin_of_group(plug, group) (plug->h.groups & (1 << group))
59779 +
59780 +/* PRIVATE INTERFACES */
59781 +/* NIKITA-FIXME-HANS: what is this for and why does it duplicate what is in
59782 + * plugin_header? */
59783 +/* plugin type representation. */
59784 +struct reiser4_plugin_type_data {
59785 +       /* internal plugin type identifier. Should coincide with
59786 +          index of this item in plugins[] array. */
59787 +       reiser4_plugin_type type_id;
59788 +       /* short symbolic label of this plugin type. Should be no longer
59789 +          than MAX_PLUGIN_TYPE_LABEL_LEN characters including '\0'. */
59790 +       const char *label;
59791 +       /* plugin type description longer than .label */
59792 +       const char *desc;
59793 +
59794 +/* NIKITA-FIXME-HANS: define built-in */
59795 +       /* number of built-in plugin instances of this type */
59796 +       int builtin_num;
59797 +       /* array of built-in plugins */
59798 +       void *builtin;
59799 +       struct list_head plugins_list;
59800 +       size_t size;
59801 +};
59802 +
59803 +extern struct reiser4_plugin_type_data plugins[REISER4_PLUGIN_TYPES];
59804 +
59805 +int is_plugin_type_valid(reiser4_plugin_type type);
59806 +int is_plugin_id_valid(reiser4_plugin_type type, reiser4_plugin_id id);
59807 +
59808 +static inline reiser4_plugin *plugin_at(struct reiser4_plugin_type_data *ptype,
59809 +                                       int i)
59810 +{
59811 +       char *builtin;
59812 +
59813 +       builtin = ptype->builtin;
59814 +       return (reiser4_plugin *) (builtin + i * ptype->size);
59815 +}
59816 +
59817 +/* return plugin by its @type_id and @id */
59818 +static inline reiser4_plugin *plugin_by_id(reiser4_plugin_type type,
59819 +                                          reiser4_plugin_id id)
59820 +{
59821 +       assert("nikita-1651", is_plugin_type_valid(type));
59822 +       assert("nikita-1652", is_plugin_id_valid(type, id));
59823 +       return plugin_at(&plugins[type], id);
59824 +}
59825 +
59826 +extern reiser4_plugin *plugin_by_unsafe_id(reiser4_plugin_type type_id,
59827 +                                          reiser4_plugin_id id);
59828 +
59829 +/**
59830 + * plugin_by_disk_id - get reiser4_plugin
59831 + * @type_id: plugin type id
59832 + * @did: plugin id in disk format
59833 + *
59834 + * Returns reiser4_plugin by plugin type id an dplugin_id.
59835 + */
59836 +static inline reiser4_plugin *plugin_by_disk_id(reiser4_tree * tree UNUSED_ARG,
59837 +                                               reiser4_plugin_type type_id,
59838 +                                               __le16 *plugin_id)
59839 +{
59840 +       /*
59841 +        * what we should do properly is to maintain within each file-system a
59842 +        * dictionary that maps on-disk plugin ids to "universal" ids. This
59843 +        * dictionary will be resolved on mount time, so that this function
59844 +        * will perform just one additional array lookup.
59845 +        */
59846 +       return plugin_by_unsafe_id(type_id, le16_to_cpu(*plugin_id));
59847 +}
59848 +
59849 +/* __PLUGIN_HEADER_H__ */
59850 +#endif
59851 +
59852 +/*
59853 + * Local variables:
59854 + * c-indentation-style: "K&R"
59855 + * mode-name: "LC"
59856 + * c-basic-offset: 8
59857 + * tab-width: 8
59858 + * fill-column: 79
59859 + * End:
59860 + */
59861 diff -puN /dev/null fs/reiser4/plugin/plugin_set.c
59862 --- /dev/null
59863 +++ a/fs/reiser4/plugin/plugin_set.c
59864 @@ -0,0 +1,380 @@
59865 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
59866 + * reiser4/README */
59867 +/* This file contains Reiser4 plugin set operations */
59868 +
59869 +/* plugin sets
59870 + *
59871 + * Each file in reiser4 is controlled by a whole set of plugins (file plugin,
59872 + * directory plugin, hash plugin, tail policy plugin, security plugin, etc.)
59873 + * assigned (inherited, deduced from mode bits, etc.) at creation time. This
59874 + * set of plugins (so called pset) is described by structure plugin_set (see
59875 + * plugin/plugin_set.h), which contains pointers to all required plugins.
59876 + *
59877 + * Children can inherit some pset members from their parent, however sometimes
59878 + * it is useful to specify members different from parent ones. Since object's
59879 + * pset can not be easily changed without fatal consequences, we use for this
59880 + * purpose another special plugin table (so called hset, or heir set) described
59881 + * by the same structure.
59882 + *
59883 + * Inode only stores a pointers to pset and hset. Different inodes with the
59884 + * same set of pset (hset) members point to the same pset (hset). This is
59885 + * archived by storing psets and hsets in global hash table. Races are avoided
59886 + * by simple (and efficient so far) solution of never recycling psets, even
59887 + * when last inode pointing to it is destroyed.
59888 + */
59889 +
59890 +#include "../debug.h"
59891 +#include "../super.h"
59892 +#include "plugin_set.h"
59893 +
59894 +#include <linux/slab.h>
59895 +#include <linux/stddef.h>
59896 +
59897 +/* slab for plugin sets */
59898 +static struct kmem_cache *plugin_set_slab;
59899 +
59900 +static spinlock_t plugin_set_lock[8] __cacheline_aligned_in_smp = {
59901 +       [0 ... 7] = SPIN_LOCK_UNLOCKED
59902 +};
59903 +
59904 +/* hash table support */
59905 +
59906 +#define PS_TABLE_SIZE (32)
59907 +
59908 +static inline plugin_set *cast_to(const unsigned long *a)
59909 +{
59910 +       return container_of(a, plugin_set, hashval);
59911 +}
59912 +
59913 +static inline int pseq(const unsigned long *a1, const unsigned long *a2)
59914 +{
59915 +       plugin_set *set1;
59916 +       plugin_set *set2;
59917 +
59918 +       /* make sure fields are not missed in the code below */
59919 +       cassert(sizeof *set1 ==
59920 +               sizeof set1->hashval +
59921 +               sizeof set1->link +
59922 +               sizeof set1->file +
59923 +               sizeof set1->dir +
59924 +               sizeof set1->perm +
59925 +               sizeof set1->formatting +
59926 +               sizeof set1->hash +
59927 +               sizeof set1->fibration +
59928 +               sizeof set1->sd +
59929 +               sizeof set1->dir_item +
59930 +               sizeof set1->cipher +
59931 +               sizeof set1->digest +
59932 +               sizeof set1->compression +
59933 +               sizeof set1->compression_mode +
59934 +               sizeof set1->cluster +
59935 +               sizeof set1->create);
59936 +
59937 +       set1 = cast_to(a1);
59938 +       set2 = cast_to(a2);
59939 +       return
59940 +           set1->hashval == set2->hashval &&
59941 +           set1->file == set2->file &&
59942 +           set1->dir == set2->dir &&
59943 +           set1->perm == set2->perm &&
59944 +           set1->formatting == set2->formatting &&
59945 +           set1->hash == set2->hash &&
59946 +           set1->fibration == set2->fibration &&
59947 +           set1->sd == set2->sd &&
59948 +           set1->dir_item == set2->dir_item &&
59949 +           set1->cipher == set2->cipher &&
59950 +           set1->digest == set2->digest &&
59951 +           set1->compression == set2->compression &&
59952 +           set1->compression_mode == set2->compression_mode &&
59953 +           set1->cluster == set2->cluster &&
59954 +           set1->create == set2->create;
59955 +}
59956 +
59957 +#define HASH_FIELD(hash, set, field)           \
59958 +({                                             \
59959 +       (hash) += (unsigned long)(set)->field >> 2;     \
59960 +})
59961 +
59962 +static inline unsigned long calculate_hash(const plugin_set * set)
59963 +{
59964 +       unsigned long result;
59965 +
59966 +       result = 0;
59967 +       HASH_FIELD(result, set, file);
59968 +       HASH_FIELD(result, set, dir);
59969 +       HASH_FIELD(result, set, perm);
59970 +       HASH_FIELD(result, set, formatting);
59971 +       HASH_FIELD(result, set, hash);
59972 +       HASH_FIELD(result, set, fibration);
59973 +       HASH_FIELD(result, set, sd);
59974 +       HASH_FIELD(result, set, dir_item);
59975 +       HASH_FIELD(result, set, cipher);
59976 +       HASH_FIELD(result, set, digest);
59977 +       HASH_FIELD(result, set, compression);
59978 +       HASH_FIELD(result, set, compression_mode);
59979 +       HASH_FIELD(result, set, cluster);
59980 +       HASH_FIELD(result, set, create);
59981 +       return result & (PS_TABLE_SIZE - 1);
59982 +}
59983 +
59984 +static inline unsigned long
59985 +pshash(ps_hash_table * table, const unsigned long *a)
59986 +{
59987 +       return *a;
59988 +}
59989 +
59990 +/* The hash table definition */
59991 +#define KMALLOC(size) kmalloc((size), reiser4_ctx_gfp_mask_get())
59992 +#define KFREE(ptr, size) kfree(ptr)
59993 +TYPE_SAFE_HASH_DEFINE(ps, plugin_set, unsigned long, hashval, link, pshash,
59994 +                     pseq);
59995 +#undef KFREE
59996 +#undef KMALLOC
59997 +
59998 +static ps_hash_table ps_table;
59999 +static plugin_set empty_set = {
60000 +       .hashval = 0,
60001 +       .file = NULL,
60002 +       .dir = NULL,
60003 +       .perm = NULL,
60004 +       .formatting = NULL,
60005 +       .hash = NULL,
60006 +       .fibration = NULL,
60007 +       .sd = NULL,
60008 +       .dir_item = NULL,
60009 +       .cipher = NULL,
60010 +       .digest = NULL,
60011 +       .compression = NULL,
60012 +       .compression_mode = NULL,
60013 +       .cluster = NULL,
60014 +       .create = NULL,
60015 +       .link = {NULL}
60016 +};
60017 +
60018 +plugin_set *plugin_set_get_empty(void)
60019 +{
60020 +       return &empty_set;
60021 +}
60022 +
60023 +void plugin_set_put(plugin_set * set)
60024 +{
60025 +}
60026 +
60027 +static inline unsigned long *pset_field(plugin_set * set, int offset)
60028 +{
60029 +       return (unsigned long *)(((char *)set) + offset);
60030 +}
60031 +
60032 +static int plugin_set_field(plugin_set ** set, const unsigned long val,
60033 +                           const int offset)
60034 +{
60035 +       unsigned long *spot;
60036 +       spinlock_t *lock;
60037 +       plugin_set replica;
60038 +       plugin_set *twin;
60039 +       plugin_set *psal;
60040 +       plugin_set *orig;
60041 +
60042 +       assert("nikita-2902", set != NULL);
60043 +       assert("nikita-2904", *set != NULL);
60044 +
60045 +       spot = pset_field(*set, offset);
60046 +       if (unlikely(*spot == val))
60047 +               return 0;
60048 +
60049 +       replica = *(orig = *set);
60050 +       *pset_field(&replica, offset) = val;
60051 +       replica.hashval = calculate_hash(&replica);
60052 +       rcu_read_lock();
60053 +       twin = ps_hash_find(&ps_table, &replica.hashval);
60054 +       if (unlikely(twin == NULL)) {
60055 +               rcu_read_unlock();
60056 +               psal = kmem_cache_alloc(plugin_set_slab,
60057 +                                       reiser4_ctx_gfp_mask_get());
60058 +               if (psal == NULL)
60059 +                       return RETERR(-ENOMEM);
60060 +               *psal = replica;
60061 +               lock = &plugin_set_lock[replica.hashval & 7];
60062 +               spin_lock(lock);
60063 +               twin = ps_hash_find(&ps_table, &replica.hashval);
60064 +               if (likely(twin == NULL)) {
60065 +                       *set = psal;
60066 +                       ps_hash_insert_rcu(&ps_table, psal);
60067 +               } else {
60068 +                       *set = twin;
60069 +                       kmem_cache_free(plugin_set_slab, psal);
60070 +               }
60071 +               spin_unlock(lock);
60072 +       } else {
60073 +               rcu_read_unlock();
60074 +               *set = twin;
60075 +       }
60076 +       return 0;
60077 +}
60078 +
60079 +static struct {
60080 +       int offset;
60081 +       reiser4_plugin_groups groups;
60082 +       reiser4_plugin_type type;
60083 +} pset_descr[PSET_LAST] = {
60084 +       [PSET_FILE] = {
60085 +               .offset = offsetof(plugin_set, file),
60086 +               .type = REISER4_FILE_PLUGIN_TYPE,
60087 +               .groups = 0
60088 +       },
60089 +       [PSET_DIR] = {
60090 +               .offset = offsetof(plugin_set, dir),
60091 +               .type = REISER4_DIR_PLUGIN_TYPE,
60092 +               .groups = 0
60093 +       },
60094 +       [PSET_PERM] = {
60095 +               .offset = offsetof(plugin_set, perm),
60096 +               .type = REISER4_PERM_PLUGIN_TYPE,
60097 +               .groups = 0
60098 +       },
60099 +       [PSET_FORMATTING] = {
60100 +               .offset = offsetof(plugin_set, formatting),
60101 +               .type = REISER4_FORMATTING_PLUGIN_TYPE,
60102 +               .groups = 0
60103 +       },
60104 +       [PSET_HASH] = {
60105 +               .offset = offsetof(plugin_set, hash),
60106 +               .type = REISER4_HASH_PLUGIN_TYPE,
60107 +               .groups = 0
60108 +       },
60109 +       [PSET_FIBRATION] = {
60110 +               .offset = offsetof(plugin_set, fibration),
60111 +               .type = REISER4_FIBRATION_PLUGIN_TYPE,
60112 +               .groups = 0
60113 +       },
60114 +       [PSET_SD] = {
60115 +               .offset = offsetof(plugin_set, sd),
60116 +               .type = REISER4_ITEM_PLUGIN_TYPE,
60117 +               .groups = (1 << STAT_DATA_ITEM_TYPE)
60118 +       },
60119 +       [PSET_DIR_ITEM] = {
60120 +               .offset = offsetof(plugin_set, dir_item),
60121 +               .type = REISER4_ITEM_PLUGIN_TYPE,
60122 +               .groups = (1 << DIR_ENTRY_ITEM_TYPE)
60123 +       },
60124 +       [PSET_CIPHER] = {
60125 +               .offset = offsetof(plugin_set, cipher),
60126 +               .type = REISER4_CIPHER_PLUGIN_TYPE,
60127 +               .groups = 0
60128 +       },
60129 +       [PSET_DIGEST] = {
60130 +               .offset = offsetof(plugin_set, digest),
60131 +               .type = REISER4_DIGEST_PLUGIN_TYPE,
60132 +               .groups = 0
60133 +       },
60134 +       [PSET_COMPRESSION] = {
60135 +               .offset = offsetof(plugin_set, compression),
60136 +               .type = REISER4_COMPRESSION_PLUGIN_TYPE,
60137 +               .groups = 0
60138 +       },
60139 +       [PSET_COMPRESSION_MODE] = {
60140 +               .offset = offsetof(plugin_set, compression_mode),
60141 +               .type = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
60142 +               .groups = 0
60143 +       },
60144 +       [PSET_CLUSTER] = {
60145 +               .offset = offsetof(plugin_set, cluster),
60146 +               .type = REISER4_CLUSTER_PLUGIN_TYPE,
60147 +               .groups = 0
60148 +       },
60149 +       [PSET_CREATE] = {
60150 +               .offset = offsetof(plugin_set, create),
60151 +               .type = REISER4_FILE_PLUGIN_TYPE,
60152 +               .groups = (1 << REISER4_REGULAR_FILE)
60153 +       }
60154 +};
60155 +
60156 +#define DEFINE_PSET_OPS(PREFIX)                                                       \
60157 +       reiser4_plugin_type PREFIX##_member_to_type_unsafe(pset_member memb)   \
60158 +{                                                                             \
60159 +       if (memb > PSET_LAST)                                                  \
60160 +               return REISER4_PLUGIN_TYPES;                                   \
60161 +       return pset_descr[memb].type;                                          \
60162 +}                                                                             \
60163 +                                                                              \
60164 +int PREFIX##_set_unsafe(plugin_set ** set, pset_member memb,                  \
60165 +                    reiser4_plugin * plugin)                                  \
60166 +{                                                                             \
60167 +       assert("nikita-3492", set != NULL);                                    \
60168 +       assert("nikita-3493", *set != NULL);                                   \
60169 +       assert("nikita-3494", plugin != NULL);                                 \
60170 +       assert("nikita-3495", 0 <= memb && memb < PSET_LAST);                  \
60171 +       assert("nikita-3496", plugin->h.type_id == pset_descr[memb].type);     \
60172 +                                                                              \
60173 +       if (pset_descr[memb].groups)                                           \
60174 +               if (!(pset_descr[memb].groups & plugin->h.groups))             \
60175 +                       return -EINVAL;                                        \
60176 +                                                                              \
60177 +       return plugin_set_field(set,                                           \
60178 +                       (unsigned long)plugin, pset_descr[memb].offset);       \
60179 +}                                                                             \
60180 +                                                                              \
60181 +reiser4_plugin *PREFIX##_get(plugin_set * set, pset_member memb)              \
60182 +{                                                                             \
60183 +       assert("nikita-3497", set != NULL);                                    \
60184 +       assert("nikita-3498", 0 <= memb && memb < PSET_LAST);                  \
60185 +                                                                              \
60186 +       return *(reiser4_plugin **) (((char *)set) + pset_descr[memb].offset); \
60187 +}
60188 +
60189 +DEFINE_PSET_OPS(aset);
60190 +
60191 +int set_plugin(plugin_set ** set, pset_member memb, reiser4_plugin * plugin)
60192 +{
60193 +       return plugin_set_field(set,
60194 +               (unsigned long)plugin, pset_descr[memb].offset);
60195 +}
60196 +
60197 +/**
60198 + * init_plugin_set - create plugin set cache and hash table
60199 + *
60200 + * Initializes slab cache of plugin_set-s and their hash table. It is part of
60201 + * reiser4 module initialization.
60202 + */
60203 +int init_plugin_set(void)
60204 +{
60205 +       int result;
60206 +
60207 +       result = ps_hash_init(&ps_table, PS_TABLE_SIZE);
60208 +       if (result == 0) {
60209 +               plugin_set_slab = kmem_cache_create("plugin_set",
60210 +                                                   sizeof(plugin_set), 0,
60211 +                                                   SLAB_HWCACHE_ALIGN,
60212 +                                                   NULL);
60213 +               if (plugin_set_slab == NULL)
60214 +                       result = RETERR(-ENOMEM);
60215 +       }
60216 +       return result;
60217 +}
60218 +
60219 +/**
60220 + * done_plugin_set - delete plugin_set cache and plugin_set hash table
60221 + *
60222 + * This is called on reiser4 module unloading or system shutdown.
60223 + */
60224 +void done_plugin_set(void)
60225 +{
60226 +       plugin_set *cur, *next;
60227 +
60228 +       for_all_in_htable(&ps_table, ps, cur, next) {
60229 +               ps_hash_remove(&ps_table, cur);
60230 +               kmem_cache_free(plugin_set_slab, cur);
60231 +       }
60232 +       destroy_reiser4_cache(&plugin_set_slab);
60233 +       ps_hash_done(&ps_table);
60234 +}
60235 +
60236 +/*
60237 + * Local variables:
60238 + * c-indentation-style: "K&R"
60239 + * mode-name: "LC"
60240 + * c-basic-offset: 8
60241 + * tab-width: 8
60242 + * fill-column: 120
60243 + * End:
60244 + */
60245 diff -puN /dev/null fs/reiser4/plugin/plugin_set.h
60246 --- /dev/null
60247 +++ a/fs/reiser4/plugin/plugin_set.h
60248 @@ -0,0 +1,78 @@
60249 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
60250 + * reiser4/README */
60251 +
60252 +/* Reiser4 plugin set definition.
60253 +   See fs/reiser4/plugin/plugin_set.c for details */
60254 +
60255 +#if !defined(__PLUGIN_SET_H__)
60256 +#define __PLUGIN_SET_H__
60257 +
60258 +#include "../type_safe_hash.h"
60259 +#include "plugin.h"
60260 +
60261 +#include <linux/rcupdate.h>
60262 +
60263 +struct plugin_set;
60264 +typedef struct plugin_set plugin_set;
60265 +
60266 +TYPE_SAFE_HASH_DECLARE(ps, plugin_set);
60267 +
60268 +struct plugin_set {
60269 +       unsigned long hashval;
60270 +       /* plugin of file */
60271 +       file_plugin *file;
60272 +       /* plugin of dir */
60273 +       dir_plugin *dir;
60274 +       /* perm plugin for this file */
60275 +       perm_plugin *perm;
60276 +       /* tail policy plugin. Only meaningful for regular files */
60277 +       formatting_plugin *formatting;
60278 +       /* hash plugin. Only meaningful for directories. */
60279 +       hash_plugin *hash;
60280 +       /* fibration plugin. Only meaningful for directories. */
60281 +       fibration_plugin *fibration;
60282 +       /* plugin of stat-data */
60283 +       item_plugin *sd;
60284 +       /* plugin of items a directory is built of */
60285 +       item_plugin *dir_item;
60286 +       /* cipher plugin */
60287 +       cipher_plugin *cipher;
60288 +       /* digest plugin */
60289 +       digest_plugin *digest;
60290 +       /* compression plugin */
60291 +       compression_plugin *compression;
60292 +       /* compression mode plugin */
60293 +       compression_mode_plugin *compression_mode;
60294 +       /* cluster plugin */
60295 +       cluster_plugin *cluster;
60296 +       /* this specifies file plugin of regular children.
60297 +          only meaningful for directories */
60298 +       file_plugin *create;
60299 +       ps_hash_link link;
60300 +};
60301 +
60302 +extern plugin_set *plugin_set_get_empty(void);
60303 +extern void plugin_set_put(plugin_set * set);
60304 +
60305 +extern int init_plugin_set(void);
60306 +extern void done_plugin_set(void);
60307 +
60308 +extern reiser4_plugin *aset_get(plugin_set * set, pset_member memb);
60309 +extern int set_plugin(plugin_set ** set, pset_member memb,
60310 +                     reiser4_plugin * plugin);
60311 +extern int aset_set_unsafe(plugin_set ** set, pset_member memb,
60312 +                          reiser4_plugin * plugin);
60313 +extern reiser4_plugin_type aset_member_to_type_unsafe(pset_member memb);
60314 +
60315 +/* __PLUGIN_SET_H__ */
60316 +#endif
60317 +
60318 +/* Make Linus happy.
60319 +   Local variables:
60320 +   c-indentation-style: "K&R"
60321 +   mode-name: "LC"
60322 +   c-basic-offset: 8
60323 +   tab-width: 8
60324 +   fill-column: 120
60325 +   End:
60326 +*/
60327 diff -puN /dev/null fs/reiser4/plugin/security/Makefile
60328 --- /dev/null
60329 +++ a/fs/reiser4/plugin/security/Makefile
60330 @@ -0,0 +1,4 @@
60331 +obj-$(CONFIG_REISER4_FS) += security_plugins.o
60332 +
60333 +security_plugins-objs :=       \
60334 +       perm.o
60335 diff -puN /dev/null fs/reiser4/plugin/security/perm.c
60336 --- /dev/null
60337 +++ a/fs/reiser4/plugin/security/perm.c
60338 @@ -0,0 +1,33 @@
60339 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
60340 +
60341 +/*
60342 + * This file contains implementation of permission plugins.
60343 + * See the comments in perm.h
60344 + */
60345 +
60346 +#include "../plugin.h"
60347 +#include "../plugin_header.h"
60348 +#include "../../debug.h"
60349 +
60350 +perm_plugin perm_plugins[LAST_PERM_ID] = {
60351 +       [NULL_PERM_ID] = {
60352 +               .h = {
60353 +                       .type_id = REISER4_PERM_PLUGIN_TYPE,
60354 +                       .id = NULL_PERM_ID,
60355 +                       .pops = NULL,
60356 +                       .label = "null",
60357 +                       .desc = "stub permission plugin",
60358 +                       .linkage = {NULL, NULL}
60359 +               }
60360 +       }
60361 +};
60362 +
60363 +/*
60364 + * Local variables:
60365 + * c-indentation-style: "K&R"
60366 + * mode-name: "LC"
60367 + * c-basic-offset: 8
60368 + * tab-width: 8
60369 + * fill-column: 79
60370 + * End:
60371 + */
60372 diff -puN /dev/null fs/reiser4/plugin/security/perm.h
60373 --- /dev/null
60374 +++ a/fs/reiser4/plugin/security/perm.h
60375 @@ -0,0 +1,38 @@
60376 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
60377 +
60378 +/* Perm (short for "permissions") plugins common stuff. */
60379 +
60380 +#if !defined( __REISER4_PERM_H__ )
60381 +#define __REISER4_PERM_H__
60382 +
60383 +#include "../../forward.h"
60384 +#include "../plugin_header.h"
60385 +
60386 +#include <linux/types.h>
60387 +
60388 +/* Definition of permission plugin */
60389 +/* NIKITA-FIXME-HANS: define what this is targeted for.
60390 +   It does not seem to be intended for use with sys_reiser4.  Explain. */
60391 +
60392 +/* NOTE-EDWARD: This seems to be intended for deprecated sys_reiser4.
60393 +   Consider it like a temporary "seam" and reserved pset member.
60394 +   If you have something usefull to add, then rename this plugin and add here */
60395 +typedef struct perm_plugin {
60396 +       /* generic plugin fields */
60397 +       plugin_header h;
60398 +} perm_plugin;
60399 +
60400 +typedef enum { NULL_PERM_ID, LAST_PERM_ID } reiser4_perm_id;
60401 +
60402 +/* __REISER4_PERM_H__ */
60403 +#endif
60404 +
60405 +/* Make Linus happy.
60406 +   Local variables:
60407 +   c-indentation-style: "K&R"
60408 +   mode-name: "LC"
60409 +   c-basic-offset: 8
60410 +   tab-width: 8
60411 +   fill-column: 120
60412 +   End:
60413 +*/
60414 diff -puN /dev/null fs/reiser4/plugin/space/Makefile
60415 --- /dev/null
60416 +++ a/fs/reiser4/plugin/space/Makefile
60417 @@ -0,0 +1,4 @@
60418 +obj-$(CONFIG_REISER4_FS) += space_plugins.o
60419 +
60420 +space_plugins-objs := \
60421 +       bitmap.o
60422 diff -puN /dev/null fs/reiser4/plugin/space/bitmap.c
60423 --- /dev/null
60424 +++ a/fs/reiser4/plugin/space/bitmap.c
60425 @@ -0,0 +1,1585 @@
60426 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
60427 +
60428 +#include "../../debug.h"
60429 +#include "../../dformat.h"
60430 +#include "../../txnmgr.h"
60431 +#include "../../jnode.h"
60432 +#include "../../block_alloc.h"
60433 +#include "../../tree.h"
60434 +#include "../../super.h"
60435 +#include "../plugin.h"
60436 +#include "space_allocator.h"
60437 +#include "bitmap.h"
60438 +
60439 +#include <linux/types.h>
60440 +#include <linux/fs.h>          /* for struct super_block  */
60441 +#include <linux/mutex.h>
60442 +#include <asm/div64.h>
60443 +
60444 +/* Proposed (but discarded) optimization: dynamic loading/unloading of bitmap
60445 + * blocks
60446 +
60447 +   A useful optimization of reiser4 bitmap handling would be dynamic bitmap
60448 +   blocks loading/unloading which is different from v3.x where all bitmap
60449 +   blocks are loaded at mount time.
60450 +
60451 +   To implement bitmap blocks unloading we need to count bitmap block usage
60452 +   and detect currently unused blocks allowing them to be unloaded. It is not
60453 +   a simple task since we allow several threads to modify one bitmap block
60454 +   simultaneously.
60455 +
60456 +   Briefly speaking, the following schema is proposed: we count in special
60457 +   variable associated with each bitmap block. That is for counting of block
60458 +   alloc/dealloc operations on that bitmap block. With a deferred block
60459 +   deallocation feature of reiser4 all those operation will be represented in
60460 +   atom dirty/deleted lists as jnodes for freshly allocated or deleted
60461 +   nodes.
60462 +
60463 +   So, we increment usage counter for each new node allocated or deleted, and
60464 +   decrement it at atom commit one time for each node from the dirty/deleted
60465 +   atom's list.  Of course, freshly allocated node deletion and node reusing
60466 +   from atom deleted (if we do so) list should decrement bitmap usage counter
60467 +   also.
60468 +
60469 +   This schema seems to be working but that reference counting is
60470 +   not easy to debug. I think we should agree with Hans and do not implement
60471 +   it in v4.0. Current code implements "on-demand" bitmap blocks loading only.
60472 +
60473 +   For simplicity all bitmap nodes (both commit and working bitmap blocks) are
60474 +   loaded into memory on fs mount time or each bitmap nodes are loaded at the
60475 +   first access to it, the "dont_load_bitmap" mount option controls whether
60476 +   bimtap nodes should be loaded at mount time. Dynamic unloading of bitmap
60477 +   nodes currently is not supported. */
60478 +
60479 +#define CHECKSUM_SIZE    4
60480 +
60481 +#define BYTES_PER_LONG   (sizeof(long))
60482 +
60483 +#if BITS_PER_LONG == 64
60484 +#  define LONG_INT_SHIFT (6)
60485 +#else
60486 +#  define LONG_INT_SHIFT (5)
60487 +#endif
60488 +
60489 +#define LONG_INT_MASK (BITS_PER_LONG - 1UL)
60490 +
60491 +typedef unsigned long ulong_t;
60492 +
60493 +#define bmap_size(blocksize)       ((blocksize) - CHECKSUM_SIZE)
60494 +#define bmap_bit_count(blocksize)   (bmap_size(blocksize) << 3)
60495 +
60496 +/* Block allocation/deallocation are done through special bitmap objects which
60497 +   are allocated in an array at fs mount. */
60498 +struct bitmap_node {
60499 +       struct mutex mutex;     /* long term lock object */
60500 +
60501 +       jnode *wjnode;          /* j-nodes for WORKING ... */
60502 +       jnode *cjnode;          /* ... and COMMIT bitmap blocks */
60503 +
60504 +       bmap_off_t first_zero_bit;      /* for skip_busy option implementation */
60505 +
60506 +       atomic_t loaded;        /* a flag which shows that bnode is loaded
60507 +                                * already */
60508 +};
60509 +
60510 +static inline char *bnode_working_data(struct bitmap_node *bnode)
60511 +{
60512 +       char *data;
60513 +
60514 +       data = jdata(bnode->wjnode);
60515 +       assert("zam-429", data != NULL);
60516 +
60517 +       return data + CHECKSUM_SIZE;
60518 +}
60519 +
60520 +static inline char *bnode_commit_data(const struct bitmap_node *bnode)
60521 +{
60522 +       char *data;
60523 +
60524 +       data = jdata(bnode->cjnode);
60525 +       assert("zam-430", data != NULL);
60526 +
60527 +       return data + CHECKSUM_SIZE;
60528 +}
60529 +
60530 +static inline __u32 bnode_commit_crc(const struct bitmap_node *bnode)
60531 +{
60532 +       char *data;
60533 +
60534 +       data = jdata(bnode->cjnode);
60535 +       assert("vpf-261", data != NULL);
60536 +
60537 +       return le32_to_cpu(get_unaligned((d32 *)data));
60538 +}
60539 +
60540 +static inline void bnode_set_commit_crc(struct bitmap_node *bnode, __u32 crc)
60541 +{
60542 +       char *data;
60543 +
60544 +       data = jdata(bnode->cjnode);
60545 +       assert("vpf-261", data != NULL);
60546 +
60547 +       put_unaligned(cpu_to_le32(crc), (d32 *)data);
60548 +}
60549 +
60550 +/* ZAM-FIXME-HANS: is the idea that this might be a union someday? having
60551 + * written the code, does this added abstraction still have */
60552 +/* ANSWER(Zam): No, the abstractions is in the level above (exact place is the
60553 + * reiser4_space_allocator structure) */
60554 +/* ZAM-FIXME-HANS: I don't understand your english in comment above. */
60555 +/* FIXME-HANS(Zam): I don't understand the questions like "might be a union
60556 + * someday?". What they about?  If there is a reason to have a union, it should
60557 + * be a union, if not, it should not be a union.  "..might be someday" means no
60558 + * reason. */
60559 +struct bitmap_allocator_data {
60560 +       /* an array for bitmap blocks direct access */
60561 +       struct bitmap_node *bitmap;
60562 +};
60563 +
60564 +#define get_barray(super) \
60565 +(((struct bitmap_allocator_data *)(get_super_private(super)->space_allocator.u.generic)) -> bitmap)
60566 +
60567 +#define get_bnode(super, i) (get_barray(super) + i)
60568 +
60569 +/* allocate and initialize jnode with JNODE_BITMAP type */
60570 +static jnode *bnew(void)
60571 +{
60572 +       jnode *jal = jalloc();
60573 +
60574 +       if (jal)
60575 +               jnode_init(jal, current_tree, JNODE_BITMAP);
60576 +
60577 +       return jal;
60578 +}
60579 +
60580 +/* this file contains:
60581 +   - bitmap based implementation of space allocation plugin
60582 +   - all the helper functions like set bit, find_first_zero_bit, etc */
60583 +
60584 +/* Audited by: green(2002.06.12) */
60585 +static int find_next_zero_bit_in_word(ulong_t word, int start_bit)
60586 +{
60587 +       ulong_t mask = 1UL << start_bit;
60588 +       int i = start_bit;
60589 +
60590 +       while ((word & mask) != 0) {
60591 +               mask <<= 1;
60592 +               if (++i >= BITS_PER_LONG)
60593 +                       break;
60594 +       }
60595 +
60596 +       return i;
60597 +}
60598 +
60599 +#include <linux/bitops.h>
60600 +
60601 +#if BITS_PER_LONG == 64
60602 +
60603 +#define OFF(addr)  (((ulong_t)(addr) & (BYTES_PER_LONG - 1)) << 3)
60604 +#define BASE(addr) ((ulong_t*) ((ulong_t)(addr) & ~(BYTES_PER_LONG - 1)))
60605 +
60606 +static inline void reiser4_set_bit(int nr, void *addr)
60607 +{
60608 +       ext2_set_bit(nr + OFF(addr), BASE(addr));
60609 +}
60610 +
60611 +static inline void reiser4_clear_bit(int nr, void *addr)
60612 +{
60613 +       ext2_clear_bit(nr + OFF(addr), BASE(addr));
60614 +}
60615 +
60616 +static inline int reiser4_test_bit(int nr, void *addr)
60617 +{
60618 +       return ext2_test_bit(nr + OFF(addr), BASE(addr));
60619 +}
60620 +static inline int reiser4_find_next_zero_bit(void *addr, int maxoffset,
60621 +                                            int offset)
60622 +{
60623 +       int off = OFF(addr);
60624 +
60625 +       return ext2_find_next_zero_bit(BASE(addr), maxoffset + off,
60626 +                                      offset + off) - off;
60627 +}
60628 +
60629 +#else
60630 +
60631 +#define reiser4_set_bit(nr, addr)    ext2_set_bit(nr, addr)
60632 +#define reiser4_clear_bit(nr, addr)  ext2_clear_bit(nr, addr)
60633 +#define reiser4_test_bit(nr, addr)  ext2_test_bit(nr, addr)
60634 +
60635 +#define reiser4_find_next_zero_bit(addr, maxoffset, offset) \
60636 +ext2_find_next_zero_bit(addr, maxoffset, offset)
60637 +#endif
60638 +
60639 +/* Search for a set bit in the bit array [@start_offset, @max_offset[, offsets
60640 + * are counted from @addr, return the offset of the first bit if it is found,
60641 + * @maxoffset otherwise. */
60642 +static bmap_off_t __reiser4_find_next_set_bit(void *addr, bmap_off_t max_offset,
60643 +                                             bmap_off_t start_offset)
60644 +{
60645 +       ulong_t *base = addr;
60646 +       /* start_offset is in bits, convert it to byte offset within bitmap. */
60647 +       int word_nr = start_offset >> LONG_INT_SHIFT;
60648 +       /* bit number within the byte. */
60649 +       int bit_nr = start_offset & LONG_INT_MASK;
60650 +       int max_word_nr = (max_offset - 1) >> LONG_INT_SHIFT;
60651 +
60652 +       assert("zam-387", max_offset != 0);
60653 +
60654 +       /* Unaligned @start_offset case.  */
60655 +       if (bit_nr != 0) {
60656 +               bmap_nr_t nr;
60657 +
60658 +               nr = find_next_zero_bit_in_word(~(base[word_nr]), bit_nr);
60659 +
60660 +               if (nr < BITS_PER_LONG)
60661 +                       return (word_nr << LONG_INT_SHIFT) + nr;
60662 +
60663 +               ++word_nr;
60664 +       }
60665 +
60666 +       /* Fast scan trough aligned words. */
60667 +       while (word_nr <= max_word_nr) {
60668 +               if (base[word_nr] != 0) {
60669 +                       return (word_nr << LONG_INT_SHIFT)
60670 +                           + find_next_zero_bit_in_word(~(base[word_nr]), 0);
60671 +               }
60672 +
60673 +               ++word_nr;
60674 +       }
60675 +
60676 +       return max_offset;
60677 +}
60678 +
60679 +#if BITS_PER_LONG == 64
60680 +
60681 +static bmap_off_t reiser4_find_next_set_bit(void *addr, bmap_off_t max_offset,
60682 +                                           bmap_off_t start_offset)
60683 +{
60684 +       bmap_off_t off = OFF(addr);
60685 +
60686 +       return __reiser4_find_next_set_bit(BASE(addr), max_offset + off,
60687 +                                          start_offset + off) - off;
60688 +}
60689 +
60690 +#else
60691 +#define reiser4_find_next_set_bit(addr, max_offset, start_offset) \
60692 +  __reiser4_find_next_set_bit(addr, max_offset, start_offset)
60693 +#endif
60694 +
60695 +/* search for the first set bit in single word. */
60696 +static int find_last_set_bit_in_word(ulong_t word, int start_bit)
60697 +{
60698 +       ulong_t bit_mask;
60699 +       int nr = start_bit;
60700 +
60701 +       assert("zam-965", start_bit < BITS_PER_LONG);
60702 +       assert("zam-966", start_bit >= 0);
60703 +
60704 +       bit_mask = (1UL << nr);
60705 +
60706 +       while (bit_mask != 0) {
60707 +               if (bit_mask & word)
60708 +                       return nr;
60709 +               bit_mask >>= 1;
60710 +               nr--;
60711 +       }
60712 +       return BITS_PER_LONG;
60713 +}
60714 +
60715 +/* Search bitmap for a set bit in backward direction from the end to the
60716 + * beginning of given region
60717 + *
60718 + * @result: result offset of the last set bit
60719 + * @addr:   base memory address,
60720 + * @low_off:  low end of the search region, edge bit included into the region,
60721 + * @high_off: high end of the search region, edge bit included into the region,
60722 + *
60723 + * @return: 0 - set bit was found, -1 otherwise.
60724 + */
60725 +static int
60726 +reiser4_find_last_set_bit(bmap_off_t * result, void *addr, bmap_off_t low_off,
60727 +                         bmap_off_t high_off)
60728 +{
60729 +       ulong_t *base = addr;
60730 +       int last_word;
60731 +       int first_word;
60732 +       int last_bit;
60733 +       int nr;
60734 +
60735 +       assert("zam-962", high_off >= low_off);
60736 +
60737 +       last_word = high_off >> LONG_INT_SHIFT;
60738 +       last_bit = high_off & LONG_INT_MASK;
60739 +       first_word = low_off >> LONG_INT_SHIFT;
60740 +
60741 +       if (last_bit < BITS_PER_LONG) {
60742 +               nr = find_last_set_bit_in_word(base[last_word], last_bit);
60743 +               if (nr < BITS_PER_LONG) {
60744 +                       *result = (last_word << LONG_INT_SHIFT) + nr;
60745 +                       return 0;
60746 +               }
60747 +               --last_word;
60748 +       }
60749 +       while (last_word >= first_word) {
60750 +               if (base[last_word] != 0x0) {
60751 +                       last_bit =
60752 +                           find_last_set_bit_in_word(base[last_word],
60753 +                                                     BITS_PER_LONG - 1);
60754 +                       assert("zam-972", last_bit < BITS_PER_LONG);
60755 +                       *result = (last_word << LONG_INT_SHIFT) + last_bit;
60756 +                       return 0;
60757 +               }
60758 +               --last_word;
60759 +       }
60760 +
60761 +       return -1;              /* set bit not found */
60762 +}
60763 +
60764 +/* Search bitmap for a clear bit in backward direction from the end to the
60765 + * beginning of given region */
60766 +static int
60767 +reiser4_find_last_zero_bit(bmap_off_t * result, void *addr, bmap_off_t low_off,
60768 +                          bmap_off_t high_off)
60769 +{
60770 +       ulong_t *base = addr;
60771 +       int last_word;
60772 +       int first_word;
60773 +       int last_bit;
60774 +       int nr;
60775 +
60776 +       last_word = high_off >> LONG_INT_SHIFT;
60777 +       last_bit = high_off & LONG_INT_MASK;
60778 +       first_word = low_off >> LONG_INT_SHIFT;
60779 +
60780 +       if (last_bit < BITS_PER_LONG) {
60781 +               nr = find_last_set_bit_in_word(~base[last_word], last_bit);
60782 +               if (nr < BITS_PER_LONG) {
60783 +                       *result = (last_word << LONG_INT_SHIFT) + nr;
60784 +                       return 0;
60785 +               }
60786 +               --last_word;
60787 +       }
60788 +       while (last_word >= first_word) {
60789 +               if (base[last_word] != (ulong_t) (-1)) {
60790 +                       *result = (last_word << LONG_INT_SHIFT) +
60791 +                           find_last_set_bit_in_word(~base[last_word],
60792 +                                                     BITS_PER_LONG - 1);
60793 +                       return 0;
60794 +               }
60795 +               --last_word;
60796 +       }
60797 +
60798 +       return -1;              /* zero bit not found */
60799 +}
60800 +
60801 +/* Audited by: green(2002.06.12) */
60802 +static void reiser4_clear_bits(char *addr, bmap_off_t start, bmap_off_t end)
60803 +{
60804 +       int first_byte;
60805 +       int last_byte;
60806 +
60807 +       unsigned char first_byte_mask = 0xFF;
60808 +       unsigned char last_byte_mask = 0xFF;
60809 +
60810 +       assert("zam-410", start < end);
60811 +
60812 +       first_byte = start >> 3;
60813 +       last_byte = (end - 1) >> 3;
60814 +
60815 +       if (last_byte > first_byte + 1)
60816 +               memset(addr + first_byte + 1, 0,
60817 +                      (size_t) (last_byte - first_byte - 1));
60818 +
60819 +       first_byte_mask >>= 8 - (start & 0x7);
60820 +       last_byte_mask <<= ((end - 1) & 0x7) + 1;
60821 +
60822 +       if (first_byte == last_byte) {
60823 +               addr[first_byte] &= (first_byte_mask | last_byte_mask);
60824 +       } else {
60825 +               addr[first_byte] &= first_byte_mask;
60826 +               addr[last_byte] &= last_byte_mask;
60827 +       }
60828 +}
60829 +
60830 +/* Audited by: green(2002.06.12) */
60831 +/* ZAM-FIXME-HANS: comment this */
60832 +static void reiser4_set_bits(char *addr, bmap_off_t start, bmap_off_t end)
60833 +{
60834 +       int first_byte;
60835 +       int last_byte;
60836 +
60837 +       unsigned char first_byte_mask = 0xFF;
60838 +       unsigned char last_byte_mask = 0xFF;
60839 +
60840 +       assert("zam-386", start < end);
60841 +
60842 +       first_byte = start >> 3;
60843 +       last_byte = (end - 1) >> 3;
60844 +
60845 +       if (last_byte > first_byte + 1)
60846 +               memset(addr + first_byte + 1, 0xFF,
60847 +                      (size_t) (last_byte - first_byte - 1));
60848 +
60849 +       first_byte_mask <<= start & 0x7;
60850 +       last_byte_mask >>= 7 - ((end - 1) & 0x7);
60851 +
60852 +       if (first_byte == last_byte) {
60853 +               addr[first_byte] |= (first_byte_mask & last_byte_mask);
60854 +       } else {
60855 +               addr[first_byte] |= first_byte_mask;
60856 +               addr[last_byte] |= last_byte_mask;
60857 +       }
60858 +}
60859 +
60860 +#define ADLER_BASE    65521
60861 +#define ADLER_NMAX    5552
60862 +
60863 +/* Calculates the adler32 checksum for the data pointed by `data` of the
60864 +    length `len`. This function was originally taken from zlib, version 1.1.3,
60865 +    July 9th, 1998.
60866 +
60867 +    Copyright (C) 1995-1998 Jean-loup Gailly and Mark Adler
60868 +
60869 +    This software is provided 'as-is', without any express or implied
60870 +    warranty.  In no event will the authors be held liable for any damages
60871 +    arising from the use of this software.
60872 +
60873 +    Permission is granted to anyone to use this software for any purpose,
60874 +    including commercial applications, and to alter it and redistribute it
60875 +    freely, subject to the following restrictions:
60876 +
60877 +    1. The origin of this software must not be misrepresented; you must not
60878 +       claim that you wrote the original software. If you use this software
60879 +       in a product, an acknowledgment in the product documentation would be
60880 +       appreciated but is not required.
60881 +    2. Altered source versions must be plainly marked as such, and must not be
60882 +       misrepresented as being the original software.
60883 +    3. This notice may not be removed or altered from any source distribution.
60884 +
60885 +    Jean-loup Gailly        Mark Adler
60886 +    jloup@gzip.org          madler@alumni.caltech.edu
60887 +
60888 +    The above comment applies only to the reiser4_adler32 function.
60889 +*/
60890 +
60891 +__u32 reiser4_adler32(char *data, __u32 len)
60892 +{
60893 +       unsigned char *t = data;
60894 +       __u32 s1 = 1;
60895 +       __u32 s2 = 0;
60896 +       int k;
60897 +
60898 +       while (len > 0) {
60899 +               k = len < ADLER_NMAX ? len : ADLER_NMAX;
60900 +               len -= k;
60901 +
60902 +               while (k--) {
60903 +                       s1 += *t++;
60904 +                       s2 += s1;
60905 +               }
60906 +
60907 +               s1 %= ADLER_BASE;
60908 +               s2 %= ADLER_BASE;
60909 +       }
60910 +       return (s2 << 16) | s1;
60911 +}
60912 +
60913 +#define sb_by_bnode(bnode) \
60914 +       ((struct super_block *)jnode_get_tree(bnode->wjnode)->super)
60915 +
60916 +static __u32 bnode_calc_crc(const struct bitmap_node *bnode, unsigned long size)
60917 +{
60918 +       return reiser4_adler32(bnode_commit_data(bnode), bmap_size(size));
60919 +}
60920 +
60921 +static int
60922 +bnode_check_adler32(const struct bitmap_node *bnode, unsigned long size)
60923 +{
60924 +       if (bnode_calc_crc(bnode, size) != bnode_commit_crc(bnode)) {
60925 +               bmap_nr_t bmap;
60926 +
60927 +               bmap = bnode - get_bnode(sb_by_bnode(bnode), 0);
60928 +
60929 +               warning("vpf-263",
60930 +                       "Checksum for the bitmap block %llu is incorrect",
60931 +                       bmap);
60932 +
60933 +               return RETERR(-EIO);
60934 +       }
60935 +
60936 +       return 0;
60937 +}
60938 +
60939 +#define REISER4_CHECK_BMAP_CRC (0)
60940 +
60941 +#if REISER4_CHECK_BMAP_CRC
60942 +static int bnode_check_crc(const struct bitmap_node *bnode)
60943 +{
60944 +       return bnode_check_adler32(bnode,
60945 +                                  bmap_size(sb_by_bnode(bnode)->s_blocksize));
60946 +}
60947 +
60948 +/* REISER4_CHECK_BMAP_CRC */
60949 +#else
60950 +
60951 +#define bnode_check_crc(bnode) (0)
60952 +
60953 +/* REISER4_CHECK_BMAP_CRC */
60954 +#endif
60955 +
60956 +/* Recalculates the adler32 checksum for only 1 byte change.
60957 +    adler - previous adler checksum
60958 +    old_data, data - old, new byte values.
60959 +    tail == (chunk - offset) : length, checksum was calculated for, - offset of
60960 +    the changed byte within this chunk.
60961 +    This function can be used for checksum calculation optimisation.
60962 +*/
60963 +
60964 +static __u32
60965 +adler32_recalc(__u32 adler, unsigned char old_data, unsigned char data,
60966 +              __u32 tail)
60967 +{
60968 +       __u32 delta = data - old_data + 2 * ADLER_BASE;
60969 +       __u32 s1 = adler & 0xffff;
60970 +       __u32 s2 = (adler >> 16) & 0xffff;
60971 +
60972 +       s1 = (delta + s1) % ADLER_BASE;
60973 +       s2 = (delta * tail + s2) % ADLER_BASE;
60974 +
60975 +       return (s2 << 16) | s1;
60976 +}
60977 +
60978 +#define LIMIT(val, boundary) ((val) > (boundary) ? (boundary) : (val))
60979 +
60980 +/**
60981 + * get_nr_bitmap - calculate number of bitmap blocks
60982 + * @super: super block with initialized blocksize and block count
60983 + *
60984 + * Calculates number of bitmap blocks of a filesystem which uses bitmaps to
60985 + * maintain free disk space. It assumes that each bitmap addresses the same
60986 + * number of blocks which is calculated by bmap_block_count macro defined in
60987 + * above. Number of blocks in the filesystem has to be initialized in reiser4
60988 + * private data of super block already so that it can be obtained via
60989 + * reiser4_block_count(). Unfortunately, number of blocks addressed by a bitmap
60990 + * is not power of 2 because 4 bytes are used for checksum. Therefore, we have
60991 + * to use special function to divide and modulo 64bits filesystem block
60992 + * counters.
60993 + *
60994 + * Example: suppose filesystem have 32768 blocks. Blocksize is 4096. Each bitmap
60995 + * block addresses (4096 - 4) * 8 = 32736 blocks. Number of bitmaps to address
60996 + * all 32768 blocks is calculated as (32768 - 1) / 32736 + 1 = 2.
60997 + */
60998 +static bmap_nr_t get_nr_bmap(const struct super_block *super)
60999 +{
61000 +       u64 quotient;
61001 +
61002 +       assert("zam-393", reiser4_block_count(super) != 0);
61003 +
61004 +       quotient = reiser4_block_count(super) - 1;
61005 +       do_div(quotient, bmap_bit_count(super->s_blocksize));
61006 +       return quotient + 1;
61007 +}
61008 +
61009 +/**
61010 + * parse_blocknr - calculate bitmap number and offset in it by block number
61011 + * @block: pointer to block number to calculate location in bitmap of
61012 + * @bmap: pointer where to store bitmap block number
61013 + * @offset: pointer where to store offset within bitmap block
61014 + *
61015 + * Calculates location of bit which is responsible for allocation/freeing of
61016 + * block @*block. That location is represented by bitmap block number and offset
61017 + * within that bitmap block.
61018 + */
61019 +static void
61020 +parse_blocknr(const reiser4_block_nr *block, bmap_nr_t *bmap,
61021 +             bmap_off_t *offset)
61022 +{
61023 +       struct super_block *super = get_current_context()->super;
61024 +       u64 quotient = *block;
61025 +
61026 +       *offset = do_div(quotient, bmap_bit_count(super->s_blocksize));
61027 +       *bmap = quotient;
61028 +
61029 +       assert("zam-433", *bmap < get_nr_bmap(super));
61030 +       assert("", *offset < bmap_bit_count(super->s_blocksize));
61031 +}
61032 +
61033 +#if REISER4_DEBUG
61034 +/* Audited by: green(2002.06.12) */
61035 +static void
61036 +check_block_range(const reiser4_block_nr * start, const reiser4_block_nr * len)
61037 +{
61038 +       struct super_block *sb = reiser4_get_current_sb();
61039 +
61040 +       assert("zam-436", sb != NULL);
61041 +
61042 +       assert("zam-455", start != NULL);
61043 +       assert("zam-437", *start != 0);
61044 +       assert("zam-541", !reiser4_blocknr_is_fake(start));
61045 +       assert("zam-441", *start < reiser4_block_count(sb));
61046 +
61047 +       if (len != NULL) {
61048 +               assert("zam-438", *len != 0);
61049 +               assert("zam-442", *start + *len <= reiser4_block_count(sb));
61050 +       }
61051 +}
61052 +
61053 +static void check_bnode_loaded(const struct bitmap_node *bnode)
61054 +{
61055 +       assert("zam-485", bnode != NULL);
61056 +       assert("zam-483", jnode_page(bnode->wjnode) != NULL);
61057 +       assert("zam-484", jnode_page(bnode->cjnode) != NULL);
61058 +       assert("nikita-2820", jnode_is_loaded(bnode->wjnode));
61059 +       assert("nikita-2821", jnode_is_loaded(bnode->cjnode));
61060 +}
61061 +
61062 +#else
61063 +
61064 +#  define check_block_range(start, len) do { /* nothing */} while(0)
61065 +#  define check_bnode_loaded(bnode)     do { /* nothing */} while(0)
61066 +
61067 +#endif
61068 +
61069 +/* modify bnode->first_zero_bit (if we free bits before); bnode should be
61070 +   spin-locked */
61071 +static inline void
61072 +adjust_first_zero_bit(struct bitmap_node *bnode, bmap_off_t offset)
61073 +{
61074 +       if (offset < bnode->first_zero_bit)
61075 +               bnode->first_zero_bit = offset;
61076 +}
61077 +
61078 +/* return a physical disk address for logical bitmap number @bmap */
61079 +/* FIXME-VS: this is somehow related to disk layout? */
61080 +/* ZAM-FIXME-HANS: your answer is? Use not more than one function dereference
61081 + * per block allocation so that performance is not affected.  Probably this
61082 + * whole file should be considered part of the disk layout plugin, and other
61083 + * disk layouts can use other defines and efficiency will not be significantly
61084 + * affected.  */
61085 +
61086 +#define REISER4_FIRST_BITMAP_BLOCK \
61087 +       ((REISER4_MASTER_OFFSET / PAGE_CACHE_SIZE) + 2)
61088 +
61089 +/* Audited by: green(2002.06.12) */
61090 +static void
61091 +get_bitmap_blocknr(struct super_block *super, bmap_nr_t bmap,
61092 +                  reiser4_block_nr * bnr)
61093 +{
61094 +
61095 +       assert("zam-390", bmap < get_nr_bmap(super));
61096 +
61097 +#ifdef CONFIG_REISER4_BADBLOCKS
61098 +#define BITMAP_PLUGIN_DISKMAP_ID ((0xc0e1<<16) | (0xe0ff))
61099 +       /* Check if the diskmap have this already, first. */
61100 +       if (reiser4_get_diskmap_value(BITMAP_PLUGIN_DISKMAP_ID, bmap, bnr) == 0)
61101 +               return;         /* Found it in diskmap */
61102 +#endif
61103 +       /* FIXME_ZAM: before discussing of disk layouts and disk format
61104 +          plugins I implement bitmap location scheme which is close to scheme
61105 +          used in reiser 3.6 */
61106 +       if (bmap == 0) {
61107 +               *bnr = REISER4_FIRST_BITMAP_BLOCK;
61108 +       } else {
61109 +               *bnr = bmap * bmap_bit_count(super->s_blocksize);
61110 +       }
61111 +}
61112 +
61113 +/* construct a fake block number for shadow bitmap (WORKING BITMAP) block */
61114 +/* Audited by: green(2002.06.12) */
61115 +static void get_working_bitmap_blocknr(bmap_nr_t bmap, reiser4_block_nr * bnr)
61116 +{
61117 +       *bnr =
61118 +           (reiser4_block_nr) ((bmap & ~REISER4_BLOCKNR_STATUS_BIT_MASK) |
61119 +                               REISER4_BITMAP_BLOCKS_STATUS_VALUE);
61120 +}
61121 +
61122 +/* bnode structure initialization */
61123 +static void
61124 +init_bnode(struct bitmap_node *bnode,
61125 +          struct super_block *super UNUSED_ARG, bmap_nr_t bmap UNUSED_ARG)
61126 +{
61127 +       memset(bnode, 0, sizeof(struct bitmap_node));
61128 +
61129 +       mutex_init(&bnode->mutex);
61130 +       atomic_set(&bnode->loaded, 0);
61131 +}
61132 +
61133 +static void release(jnode * node)
61134 +{
61135 +       jrelse(node);
61136 +       JF_SET(node, JNODE_HEARD_BANSHEE);
61137 +       jput(node);
61138 +}
61139 +
61140 +/* This function is for internal bitmap.c use because it assumes that jnode is
61141 +   in under full control of this thread */
61142 +static void done_bnode(struct bitmap_node *bnode)
61143 +{
61144 +       if (bnode) {
61145 +               atomic_set(&bnode->loaded, 0);
61146 +               if (bnode->wjnode != NULL)
61147 +                       release(bnode->wjnode);
61148 +               if (bnode->cjnode != NULL)
61149 +                       release(bnode->cjnode);
61150 +               bnode->wjnode = bnode->cjnode = NULL;
61151 +       }
61152 +}
61153 +
61154 +/* ZAM-FIXME-HANS: comment this.  Called only by load_and_lock_bnode()*/
61155 +static int prepare_bnode(struct bitmap_node *bnode, jnode **cjnode_ret,
61156 +                        jnode **wjnode_ret)
61157 +{
61158 +       struct super_block *super;
61159 +       jnode *cjnode;
61160 +       jnode *wjnode;
61161 +       bmap_nr_t bmap;
61162 +       int ret;
61163 +
61164 +       super = reiser4_get_current_sb();
61165 +
61166 +       *wjnode_ret = wjnode = bnew();
61167 +       if (wjnode == NULL) {
61168 +               *cjnode_ret = NULL;
61169 +               return RETERR(-ENOMEM);
61170 +       }
61171 +
61172 +       *cjnode_ret = cjnode = bnew();
61173 +       if (cjnode == NULL)
61174 +               return RETERR(-ENOMEM);
61175 +
61176 +       bmap = bnode - get_bnode(super, 0);
61177 +
61178 +       get_working_bitmap_blocknr(bmap, &wjnode->blocknr);
61179 +       get_bitmap_blocknr(super, bmap, &cjnode->blocknr);
61180 +
61181 +       jref(cjnode);
61182 +       jref(wjnode);
61183 +
61184 +       /* load commit bitmap */
61185 +       ret = jload_gfp(cjnode, GFP_NOFS, 1);
61186 +
61187 +       if (ret)
61188 +               goto error;
61189 +
61190 +       /* allocate memory for working bitmap block. Note that for
61191 +        * bitmaps jinit_new() doesn't actually modifies node content,
61192 +        * so parallel calls to this are ok. */
61193 +       ret = jinit_new(wjnode, GFP_NOFS);
61194 +
61195 +       if (ret != 0) {
61196 +               jrelse(cjnode);
61197 +               goto error;
61198 +       }
61199 +
61200 +       return 0;
61201 +
61202 +      error:
61203 +       jput(cjnode);
61204 +       jput(wjnode);
61205 +       *wjnode_ret = *cjnode_ret = NULL;
61206 +       return ret;
61207 +
61208 +}
61209 +
61210 +/* Check the bnode data on read. */
61211 +static int check_struct_bnode(struct bitmap_node *bnode, __u32 blksize)
61212 +{
61213 +       void *data;
61214 +       int ret;
61215 +
61216 +       /* Check CRC */
61217 +       ret = bnode_check_adler32(bnode, blksize);
61218 +
61219 +       if (ret) {
61220 +               return ret;
61221 +       }
61222 +
61223 +       data = jdata(bnode->cjnode) + CHECKSUM_SIZE;
61224 +
61225 +       /* Check the very first bit -- it must be busy. */
61226 +       if (!reiser4_test_bit(0, data)) {
61227 +               warning("vpf-1362", "The allocator block %llu is not marked "
61228 +                       "as used.", (unsigned long long)bnode->cjnode->blocknr);
61229 +
61230 +               return -EINVAL;
61231 +       }
61232 +
61233 +       return 0;
61234 +}
61235 +
61236 +/* load bitmap blocks "on-demand" */
61237 +static int load_and_lock_bnode(struct bitmap_node *bnode)
61238 +{
61239 +       int ret;
61240 +
61241 +       jnode *cjnode;
61242 +       jnode *wjnode;
61243 +
61244 +       assert("nikita-3040", reiser4_schedulable());
61245 +
61246 +/* ZAM-FIXME-HANS: since bitmaps are never unloaded, this does not
61247 + * need to be atomic, right? Just leave a comment that if bitmaps were
61248 + * unloadable, this would need to be atomic.  */
61249 +       if (atomic_read(&bnode->loaded)) {
61250 +               /* bitmap is already loaded, nothing to do */
61251 +               check_bnode_loaded(bnode);
61252 +               mutex_lock(&bnode->mutex);
61253 +               assert("nikita-2827", atomic_read(&bnode->loaded));
61254 +               return 0;
61255 +       }
61256 +
61257 +       ret = prepare_bnode(bnode, &cjnode, &wjnode);
61258 +       if (ret == 0) {
61259 +               mutex_lock(&bnode->mutex);
61260 +
61261 +               if (!atomic_read(&bnode->loaded)) {
61262 +                       assert("nikita-2822", cjnode != NULL);
61263 +                       assert("nikita-2823", wjnode != NULL);
61264 +                       assert("nikita-2824", jnode_is_loaded(cjnode));
61265 +                       assert("nikita-2825", jnode_is_loaded(wjnode));
61266 +
61267 +                       bnode->wjnode = wjnode;
61268 +                       bnode->cjnode = cjnode;
61269 +
61270 +                       ret = check_struct_bnode(bnode, current_blocksize);
61271 +                       if (!ret) {
61272 +                               cjnode = wjnode = NULL;
61273 +                               atomic_set(&bnode->loaded, 1);
61274 +                               /* working bitmap is initialized by on-disk
61275 +                                * commit bitmap. This should be performed
61276 +                                * under mutex. */
61277 +                               memcpy(bnode_working_data(bnode),
61278 +                                      bnode_commit_data(bnode),
61279 +                                      bmap_size(current_blocksize));
61280 +                       } else
61281 +                               mutex_unlock(&bnode->mutex);
61282 +               } else
61283 +                       /* race: someone already loaded bitmap while we were
61284 +                        * busy initializing data. */
61285 +                       check_bnode_loaded(bnode);
61286 +       }
61287 +
61288 +       if (wjnode != NULL) {
61289 +               release(wjnode);
61290 +               bnode->wjnode = NULL;
61291 +       }
61292 +       if (cjnode != NULL) {
61293 +               release(cjnode);
61294 +               bnode->cjnode = NULL;
61295 +       }
61296 +
61297 +       return ret;
61298 +}
61299 +
61300 +static void release_and_unlock_bnode(struct bitmap_node *bnode)
61301 +{
61302 +       check_bnode_loaded(bnode);
61303 +       mutex_unlock(&bnode->mutex);
61304 +}
61305 +
61306 +/* This function does all block allocation work but only for one bitmap
61307 +   block.*/
61308 +/* FIXME_ZAM: It does not allow us to allocate block ranges across bitmap
61309 +   block responsibility zone boundaries. This had no sense in v3.6 but may
61310 +   have it in v4.x */
61311 +/* ZAM-FIXME-HANS: do you mean search one bitmap block forward? */
61312 +static int
61313 +search_one_bitmap_forward(bmap_nr_t bmap, bmap_off_t * offset,
61314 +                         bmap_off_t max_offset, int min_len, int max_len)
61315 +{
61316 +       struct super_block *super = get_current_context()->super;
61317 +       struct bitmap_node *bnode = get_bnode(super, bmap);
61318 +
61319 +       char *data;
61320 +
61321 +       bmap_off_t search_end;
61322 +       bmap_off_t start;
61323 +       bmap_off_t end;
61324 +
61325 +       int set_first_zero_bit = 0;
61326 +
61327 +       int ret;
61328 +
61329 +       assert("zam-364", min_len > 0);
61330 +       assert("zam-365", max_len >= min_len);
61331 +       assert("zam-366", *offset <= max_offset);
61332 +
61333 +       ret = load_and_lock_bnode(bnode);
61334 +
61335 +       if (ret)
61336 +               return ret;
61337 +
61338 +       data = bnode_working_data(bnode);
61339 +
61340 +       start = *offset;
61341 +
61342 +       if (bnode->first_zero_bit >= start) {
61343 +               start = bnode->first_zero_bit;
61344 +               set_first_zero_bit = 1;
61345 +       }
61346 +
61347 +       while (start + min_len < max_offset) {
61348 +
61349 +               start =
61350 +                   reiser4_find_next_zero_bit((long *)data, max_offset, start);
61351 +               if (set_first_zero_bit) {
61352 +                       bnode->first_zero_bit = start;
61353 +                       set_first_zero_bit = 0;
61354 +               }
61355 +               if (start >= max_offset)
61356 +                       break;
61357 +
61358 +               search_end = LIMIT(start + max_len, max_offset);
61359 +               end =
61360 +                   reiser4_find_next_set_bit((long *)data, search_end, start);
61361 +               if (end >= start + min_len) {
61362 +                       /* we can't trust find_next_set_bit result if set bit
61363 +                          was not fount, result may be bigger than
61364 +                          max_offset */
61365 +                       if (end > search_end)
61366 +                               end = search_end;
61367 +
61368 +                       ret = end - start;
61369 +                       *offset = start;
61370 +
61371 +                       reiser4_set_bits(data, start, end);
61372 +
61373 +                       /* FIXME: we may advance first_zero_bit if [start,
61374 +                          end] region overlaps the first_zero_bit point */
61375 +
61376 +                       break;
61377 +               }
61378 +
61379 +               start = end + 1;
61380 +       }
61381 +
61382 +       release_and_unlock_bnode(bnode);
61383 +
61384 +       return ret;
61385 +}
61386 +
61387 +static int
61388 +search_one_bitmap_backward(bmap_nr_t bmap, bmap_off_t * start_offset,
61389 +                          bmap_off_t end_offset, int min_len, int max_len)
61390 +{
61391 +       struct super_block *super = get_current_context()->super;
61392 +       struct bitmap_node *bnode = get_bnode(super, bmap);
61393 +       char *data;
61394 +       bmap_off_t start;
61395 +       int ret;
61396 +
61397 +       assert("zam-958", min_len > 0);
61398 +       assert("zam-959", max_len >= min_len);
61399 +       assert("zam-960", *start_offset >= end_offset);
61400 +
61401 +       ret = load_and_lock_bnode(bnode);
61402 +       if (ret)
61403 +               return ret;
61404 +
61405 +       data = bnode_working_data(bnode);
61406 +       start = *start_offset;
61407 +
61408 +       while (1) {
61409 +               bmap_off_t end, search_end;
61410 +
61411 +               /* Find the beginning of the zero filled region */
61412 +               if (reiser4_find_last_zero_bit(&start, data, end_offset, start))
61413 +                       break;
61414 +               /* Is there more than `min_len' bits from `start' to
61415 +                * `end_offset'?  */
61416 +               if (start < end_offset + min_len - 1)
61417 +                       break;
61418 +
61419 +               /* Do not search to `end_offset' if we need to find less than
61420 +                * `max_len' zero bits. */
61421 +               if (end_offset + max_len - 1 < start)
61422 +                       search_end = start - max_len + 1;
61423 +               else
61424 +                       search_end = end_offset;
61425 +
61426 +               if (reiser4_find_last_set_bit(&end, data, search_end, start))
61427 +                       end = search_end;
61428 +               else
61429 +                       end++;
61430 +
61431 +               if (end + min_len <= start + 1) {
61432 +                       if (end < search_end)
61433 +                               end = search_end;
61434 +                       ret = start - end + 1;
61435 +                       *start_offset = end;    /* `end' is lowest offset */
61436 +                       assert("zam-987",
61437 +                              reiser4_find_next_set_bit(data, start + 1,
61438 +                                                        end) >= start + 1);
61439 +                       reiser4_set_bits(data, end, start + 1);
61440 +                       break;
61441 +               }
61442 +
61443 +               if (end <= end_offset)
61444 +                       /* left search boundary reached. */
61445 +                       break;
61446 +               start = end - 1;
61447 +       }
61448 +
61449 +       release_and_unlock_bnode(bnode);
61450 +       return ret;
61451 +}
61452 +
61453 +/* allocate contiguous range of blocks in bitmap */
61454 +static int bitmap_alloc_forward(reiser4_block_nr * start,
61455 +                               const reiser4_block_nr * end, int min_len,
61456 +                               int max_len)
61457 +{
61458 +       bmap_nr_t bmap, end_bmap;
61459 +       bmap_off_t offset, end_offset;
61460 +       int len;
61461 +
61462 +       reiser4_block_nr tmp;
61463 +
61464 +       struct super_block *super = get_current_context()->super;
61465 +       const bmap_off_t max_offset = bmap_bit_count(super->s_blocksize);
61466 +
61467 +       parse_blocknr(start, &bmap, &offset);
61468 +
61469 +       tmp = *end - 1;
61470 +       parse_blocknr(&tmp, &end_bmap, &end_offset);
61471 +       ++end_offset;
61472 +
61473 +       assert("zam-358", end_bmap >= bmap);
61474 +       assert("zam-359", ergo(end_bmap == bmap, end_offset >= offset));
61475 +
61476 +       for (; bmap < end_bmap; bmap++, offset = 0) {
61477 +               len =
61478 +                   search_one_bitmap_forward(bmap, &offset, max_offset,
61479 +                                             min_len, max_len);
61480 +               if (len != 0)
61481 +                       goto out;
61482 +       }
61483 +
61484 +       len =
61485 +           search_one_bitmap_forward(bmap, &offset, end_offset, min_len,
61486 +                                     max_len);
61487 +      out:
61488 +       *start = bmap * max_offset + offset;
61489 +       return len;
61490 +}
61491 +
61492 +/* allocate contiguous range of blocks in bitmap (from @start to @end in
61493 + * backward direction) */
61494 +static int bitmap_alloc_backward(reiser4_block_nr * start,
61495 +                                const reiser4_block_nr * end, int min_len,
61496 +                                int max_len)
61497 +{
61498 +       bmap_nr_t bmap, end_bmap;
61499 +       bmap_off_t offset, end_offset;
61500 +       int len;
61501 +       struct super_block *super = get_current_context()->super;
61502 +       const bmap_off_t max_offset = bmap_bit_count(super->s_blocksize);
61503 +
61504 +       parse_blocknr(start, &bmap, &offset);
61505 +       parse_blocknr(end, &end_bmap, &end_offset);
61506 +
61507 +       assert("zam-961", end_bmap <= bmap);
61508 +       assert("zam-962", ergo(end_bmap == bmap, end_offset <= offset));
61509 +
61510 +       for (; bmap > end_bmap; bmap--, offset = max_offset - 1) {
61511 +               len =
61512 +                   search_one_bitmap_backward(bmap, &offset, 0, min_len,
61513 +                                              max_len);
61514 +               if (len != 0)
61515 +                       goto out;
61516 +       }
61517 +
61518 +       len =
61519 +           search_one_bitmap_backward(bmap, &offset, end_offset, min_len,
61520 +                                      max_len);
61521 +      out:
61522 +       *start = bmap * max_offset + offset;
61523 +       return len;
61524 +}
61525 +
61526 +/* plugin->u.space_allocator.alloc_blocks() */
61527 +static int alloc_blocks_forward(reiser4_blocknr_hint *hint, int needed,
61528 +                               reiser4_block_nr *start, reiser4_block_nr *len)
61529 +{
61530 +       struct super_block *super = get_current_context()->super;
61531 +       int actual_len;
61532 +
61533 +       reiser4_block_nr search_start;
61534 +       reiser4_block_nr search_end;
61535 +
61536 +       assert("zam-398", super != NULL);
61537 +       assert("zam-412", hint != NULL);
61538 +       assert("zam-397", hint->blk <= reiser4_block_count(super));
61539 +
61540 +       if (hint->max_dist == 0)
61541 +               search_end = reiser4_block_count(super);
61542 +       else
61543 +               search_end =
61544 +                   LIMIT(hint->blk + hint->max_dist,
61545 +                         reiser4_block_count(super));
61546 +
61547 +       /* We use @hint -> blk as a search start and search from it to the end
61548 +          of the disk or in given region if @hint -> max_dist is not zero */
61549 +       search_start = hint->blk;
61550 +
61551 +       actual_len =
61552 +           bitmap_alloc_forward(&search_start, &search_end, 1, needed);
61553 +
61554 +       /* There is only one bitmap search if max_dist was specified or first
61555 +          pass was from the beginning of the bitmap. We also do one pass for
61556 +          scanning bitmap in backward direction. */
61557 +       if (!(actual_len != 0 || hint->max_dist != 0 || search_start == 0)) {
61558 +               /* next step is a scanning from 0 to search_start */
61559 +               search_end = search_start;
61560 +               search_start = 0;
61561 +               actual_len =
61562 +                   bitmap_alloc_forward(&search_start, &search_end, 1, needed);
61563 +       }
61564 +       if (actual_len == 0)
61565 +               return RETERR(-ENOSPC);
61566 +       if (actual_len < 0)
61567 +               return RETERR(actual_len);
61568 +       *len = actual_len;
61569 +       *start = search_start;
61570 +       return 0;
61571 +}
61572 +
61573 +static int alloc_blocks_backward(reiser4_blocknr_hint * hint, int needed,
61574 +                                reiser4_block_nr * start,
61575 +                                reiser4_block_nr * len)
61576 +{
61577 +       reiser4_block_nr search_start;
61578 +       reiser4_block_nr search_end;
61579 +       int actual_len;
61580 +
61581 +       ON_DEBUG(struct super_block *super = reiser4_get_current_sb());
61582 +
61583 +       assert("zam-969", super != NULL);
61584 +       assert("zam-970", hint != NULL);
61585 +       assert("zam-971", hint->blk <= reiser4_block_count(super));
61586 +
61587 +       search_start = hint->blk;
61588 +       if (hint->max_dist == 0 || search_start <= hint->max_dist)
61589 +               search_end = 0;
61590 +       else
61591 +               search_end = search_start - hint->max_dist;
61592 +
61593 +       actual_len =
61594 +           bitmap_alloc_backward(&search_start, &search_end, 1, needed);
61595 +       if (actual_len == 0)
61596 +               return RETERR(-ENOSPC);
61597 +       if (actual_len < 0)
61598 +               return RETERR(actual_len);
61599 +       *len = actual_len;
61600 +       *start = search_start;
61601 +       return 0;
61602 +}
61603 +
61604 +/* plugin->u.space_allocator.alloc_blocks() */
61605 +int reiser4_alloc_blocks_bitmap(reiser4_space_allocator * allocator,
61606 +                               reiser4_blocknr_hint * hint, int needed,
61607 +                               reiser4_block_nr * start, reiser4_block_nr * len)
61608 +{
61609 +       if (hint->backward)
61610 +               return alloc_blocks_backward(hint, needed, start, len);
61611 +       return alloc_blocks_forward(hint, needed, start, len);
61612 +}
61613 +
61614 +/* plugin->u.space_allocator.dealloc_blocks(). */
61615 +/* It just frees blocks in WORKING BITMAP. Usually formatted an unformatted
61616 +   nodes deletion is deferred until transaction commit.  However, deallocation
61617 +   of temporary objects like wandered blocks and transaction commit records
61618 +   requires immediate node deletion from WORKING BITMAP.*/
61619 +void reiser4_dealloc_blocks_bitmap(reiser4_space_allocator * allocator,
61620 +                                  reiser4_block_nr start, reiser4_block_nr len)
61621 +{
61622 +       struct super_block *super = reiser4_get_current_sb();
61623 +
61624 +       bmap_nr_t bmap;
61625 +       bmap_off_t offset;
61626 +
61627 +       struct bitmap_node *bnode;
61628 +       int ret;
61629 +
61630 +       assert("zam-468", len != 0);
61631 +       check_block_range(&start, &len);
61632 +
61633 +       parse_blocknr(&start, &bmap, &offset);
61634 +
61635 +       assert("zam-469", offset + len <= bmap_bit_count(super->s_blocksize));
61636 +
61637 +       bnode = get_bnode(super, bmap);
61638 +
61639 +       assert("zam-470", bnode != NULL);
61640 +
61641 +       ret = load_and_lock_bnode(bnode);
61642 +       assert("zam-481", ret == 0);
61643 +
61644 +       reiser4_clear_bits(bnode_working_data(bnode), offset,
61645 +                          (bmap_off_t) (offset + len));
61646 +
61647 +       adjust_first_zero_bit(bnode, offset);
61648 +
61649 +       release_and_unlock_bnode(bnode);
61650 +}
61651 +
61652 +/* plugin->u.space_allocator.check_blocks(). */
61653 +void reiser4_check_blocks_bitmap(const reiser4_block_nr * start,
61654 +                                const reiser4_block_nr * len, int desired)
61655 +{
61656 +#if REISER4_DEBUG
61657 +       struct super_block *super = reiser4_get_current_sb();
61658 +
61659 +       bmap_nr_t bmap;
61660 +       bmap_off_t start_offset;
61661 +       bmap_off_t end_offset;
61662 +
61663 +       struct bitmap_node *bnode;
61664 +       int ret;
61665 +
61666 +       assert("zam-622", len != NULL);
61667 +       check_block_range(start, len);
61668 +       parse_blocknr(start, &bmap, &start_offset);
61669 +
61670 +       end_offset = start_offset + *len;
61671 +       assert("nikita-2214", end_offset <= bmap_bit_count(super->s_blocksize));
61672 +
61673 +       bnode = get_bnode(super, bmap);
61674 +
61675 +       assert("nikita-2215", bnode != NULL);
61676 +
61677 +       ret = load_and_lock_bnode(bnode);
61678 +       assert("zam-626", ret == 0);
61679 +
61680 +       assert("nikita-2216", jnode_is_loaded(bnode->wjnode));
61681 +
61682 +       if (desired) {
61683 +               assert("zam-623",
61684 +                      reiser4_find_next_zero_bit(bnode_working_data(bnode),
61685 +                                                 end_offset, start_offset)
61686 +                      >= end_offset);
61687 +       } else {
61688 +               assert("zam-624",
61689 +                      reiser4_find_next_set_bit(bnode_working_data(bnode),
61690 +                                                end_offset, start_offset)
61691 +                      >= end_offset);
61692 +       }
61693 +
61694 +       release_and_unlock_bnode(bnode);
61695 +#endif
61696 +}
61697 +
61698 +/* conditional insertion of @node into atom's overwrite set  if it was not there */
61699 +static void cond_add_to_overwrite_set(txn_atom * atom, jnode * node)
61700 +{
61701 +       assert("zam-546", atom != NULL);
61702 +       assert("zam-547", atom->stage == ASTAGE_PRE_COMMIT);
61703 +       assert("zam-548", node != NULL);
61704 +
61705 +       spin_lock_atom(atom);
61706 +       spin_lock_jnode(node);
61707 +
61708 +       if (node->atom == NULL) {
61709 +               JF_SET(node, JNODE_OVRWR);
61710 +               insert_into_atom_ovrwr_list(atom, node);
61711 +       } else {
61712 +               assert("zam-549", node->atom == atom);
61713 +       }
61714 +
61715 +       spin_unlock_jnode(node);
61716 +       spin_unlock_atom(atom);
61717 +}
61718 +
61719 +/* an actor which applies delete set to COMMIT bitmap pages and link modified
61720 +   pages in a single-linked list */
61721 +static int
61722 +apply_dset_to_commit_bmap(txn_atom * atom, const reiser4_block_nr * start,
61723 +                         const reiser4_block_nr * len, void *data)
61724 +{
61725 +
61726 +       bmap_nr_t bmap;
61727 +       bmap_off_t offset;
61728 +       int ret;
61729 +
61730 +       long long *blocks_freed_p = data;
61731 +
61732 +       struct bitmap_node *bnode;
61733 +
61734 +       struct super_block *sb = reiser4_get_current_sb();
61735 +
61736 +       check_block_range(start, len);
61737 +
61738 +       parse_blocknr(start, &bmap, &offset);
61739 +
61740 +       /* FIXME-ZAM: we assume that all block ranges are allocated by this
61741 +          bitmap-based allocator and each block range can't go over a zone of
61742 +          responsibility of one bitmap block; same assumption is used in
61743 +          other journal hooks in bitmap code. */
61744 +       bnode = get_bnode(sb, bmap);
61745 +       assert("zam-448", bnode != NULL);
61746 +
61747 +       /* it is safe to unlock atom with is in ASTAGE_PRE_COMMIT */
61748 +       assert("zam-767", atom->stage == ASTAGE_PRE_COMMIT);
61749 +       ret = load_and_lock_bnode(bnode);
61750 +       if (ret)
61751 +               return ret;
61752 +
61753 +       /* put bnode into atom's overwrite set */
61754 +       cond_add_to_overwrite_set(atom, bnode->cjnode);
61755 +
61756 +       data = bnode_commit_data(bnode);
61757 +
61758 +       ret = bnode_check_crc(bnode);
61759 +       if (ret != 0)
61760 +               return ret;
61761 +
61762 +       if (len != NULL) {
61763 +               /* FIXME-ZAM: a check that all bits are set should be there */
61764 +               assert("zam-443",
61765 +                      offset + *len <= bmap_bit_count(sb->s_blocksize));
61766 +               reiser4_clear_bits(data, offset, (bmap_off_t) (offset + *len));
61767 +
61768 +               (*blocks_freed_p) += *len;
61769 +       } else {
61770 +               reiser4_clear_bit(offset, data);
61771 +               (*blocks_freed_p)++;
61772 +       }
61773 +
61774 +       bnode_set_commit_crc(bnode, bnode_calc_crc(bnode, sb->s_blocksize));
61775 +
61776 +       release_and_unlock_bnode(bnode);
61777 +
61778 +       return 0;
61779 +}
61780 +
61781 +/* plugin->u.space_allocator.pre_commit_hook(). */
61782 +/* It just applies transaction changes to fs-wide COMMIT BITMAP, hoping the
61783 +   rest is done by transaction manager (allocate wandered locations for COMMIT
61784 +   BITMAP blocks, copy COMMIT BITMAP blocks data). */
61785 +/* Only one instance of this function can be running at one given time, because
61786 +   only one transaction can be committed a time, therefore it is safe to access
61787 +   some global variables without any locking */
61788 +
61789 +int reiser4_pre_commit_hook_bitmap(void)
61790 +{
61791 +       struct super_block *super = reiser4_get_current_sb();
61792 +       txn_atom *atom;
61793 +
61794 +       long long blocks_freed = 0;
61795 +
61796 +       atom = get_current_atom_locked();
61797 +       assert("zam-876", atom->stage == ASTAGE_PRE_COMMIT);
61798 +       spin_unlock_atom(atom);
61799 +
61800 +       {                       /* scan atom's captured list and find all freshly allocated nodes,
61801 +                                * mark corresponded bits in COMMIT BITMAP as used */
61802 +               struct list_head *head = ATOM_CLEAN_LIST(atom);
61803 +               jnode *node = list_entry(head->next, jnode, capture_link);
61804 +
61805 +               while (head != &node->capture_link) {
61806 +                       /* we detect freshly allocated jnodes */
61807 +                       if (JF_ISSET(node, JNODE_RELOC)) {
61808 +                               int ret;
61809 +                               bmap_nr_t bmap;
61810 +
61811 +                               bmap_off_t offset;
61812 +                               bmap_off_t index;
61813 +                               struct bitmap_node *bn;
61814 +                               __u32 size = bmap_size(super->s_blocksize);
61815 +                               __u32 crc;
61816 +                               char byte;
61817 +
61818 +                               assert("zam-559", !JF_ISSET(node, JNODE_OVRWR));
61819 +                               assert("zam-460",
61820 +                                      !reiser4_blocknr_is_fake(&node->blocknr));
61821 +
61822 +                               parse_blocknr(&node->blocknr, &bmap, &offset);
61823 +                               bn = get_bnode(super, bmap);
61824 +
61825 +                               index = offset >> 3;
61826 +                               assert("vpf-276", index < size);
61827 +
61828 +                               ret = bnode_check_crc(bnode);
61829 +                               if (ret != 0)
61830 +                                       return ret;
61831 +
61832 +                               check_bnode_loaded(bn);
61833 +                               load_and_lock_bnode(bn);
61834 +
61835 +                               byte = *(bnode_commit_data(bn) + index);
61836 +                               reiser4_set_bit(offset, bnode_commit_data(bn));
61837 +
61838 +                               crc = adler32_recalc(bnode_commit_crc(bn), byte,
61839 +                                                    *(bnode_commit_data(bn) +
61840 +                                                      index),
61841 +                                                    size - index),
61842 +                                   bnode_set_commit_crc(bn, crc);
61843 +
61844 +                               release_and_unlock_bnode(bn);
61845 +
61846 +                               ret = bnode_check_crc(bn);
61847 +                               if (ret != 0)
61848 +                                       return ret;
61849 +
61850 +                               /* working of this depends on how it inserts
61851 +                                  new j-node into clean list, because we are
61852 +                                  scanning the same list now. It is OK, if
61853 +                                  insertion is done to the list front */
61854 +                               cond_add_to_overwrite_set(atom, bn->cjnode);
61855 +                       }
61856 +
61857 +                       node = list_entry(node->capture_link.next, jnode, capture_link);
61858 +               }
61859 +       }
61860 +
61861 +       blocknr_set_iterator(atom, &atom->delete_set, apply_dset_to_commit_bmap,
61862 +                            &blocks_freed, 0);
61863 +
61864 +       blocks_freed -= atom->nr_blocks_allocated;
61865 +
61866 +       {
61867 +               reiser4_super_info_data *sbinfo;
61868 +
61869 +               sbinfo = get_super_private(super);
61870 +
61871 +               spin_lock_reiser4_super(sbinfo);
61872 +               sbinfo->blocks_free_committed += blocks_freed;
61873 +               spin_unlock_reiser4_super(sbinfo);
61874 +       }
61875 +
61876 +       return 0;
61877 +}
61878 +
61879 +/* plugin->u.space_allocator.init_allocator
61880 +    constructor of reiser4_space_allocator object. It is called on fs mount */
61881 +int reiser4_init_allocator_bitmap(reiser4_space_allocator * allocator,
61882 +                                 struct super_block *super, void *arg)
61883 +{
61884 +       struct bitmap_allocator_data *data = NULL;
61885 +       bmap_nr_t bitmap_blocks_nr;
61886 +       bmap_nr_t i;
61887 +
61888 +       assert("nikita-3039", reiser4_schedulable());
61889 +
61890 +       /* getting memory for bitmap allocator private data holder */
61891 +       data =
61892 +               kmalloc(sizeof(struct bitmap_allocator_data),
61893 +                       reiser4_ctx_gfp_mask_get());
61894 +
61895 +       if (data == NULL)
61896 +               return RETERR(-ENOMEM);
61897 +
61898 +       /* allocation and initialization for the array of bnodes */
61899 +       bitmap_blocks_nr = get_nr_bmap(super);
61900 +
61901 +       /* FIXME-ZAM: it is not clear what to do with huge number of bitmaps
61902 +          which is bigger than 2^32 (= 8 * 4096 * 4096 * 2^32 bytes = 5.76e+17,
61903 +          may I never meet someone who still uses the ia32 architecture when
61904 +          storage devices of that size enter the market, and wants to use ia32
61905 +          with that storage device, much less reiser4. ;-) -Hans). Kmalloc is not possible and,
61906 +          probably, another dynamic data structure should replace a static
61907 +          array of bnodes. */
61908 +       /*data->bitmap = reiser4_kmalloc((size_t) (sizeof (struct bitmap_node) * bitmap_blocks_nr), GFP_KERNEL); */
61909 +       data->bitmap = reiser4_vmalloc(sizeof(struct bitmap_node) * bitmap_blocks_nr);
61910 +       if (data->bitmap == NULL) {
61911 +               kfree(data);
61912 +               return RETERR(-ENOMEM);
61913 +       }
61914 +
61915 +       for (i = 0; i < bitmap_blocks_nr; i++)
61916 +               init_bnode(data->bitmap + i, super, i);
61917 +
61918 +       allocator->u.generic = data;
61919 +
61920 +#if REISER4_DEBUG
61921 +       get_super_private(super)->min_blocks_used += bitmap_blocks_nr;
61922 +#endif
61923 +
61924 +       /* Load all bitmap blocks at mount time. */
61925 +       if (!test_bit
61926 +           (REISER4_DONT_LOAD_BITMAP, &get_super_private(super)->fs_flags)) {
61927 +               __u64 start_time, elapsed_time;
61928 +               struct bitmap_node *bnode;
61929 +               int ret;
61930 +
61931 +               if (REISER4_DEBUG)
61932 +                       printk(KERN_INFO "loading reiser4 bitmap...");
61933 +               start_time = jiffies;
61934 +
61935 +               for (i = 0; i < bitmap_blocks_nr; i++) {
61936 +                       bnode = data->bitmap + i;
61937 +                       ret = load_and_lock_bnode(bnode);
61938 +                       if (ret) {
61939 +                               reiser4_destroy_allocator_bitmap(allocator,
61940 +                                                                super);
61941 +                               return ret;
61942 +                       }
61943 +                       release_and_unlock_bnode(bnode);
61944 +               }
61945 +
61946 +               elapsed_time = jiffies - start_time;
61947 +               if (REISER4_DEBUG)
61948 +                       printk("...done (%llu jiffies)\n",
61949 +                              (unsigned long long)elapsed_time);
61950 +       }
61951 +
61952 +       return 0;
61953 +}
61954 +
61955 +/* plugin->u.space_allocator.destroy_allocator
61956 +   destructor. It is called on fs unmount */
61957 +int reiser4_destroy_allocator_bitmap(reiser4_space_allocator * allocator,
61958 +                                    struct super_block *super)
61959 +{
61960 +       bmap_nr_t bitmap_blocks_nr;
61961 +       bmap_nr_t i;
61962 +
61963 +       struct bitmap_allocator_data *data = allocator->u.generic;
61964 +
61965 +       assert("zam-414", data != NULL);
61966 +       assert("zam-376", data->bitmap != NULL);
61967 +
61968 +       bitmap_blocks_nr = get_nr_bmap(super);
61969 +
61970 +       for (i = 0; i < bitmap_blocks_nr; i++) {
61971 +               struct bitmap_node *bnode = data->bitmap + i;
61972 +
61973 +               mutex_lock(&bnode->mutex);
61974 +
61975 +#if REISER4_DEBUG
61976 +               if (atomic_read(&bnode->loaded)) {
61977 +                       jnode *wj = bnode->wjnode;
61978 +                       jnode *cj = bnode->cjnode;
61979 +
61980 +                       assert("zam-480", jnode_page(cj) != NULL);
61981 +                       assert("zam-633", jnode_page(wj) != NULL);
61982 +
61983 +                       assert("zam-634",
61984 +                              memcmp(jdata(wj), jdata(wj),
61985 +                                     bmap_size(super->s_blocksize)) == 0);
61986 +
61987 +               }
61988 +#endif
61989 +               done_bnode(bnode);
61990 +               mutex_unlock(&bnode->mutex);
61991 +       }
61992 +
61993 +       vfree(data->bitmap);
61994 +       kfree(data);
61995 +
61996 +       allocator->u.generic = NULL;
61997 +
61998 +       return 0;
61999 +}
62000 +
62001 +/*
62002 + * Local variables:
62003 + * c-indentation-style: "K&R"
62004 + * mode-name: "LC"
62005 + * c-basic-offset: 8
62006 + * tab-width: 8
62007 + * fill-column: 79
62008 + * scroll-step: 1
62009 + * End:
62010 + */
62011 diff -puN /dev/null fs/reiser4/plugin/space/bitmap.h
62012 --- /dev/null
62013 +++ a/fs/reiser4/plugin/space/bitmap.h
62014 @@ -0,0 +1,47 @@
62015 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
62016 +
62017 +#if !defined (__REISER4_PLUGIN_SPACE_BITMAP_H__)
62018 +#define __REISER4_PLUGIN_SPACE_BITMAP_H__
62019 +
62020 +#include "../../dformat.h"
62021 +#include "../../block_alloc.h"
62022 +
62023 +#include <linux/types.h>       /* for __u??  */
62024 +#include <linux/fs.h>          /* for struct super_block  */
62025 +/* EDWARD-FIXME-HANS: write something as informative as the below for every .h file lacking it. */
62026 +/* declarations of functions implementing methods of space allocator plugin for
62027 +   bitmap based allocator. The functions themselves are in bitmap.c */
62028 +extern int reiser4_init_allocator_bitmap(reiser4_space_allocator *,
62029 +                                        struct super_block *, void *);
62030 +extern int reiser4_destroy_allocator_bitmap(reiser4_space_allocator *,
62031 +                                           struct super_block *);
62032 +extern int reiser4_alloc_blocks_bitmap(reiser4_space_allocator *,
62033 +                                      reiser4_blocknr_hint *, int needed,
62034 +                                      reiser4_block_nr * start,
62035 +                                      reiser4_block_nr * len);
62036 +extern void reiser4_check_blocks_bitmap(const reiser4_block_nr *,
62037 +                                       const reiser4_block_nr *, int);
62038 +extern void reiser4_dealloc_blocks_bitmap(reiser4_space_allocator *,
62039 +                                         reiser4_block_nr,
62040 +                                         reiser4_block_nr);
62041 +extern int reiser4_pre_commit_hook_bitmap(void);
62042 +
62043 +#define reiser4_post_commit_hook_bitmap() do{}while(0)
62044 +#define reiser4_post_write_back_hook_bitmap() do{}while(0)
62045 +#define reiser4_print_info_bitmap(pref, al) do{}while(0)
62046 +
62047 +typedef __u64 bmap_nr_t;
62048 +typedef __u32 bmap_off_t;
62049 +
62050 +#endif                         /* __REISER4_PLUGIN_SPACE_BITMAP_H__ */
62051 +
62052 +/* Make Linus happy.
62053 +   Local variables:
62054 +   c-indentation-style: "K&R"
62055 +   mode-name: "LC"
62056 +   c-basic-offset: 8
62057 +   tab-width: 8
62058 +   fill-column: 120
62059 +   scroll-step: 1
62060 +   End:
62061 +*/
62062 diff -puN /dev/null fs/reiser4/plugin/space/space_allocator.h
62063 --- /dev/null
62064 +++ a/fs/reiser4/plugin/space/space_allocator.h
62065 @@ -0,0 +1,80 @@
62066 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
62067 +
62068 +#ifndef __SPACE_ALLOCATOR_H__
62069 +#define __SPACE_ALLOCATOR_H__
62070 +
62071 +#include "../../forward.h"
62072 +#include "bitmap.h"
62073 +/* NIKITA-FIXME-HANS: surely this could use a comment. Something about how bitmap is the only space allocator for now,
62074 + * but... */
62075 +#define DEF_SPACE_ALLOCATOR(allocator)                                                                                 \
62076 +                                                                                                                       \
62077 +static inline int sa_init_allocator (reiser4_space_allocator * al, struct super_block *s, void * opaque)               \
62078 +{                                                                                                                      \
62079 +       return reiser4_init_allocator_##allocator (al, s, opaque);                                                      \
62080 +}                                                                                                                      \
62081 +                                                                                                                       \
62082 +static inline void sa_destroy_allocator (reiser4_space_allocator *al, struct super_block *s)                           \
62083 +{                                                                                                                      \
62084 +       reiser4_destroy_allocator_##allocator (al, s);                                                                  \
62085 +}                                                                                                                      \
62086 +                                                                                                                       \
62087 +static inline int sa_alloc_blocks (reiser4_space_allocator *al, reiser4_blocknr_hint * hint,                           \
62088 +                                  int needed, reiser4_block_nr * start, reiser4_block_nr * len)                        \
62089 +{                                                                                                                      \
62090 +       return reiser4_alloc_blocks_##allocator (al, hint, needed, start, len);                                         \
62091 +}                                                                                                                      \
62092 +static inline void sa_dealloc_blocks (reiser4_space_allocator * al, reiser4_block_nr start, reiser4_block_nr len)      \
62093 +{                                                                                                                      \
62094 +       reiser4_dealloc_blocks_##allocator (al, start, len);                                                            \
62095 +}                                                                                                                      \
62096 +                                                                                                                       \
62097 +static inline void sa_check_blocks (const reiser4_block_nr * start, const reiser4_block_nr * end, int desired)                 \
62098 +{                                                                                                                      \
62099 +       reiser4_check_blocks_##allocator (start, end, desired);                                                         \
62100 +}                                                                                                                      \
62101 +                                                                                                                       \
62102 +static inline void sa_pre_commit_hook (void)                                                                           \
62103 +{                                                                                                                      \
62104 +       reiser4_pre_commit_hook_##allocator ();                                                                         \
62105 +}                                                                                                                      \
62106 +                                                                                                                       \
62107 +static inline void sa_post_commit_hook (void)                                                                          \
62108 +{                                                                                                                      \
62109 +       reiser4_post_commit_hook_##allocator ();                                                                        \
62110 +}                                                                                                                      \
62111 +                                                                                                                       \
62112 +static inline void sa_post_write_back_hook (void)                                                                      \
62113 +{                                                                                                                      \
62114 +       reiser4_post_write_back_hook_##allocator();                                                                     \
62115 +}                                                                                                                      \
62116 +                                                                                                                       \
62117 +static inline void sa_print_info(const char * prefix, reiser4_space_allocator * al)                                    \
62118 +{                                                                                                                      \
62119 +       reiser4_print_info_##allocator (prefix, al);                                                                    \
62120 +}
62121 +
62122 +DEF_SPACE_ALLOCATOR(bitmap)
62123 +
62124 +/* this object is part of reiser4 private in-core super block */
62125 +struct reiser4_space_allocator {
62126 +       union {
62127 +               /* space allocators might use this pointer to reference their
62128 +                * data. */
62129 +               void *generic;
62130 +       } u;
62131 +};
62132 +
62133 +/* __SPACE_ALLOCATOR_H__ */
62134 +#endif
62135 +
62136 +/* Make Linus happy.
62137 +   Local variables:
62138 +   c-indentation-style: "K&R"
62139 +   mode-name: "LC"
62140 +   c-basic-offset: 8
62141 +   tab-width: 8
62142 +   fill-column: 120
62143 +   scroll-step: 1
62144 +   End:
62145 +*/
62146 diff -puN /dev/null fs/reiser4/plugin/tail_policy.c
62147 --- /dev/null
62148 +++ a/fs/reiser4/plugin/tail_policy.c
62149 @@ -0,0 +1,113 @@
62150 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
62151 + * reiser4/README */
62152 +
62153 +/* Formatting policy plugins */
62154 +
62155 +/*
62156 + * Formatting policy plugin is used by object plugin (of regular file) to
62157 + * convert file between two representations.
62158 + *
62159 + * Currently following policies are implemented:
62160 + *  never store file in formatted nodes
62161 + *  always store file in formatted nodes
62162 + *  store file in formatted nodes if file is smaller than 4 blocks (default)
62163 + */
62164 +
62165 +#include "../tree.h"
62166 +#include "../inode.h"
62167 +#include "../super.h"
62168 +#include "object.h"
62169 +#include "plugin.h"
62170 +#include "node/node.h"
62171 +#include "plugin_header.h"
62172 +
62173 +#include <linux/pagemap.h>
62174 +#include <linux/fs.h>          /* For struct inode */
62175 +
62176 +/**
62177 + * have_formatting_never -
62178 + * @inode:
62179 + * @size:
62180 + *
62181 + *
62182 + */
62183 +/* Never store file's tail as direct item */
62184 +/* Audited by: green(2002.06.12) */
62185 +static int have_formatting_never(const struct inode *inode UNUSED_ARG
62186 +                     /* inode to operate on */ ,
62187 +                     loff_t size UNUSED_ARG/* new object size */)
62188 +{
62189 +       return 0;
62190 +}
62191 +
62192 +/* Always store file's tail as direct item */
62193 +/* Audited by: green(2002.06.12) */
62194 +static int
62195 +have_formatting_always(const struct inode *inode UNUSED_ARG
62196 +                      /* inode to operate on */ ,
62197 +                      loff_t size UNUSED_ARG/* new object size */)
62198 +{
62199 +       return 1;
62200 +}
62201 +
62202 +/* This function makes test if we should store file denoted @inode as tails only
62203 +   or as extents only. */
62204 +static int
62205 +have_formatting_default(const struct inode *inode UNUSED_ARG
62206 +                       /* inode to operate on */ ,
62207 +                       loff_t size/* new object size */)
62208 +{
62209 +       assert("umka-1253", inode != NULL);
62210 +
62211 +       if (size > inode->i_sb->s_blocksize * 4)
62212 +               return 0;
62213 +
62214 +       return 1;
62215 +}
62216 +
62217 +/* tail plugins */
62218 +formatting_plugin formatting_plugins[LAST_TAIL_FORMATTING_ID] = {
62219 +       [NEVER_TAILS_FORMATTING_ID] = {
62220 +               .h = {
62221 +                       .type_id = REISER4_FORMATTING_PLUGIN_TYPE,
62222 +                       .id = NEVER_TAILS_FORMATTING_ID,
62223 +                       .pops = NULL,
62224 +                       .label = "never",
62225 +                       .desc = "Never store file's tail",
62226 +                       .linkage = {NULL, NULL}
62227 +               },
62228 +               .have_tail = have_formatting_never
62229 +       },
62230 +       [ALWAYS_TAILS_FORMATTING_ID] = {
62231 +               .h = {
62232 +                       .type_id = REISER4_FORMATTING_PLUGIN_TYPE,
62233 +                       .id = ALWAYS_TAILS_FORMATTING_ID,
62234 +                       .pops = NULL,
62235 +                       .label = "always",
62236 +                       .desc = "Always store file's tail",
62237 +                       .linkage = {NULL, NULL}
62238 +               },
62239 +               .have_tail = have_formatting_always
62240 +       },
62241 +       [SMALL_FILE_FORMATTING_ID] = {
62242 +               .h = {
62243 +                       .type_id = REISER4_FORMATTING_PLUGIN_TYPE,
62244 +                       .id = SMALL_FILE_FORMATTING_ID,
62245 +                       .pops = NULL,
62246 +                       .label = "4blocks",
62247 +                       .desc = "store files shorter than 4 blocks in tail items",
62248 +                       .linkage = {NULL, NULL}
62249 +               },
62250 +               .have_tail = have_formatting_default
62251 +       }
62252 +};
62253 +
62254 +/*
62255 + * Local variables:
62256 + * c-indentation-style: "K&R"
62257 + * mode-name: "LC"
62258 + * c-basic-offset: 8
62259 + * tab-width: 8
62260 + * fill-column: 79
62261 + * End:
62262 + */
62263 diff -puN /dev/null fs/reiser4/pool.c
62264 --- /dev/null
62265 +++ a/fs/reiser4/pool.c
62266 @@ -0,0 +1,231 @@
62267 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
62268 + * reiser4/README */
62269 +
62270 +/* Fast pool allocation.
62271 +
62272 +   There are situations when some sub-system normally asks memory allocator
62273 +   for only few objects, but under some circumstances could require much
62274 +   more. Typical and actually motivating example is tree balancing. It needs
62275 +   to keep track of nodes that were involved into it, and it is well-known
62276 +   that in reasonable packed balanced tree most (92.938121%) percent of all
62277 +   balancings end up after working with only few nodes (3.141592 on
62278 +   average). But in rare cases balancing can involve much more nodes
62279 +   (3*tree_height+1 in extremal situation).
62280 +
62281 +   On the one hand, we don't want to resort to dynamic allocation (slab,
62282 +    malloc(), etc.) to allocate data structures required to keep track of
62283 +   nodes during balancing. On the other hand, we cannot statically allocate
62284 +   required amount of space on the stack, because first: it is useless wastage
62285 +   of precious resource, and second: this amount is unknown in advance (tree
62286 +   height can change).
62287 +
62288 +   Pools, implemented in this file are solution for this problem:
62289 +
62290 +    - some configurable amount of objects is statically preallocated on the
62291 +    stack
62292 +
62293 +    - if this preallocated pool is exhausted and more objects is requested
62294 +    they are allocated dynamically.
62295 +
62296 +   Pools encapsulate distinction between statically and dynamically allocated
62297 +   objects. Both allocation and recycling look exactly the same.
62298 +
62299 +   To keep track of dynamically allocated objects, pool adds its own linkage
62300 +   to each object.
62301 +
62302 +   NOTE-NIKITA This linkage also contains some balancing-specific data. This
62303 +   is not perfect. On the other hand, balancing is currently the only client
62304 +   of pool code.
62305 +
62306 +   NOTE-NIKITA Another desirable feature is to rewrite all pool manipulation
62307 +   functions in the style of tslist/tshash, i.e., make them unreadable, but
62308 +   type-safe.
62309 +
62310 +*/
62311 +
62312 +#include "debug.h"
62313 +#include "pool.h"
62314 +#include "super.h"
62315 +
62316 +#include <linux/types.h>
62317 +#include <linux/err.h>
62318 +
62319 +/* initialize new pool object @h */
62320 +static void reiser4_init_pool_obj(struct reiser4_pool_header *h)
62321 +{
62322 +       INIT_LIST_HEAD(&h->usage_linkage);
62323 +       INIT_LIST_HEAD(&h->level_linkage);
62324 +       INIT_LIST_HEAD(&h->extra_linkage);
62325 +}
62326 +
62327 +/* initialize new pool */
62328 +void reiser4_init_pool(struct reiser4_pool *pool /* pool to initialize */ ,
62329 +                      size_t obj_size /* size of objects in @pool */ ,
62330 +                      int num_of_objs /* number of preallocated objects */ ,
62331 +                      char *data/* area for preallocated objects */)
62332 +{
62333 +       struct reiser4_pool_header *h;
62334 +       int i;
62335 +
62336 +       assert("nikita-955", pool != NULL);
62337 +       assert("nikita-1044", obj_size > 0);
62338 +       assert("nikita-956", num_of_objs >= 0);
62339 +       assert("nikita-957", data != NULL);
62340 +
62341 +       memset(pool, 0, sizeof *pool);
62342 +       pool->obj_size = obj_size;
62343 +       pool->data = data;
62344 +       INIT_LIST_HEAD(&pool->free);
62345 +       INIT_LIST_HEAD(&pool->used);
62346 +       INIT_LIST_HEAD(&pool->extra);
62347 +       memset(data, 0, obj_size * num_of_objs);
62348 +       for (i = 0; i < num_of_objs; ++i) {
62349 +               h = (struct reiser4_pool_header *) (data + i * obj_size);
62350 +               reiser4_init_pool_obj(h);
62351 +               /* add pool header to the end of pool's free list */
62352 +               list_add_tail(&h->usage_linkage, &pool->free);
62353 +       }
62354 +}
62355 +
62356 +/* release pool resources
62357 +
62358 +   Release all resources acquired by this pool, specifically, dynamically
62359 +   allocated objects.
62360 +
62361 +*/
62362 +void reiser4_done_pool(struct reiser4_pool *pool UNUSED_ARG)
62363 +{
62364 +}
62365 +
62366 +/* allocate carry object from @pool
62367 +
62368 +   First, try to get preallocated object. If this fails, resort to dynamic
62369 +   allocation.
62370 +
62371 +*/
62372 +static void *reiser4_pool_alloc(struct reiser4_pool *pool)
62373 +{
62374 +       struct reiser4_pool_header *result;
62375 +
62376 +       assert("nikita-959", pool != NULL);
62377 +
62378 +       if (!list_empty(&pool->free)) {
62379 +               struct list_head *linkage;
62380 +
62381 +               linkage = pool->free.next;
62382 +               list_del(linkage);
62383 +               INIT_LIST_HEAD(linkage);
62384 +               result = list_entry(linkage, struct reiser4_pool_header,
62385 +                                   usage_linkage);
62386 +               BUG_ON(!list_empty(&result->level_linkage) ||
62387 +                      !list_empty(&result->extra_linkage));
62388 +       } else {
62389 +               /* pool is empty. Extra allocations don't deserve dedicated
62390 +                  slab to be served from, as they are expected to be rare. */
62391 +               result = kmalloc(pool->obj_size, reiser4_ctx_gfp_mask_get());
62392 +               if (result != 0) {
62393 +                       reiser4_init_pool_obj(result);
62394 +                       list_add(&result->extra_linkage, &pool->extra);
62395 +               } else
62396 +                       return ERR_PTR(RETERR(-ENOMEM));
62397 +               BUG_ON(!list_empty(&result->usage_linkage) ||
62398 +                      !list_empty(&result->level_linkage));
62399 +       }
62400 +       ++pool->objs;
62401 +       list_add(&result->usage_linkage, &pool->used);
62402 +       memset(result + 1, 0, pool->obj_size - sizeof *result);
62403 +       return result;
62404 +}
62405 +
62406 +/* return object back to the pool */
62407 +void reiser4_pool_free(struct reiser4_pool *pool,
62408 +                      struct reiser4_pool_header *h)
62409 +{
62410 +       assert("nikita-961", h != NULL);
62411 +       assert("nikita-962", pool != NULL);
62412 +
62413 +       --pool->objs;
62414 +       assert("nikita-963", pool->objs >= 0);
62415 +
62416 +       list_del_init(&h->usage_linkage);
62417 +       list_del_init(&h->level_linkage);
62418 +
62419 +       if (list_empty(&h->extra_linkage))
62420 +               /*
62421 +                * pool header is not an extra one. Push it onto free list
62422 +                * using usage_linkage
62423 +                */
62424 +               list_add(&h->usage_linkage, &pool->free);
62425 +       else {
62426 +               /* remove pool header from pool's extra list and kfree it */
62427 +               list_del(&h->extra_linkage);
62428 +               kfree(h);
62429 +       }
62430 +}
62431 +
62432 +/* add new object to the carry level list
62433 +
62434 +   Carry level is FIFO most of the time, but not always. Complications arise
62435 +   when make_space() function tries to go to the left neighbor and thus adds
62436 +   carry node before existing nodes, and also, when updating delimiting keys
62437 +   after moving data between two nodes, we want left node to be locked before
62438 +   right node.
62439 +
62440 +   Latter case is confusing at the first glance. Problem is that COP_UPDATE
62441 +   opration that updates delimiting keys is sometimes called with two nodes
62442 +   (when data are moved between two nodes) and sometimes with only one node
62443 +   (when leftmost item is deleted in a node). In any case operation is
62444 +   supplied with at least node whose left delimiting key is to be updated
62445 +   (that is "right" node).
62446 +
62447 +   @pool - from which to allocate new object;
62448 +   @list - where to add object;
62449 +   @reference - after (or before) which existing object to add
62450 +*/
62451 +struct reiser4_pool_header *reiser4_add_obj(struct reiser4_pool *pool,
62452 +                                        struct list_head *list,
62453 +                                        pool_ordering order,
62454 +                                        struct reiser4_pool_header *reference)
62455 +{
62456 +       struct reiser4_pool_header *result;
62457 +
62458 +       assert("nikita-972", pool != NULL);
62459 +
62460 +       result = reiser4_pool_alloc(pool);
62461 +       if (IS_ERR(result))
62462 +               return result;
62463 +
62464 +       assert("nikita-973", result != NULL);
62465 +
62466 +       switch (order) {
62467 +       case POOLO_BEFORE:
62468 +               __list_add(&result->level_linkage,
62469 +                          reference->level_linkage.prev,
62470 +                          &reference->level_linkage);
62471 +               break;
62472 +       case POOLO_AFTER:
62473 +               __list_add(&result->level_linkage,
62474 +                          &reference->level_linkage,
62475 +                          reference->level_linkage.next);
62476 +               break;
62477 +       case POOLO_LAST:
62478 +               list_add_tail(&result->level_linkage, list);
62479 +               break;
62480 +       case POOLO_FIRST:
62481 +               list_add(&result->level_linkage, list);
62482 +               break;
62483 +       default:
62484 +               wrong_return_value("nikita-927", "order");
62485 +       }
62486 +       return result;
62487 +}
62488 +
62489 +/* Make Linus happy.
62490 +   Local variables:
62491 +   c-indentation-style: "K&R"
62492 +   mode-name: "LC"
62493 +   c-basic-offset: 8
62494 +   tab-width: 8
62495 +   fill-column: 120
62496 +   End:
62497 +*/
62498 diff -puN /dev/null fs/reiser4/pool.h
62499 --- /dev/null
62500 +++ a/fs/reiser4/pool.h
62501 @@ -0,0 +1,57 @@
62502 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
62503 + * reiser4/README */
62504 +
62505 +/* Fast pool allocation */
62506 +
62507 +#ifndef __REISER4_POOL_H__
62508 +#define __REISER4_POOL_H__
62509 +
62510 +#include <linux/types.h>
62511 +
62512 +struct reiser4_pool {
62513 +       size_t obj_size;
62514 +       int objs;
62515 +       char *data;
62516 +       struct list_head free;
62517 +       struct list_head used;
62518 +       struct list_head extra;
62519 +};
62520 +
62521 +struct reiser4_pool_header {
62522 +       /* object is either on free or "used" lists */
62523 +       struct list_head usage_linkage;
62524 +       struct list_head level_linkage;
62525 +       struct list_head extra_linkage;
62526 +};
62527 +
62528 +typedef enum {
62529 +       POOLO_BEFORE,
62530 +       POOLO_AFTER,
62531 +       POOLO_LAST,
62532 +       POOLO_FIRST
62533 +} pool_ordering;
62534 +
62535 +/* pool manipulation functions */
62536 +
62537 +extern void reiser4_init_pool(struct reiser4_pool *pool, size_t obj_size,
62538 +                             int num_of_objs, char *data);
62539 +extern void reiser4_done_pool(struct reiser4_pool *pool);
62540 +extern void reiser4_pool_free(struct reiser4_pool *pool,
62541 +                             struct reiser4_pool_header *h);
62542 +struct reiser4_pool_header *reiser4_add_obj(struct reiser4_pool *pool,
62543 +                                        struct list_head *list,
62544 +                                        pool_ordering order,
62545 +                                        struct reiser4_pool_header *reference);
62546 +
62547 +/* __REISER4_POOL_H__ */
62548 +#endif
62549 +
62550 +/* Make Linus happy.
62551 +   Local variables:
62552 +   c-indentation-style: "K&R"
62553 +   mode-name: "LC"
62554 +   c-basic-offset: 8
62555 +   tab-width: 8
62556 +   fill-column: 120
62557 +   End:
62558 +*/
62559 diff -puN /dev/null fs/reiser4/readahead.c
62560 --- /dev/null
62561 +++ a/fs/reiser4/readahead.c
62562 @@ -0,0 +1,140 @@
62563 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
62564 + * reiser4/README */
62565 +
62566 +#include "forward.h"
62567 +#include "tree.h"
62568 +#include "tree_walk.h"
62569 +#include "super.h"
62570 +#include "inode.h"
62571 +#include "key.h"
62572 +#include "znode.h"
62573 +
62574 +#include <linux/swap.h>                /* for totalram_pages */
62575 +
62576 +void reiser4_init_ra_info(ra_info_t *rai)
62577 +{
62578 +       rai->key_to_stop = *reiser4_min_key();
62579 +}
62580 +
62581 +/* global formatted node readahead parameter. It can be set by mount option
62582 + * -o readahead:NUM:1 */
62583 +static inline int ra_adjacent_only(int flags)
62584 +{
62585 +       return flags & RA_ADJACENT_ONLY;
62586 +}
62587 +
62588 +/* this is used by formatted_readahead to decide whether read for right neighbor
62589 + * of node is to be issued. It returns 1 if right neighbor's first key is less
62590 + * or equal to readahead's stop key */
62591 +static int should_readahead_neighbor(znode * node, ra_info_t *info)
62592 +{
62593 +       int result;
62594 +
62595 +       read_lock_dk(znode_get_tree(node));
62596 +       result = keyle(znode_get_rd_key(node), &info->key_to_stop);
62597 +       read_unlock_dk(znode_get_tree(node));
62598 +       return result;
62599 +}
62600 +
62601 +#define LOW_MEM_PERCENTAGE (5)
62602 +
62603 +static int low_on_memory(void)
62604 +{
62605 +       unsigned int freepages;
62606 +
62607 +       freepages = nr_free_pages();
62608 +       return freepages < (totalram_pages * LOW_MEM_PERCENTAGE / 100);
62609 +}
62610 +
62611 +/* start read for @node and for a few of its right neighbors */
62612 +void formatted_readahead(znode * node, ra_info_t *info)
62613 +{
62614 +       struct formatted_ra_params *ra_params;
62615 +       znode *cur;
62616 +       int i;
62617 +       int grn_flags;
62618 +       lock_handle next_lh;
62619 +
62620 +       /* do nothing if node block number has not been assigned to node (which
62621 +        * means it is still in cache). */
62622 +       if (reiser4_blocknr_is_fake(znode_get_block(node)))
62623 +               return;
62624 +
62625 +       ra_params = get_current_super_ra_params();
62626 +
62627 +       if (znode_page(node) == NULL)
62628 +               jstartio(ZJNODE(node));
62629 +
62630 +       if (znode_get_level(node) != LEAF_LEVEL)
62631 +               return;
62632 +
62633 +       /* don't waste memory for read-ahead when low on memory */
62634 +       if (low_on_memory())
62635 +               return;
62636 +
62637 +       /* We can have locked nodes on upper tree levels, in this situation lock
62638 +          priorities do not help to resolve deadlocks, we have to use TRY_LOCK
62639 +          here. */
62640 +       grn_flags = (GN_CAN_USE_UPPER_LEVELS | GN_TRY_LOCK);
62641 +
62642 +       i = 0;
62643 +       cur = zref(node);
62644 +       init_lh(&next_lh);
62645 +       while (i < ra_params->max) {
62646 +               const reiser4_block_nr * nextblk;
62647 +
62648 +               if (!should_readahead_neighbor(cur, info))
62649 +                       break;
62650 +
62651 +               if (reiser4_get_right_neighbor
62652 +                   (&next_lh, cur, ZNODE_READ_LOCK, grn_flags))
62653 +                       break;
62654 +
62655 +               nextblk = znode_get_block(next_lh.node);
62656 +               if (reiser4_blocknr_is_fake(nextblk) ||
62657 +                   (ra_adjacent_only(ra_params->flags)
62658 +                    && *nextblk != *znode_get_block(cur) + 1))
62659 +                       break;
62660 +
62661 +               zput(cur);
62662 +               cur = zref(next_lh.node);
62663 +               done_lh(&next_lh);
62664 +               if (znode_page(cur) == NULL)
62665 +                       jstartio(ZJNODE(cur));
62666 +               else
62667 +                       /* Do not scan read-ahead window if pages already
62668 +                        * allocated (and i/o already started). */
62669 +                       break;
62670 +
62671 +               i++;
62672 +       }
62673 +       zput(cur);
62674 +       done_lh(&next_lh);
62675 +}
62676 +
62677 +void reiser4_readdir_readahead_init(struct inode *dir, tap_t *tap)
62678 +{
62679 +       reiser4_key *stop_key;
62680 +
62681 +       assert("nikita-3542", dir != NULL);
62682 +       assert("nikita-3543", tap != NULL);
62683 +
62684 +       stop_key = &tap->ra_info.key_to_stop;
62685 +       /* initialize readdir readahead information: include into readahead
62686 +        * stat data of all files of the directory */
62687 +       set_key_locality(stop_key, get_inode_oid(dir));
62688 +       set_key_type(stop_key, KEY_SD_MINOR);
62689 +       set_key_ordering(stop_key, get_key_ordering(reiser4_max_key()));
62690 +       set_key_objectid(stop_key, get_key_objectid(reiser4_max_key()));
62691 +       set_key_offset(stop_key, get_key_offset(reiser4_max_key()));
62692 +}
62693 +
62694 +/*
62695 +   Local variables:
62696 +   c-indentation-style: "K&R"
62697 +   mode-name: "LC"
62698 +   c-basic-offset: 8
62699 +   tab-width: 8
62700 +   fill-column: 80
62701 +   End:
62702 +*/
62703 diff -puN /dev/null fs/reiser4/readahead.h
62704 --- /dev/null
62705 +++ a/fs/reiser4/readahead.h
62706 @@ -0,0 +1,52 @@
62707 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
62708 + * reiser4/README */
62709 +
62710 +#ifndef __READAHEAD_H__
62711 +#define __READAHEAD_H__
62712 +
62713 +#include "key.h"
62714 +
62715 +typedef enum {
62716 +       RA_ADJACENT_ONLY = 1,   /* only requests nodes which are adjacent.
62717 +                                  Default is NO (not only adjacent) */
62718 +} ra_global_flags;
62719 +
62720 +/* reiser4 super block has a field of this type.
62721 +   It controls readahead during tree traversals */
62722 +struct formatted_ra_params {
62723 +       unsigned long max;      /* request not more than this amount of nodes.
62724 +                                  Default is totalram_pages / 4 */
62725 +       int flags;
62726 +};
62727 +
62728 +typedef struct {
62729 +       reiser4_key key_to_stop;
62730 +} ra_info_t;
62731 +
62732 +void formatted_readahead(znode * , ra_info_t *);
62733 +void reiser4_init_ra_info(ra_info_t *rai);
62734 +
62735 +struct reiser4_file_ra_state {
62736 +       loff_t start;           /* Current window */
62737 +       loff_t size;
62738 +       loff_t next_size;       /* Next window size */
62739 +       loff_t ahead_start;     /* Ahead window */
62740 +       loff_t ahead_size;
62741 +       loff_t max_window_size; /* Maximum readahead window */
62742 +       loff_t slow_start;      /* enlarging r/a size algorithm. */
62743 +};
62744 +
62745 +extern void reiser4_readdir_readahead_init(struct inode *dir, tap_t *tap);
62746 +
62747 +/* __READAHEAD_H__ */
62748 +#endif
62749 +
62750 +/*
62751 +   Local variables:
62752 +   c-indentation-style: "K&R"
62753 +   mode-name: "LC"
62754 +   c-basic-offset: 8
62755 +   tab-width: 8
62756 +   fill-column: 120
62757 +   End:
62758 +*/
62759 diff -puN /dev/null fs/reiser4/reiser4.h
62760 --- /dev/null
62761 +++ a/fs/reiser4/reiser4.h
62762 @@ -0,0 +1,270 @@
62763 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
62764 + * reiser4/README */
62765 +
62766 +/* definitions of common constants used by reiser4 */
62767 +
62768 +#if !defined(__REISER4_H__)
62769 +#define __REISER4_H__
62770 +
62771 +#include <asm/param.h>         /* for HZ */
62772 +#include <linux/errno.h>
62773 +#include <linux/types.h>
62774 +#include <linux/fs.h>
62775 +#include <linux/hardirq.h>
62776 +#include <linux/sched.h>
62777 +
62778 +/*
62779 + * reiser4 compilation options.
62780 + */
62781 +
62782 +#if defined(CONFIG_REISER4_DEBUG)
62783 +/* turn on assertion checks */
62784 +#define REISER4_DEBUG (1)
62785 +#else
62786 +#define REISER4_DEBUG (0)
62787 +#endif
62788 +
62789 +#if defined(CONFIG_ZLIB_INFLATE)
62790 +/* turn on zlib */
62791 +#define REISER4_ZLIB (1)
62792 +#else
62793 +#define REISER4_ZLIB (0)
62794 +#endif
62795 +
62796 +#if defined(CONFIG_CRYPTO_SHA256)
62797 +#define REISER4_SHA256 (1)
62798 +#else
62799 +#define REISER4_SHA256 (0)
62800 +#endif
62801 +
62802 +/*
62803 + * Turn on large keys mode. In his mode (which is default), reiser4 key has 4
62804 + * 8-byte components. In the old "small key" mode, it's 3 8-byte
62805 + * components. Additional component, referred to as "ordering" is used to
62806 + * order items from which given object is composed of. As such, ordering is
62807 + * placed between locality and objectid. For directory item ordering contains
62808 + * initial prefix of the file name this item is for. This sorts all directory
62809 + * items within given directory lexicographically (but see
62810 + * fibration.[ch]). For file body and stat-data, ordering contains initial
62811 + * prefix of the name file was initially created with. In the common case
62812 + * (files with single name) this allows to order file bodies and stat-datas in
62813 + * the same order as their respective directory entries, thus speeding up
62814 + * readdir.
62815 + *
62816 + * Note, that kernel can only mount file system with the same key size as one
62817 + * it is compiled for, so flipping this option may render your data
62818 + * inaccessible.
62819 + */
62820 +#define REISER4_LARGE_KEY (1)
62821 +/*#define REISER4_LARGE_KEY (0)*/
62822 +
62823 +/*#define GUESS_EXISTS 1*/
62824 +
62825 +/*
62826 + * PLEASE update fs/reiser4/kattr.c:show_options() when adding new compilation
62827 + * option
62828 + */
62829 +
62830 +extern const char *REISER4_SUPER_MAGIC_STRING;
62831 +extern const int REISER4_MAGIC_OFFSET; /* offset to magic string from the
62832 +                                        * beginning of device */
62833 +
62834 +/* here go tunable parameters that are not worth special entry in kernel
62835 +   configuration */
62836 +
62837 +/* default number of slots in coord-by-key caches */
62838 +#define CBK_CACHE_SLOTS    (16)
62839 +/* how many elementary tree operation to carry on the next level */
62840 +#define CARRIES_POOL_SIZE        (5)
62841 +/* size of pool of preallocated nodes for carry process. */
62842 +#define NODES_LOCKED_POOL_SIZE   (5)
62843 +
62844 +#define REISER4_NEW_NODE_FLAGS (COPI_LOAD_LEFT | COPI_LOAD_RIGHT | COPI_GO_LEFT)
62845 +#define REISER4_NEW_EXTENT_FLAGS (COPI_LOAD_LEFT | COPI_LOAD_RIGHT | COPI_GO_LEFT)
62846 +#define REISER4_PASTE_FLAGS (COPI_GO_LEFT)
62847 +#define REISER4_INSERT_FLAGS (COPI_GO_LEFT)
62848 +
62849 +/* we are supporting reservation of disk space on uid basis */
62850 +#define REISER4_SUPPORT_UID_SPACE_RESERVATION (0)
62851 +/* we are supporting reservation of disk space for groups */
62852 +#define REISER4_SUPPORT_GID_SPACE_RESERVATION (0)
62853 +/* we are supporting reservation of disk space for root */
62854 +#define REISER4_SUPPORT_ROOT_SPACE_RESERVATION (0)
62855 +/* we use rapid flush mode, see flush.c for comments.  */
62856 +#define REISER4_USE_RAPID_FLUSH (1)
62857 +
62858 +/*
62859 + * set this to 0 if you don't want to use wait-for-flush in ->writepage().
62860 + */
62861 +#define REISER4_USE_ENTD (1)
62862 +
62863 +/* key allocation is Plan-A */
62864 +#define REISER4_PLANA_KEY_ALLOCATION (1)
62865 +/* key allocation follows good old 3.x scheme */
62866 +#define REISER4_3_5_KEY_ALLOCATION (0)
62867 +
62868 +/* size of hash-table for znodes */
62869 +#define REISER4_ZNODE_HASH_TABLE_SIZE (1 << 13)
62870 +
62871 +/* number of buckets in lnode hash-table */
62872 +#define LNODE_HTABLE_BUCKETS (1024)
62873 +
62874 +/* some ridiculously high maximal limit on height of znode tree. This
62875 +    is used in declaration of various per level arrays and
62876 +    to allocate stattistics gathering array for per-level stats. */
62877 +#define REISER4_MAX_ZTREE_HEIGHT     (8)
62878 +
62879 +#define REISER4_PANIC_MSG_BUFFER_SIZE (1024)
62880 +
62881 +/* If array contains less than REISER4_SEQ_SEARCH_BREAK elements then,
62882 +   sequential search is on average faster than binary. This is because
62883 +   of better optimization and because sequential search is more CPU
62884 +   cache friendly. This number (25) was found by experiments on dual AMD
62885 +   Athlon(tm), 1400MHz.
62886 +
62887 +   NOTE: testing in kernel has shown that binary search is more effective than
62888 +   implied by results of the user level benchmarking. Probably because in the
62889 +   node keys are separated by other data. So value was adjusted after few
62890 +   tests. More thorough tuning is needed.
62891 +*/
62892 +#define REISER4_SEQ_SEARCH_BREAK      (3)
62893 +
62894 +/* don't allow tree to be lower than this */
62895 +#define REISER4_MIN_TREE_HEIGHT       (TWIG_LEVEL)
62896 +
62897 +/* NOTE NIKITA this is no longer used: maximal atom size is auto-adjusted to
62898 + * available memory. */
62899 +/* Default value of maximal atom size. Can be ovewritten by
62900 +   tmgr.atom_max_size mount option. By default infinity. */
62901 +#define REISER4_ATOM_MAX_SIZE         ((unsigned)(~0))
62902 +
62903 +/* Default value of maximal atom age (in jiffies). After reaching this age
62904 +   atom will be forced to commit, either synchronously or asynchronously. Can
62905 +   be overwritten by tmgr.atom_max_age mount option. */
62906 +#define REISER4_ATOM_MAX_AGE          (600 * HZ)
62907 +
62908 +/* sleeping period for ktxnmrgd */
62909 +#define REISER4_TXNMGR_TIMEOUT  (5 * HZ)
62910 +
62911 +/* timeout to wait for ent thread in writepage. Default: 3 milliseconds. */
62912 +#define REISER4_ENTD_TIMEOUT (3 * HZ / 1000)
62913 +
62914 +/* start complaining after that many restarts in coord_by_key().
62915 +
62916 +   This either means incredibly heavy contention for this part of a tree, or
62917 +   some corruption or bug.
62918 +*/
62919 +#define REISER4_CBK_ITERATIONS_LIMIT  (100)
62920 +
62921 +/* return -EIO after that many iterations in coord_by_key().
62922 +
62923 +   I have witnessed more than 800 iterations (in 30 thread test) before cbk
62924 +   finished. --nikita
62925 +*/
62926 +#define REISER4_MAX_CBK_ITERATIONS    500000
62927 +
62928 +/* put a per-inode limit on maximal number of directory entries with identical
62929 +   keys in hashed directory.
62930 +
62931 +   Disable this until inheritance interfaces stabilize: we need some way to
62932 +   set per directory limit.
62933 +*/
62934 +#define REISER4_USE_COLLISION_LIMIT    (0)
62935 +
62936 +/* If flush finds more than FLUSH_RELOCATE_THRESHOLD adjacent dirty leaf-level
62937 +   blocks it will force them to be relocated. */
62938 +#define FLUSH_RELOCATE_THRESHOLD 64
62939 +/* If flush finds can find a block allocation closer than at most
62940 +   FLUSH_RELOCATE_DISTANCE from the preceder it will relocate to that position.
62941 + */
62942 +#define FLUSH_RELOCATE_DISTANCE  64
62943 +
62944 +/* If we have written this much or more blocks before encountering busy jnode
62945 +   in flush list - abort flushing hoping that next time we get called
62946 +   this jnode will be clean already, and we will save some seeks. */
62947 +#define FLUSH_WRITTEN_THRESHOLD 50
62948 +
62949 +/* The maximum number of nodes to scan left on a level during flush. */
62950 +#define FLUSH_SCAN_MAXNODES 10000
62951 +
62952 +/* per-atom limit of flushers */
62953 +#define ATOM_MAX_FLUSHERS (1)
62954 +
62955 +/* default tracing buffer size */
62956 +#define REISER4_TRACE_BUF_SIZE (1 << 15)
62957 +
62958 +/* what size units of IO we would like cp, etc., to use, in writing to
62959 +   reiser4. In bytes.
62960 +
62961 +   Can be overwritten by optimal_io_size mount option.
62962 +*/
62963 +#define REISER4_OPTIMAL_IO_SIZE (64 * 1024)
62964 +
62965 +/* see comments in inode.c:oid_to_uino() */
62966 +#define REISER4_UINO_SHIFT (1 << 30)
62967 +
62968 +/* Mark function argument as unused to avoid compiler warnings. */
62969 +#define UNUSED_ARG __attribute__((unused))
62970 +
62971 +#if ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 3)) || (__GNUC__ > 3)
62972 +#define NONNULL __attribute__((nonnull))
62973 +#else
62974 +#define NONNULL
62975 +#endif
62976 +
62977 +/* master super block offset in bytes.*/
62978 +#define REISER4_MASTER_OFFSET 65536
62979 +
62980 +/* size of VFS block */
62981 +#define VFS_BLKSIZE 512
62982 +/* number of bits in size of VFS block (512==2^9) */
62983 +#define VFS_BLKSIZE_BITS 9
62984 +
62985 +#define REISER4_I reiser4_inode_data
62986 +
62987 +/* implication */
62988 +#define ergo(antecedent, consequent) (!(antecedent) || (consequent))
62989 +/* logical equivalence */
62990 +#define equi(p1, p2) (ergo((p1), (p2)) && ergo((p2), (p1)))
62991 +
62992 +#define sizeof_array(x) ((int) (sizeof(x) / sizeof(x[0])))
62993 +
62994 +#define NOT_YET                       (0)
62995 +
62996 +/** Reiser4 specific error codes **/
62997 +
62998 +#define REISER4_ERROR_CODE_BASE 10000
62999 +
63000 +/* Neighbor is not available (side neighbor or parent) */
63001 +#define E_NO_NEIGHBOR  (REISER4_ERROR_CODE_BASE)
63002 +
63003 +/* Node was not found in cache */
63004 +#define E_NOT_IN_CACHE (REISER4_ERROR_CODE_BASE + 1)
63005 +
63006 +/* node has no free space enough for completion of balancing operation */
63007 +#define E_NODE_FULL    (REISER4_ERROR_CODE_BASE + 2)
63008 +
63009 +/* repeat operation */
63010 +#define E_REPEAT       (REISER4_ERROR_CODE_BASE + 3)
63011 +
63012 +/* deadlock happens */
63013 +#define E_DEADLOCK     (REISER4_ERROR_CODE_BASE + 4)
63014 +
63015 +/* operation cannot be performed, because it would block and non-blocking mode
63016 + * was requested. */
63017 +#define E_BLOCK        (REISER4_ERROR_CODE_BASE + 5)
63018 +
63019 +/* wait some event (depends on context), then repeat */
63020 +#define E_WAIT         (REISER4_ERROR_CODE_BASE + 6)
63021 +
63022 +#endif                         /* __REISER4_H__ */
63023 +
63024 +/* Make Linus happy.
63025 +   Local variables:
63026 +   c-indentation-style: "K&R"
63027 +   mode-name: "LC"
63028 +   c-basic-offset: 8
63029 +   tab-width: 8
63030 +   fill-column: 120
63031 +   End:
63032 +*/
63033 diff -puN /dev/null fs/reiser4/safe_link.c
63034 --- /dev/null
63035 +++ a/fs/reiser4/safe_link.c
63036 @@ -0,0 +1,354 @@
63037 +/* Copyright 2003, 2004 by Hans Reiser, licensing governed by
63038 + * reiser4/README */
63039 +
63040 +/* Safe-links. */
63041 +
63042 +/*
63043 + * Safe-links are used to maintain file system consistency during operations
63044 + * that spawns multiple transactions. For example:
63045 + *
63046 + *     1. Unlink. UNIX supports "open-but-unlinked" files, that is files
63047 + *     without user-visible names in the file system, but still opened by some
63048 + *     active process. What happens here is that unlink proper (i.e., removal
63049 + *     of the last file name) and file deletion (truncate of file body to zero
63050 + *     and deletion of stat-data, that happens when last file descriptor is
63051 + *     closed), may belong to different transactions T1 and T2. If a crash
63052 + *     happens after T1 commit, but before T2 commit, on-disk file system has
63053 + *     a file without name, that is, disk space leak.
63054 + *
63055 + *     2. Truncate. Truncate of large file may spawn multiple transactions. If
63056 + *     system crashes while truncate was in-progress, file is left partially
63057 + *     truncated, which violates "atomicity guarantees" of reiser4, viz. that
63058 + *     every system is atomic.
63059 + *
63060 + * Safe-links address both above cases. Basically, safe-link is a way post
63061 + * some operation to be executed during commit of some other transaction than
63062 + * current one. (Another way to look at the safe-link is to interpret it as a
63063 + * logical logging.)
63064 + *
63065 + * Specifically, at the beginning of unlink safe-link in inserted in the
63066 + * tree. This safe-link is normally removed by file deletion code (during
63067 + * transaction T2 in the above terms). Truncate also inserts safe-link that is
63068 + * normally removed when truncate operation is finished.
63069 + *
63070 + * This means, that in the case of "clean umount" there are no safe-links in
63071 + * the tree. If safe-links are observed during mount, it means that (a) system
63072 + * was terminated abnormally, and (b) safe-link correspond to the "pending"
63073 + * (i.e., not finished) operations that were in-progress during system
63074 + * termination. Each safe-link record enough information to complete
63075 + * corresponding operation, and mount simply "replays" them (hence, the
63076 + * analogy with the logical logging).
63077 + *
63078 + * Safe-links are implemented as blackbox items (see
63079 + * plugin/item/blackbox.[ch]).
63080 + *
63081 + * For the reference: ext3 also has similar mechanism, it's called "an orphan
63082 + * list" there.
63083 + */
63084 +
63085 +#include "safe_link.h"
63086 +#include "debug.h"
63087 +#include "inode.h"
63088 +
63089 +#include "plugin/item/blackbox.h"
63090 +
63091 +#include <linux/fs.h>
63092 +
63093 +/*
63094 + * On-disk format of safe-link.
63095 + */
63096 +typedef struct safelink {
63097 +       reiser4_key sdkey;      /* key of stat-data for the file safe-link is
63098 +                                * for */
63099 +       d64 size;               /* size to which file should be truncated */
63100 +} safelink_t;
63101 +
63102 +/*
63103 + * locality where safe-link items are stored. Next to the objectid of root
63104 + * directory.
63105 + */
63106 +static oid_t safe_link_locality(reiser4_tree * tree)
63107 +{
63108 +       return get_key_objectid(get_super_private(tree->super)->df_plug->
63109 +                               root_dir_key(tree->super)) + 1;
63110 +}
63111 +
63112 +/*
63113 +  Construct a key for the safe-link. Key has the following format:
63114 +
63115 +|        60     | 4 |        64        | 4 |      60       |         64       |
63116 ++---------------+---+------------------+---+---------------+------------------+
63117 +|   locality    | 0 |        0         | 0 |   objectid    |     link type    |
63118 ++---------------+---+------------------+---+---------------+------------------+
63119 +|                   |                  |                   |                  |
63120 +|     8 bytes       |     8 bytes      |      8 bytes      |      8 bytes     |
63121 +
63122 +   This is in large keys format. In small keys format second 8 byte chunk is
63123 +   out. Locality is a constant returned by safe_link_locality(). objectid is
63124 +   an oid of a file on which operation protected by this safe-link is
63125 +   performed. link-type is used to distinguish safe-links for different
63126 +   operations.
63127 +
63128 + */
63129 +static reiser4_key *build_link_key(reiser4_tree * tree, oid_t oid,
63130 +                                  reiser4_safe_link_t link, reiser4_key * key)
63131 +{
63132 +       reiser4_key_init(key);
63133 +       set_key_locality(key, safe_link_locality(tree));
63134 +       set_key_objectid(key, oid);
63135 +       set_key_offset(key, link);
63136 +       return key;
63137 +}
63138 +
63139 +/*
63140 + * how much disk space is necessary to insert and remove (in the
63141 + * error-handling path) safe-link.
63142 + */
63143 +static __u64 safe_link_tograb(reiser4_tree * tree)
63144 +{
63145 +       return
63146 +           /* insert safe link */
63147 +           estimate_one_insert_item(tree) +
63148 +           /* remove safe link */
63149 +           estimate_one_item_removal(tree) +
63150 +           /* drill to the leaf level during insertion */
63151 +           1 + estimate_one_insert_item(tree) +
63152 +           /*
63153 +            * possible update of existing safe-link. Actually, if
63154 +            * safe-link existed already (we failed to remove it), then no
63155 +            * insertion is necessary, so this term is already "covered",
63156 +            * but for simplicity let's left it.
63157 +            */
63158 +           1;
63159 +}
63160 +
63161 +/*
63162 + * grab enough disk space to insert and remove (in the error-handling path)
63163 + * safe-link.
63164 + */
63165 +int safe_link_grab(reiser4_tree * tree, reiser4_ba_flags_t flags)
63166 +{
63167 +       int result;
63168 +
63169 +       grab_space_enable();
63170 +       /* The sbinfo->delete_mutex can be taken here.
63171 +        * safe_link_release() should be called before leaving reiser4
63172 +        * context. */
63173 +       result =
63174 +           reiser4_grab_reserved(tree->super, safe_link_tograb(tree), flags);
63175 +       grab_space_enable();
63176 +       return result;
63177 +}
63178 +
63179 +/*
63180 + * release unused disk space reserved by safe_link_grab().
63181 + */
63182 +void safe_link_release(reiser4_tree * tree)
63183 +{
63184 +       reiser4_release_reserved(tree->super);
63185 +}
63186 +
63187 +/*
63188 + * insert into tree safe-link for operation @link on inode @inode.
63189 + */
63190 +int safe_link_add(struct inode *inode, reiser4_safe_link_t link)
63191 +{
63192 +       reiser4_key key;
63193 +       safelink_t sl;
63194 +       int length;
63195 +       int result;
63196 +       reiser4_tree *tree;
63197 +
63198 +       build_sd_key(inode, &sl.sdkey);
63199 +       length = sizeof sl.sdkey;
63200 +
63201 +       if (link == SAFE_TRUNCATE) {
63202 +               /*
63203 +                * for truncate we have to store final file length also,
63204 +                * expand item.
63205 +                */
63206 +               length += sizeof(sl.size);
63207 +               put_unaligned(cpu_to_le64(inode->i_size), &sl.size);
63208 +       }
63209 +       tree = reiser4_tree_by_inode(inode);
63210 +       build_link_key(tree, get_inode_oid(inode), link, &key);
63211 +
63212 +       result = store_black_box(tree, &key, &sl, length);
63213 +       if (result == -EEXIST)
63214 +               result = update_black_box(tree, &key, &sl, length);
63215 +       return result;
63216 +}
63217 +
63218 +/*
63219 + * remove safe-link corresponding to the operation @link on inode @inode from
63220 + * the tree.
63221 + */
63222 +int safe_link_del(reiser4_tree * tree, oid_t oid, reiser4_safe_link_t link)
63223 +{
63224 +       reiser4_key key;
63225 +
63226 +       return kill_black_box(tree, build_link_key(tree, oid, link, &key));
63227 +}
63228 +
63229 +/*
63230 + * in-memory structure to keep information extracted from safe-link. This is
63231 + * used to iterate over all safe-links.
63232 + */
63233 +struct safe_link_context {
63234 +       reiser4_tree *tree;     /* internal tree */
63235 +       reiser4_key key;        /* safe-link key */
63236 +       reiser4_key sdkey;      /* key of object stat-data */
63237 +       reiser4_safe_link_t link;       /* safe-link type */
63238 +       oid_t oid;              /* object oid */
63239 +       __u64 size;             /* final size for truncate */
63240 +};
63241 +
63242 +/*
63243 + * start iterating over all safe-links.
63244 + */
63245 +static void safe_link_iter_begin(reiser4_tree * tree,
63246 +                                struct safe_link_context *ctx)
63247 +{
63248 +       ctx->tree = tree;
63249 +       reiser4_key_init(&ctx->key);
63250 +       set_key_locality(&ctx->key, safe_link_locality(tree));
63251 +       set_key_objectid(&ctx->key, get_key_objectid(reiser4_max_key()));
63252 +       set_key_offset(&ctx->key, get_key_offset(reiser4_max_key()));
63253 +}
63254 +
63255 +/*
63256 + * return next safe-link.
63257 + */
63258 +static int safe_link_iter_next(struct safe_link_context *ctx)
63259 +{
63260 +       int result;
63261 +       safelink_t sl;
63262 +
63263 +       result = load_black_box(ctx->tree, &ctx->key, &sl, sizeof sl, 0);
63264 +       if (result == 0) {
63265 +               ctx->oid = get_key_objectid(&ctx->key);
63266 +               ctx->link = get_key_offset(&ctx->key);
63267 +               ctx->sdkey = sl.sdkey;
63268 +               if (ctx->link == SAFE_TRUNCATE)
63269 +                       ctx->size = le64_to_cpu(get_unaligned(&sl.size));
63270 +       }
63271 +       return result;
63272 +}
63273 +
63274 +/*
63275 + * check are there any more safe-links left in the tree.
63276 + */
63277 +static int safe_link_iter_finished(struct safe_link_context *ctx)
63278 +{
63279 +       return get_key_locality(&ctx->key) != safe_link_locality(ctx->tree);
63280 +}
63281 +
63282 +/*
63283 + * finish safe-link iteration.
63284 + */
63285 +static void safe_link_iter_end(struct safe_link_context *ctx)
63286 +{
63287 +       /* nothing special */
63288 +}
63289 +
63290 +/*
63291 + * process single safe-link.
63292 + */
63293 +static int process_safelink(struct super_block *super, reiser4_safe_link_t link,
63294 +                           reiser4_key * sdkey, oid_t oid, __u64 size)
63295 +{
63296 +       struct inode *inode;
63297 +       int result;
63298 +
63299 +       /*
63300 +        * obtain object inode by reiser4_iget(), then call object plugin
63301 +        * ->safelink() method to do actual work, then delete safe-link on
63302 +        * success.
63303 +        */
63304 +       inode = reiser4_iget(super, sdkey, 1);
63305 +       if (!IS_ERR(inode)) {
63306 +               file_plugin *fplug;
63307 +
63308 +               fplug = inode_file_plugin(inode);
63309 +               assert("nikita-3428", fplug != NULL);
63310 +               assert("", oid == get_inode_oid(inode));
63311 +               if (fplug->safelink != NULL) {
63312 +                       /* reiser4_txn_restart_current is not necessary because
63313 +                        * mounting is signle thread. However, without it
63314 +                        * deadlock detection code will complain (see
63315 +                        * nikita-3361). */
63316 +                       reiser4_txn_restart_current();
63317 +                       result = fplug->safelink(inode, link, size);
63318 +               } else {
63319 +                       warning("nikita-3430",
63320 +                               "Cannot handle safelink for %lli",
63321 +                               (unsigned long long)oid);
63322 +                       reiser4_print_key("key", sdkey);
63323 +                       result = 0;
63324 +               }
63325 +               if (result != 0) {
63326 +                       warning("nikita-3431",
63327 +                               "Error processing safelink for %lli: %i",
63328 +                               (unsigned long long)oid, result);
63329 +               }
63330 +               reiser4_iget_complete(inode);
63331 +               iput(inode);
63332 +               if (result == 0) {
63333 +                       result = safe_link_grab(reiser4_get_tree(super),
63334 +                                               BA_CAN_COMMIT);
63335 +                       if (result == 0)
63336 +                               result =
63337 +                                   safe_link_del(reiser4_get_tree(super), oid,
63338 +                                                 link);
63339 +                       safe_link_release(reiser4_get_tree(super));
63340 +                       /*
63341 +                        * restart transaction: if there was large number of
63342 +                        * safe-links, their processing may fail to fit into
63343 +                        * single transaction.
63344 +                        */
63345 +                       if (result == 0)
63346 +                               reiser4_txn_restart_current();
63347 +               }
63348 +       } else
63349 +               result = PTR_ERR(inode);
63350 +       return result;
63351 +}
63352 +
63353 +/*
63354 + * iterate over all safe-links in the file-system processing them one by one.
63355 + */
63356 +int process_safelinks(struct super_block *super)
63357 +{
63358 +       struct safe_link_context ctx;
63359 +       int result;
63360 +
63361 +       if (rofs_super(super))
63362 +               /* do nothing on the read-only file system */
63363 +               return 0;
63364 +       safe_link_iter_begin(&get_super_private(super)->tree, &ctx);
63365 +       result = 0;
63366 +       do {
63367 +               result = safe_link_iter_next(&ctx);
63368 +               if (safe_link_iter_finished(&ctx) || result == -ENOENT) {
63369 +                       result = 0;
63370 +                       break;
63371 +               }
63372 +               if (result == 0)
63373 +                       result = process_safelink(super, ctx.link,
63374 +                                                 &ctx.sdkey, ctx.oid,
63375 +                                                 ctx.size);
63376 +       } while (result == 0);
63377 +       safe_link_iter_end(&ctx);
63378 +       return result;
63379 +}
63380 +
63381 +/* Make Linus happy.
63382 +   Local variables:
63383 +   c-indentation-style: "K&R"
63384 +   mode-name: "LC"
63385 +   c-basic-offset: 8
63386 +   tab-width: 8
63387 +   fill-column: 120
63388 +   scroll-step: 1
63389 +   End:
63390 +*/
63391 diff -puN /dev/null fs/reiser4/safe_link.h
63392 --- /dev/null
63393 +++ a/fs/reiser4/safe_link.h
63394 @@ -0,0 +1,29 @@
63395 +/* Copyright 2003 by Hans Reiser, licensing governed by
63396 + * reiser4/README */
63397 +
63398 +/* Safe-links. See safe_link.c for details. */
63399 +
63400 +#if !defined(__FS_SAFE_LINK_H__)
63401 +#define __FS_SAFE_LINK_H__
63402 +
63403 +#include "tree.h"
63404 +
63405 +int safe_link_grab(reiser4_tree * tree, reiser4_ba_flags_t flags);
63406 +void safe_link_release(reiser4_tree * tree);
63407 +int safe_link_add(struct inode *inode, reiser4_safe_link_t link);
63408 +int safe_link_del(reiser4_tree *, oid_t oid, reiser4_safe_link_t link);
63409 +
63410 +int process_safelinks(struct super_block *super);
63411 +
63412 +/* __FS_SAFE_LINK_H__ */
63413 +#endif
63414 +
63415 +/* Make Linus happy.
63416 +   Local variables:
63417 +   c-indentation-style: "K&R"
63418 +   mode-name: "LC"
63419 +   c-basic-offset: 8
63420 +   tab-width: 8
63421 +   fill-column: 120
63422 +   End:
63423 +*/
63424 diff -puN /dev/null fs/reiser4/seal.c
63425 --- /dev/null
63426 +++ a/fs/reiser4/seal.c
63427 @@ -0,0 +1,218 @@
63428 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
63429 +/* Seals implementation. */
63430 +/* Seals are "weak" tree pointers. They are analogous to tree coords in
63431 +   allowing to bypass tree traversal. But normal usage of coords implies that
63432 +   node pointed to by coord is locked, whereas seals don't keep a lock (or
63433 +   even a reference) to znode. In stead, each znode contains a version number,
63434 +   increased on each znode modification. This version number is copied into a
63435 +   seal when seal is created. Later, one can "validate" seal by calling
63436 +   reiser4_seal_validate(). If znode is in cache and its version number is
63437 +   still the same, seal is "pristine" and coord associated with it can be
63438 +   re-used immediately.
63439 +
63440 +   If, on the other hand, znode is out of cache, or it is obviously different
63441 +   one from the znode seal was initially attached to (for example, it is on
63442 +   the different level, or is being removed from the tree), seal is
63443 +   irreparably invalid ("burned") and tree traversal has to be repeated.
63444 +
63445 +   Otherwise, there is some hope, that while znode was modified (and seal was
63446 +   "broken" as a result), key attached to the seal is still in the node. This
63447 +   is checked by first comparing this key with delimiting keys of node and, if
63448 +   key is ok, doing intra-node lookup.
63449 +
63450 +   Znode version is maintained in the following way:
63451 +
63452 +   there is reiser4_tree.znode_epoch counter. Whenever new znode is created,
63453 +   znode_epoch is incremented and its new value is stored in ->version field
63454 +   of new znode. Whenever znode is dirtied (which means it was probably
63455 +   modified), znode_epoch is also incremented and its new value is stored in
63456 +   znode->version. This is done so, because just incrementing znode->version
63457 +   on each update is not enough: it may so happen, that znode get deleted, new
63458 +   znode is allocated for the same disk block and gets the same version
63459 +   counter, tricking seal code into false positive.
63460 +*/
63461 +
63462 +#include "forward.h"
63463 +#include "debug.h"
63464 +#include "key.h"
63465 +#include "coord.h"
63466 +#include "seal.h"
63467 +#include "plugin/item/item.h"
63468 +#include "plugin/node/node.h"
63469 +#include "jnode.h"
63470 +#include "znode.h"
63471 +#include "super.h"
63472 +
63473 +static znode *seal_node(const seal_t *seal);
63474 +static int seal_matches(const seal_t *seal, znode * node);
63475 +
63476 +/* initialise seal. This can be called several times on the same seal. @coord
63477 +   and @key can be NULL.  */
63478 +void reiser4_seal_init(seal_t *seal /* seal to initialise */ ,
63479 +                      const coord_t *coord /* coord @seal will be
63480 +                                             * attached to */ ,
63481 +                      const reiser4_key * key UNUSED_ARG /* key @seal will be
63482 +                                                          * attached to */ )
63483 +{
63484 +       assert("nikita-1886", seal != NULL);
63485 +       memset(seal, 0, sizeof *seal);
63486 +       if (coord != NULL) {
63487 +               znode *node;
63488 +
63489 +               node = coord->node;
63490 +               assert("nikita-1987", node != NULL);
63491 +               spin_lock_znode(node);
63492 +               seal->version = node->version;
63493 +               assert("nikita-1988", seal->version != 0);
63494 +               seal->block = *znode_get_block(node);
63495 +#if REISER4_DEBUG
63496 +               seal->coord1 = *coord;
63497 +               if (key != NULL)
63498 +                       seal->key = *key;
63499 +#endif
63500 +               spin_unlock_znode(node);
63501 +       }
63502 +}
63503 +
63504 +/* finish with seal */
63505 +void reiser4_seal_done(seal_t *seal/* seal to clear */)
63506 +{
63507 +       assert("nikita-1887", seal != NULL);
63508 +       seal->version = 0;
63509 +}
63510 +
63511 +/* true if seal was initialised */
63512 +int reiser4_seal_is_set(const seal_t *seal/* seal to query */)
63513 +{
63514 +       assert("nikita-1890", seal != NULL);
63515 +       return seal->version != 0;
63516 +}
63517 +
63518 +#if REISER4_DEBUG
63519 +/* helper function for reiser4_seal_validate(). It checks that item at @coord
63520 + * has expected key. This is to detect cases where node was modified but wasn't
63521 + * marked dirty. */
63522 +static inline int check_seal_match(const coord_t *coord /* coord to check */ ,
63523 +                                  const reiser4_key * k/* expected key */)
63524 +{
63525 +       reiser4_key ukey;
63526 +
63527 +       return (coord->between != AT_UNIT) ||
63528 +           /* FIXME-VS: we only can compare keys for items whose units
63529 +              represent exactly one key */
63530 +           ((coord_is_existing_unit(coord))
63531 +            && (item_is_extent(coord)
63532 +                || keyeq(k, unit_key_by_coord(coord, &ukey))))
63533 +           || ((coord_is_existing_unit(coord)) && (item_is_ctail(coord))
63534 +               && keyge(k, unit_key_by_coord(coord, &ukey)));
63535 +}
63536 +#endif
63537 +
63538 +/* this is used by reiser4_seal_validate. It accepts return value of
63539 + * longterm_lock_znode and returns 1 if it can be interpreted as seal
63540 + * validation failure. For instance, when longterm_lock_znode returns -EINVAL,
63541 + * reiser4_seal_validate returns -E_REPEAT and caller will call tre search.
63542 + * We cannot do this in longterm_lock_znode(), because sometimes we want to
63543 + * distinguish between -EINVAL and -E_REPEAT. */
63544 +static int should_repeat(int return_code)
63545 +{
63546 +       return return_code == -EINVAL;
63547 +}
63548 +
63549 +/* (re-)validate seal.
63550 +
63551 +   Checks whether seal is pristine, and try to revalidate it if possible.
63552 +
63553 +   If seal was burned, or broken irreparably, return -E_REPEAT.
63554 +
63555 +   NOTE-NIKITA currently reiser4_seal_validate() returns -E_REPEAT if key we are
63556 +   looking for is in range of keys covered by the sealed node, but item wasn't
63557 +   found by node ->lookup() method. Alternative is to return -ENOENT in this
63558 +   case, but this would complicate callers logic.
63559 +
63560 +*/
63561 +int reiser4_seal_validate(seal_t *seal /* seal to validate */,
63562 +                         coord_t *coord /* coord to validate against */,
63563 +                         const reiser4_key * key /* key to validate against */,
63564 +                         lock_handle * lh /* resulting lock handle */,
63565 +                         znode_lock_mode mode /* lock node */,
63566 +                         znode_lock_request request/* locking priority */)
63567 +{
63568 +       znode *node;
63569 +       int result;
63570 +
63571 +       assert("nikita-1889", seal != NULL);
63572 +       assert("nikita-1881", reiser4_seal_is_set(seal));
63573 +       assert("nikita-1882", key != NULL);
63574 +       assert("nikita-1883", coord != NULL);
63575 +       assert("nikita-1884", lh != NULL);
63576 +       assert("nikita-1885", keyeq(&seal->key, key));
63577 +       assert("nikita-1989", coords_equal(&seal->coord1, coord));
63578 +
63579 +       /* obtain znode by block number */
63580 +       node = seal_node(seal);
63581 +       if (node != NULL) {
63582 +               /* znode was in cache, lock it */
63583 +               result = longterm_lock_znode(lh, node, mode, request);
63584 +               zput(node);
63585 +               if (result == 0) {
63586 +                       if (seal_matches(seal, node)) {
63587 +                               /* if seal version and znode version
63588 +                                  coincide */
63589 +                               ON_DEBUG(coord_update_v(coord));
63590 +                               assert("nikita-1990",
63591 +                                      node == seal->coord1.node);
63592 +                               assert("nikita-1898",
63593 +                                      WITH_DATA_RET(coord->node, 1,
63594 +                                                    check_seal_match(coord,
63595 +                                                                     key)));
63596 +                       } else
63597 +                               result = RETERR(-E_REPEAT);
63598 +               }
63599 +               if (result != 0) {
63600 +                       if (should_repeat(result))
63601 +                               result = RETERR(-E_REPEAT);
63602 +                       /* unlock node on failure */
63603 +                       done_lh(lh);
63604 +               }
63605 +       } else {
63606 +               /* znode wasn't in cache */
63607 +               result = RETERR(-E_REPEAT);
63608 +       }
63609 +       return result;
63610 +}
63611 +
63612 +/* helpers functions */
63613 +
63614 +/* obtain reference to znode seal points to, if in cache */
63615 +static znode *seal_node(const seal_t *seal/* seal to query */)
63616 +{
63617 +       assert("nikita-1891", seal != NULL);
63618 +       return zlook(current_tree, &seal->block);
63619 +}
63620 +
63621 +/* true if @seal version and @node version coincide */
63622 +static int seal_matches(const seal_t *seal /* seal to check */ ,
63623 +                       znode * node/* node to check */)
63624 +{
63625 +       int result;
63626 +
63627 +       assert("nikita-1991", seal != NULL);
63628 +       assert("nikita-1993", node != NULL);
63629 +
63630 +       spin_lock_znode(node);
63631 +       result = (seal->version == node->version);
63632 +       spin_unlock_znode(node);
63633 +       return result;
63634 +}
63635 +
63636 +/* Make Linus happy.
63637 +   Local variables:
63638 +   c-indentation-style: "K&R"
63639 +   mode-name: "LC"
63640 +   c-basic-offset: 8
63641 +   tab-width: 8
63642 +   fill-column: 120
63643 +   scroll-step: 1
63644 +   End:
63645 +*/
63646 diff -puN /dev/null fs/reiser4/seal.h
63647 --- /dev/null
63648 +++ a/fs/reiser4/seal.h
63649 @@ -0,0 +1,49 @@
63650 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
63651 +
63652 +/* Declaration of seals: "weak" tree pointers. See seal.c for comments. */
63653 +
63654 +#ifndef __SEAL_H__
63655 +#define __SEAL_H__
63656 +
63657 +#include "forward.h"
63658 +#include "debug.h"
63659 +#include "dformat.h"
63660 +#include "key.h"
63661 +#include "coord.h"
63662 +
63663 +/* for __u?? types */
63664 +/*#include <linux/types.h>*/
63665 +
63666 +/* seal. See comment at the top of seal.c */
63667 +typedef struct seal_s {
63668 +       /* version of znode recorder at the time of seal creation */
63669 +       __u64 version;
63670 +       /* block number of znode attached to this seal */
63671 +       reiser4_block_nr block;
63672 +#if REISER4_DEBUG
63673 +       /* coord this seal is attached to. For debugging. */
63674 +       coord_t coord1;
63675 +       /* key this seal is attached to. For debugging. */
63676 +       reiser4_key key;
63677 +#endif
63678 +} seal_t;
63679 +
63680 +extern void reiser4_seal_init(seal_t *, const coord_t *, const reiser4_key *);
63681 +extern void reiser4_seal_done(seal_t *);
63682 +extern int reiser4_seal_is_set(const seal_t *);
63683 +extern int reiser4_seal_validate(seal_t *, coord_t *,
63684 +                        const reiser4_key *, lock_handle * ,
63685 +                        znode_lock_mode mode, znode_lock_request request);
63686 +
63687 +/* __SEAL_H__ */
63688 +#endif
63689 +
63690 +/* Make Linus happy.
63691 +   Local variables:
63692 +   c-indentation-style: "K&R"
63693 +   mode-name: "LC"
63694 +   c-basic-offset: 8
63695 +   tab-width: 8
63696 +   fill-column: 120
63697 +   End:
63698 +*/
63699 diff -puN /dev/null fs/reiser4/search.c
63700 --- /dev/null
63701 +++ a/fs/reiser4/search.c
63702 @@ -0,0 +1,1612 @@
63703 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
63704 + * reiser4/README */
63705 +
63706 +#include "forward.h"
63707 +#include "debug.h"
63708 +#include "dformat.h"
63709 +#include "key.h"
63710 +#include "coord.h"
63711 +#include "seal.h"
63712 +#include "plugin/item/item.h"
63713 +#include "plugin/node/node.h"
63714 +#include "plugin/plugin.h"
63715 +#include "jnode.h"
63716 +#include "znode.h"
63717 +#include "block_alloc.h"
63718 +#include "tree_walk.h"
63719 +#include "tree.h"
63720 +#include "reiser4.h"
63721 +#include "super.h"
63722 +#include "inode.h"
63723 +
63724 +#include <linux/slab.h>
63725 +
63726 +static const char *bias_name(lookup_bias bias);
63727 +
63728 +/* tree searching algorithm, intranode searching algorithms are in
63729 +   plugin/node/ */
63730 +
63731 +/* tree lookup cache
63732 + *
63733 + * The coord by key cache consists of small list of recently accessed nodes
63734 + * maintained according to the LRU discipline. Before doing real top-to-down
63735 + * tree traversal this cache is scanned for nodes that can contain key
63736 + * requested.
63737 + *
63738 + * The efficiency of coord cache depends heavily on locality of reference for
63739 + * tree accesses. Our user level simulations show reasonably good hit ratios
63740 + * for coord cache under most loads so far.
63741 + */
63742 +
63743 +/* Initialise coord cache slot */
63744 +static void cbk_cache_init_slot(cbk_cache_slot *slot)
63745 +{
63746 +       assert("nikita-345", slot != NULL);
63747 +
63748 +       INIT_LIST_HEAD(&slot->lru);
63749 +       slot->node = NULL;
63750 +}
63751 +
63752 +/* Initialize coord cache */
63753 +int cbk_cache_init(cbk_cache * cache/* cache to init */)
63754 +{
63755 +       int i;
63756 +
63757 +       assert("nikita-346", cache != NULL);
63758 +
63759 +       cache->slot =
63760 +               kmalloc(sizeof(cbk_cache_slot) * cache->nr_slots,
63761 +                       reiser4_ctx_gfp_mask_get());
63762 +       if (cache->slot == NULL)
63763 +               return RETERR(-ENOMEM);
63764 +
63765 +       INIT_LIST_HEAD(&cache->lru);
63766 +       for (i = 0; i < cache->nr_slots; ++i) {
63767 +               cbk_cache_init_slot(cache->slot + i);
63768 +               list_add_tail(&((cache->slot + i)->lru), &cache->lru);
63769 +       }
63770 +       rwlock_init(&cache->guard);
63771 +       return 0;
63772 +}
63773 +
63774 +/* free cbk cache data */
63775 +void cbk_cache_done(cbk_cache * cache/* cache to release */)
63776 +{
63777 +       assert("nikita-2493", cache != NULL);
63778 +       if (cache->slot != NULL) {
63779 +               kfree(cache->slot);
63780 +               cache->slot = NULL;
63781 +       }
63782 +}
63783 +
63784 +/* macro to iterate over all cbk cache slots */
63785 +#define for_all_slots(cache, slot)                                       \
63786 +       for ((slot) = list_entry((cache)->lru.next, cbk_cache_slot, lru); \
63787 +            &(cache)->lru != &(slot)->lru;                               \
63788 +            (slot) = list_entry(slot->lru.next, cbk_cache_slot, lru))
63789 +
63790 +#if REISER4_DEBUG
63791 +/* this function assures that [cbk-cache-invariant] invariant holds */
63792 +static int cbk_cache_invariant(const cbk_cache * cache)
63793 +{
63794 +       cbk_cache_slot *slot;
63795 +       int result;
63796 +       int unused;
63797 +
63798 +       if (cache->nr_slots == 0)
63799 +               return 1;
63800 +
63801 +       assert("nikita-2469", cache != NULL);
63802 +       unused = 0;
63803 +       result = 1;
63804 +       read_lock(&((cbk_cache *)cache)->guard);
63805 +       for_all_slots(cache, slot) {
63806 +               /* in LRU first go all `used' slots followed by `unused' */
63807 +               if (unused && (slot->node != NULL))
63808 +                       result = 0;
63809 +               if (slot->node == NULL)
63810 +                       unused = 1;
63811 +               else {
63812 +                       cbk_cache_slot *scan;
63813 +
63814 +                       /* all cached nodes are different */
63815 +                       scan = slot;
63816 +                       while (result) {
63817 +                               scan = list_entry(scan->lru.next,
63818 +                                                 cbk_cache_slot, lru);
63819 +                               if (&cache->lru == &scan->lru)
63820 +                                       break;
63821 +                               if (slot->node == scan->node)
63822 +                                       result = 0;
63823 +                       }
63824 +               }
63825 +               if (!result)
63826 +                       break;
63827 +       }
63828 +       read_unlock(&((cbk_cache *)cache)->guard);
63829 +       return result;
63830 +}
63831 +
63832 +#endif
63833 +
63834 +/* Remove references, if any, to @node from coord cache */
63835 +void cbk_cache_invalidate(const znode * node /* node to remove from cache */ ,
63836 +                         reiser4_tree * tree/* tree to remove node from */)
63837 +{
63838 +       cbk_cache_slot *slot;
63839 +       cbk_cache *cache;
63840 +       int i;
63841 +
63842 +       assert("nikita-350", node != NULL);
63843 +       assert("nikita-1479", LOCK_CNT_GTZ(rw_locked_tree));
63844 +
63845 +       cache = &tree->cbk_cache;
63846 +       assert("nikita-2470", cbk_cache_invariant(cache));
63847 +
63848 +       write_lock(&(cache->guard));
63849 +       for (i = 0, slot = cache->slot; i < cache->nr_slots; ++i, ++slot) {
63850 +               if (slot->node == node) {
63851 +                       list_move_tail(&slot->lru, &cache->lru);
63852 +                       slot->node = NULL;
63853 +                       break;
63854 +               }
63855 +       }
63856 +       write_unlock(&(cache->guard));
63857 +       assert("nikita-2471", cbk_cache_invariant(cache));
63858 +}
63859 +
63860 +/* add to the cbk-cache in the "tree" information about "node". This
63861 +    can actually be update of existing slot in a cache. */
63862 +static void cbk_cache_add(const znode * node/* node to add to the cache */)
63863 +{
63864 +       cbk_cache *cache;
63865 +
63866 +       cbk_cache_slot *slot;
63867 +       int i;
63868 +
63869 +       assert("nikita-352", node != NULL);
63870 +
63871 +       cache = &znode_get_tree(node)->cbk_cache;
63872 +       assert("nikita-2472", cbk_cache_invariant(cache));
63873 +
63874 +       if (cache->nr_slots == 0)
63875 +               return;
63876 +
63877 +       write_lock(&(cache->guard));
63878 +       /* find slot to update/add */
63879 +       for (i = 0, slot = cache->slot; i < cache->nr_slots; ++i, ++slot) {
63880 +               /* oops, this node is already in a cache */
63881 +               if (slot->node == node)
63882 +                       break;
63883 +       }
63884 +       /* if all slots are used, reuse least recently used one */
63885 +       if (i == cache->nr_slots) {
63886 +               slot = list_entry(cache->lru.prev, cbk_cache_slot, lru);
63887 +               slot->node = (znode *) node;
63888 +       }
63889 +       list_move(&slot->lru, &cache->lru);
63890 +       write_unlock(&(cache->guard));
63891 +       assert("nikita-2473", cbk_cache_invariant(cache));
63892 +}
63893 +
63894 +static int setup_delimiting_keys(cbk_handle * h);
63895 +static lookup_result coord_by_handle(cbk_handle * handle);
63896 +static lookup_result traverse_tree(cbk_handle * h);
63897 +static int cbk_cache_search(cbk_handle * h);
63898 +
63899 +static level_lookup_result cbk_level_lookup(cbk_handle * h);
63900 +static level_lookup_result cbk_node_lookup(cbk_handle * h);
63901 +
63902 +/* helper functions */
63903 +
63904 +static void update_stale_dk(reiser4_tree * tree, znode * node);
63905 +
63906 +/* release parent node during traversal */
63907 +static void put_parent(cbk_handle * h);
63908 +/* check consistency of fields */
63909 +static int sanity_check(cbk_handle * h);
63910 +/* release resources in handle */
63911 +static void hput(cbk_handle * h);
63912 +
63913 +static level_lookup_result search_to_left(cbk_handle * h);
63914 +
63915 +/* pack numerous (numberous I should say) arguments of coord_by_key() into
63916 + * cbk_handle */
63917 +static cbk_handle *cbk_pack(cbk_handle * handle,
63918 +                           reiser4_tree * tree,
63919 +                           const reiser4_key * key,
63920 +                           coord_t *coord,
63921 +                           lock_handle * active_lh,
63922 +                           lock_handle * parent_lh,
63923 +                           znode_lock_mode lock_mode,
63924 +                           lookup_bias bias,
63925 +                           tree_level lock_level,
63926 +                           tree_level stop_level,
63927 +                           __u32 flags, ra_info_t *info)
63928 +{
63929 +       memset(handle, 0, sizeof *handle);
63930 +
63931 +       handle->tree = tree;
63932 +       handle->key = key;
63933 +       handle->lock_mode = lock_mode;
63934 +       handle->bias = bias;
63935 +       handle->lock_level = lock_level;
63936 +       handle->stop_level = stop_level;
63937 +       handle->coord = coord;
63938 +       /* set flags. See comment in tree.h:cbk_flags */
63939 +       handle->flags = flags | CBK_TRUST_DK | CBK_USE_CRABLOCK;
63940 +
63941 +       handle->active_lh = active_lh;
63942 +       handle->parent_lh = parent_lh;
63943 +       handle->ra_info = info;
63944 +       return handle;
63945 +}
63946 +
63947 +/* main tree lookup procedure
63948 +
63949 +   Check coord cache. If key we are looking for is not found there, call cbk()
63950 +   to do real tree traversal.
63951 +
63952 +   As we have extents on the twig level, @lock_level and @stop_level can
63953 +   be different from LEAF_LEVEL and each other.
63954 +
63955 +   Thread cannot keep any reiser4 locks (tree, znode, dk spin-locks, or znode
63956 +   long term locks) while calling this.
63957 +*/
63958 +lookup_result coord_by_key(reiser4_tree * tree /* tree to perform search
63959 +                                                * in. Usually this tree is
63960 +                                                * part of file-system
63961 +                                                * super-block */ ,
63962 +                          const reiser4_key * key /* key to look for */ ,
63963 +                          coord_t *coord       /* where to store found
63964 +                                                * position in a tree. Fields
63965 +                                                * in "coord" are only valid if
63966 +                                                * coord_by_key() returned
63967 +                                                * "CBK_COORD_FOUND" */ ,
63968 +                          lock_handle * lh,    /* resulting lock handle */
63969 +                          znode_lock_mode lock_mode    /* type of lookup we
63970 +                                                        * want on node. Pass
63971 +                                                        * ZNODE_READ_LOCK here
63972 +                                                        * if you only want to
63973 +                                                        * read item found and
63974 +                                                        * ZNODE_WRITE_LOCK if
63975 +                                                        * you want to modify
63976 +                                                        * it */ ,
63977 +                          lookup_bias bias     /* what to return if coord
63978 +                                                * with exactly the @key is
63979 +                                                * not in the tree */ ,
63980 +                          tree_level lock_level/* tree level where to start
63981 +                                                * taking @lock type of
63982 +                                                * locks */ ,
63983 +                          tree_level stop_level/* tree level to stop. Pass
63984 +                                                * LEAF_LEVEL or TWIG_LEVEL
63985 +                                                * here Item being looked
63986 +                                                * for has to be between
63987 +                                                * @lock_level and
63988 +                                                * @stop_level, inclusive */ ,
63989 +                          __u32 flags /* search flags */ ,
63990 +                          ra_info_t *
63991 +                          info
63992 +                          /* information about desired tree traversal
63993 +                           * readahead */
63994 +                          )
63995 +{
63996 +       cbk_handle handle;
63997 +       lock_handle parent_lh;
63998 +       lookup_result result;
63999 +
64000 +       init_lh(lh);
64001 +       init_lh(&parent_lh);
64002 +
64003 +       assert("nikita-3023", reiser4_schedulable());
64004 +
64005 +       assert("nikita-353", tree != NULL);
64006 +       assert("nikita-354", key != NULL);
64007 +       assert("nikita-355", coord != NULL);
64008 +       assert("nikita-356", (bias == FIND_EXACT)
64009 +              || (bias == FIND_MAX_NOT_MORE_THAN));
64010 +       assert("nikita-357", stop_level >= LEAF_LEVEL);
64011 +       /* no locks can be held during tree traversal */
64012 +       assert("nikita-2104", lock_stack_isclean(get_current_lock_stack()));
64013 +
64014 +       cbk_pack(&handle,
64015 +                tree,
64016 +                key,
64017 +                coord,
64018 +                lh,
64019 +                &parent_lh,
64020 +                lock_mode, bias, lock_level, stop_level, flags, info);
64021 +
64022 +       result = coord_by_handle(&handle);
64023 +       assert("nikita-3247",
64024 +              ergo(!IS_CBKERR(result), coord->node == lh->node));
64025 +       return result;
64026 +}
64027 +
64028 +/* like coord_by_key(), but starts traversal from vroot of @object rather than
64029 + * from tree root. */
64030 +lookup_result reiser4_object_lookup(struct inode *object,
64031 +                                   const reiser4_key * key,
64032 +                                   coord_t *coord,
64033 +                                   lock_handle * lh,
64034 +                                   znode_lock_mode lock_mode,
64035 +                                   lookup_bias bias,
64036 +                                   tree_level lock_level,
64037 +                                   tree_level stop_level, __u32 flags,
64038 +                                   ra_info_t *info)
64039 +{
64040 +       cbk_handle handle;
64041 +       lock_handle parent_lh;
64042 +       lookup_result result;
64043 +
64044 +       init_lh(lh);
64045 +       init_lh(&parent_lh);
64046 +
64047 +       assert("nikita-3023", reiser4_schedulable());
64048 +
64049 +       assert("nikita-354", key != NULL);
64050 +       assert("nikita-355", coord != NULL);
64051 +       assert("nikita-356", (bias == FIND_EXACT)
64052 +              || (bias == FIND_MAX_NOT_MORE_THAN));
64053 +       assert("nikita-357", stop_level >= LEAF_LEVEL);
64054 +       /* no locks can be held during tree search by key */
64055 +       assert("nikita-2104", lock_stack_isclean(get_current_lock_stack()));
64056 +
64057 +       cbk_pack(&handle,
64058 +                object != NULL ? reiser4_tree_by_inode(object) : current_tree,
64059 +                key,
64060 +                coord,
64061 +                lh,
64062 +                &parent_lh,
64063 +                lock_mode, bias, lock_level, stop_level, flags, info);
64064 +       handle.object = object;
64065 +
64066 +       result = coord_by_handle(&handle);
64067 +       assert("nikita-3247",
64068 +              ergo(!IS_CBKERR(result), coord->node == lh->node));
64069 +       return result;
64070 +}
64071 +
64072 +/* lookup by cbk_handle. Common part of coord_by_key() and
64073 +   reiser4_object_lookup(). */
64074 +static lookup_result coord_by_handle(cbk_handle * handle)
64075 +{
64076 +       /*
64077 +        * first check cbk_cache (which is look-aside cache for our tree) and
64078 +        * of this fails, start traversal.
64079 +        */
64080 +       /* first check whether "key" is in cache of recent lookups. */
64081 +       if (cbk_cache_search(handle) == 0)
64082 +               return handle->result;
64083 +       else
64084 +               return traverse_tree(handle);
64085 +}
64086 +
64087 +/* Execute actor for each item (or unit, depending on @through_units_p),
64088 +   starting from @coord, right-ward, until either:
64089 +
64090 +   - end of the tree is reached
64091 +   - unformatted node is met
64092 +   - error occurred
64093 +   - @actor returns 0 or less
64094 +
64095 +   Error code, or last actor return value is returned.
64096 +
64097 +   This is used by plugin/dir/hashe_dir.c:reiser4_find_entry() to move through
64098 +   sequence of entries with identical keys and alikes.
64099 +*/
64100 +int reiser4_iterate_tree(reiser4_tree * tree /* tree to scan */ ,
64101 +                        coord_t *coord /* coord to start from */ ,
64102 +                        lock_handle * lh /* lock handle to start with and to
64103 +                                          * update along the way */ ,
64104 +                        tree_iterate_actor_t actor /* function to call on each
64105 +                                                    * item/unit */ ,
64106 +                        void *arg /* argument to pass to @actor */ ,
64107 +                        znode_lock_mode mode /* lock mode on scanned nodes */ ,
64108 +                        int through_units_p /* call @actor on each item or on
64109 +                                             * each unit */ )
64110 +{
64111 +       int result;
64112 +
64113 +       assert("nikita-1143", tree != NULL);
64114 +       assert("nikita-1145", coord != NULL);
64115 +       assert("nikita-1146", lh != NULL);
64116 +       assert("nikita-1147", actor != NULL);
64117 +
64118 +       result = zload(coord->node);
64119 +       coord_clear_iplug(coord);
64120 +       if (result != 0)
64121 +               return result;
64122 +       if (!coord_is_existing_unit(coord)) {
64123 +               zrelse(coord->node);
64124 +               return -ENOENT;
64125 +       }
64126 +       while ((result = actor(tree, coord, lh, arg)) > 0) {
64127 +               /* move further  */
64128 +               if ((through_units_p && coord_next_unit(coord)) ||
64129 +                   (!through_units_p && coord_next_item(coord))) {
64130 +                       do {
64131 +                               lock_handle couple;
64132 +
64133 +                               /* move to the next node  */
64134 +                               init_lh(&couple);
64135 +                               result =
64136 +                                   reiser4_get_right_neighbor(&couple,
64137 +                                                              coord->node,
64138 +                                                              (int)mode,
64139 +                                                              GN_CAN_USE_UPPER_LEVELS);
64140 +                               zrelse(coord->node);
64141 +                               if (result == 0) {
64142 +
64143 +                                       result = zload(couple.node);
64144 +                                       if (result != 0) {
64145 +                                               done_lh(&couple);
64146 +                                               return result;
64147 +                                       }
64148 +
64149 +                                       coord_init_first_unit(coord,
64150 +                                                             couple.node);
64151 +                                       done_lh(lh);
64152 +                                       move_lh(lh, &couple);
64153 +                               } else
64154 +                                       return result;
64155 +                       } while (node_is_empty(coord->node));
64156 +               }
64157 +
64158 +               assert("nikita-1149", coord_is_existing_unit(coord));
64159 +       }
64160 +       zrelse(coord->node);
64161 +       return result;
64162 +}
64163 +
64164 +/* return locked uber znode for @tree */
64165 +int get_uber_znode(reiser4_tree * tree, znode_lock_mode mode,
64166 +                  znode_lock_request pri, lock_handle * lh)
64167 +{
64168 +       int result;
64169 +
64170 +       result = longterm_lock_znode(lh, tree->uber, mode, pri);
64171 +       return result;
64172 +}
64173 +
64174 +/* true if @key is strictly within @node
64175 +
64176 +   we are looking for possibly non-unique key and it is item is at the edge of
64177 +   @node. May be it is in the neighbor.
64178 +*/
64179 +static int znode_contains_key_strict(znode * node      /* node to check key
64180 +                                                        * against */ ,
64181 +                                    const reiser4_key *
64182 +                                    key /* key to check */ ,
64183 +                                    int isunique)
64184 +{
64185 +       int answer;
64186 +
64187 +       assert("nikita-1760", node != NULL);
64188 +       assert("nikita-1722", key != NULL);
64189 +
64190 +       if (keyge(key, &node->rd_key))
64191 +               return 0;
64192 +
64193 +       answer = keycmp(&node->ld_key, key);
64194 +
64195 +       if (isunique)
64196 +               return answer != GREATER_THAN;
64197 +       else
64198 +               return answer == LESS_THAN;
64199 +}
64200 +
64201 +/*
64202 + * Virtual Root (vroot) code.
64203 + *
64204 + *     For given file system object (e.g., regular file or directory) let's
64205 + *     define its "virtual root" as lowest in the tree (that is, furtherest
64206 + *     from the tree root) node such that all body items of said object are
64207 + *     located in a tree rooted at this node.
64208 + *
64209 + *     Once vroot of object is found all tree lookups for items within body of
64210 + *     this object ("object lookups") can be started from its vroot rather
64211 + *     than from real root. This has following advantages:
64212 + *
64213 + *         1. amount of nodes traversed during lookup (and, hence, amount of
64214 + *         key comparisons made) decreases, and
64215 + *
64216 + *         2. contention on tree root is decreased. This latter was actually
64217 + *         motivating reason behind vroot, because spin lock of root node,
64218 + *         which is taken when acquiring long-term lock on root node is the
64219 + *         hottest lock in the reiser4.
64220 + *
64221 + * How to find vroot.
64222 + *
64223 + *     When vroot of object F is not yet determined, all object lookups start
64224 + *     from the root of the tree. At each tree level during traversal we have
64225 + *     a node N such that a key we are looking for (which is the key inside
64226 + *     object's body) is located within N. In function handle_vroot() called
64227 + *     from cbk_level_lookup() we check whether N is possible vroot for
64228 + *     F. Check is trivial---if neither leftmost nor rightmost item of N
64229 + *     belongs to F (and we already have helpful ->owns_item() method of
64230 + *     object plugin for this), then N is possible vroot of F. This, of
64231 + *     course, relies on the assumption that each object occupies contiguous
64232 + *     range of keys in the tree.
64233 + *
64234 + *     Thus, traversing tree downward and checking each node as we go, we can
64235 + *     find lowest such node, which, by definition, is vroot.
64236 + *
64237 + * How to track vroot.
64238 + *
64239 + *     Nohow. If actual vroot changes, next object lookup will just restart
64240 + *     from the actual tree root, refreshing object's vroot along the way.
64241 + *
64242 + */
64243 +
64244 +/*
64245 + * Check whether @node is possible vroot of @object.
64246 + */
64247 +static void handle_vroot(struct inode *object, znode * node)
64248 +{
64249 +       file_plugin *fplug;
64250 +       coord_t coord;
64251 +
64252 +       fplug = inode_file_plugin(object);
64253 +       assert("nikita-3353", fplug != NULL);
64254 +       assert("nikita-3354", fplug->owns_item != NULL);
64255 +
64256 +       if (unlikely(node_is_empty(node)))
64257 +               return;
64258 +
64259 +       coord_init_first_unit(&coord, node);
64260 +       /*
64261 +        * if leftmost item of @node belongs to @object, we cannot be sure
64262 +        * that @node is vroot of @object, because, some items of @object are
64263 +        * probably in the sub-tree rooted at the left neighbor of @node.
64264 +        */
64265 +       if (fplug->owns_item(object, &coord))
64266 +               return;
64267 +       coord_init_last_unit(&coord, node);
64268 +       /* mutatis mutandis for the rightmost item */
64269 +       if (fplug->owns_item(object, &coord))
64270 +               return;
64271 +       /* otherwise, @node is possible vroot of @object */
64272 +       inode_set_vroot(object, node);
64273 +}
64274 +
64275 +/*
64276 + * helper function used by traverse tree to start tree traversal not from the
64277 + * tree root, but from @h->object's vroot, if possible.
64278 + */
64279 +static int prepare_object_lookup(cbk_handle * h)
64280 +{
64281 +       znode *vroot;
64282 +       int result;
64283 +
64284 +       vroot = inode_get_vroot(h->object);
64285 +       if (vroot == NULL) {
64286 +               /*
64287 +                * object doesn't have known vroot, start from real tree root.
64288 +                */
64289 +               return LOOKUP_CONT;
64290 +       }
64291 +
64292 +       h->level = znode_get_level(vroot);
64293 +       /* take a long-term lock on vroot */
64294 +       h->result = longterm_lock_znode(h->active_lh, vroot,
64295 +                                       cbk_lock_mode(h->level, h),
64296 +                                       ZNODE_LOCK_LOPRI);
64297 +       result = LOOKUP_REST;
64298 +       if (h->result == 0) {
64299 +               int isunique;
64300 +               int inside;
64301 +
64302 +               isunique = h->flags & CBK_UNIQUE;
64303 +               /* check that key is inside vroot */
64304 +               read_lock_dk(h->tree);
64305 +               inside = (znode_contains_key_strict(vroot, h->key, isunique) &&
64306 +                         !ZF_ISSET(vroot, JNODE_HEARD_BANSHEE));
64307 +               read_unlock_dk(h->tree);
64308 +               if (inside) {
64309 +                       h->result = zload(vroot);
64310 +                       if (h->result == 0) {
64311 +                               /* search for key in vroot. */
64312 +                               result = cbk_node_lookup(h);
64313 +                               zrelse(vroot);  /*h->active_lh->node); */
64314 +                               if (h->active_lh->node != vroot) {
64315 +                                       result = LOOKUP_REST;
64316 +                               } else if (result == LOOKUP_CONT) {
64317 +                                       move_lh(h->parent_lh, h->active_lh);
64318 +                                       h->flags &= ~CBK_DKSET;
64319 +                               }
64320 +                       }
64321 +               }
64322 +       }
64323 +
64324 +       zput(vroot);
64325 +
64326 +       if (IS_CBKERR(h->result) || result == LOOKUP_REST)
64327 +               hput(h);
64328 +       return result;
64329 +}
64330 +
64331 +/* main function that handles common parts of tree traversal: starting
64332 +    (fake znode handling), restarts, error handling, completion */
64333 +static lookup_result traverse_tree(cbk_handle * h/* search handle */)
64334 +{
64335 +       int done;
64336 +       int iterations;
64337 +       int vroot_used;
64338 +
64339 +       assert("nikita-365", h != NULL);
64340 +       assert("nikita-366", h->tree != NULL);
64341 +       assert("nikita-367", h->key != NULL);
64342 +       assert("nikita-368", h->coord != NULL);
64343 +       assert("nikita-369", (h->bias == FIND_EXACT)
64344 +              || (h->bias == FIND_MAX_NOT_MORE_THAN));
64345 +       assert("nikita-370", h->stop_level >= LEAF_LEVEL);
64346 +       assert("nikita-2949", !(h->flags & CBK_DKSET));
64347 +       assert("zam-355", lock_stack_isclean(get_current_lock_stack()));
64348 +
64349 +       done = 0;
64350 +       iterations = 0;
64351 +       vroot_used = 0;
64352 +
64353 +       /* loop for restarts */
64354 +restart:
64355 +
64356 +       assert("nikita-3024", reiser4_schedulable());
64357 +
64358 +       h->result = CBK_COORD_FOUND;
64359 +       /* connect_znode() needs it */
64360 +       h->ld_key = *reiser4_min_key();
64361 +       h->rd_key = *reiser4_max_key();
64362 +       h->flags |= CBK_DKSET;
64363 +       h->error = NULL;
64364 +
64365 +       if (!vroot_used && h->object != NULL) {
64366 +               vroot_used = 1;
64367 +               done = prepare_object_lookup(h);
64368 +               if (done == LOOKUP_REST)
64369 +                       goto restart;
64370 +               else if (done == LOOKUP_DONE)
64371 +                       return h->result;
64372 +       }
64373 +       if (h->parent_lh->node == NULL) {
64374 +               done =
64375 +                   get_uber_znode(h->tree, ZNODE_READ_LOCK, ZNODE_LOCK_LOPRI,
64376 +                                  h->parent_lh);
64377 +
64378 +               assert("nikita-1637", done != -E_DEADLOCK);
64379 +
64380 +               h->block = h->tree->root_block;
64381 +               h->level = h->tree->height;
64382 +               h->coord->node = h->parent_lh->node;
64383 +
64384 +               if (done != 0)
64385 +                       return done;
64386 +       }
64387 +
64388 +       /* loop descending a tree */
64389 +       while (!done) {
64390 +
64391 +               if (unlikely((iterations > REISER4_CBK_ITERATIONS_LIMIT) &&
64392 +                            IS_POW(iterations))) {
64393 +                       warning("nikita-1481", "Too many iterations: %i",
64394 +                               iterations);
64395 +                       reiser4_print_key("key", h->key);
64396 +                       ++iterations;
64397 +               } else if (unlikely(iterations > REISER4_MAX_CBK_ITERATIONS)) {
64398 +                       h->error =
64399 +                           "reiser-2018: Too many iterations. Tree corrupted, or (less likely) starvation occurring.";
64400 +                       h->result = RETERR(-EIO);
64401 +                       break;
64402 +               }
64403 +               switch (cbk_level_lookup(h)) {
64404 +               case LOOKUP_CONT:
64405 +                       move_lh(h->parent_lh, h->active_lh);
64406 +                       continue;
64407 +               default:
64408 +                       wrong_return_value("nikita-372", "cbk_level");
64409 +               case LOOKUP_DONE:
64410 +                       done = 1;
64411 +                       break;
64412 +               case LOOKUP_REST:
64413 +                       hput(h);
64414 +                       /* deadlock avoidance is normal case. */
64415 +                       if (h->result != -E_DEADLOCK)
64416 +                               ++iterations;
64417 +                       reiser4_preempt_point();
64418 +                       goto restart;
64419 +               }
64420 +       }
64421 +       /* that's all. The rest is error handling */
64422 +       if (unlikely(h->error != NULL)) {
64423 +               warning("nikita-373", "%s: level: %i, "
64424 +                       "lock_level: %i, stop_level: %i "
64425 +                       "lock_mode: %s, bias: %s",
64426 +                       h->error, h->level, h->lock_level, h->stop_level,
64427 +                       lock_mode_name(h->lock_mode), bias_name(h->bias));
64428 +               reiser4_print_address("block", &h->block);
64429 +               reiser4_print_key("key", h->key);
64430 +               print_coord_content("coord", h->coord);
64431 +       }
64432 +       /* `unlikely' error case */
64433 +       if (unlikely(IS_CBKERR(h->result))) {
64434 +               /* failure. do cleanup */
64435 +               hput(h);
64436 +       } else {
64437 +               assert("nikita-1605", WITH_DATA_RET
64438 +                      (h->coord->node, 1,
64439 +                       ergo((h->result == CBK_COORD_FOUND) &&
64440 +                            (h->bias == FIND_EXACT) &&
64441 +                            (!node_is_empty(h->coord->node)),
64442 +                            coord_is_existing_item(h->coord))));
64443 +       }
64444 +       return h->result;
64445 +}
64446 +
64447 +/* find delimiting keys of child
64448 +
64449 +   Determine left and right delimiting keys for child pointed to by
64450 +   @parent_coord.
64451 +
64452 +*/
64453 +static void find_child_delimiting_keys(znode * parent  /* parent znode, passed
64454 +                                                        * locked */ ,
64455 +                                      const coord_t *parent_coord
64456 +                                                       /* coord where pointer
64457 +                                                        * to child is stored
64458 +                                                        */ ,
64459 +                                      reiser4_key * ld /* where to store left
64460 +                                                        * delimiting key */ ,
64461 +                                      reiser4_key * rd /* where to store right
64462 +                                                        * delimiting key */ )
64463 +{
64464 +       coord_t neighbor;
64465 +
64466 +       assert("nikita-1484", parent != NULL);
64467 +       assert_rw_locked(&(znode_get_tree(parent)->dk_lock));
64468 +
64469 +       coord_dup(&neighbor, parent_coord);
64470 +
64471 +       if (neighbor.between == AT_UNIT)
64472 +               /* imitate item ->lookup() behavior. */
64473 +               neighbor.between = AFTER_UNIT;
64474 +
64475 +       if (coord_set_to_left(&neighbor) == 0)
64476 +               unit_key_by_coord(&neighbor, ld);
64477 +       else {
64478 +               assert("nikita-14851", 0);
64479 +               *ld = *znode_get_ld_key(parent);
64480 +       }
64481 +
64482 +       coord_dup(&neighbor, parent_coord);
64483 +       if (neighbor.between == AT_UNIT)
64484 +               neighbor.between = AFTER_UNIT;
64485 +       if (coord_set_to_right(&neighbor) == 0)
64486 +               unit_key_by_coord(&neighbor, rd);
64487 +       else
64488 +               *rd = *znode_get_rd_key(parent);
64489 +}
64490 +
64491 +/*
64492 + * setup delimiting keys for a child
64493 + *
64494 + * @parent parent node
64495 + *
64496 + * @coord location in @parent where pointer to @child is
64497 + *
64498 + * @child child node
64499 + */
64500 +int
64501 +set_child_delimiting_keys(znode * parent, const coord_t *coord, znode * child)
64502 +{
64503 +       reiser4_tree *tree;
64504 +
64505 +       assert("nikita-2952",
64506 +              znode_get_level(parent) == znode_get_level(coord->node));
64507 +
64508 +       /* fast check without taking dk lock. This is safe, because
64509 +        * JNODE_DKSET is never cleared once set. */
64510 +       if (!ZF_ISSET(child, JNODE_DKSET)) {
64511 +               tree = znode_get_tree(parent);
64512 +               write_lock_dk(tree);
64513 +               if (likely(!ZF_ISSET(child, JNODE_DKSET))) {
64514 +                       find_child_delimiting_keys(parent, coord,
64515 +                                                  &child->ld_key,
64516 +                                                  &child->rd_key);
64517 +                       ON_DEBUG(child->ld_key_version =
64518 +                                atomic_inc_return(&delim_key_version);
64519 +                                child->rd_key_version =
64520 +                                atomic_inc_return(&delim_key_version););
64521 +                       ZF_SET(child, JNODE_DKSET);
64522 +               }
64523 +               write_unlock_dk(tree);
64524 +               return 1;
64525 +       }
64526 +       return 0;
64527 +}
64528 +
64529 +/* Perform tree lookup at one level. This is called from cbk_traverse()
64530 +   function that drives lookup through tree and calls cbk_node_lookup() to
64531 +   perform lookup within one node.
64532 +
64533 +   See comments in a code.
64534 +*/
64535 +static level_lookup_result cbk_level_lookup(cbk_handle * h/* search handle */)
64536 +{
64537 +       int ret;
64538 +       int setdk;
64539 +       int ldkeyset = 0;
64540 +       reiser4_key ldkey;
64541 +       reiser4_key key;
64542 +       znode *active;
64543 +
64544 +       assert("nikita-3025", reiser4_schedulable());
64545 +
64546 +       /* acquire reference to @active node */
64547 +       active =
64548 +           zget(h->tree, &h->block, h->parent_lh->node, h->level,
64549 +                reiser4_ctx_gfp_mask_get());
64550 +
64551 +       if (IS_ERR(active)) {
64552 +               h->result = PTR_ERR(active);
64553 +               return LOOKUP_DONE;
64554 +       }
64555 +
64556 +       /* lock @active */
64557 +       h->result = longterm_lock_znode(h->active_lh,
64558 +                                       active,
64559 +                                       cbk_lock_mode(h->level, h),
64560 +                                       ZNODE_LOCK_LOPRI);
64561 +       /* longterm_lock_znode() acquires additional reference to znode (which
64562 +          will be later released by longterm_unlock_znode()). Release
64563 +          reference acquired by zget().
64564 +        */
64565 +       zput(active);
64566 +       if (unlikely(h->result != 0))
64567 +               goto fail_or_restart;
64568 +
64569 +       setdk = 0;
64570 +       /* if @active is accessed for the first time, setup delimiting keys on
64571 +          it. Delimiting keys are taken from the parent node. See
64572 +          setup_delimiting_keys() for details.
64573 +        */
64574 +       if (h->flags & CBK_DKSET) {
64575 +               setdk = setup_delimiting_keys(h);
64576 +               h->flags &= ~CBK_DKSET;
64577 +       } else {
64578 +               znode *parent;
64579 +
64580 +               parent = h->parent_lh->node;
64581 +               h->result = zload(parent);
64582 +               if (unlikely(h->result != 0))
64583 +                       goto fail_or_restart;
64584 +
64585 +               if (!ZF_ISSET(active, JNODE_DKSET))
64586 +                       setdk = set_child_delimiting_keys(parent,
64587 +                                                         h->coord, active);
64588 +               else {
64589 +                       read_lock_dk(h->tree);
64590 +                       find_child_delimiting_keys(parent, h->coord, &ldkey,
64591 +                                                  &key);
64592 +                       read_unlock_dk(h->tree);
64593 +                       ldkeyset = 1;
64594 +               }
64595 +               zrelse(parent);
64596 +       }
64597 +
64598 +       /* this is ugly kludge. Reminder: this is necessary, because
64599 +          ->lookup() method returns coord with ->between field probably set
64600 +          to something different from AT_UNIT.
64601 +        */
64602 +       h->coord->between = AT_UNIT;
64603 +
64604 +       if (znode_just_created(active) && (h->coord->node != NULL)) {
64605 +               write_lock_tree(h->tree);
64606 +               /* if we are going to load znode right now, setup
64607 +                  ->in_parent: coord where pointer to this node is stored in
64608 +                  parent.
64609 +                */
64610 +               coord_to_parent_coord(h->coord, &active->in_parent);
64611 +               write_unlock_tree(h->tree);
64612 +       }
64613 +
64614 +       /* check connectedness without holding tree lock---false negatives
64615 +        * will be re-checked by connect_znode(), and false positives are
64616 +        * impossible---@active cannot suddenly turn into unconnected
64617 +        * state. */
64618 +       if (!znode_is_connected(active)) {
64619 +               h->result = connect_znode(h->coord, active);
64620 +               if (unlikely(h->result != 0)) {
64621 +                       put_parent(h);
64622 +                       goto fail_or_restart;
64623 +               }
64624 +       }
64625 +
64626 +       jload_prefetch(ZJNODE(active));
64627 +
64628 +       if (setdk)
64629 +               update_stale_dk(h->tree, active);
64630 +
64631 +       /* put_parent() cannot be called earlier, because connect_znode()
64632 +          assumes parent node is referenced; */
64633 +       put_parent(h);
64634 +
64635 +       if ((!znode_contains_key_lock(active, h->key) &&
64636 +            (h->flags & CBK_TRUST_DK))
64637 +           || ZF_ISSET(active, JNODE_HEARD_BANSHEE)) {
64638 +               /* 1. key was moved out of this node while this thread was
64639 +                  waiting for the lock. Restart. More elaborate solution is
64640 +                  to determine where key moved (to the left, or to the right)
64641 +                  and try to follow it through sibling pointers.
64642 +
64643 +                  2. or, node itself is going to be removed from the
64644 +                  tree. Release lock and restart.
64645 +                */
64646 +               h->result = -E_REPEAT;
64647 +       }
64648 +       if (h->result == -E_REPEAT)
64649 +               return LOOKUP_REST;
64650 +
64651 +       h->result = zload_ra(active, h->ra_info);
64652 +       if (h->result)
64653 +               return LOOKUP_DONE;
64654 +
64655 +       /* sanity checks */
64656 +       if (sanity_check(h)) {
64657 +               zrelse(active);
64658 +               return LOOKUP_DONE;
64659 +       }
64660 +
64661 +       /* check that key of leftmost item in the @active is the same as in
64662 +        * its parent */
64663 +       if (ldkeyset && !node_is_empty(active) &&
64664 +           !keyeq(leftmost_key_in_node(active, &key), &ldkey)) {
64665 +               warning("vs-3533", "Keys are inconsistent. Fsck?");
64666 +               reiser4_print_key("inparent", &ldkey);
64667 +               reiser4_print_key("inchild", &key);
64668 +               h->result = RETERR(-EIO);
64669 +               zrelse(active);
64670 +               return LOOKUP_DONE;
64671 +       }
64672 +
64673 +       if (h->object != NULL)
64674 +               handle_vroot(h->object, active);
64675 +
64676 +       ret = cbk_node_lookup(h);
64677 +
64678 +       /* h->active_lh->node might change, but active is yet to be zrelsed */
64679 +       zrelse(active);
64680 +
64681 +       return ret;
64682 +
64683 +fail_or_restart:
64684 +       if (h->result == -E_DEADLOCK)
64685 +               return LOOKUP_REST;
64686 +       return LOOKUP_DONE;
64687 +}
64688 +
64689 +#if REISER4_DEBUG
64690 +/* check left and right delimiting keys of a znode */
64691 +void check_dkeys(znode * node)
64692 +{
64693 +       znode *left;
64694 +       znode *right;
64695 +
64696 +       read_lock_tree(current_tree);
64697 +       read_lock_dk(current_tree);
64698 +
64699 +       assert("vs-1710", znode_is_any_locked(node));
64700 +       assert("vs-1197",
64701 +              !keygt(znode_get_ld_key(node), znode_get_rd_key(node)));
64702 +
64703 +       left = node->left;
64704 +       right = node->right;
64705 +
64706 +       if (ZF_ISSET(node, JNODE_LEFT_CONNECTED) && ZF_ISSET(node, JNODE_DKSET)
64707 +           && left != NULL && ZF_ISSET(left, JNODE_DKSET))
64708 +               /* check left neighbor. Note that left neighbor is not locked,
64709 +                  so it might get wrong delimiting keys therefore */
64710 +               assert("vs-1198",
64711 +                      (keyeq(znode_get_rd_key(left), znode_get_ld_key(node))
64712 +                       || ZF_ISSET(left, JNODE_HEARD_BANSHEE)));
64713 +
64714 +       if (ZF_ISSET(node, JNODE_RIGHT_CONNECTED) && ZF_ISSET(node, JNODE_DKSET)
64715 +           && right != NULL && ZF_ISSET(right, JNODE_DKSET))
64716 +               /* check right neighbor. Note that right neighbor is not
64717 +                  locked, so it might get wrong delimiting keys therefore  */
64718 +               assert("vs-1199",
64719 +                      (keyeq(znode_get_rd_key(node), znode_get_ld_key(right))
64720 +                       || ZF_ISSET(right, JNODE_HEARD_BANSHEE)));
64721 +
64722 +       read_unlock_dk(current_tree);
64723 +       read_unlock_tree(current_tree);
64724 +}
64725 +#endif
64726 +
64727 +/* true if @key is left delimiting key of @node */
64728 +static int key_is_ld(znode * node, const reiser4_key * key)
64729 +{
64730 +       int ld;
64731 +
64732 +       assert("nikita-1716", node != NULL);
64733 +       assert("nikita-1758", key != NULL);
64734 +
64735 +       read_lock_dk(znode_get_tree(node));
64736 +       assert("nikita-1759", znode_contains_key(node, key));
64737 +       ld = keyeq(znode_get_ld_key(node), key);
64738 +       read_unlock_dk(znode_get_tree(node));
64739 +       return ld;
64740 +}
64741 +
64742 +/* Process one node during tree traversal.
64743 +
64744 +   This is called by cbk_level_lookup(). */
64745 +static level_lookup_result cbk_node_lookup(cbk_handle * h/* search handle */)
64746 +{
64747 +       /* node plugin of @active */
64748 +       node_plugin *nplug;
64749 +       /* item plugin of item that was found */
64750 +       item_plugin *iplug;
64751 +       /* search bias */
64752 +       lookup_bias node_bias;
64753 +       /* node we are operating upon */
64754 +       znode *active;
64755 +       /* tree we are searching in */
64756 +       reiser4_tree *tree;
64757 +       /* result */
64758 +       int result;
64759 +
64760 +       assert("nikita-379", h != NULL);
64761 +
64762 +       active = h->active_lh->node;
64763 +       tree = h->tree;
64764 +
64765 +       nplug = active->nplug;
64766 +       assert("nikita-380", nplug != NULL);
64767 +
64768 +       ON_DEBUG(check_dkeys(active));
64769 +
64770 +       /* return item from "active" node with maximal key not greater than
64771 +          "key"  */
64772 +       node_bias = h->bias;
64773 +       result = nplug->lookup(active, h->key, node_bias, h->coord);
64774 +       if (unlikely(result != NS_FOUND && result != NS_NOT_FOUND)) {
64775 +               /* error occurred */
64776 +               h->result = result;
64777 +               return LOOKUP_DONE;
64778 +       }
64779 +       if (h->level == h->stop_level) {
64780 +               /* welcome to the stop level */
64781 +               assert("nikita-381", h->coord->node == active);
64782 +               if (result == NS_FOUND) {
64783 +                       /* success of tree lookup */
64784 +                       if (!(h->flags & CBK_UNIQUE)
64785 +                           && key_is_ld(active, h->key))
64786 +                               return search_to_left(h);
64787 +                       else
64788 +                               h->result = CBK_COORD_FOUND;
64789 +               } else {
64790 +                       h->result = CBK_COORD_NOTFOUND;
64791 +               }
64792 +               if (!(h->flags & CBK_IN_CACHE))
64793 +                       cbk_cache_add(active);
64794 +               return LOOKUP_DONE;
64795 +       }
64796 +
64797 +       if (h->level > TWIG_LEVEL && result == NS_NOT_FOUND) {
64798 +               h->error = "not found on internal node";
64799 +               h->result = result;
64800 +               return LOOKUP_DONE;
64801 +       }
64802 +
64803 +       assert("vs-361", h->level > h->stop_level);
64804 +
64805 +       if (handle_eottl(h, &result)) {
64806 +               assert("vs-1674", (result == LOOKUP_DONE ||
64807 +                                  result == LOOKUP_REST));
64808 +               return result;
64809 +       }
64810 +
64811 +       /* go down to next level */
64812 +       check_me("vs-12", zload(h->coord->node) == 0);
64813 +       assert("nikita-2116", item_is_internal(h->coord));
64814 +       iplug = item_plugin_by_coord(h->coord);
64815 +       iplug->s.internal.down_link(h->coord, h->key, &h->block);
64816 +       zrelse(h->coord->node);
64817 +       --h->level;
64818 +       return LOOKUP_CONT;     /* continue */
64819 +}
64820 +
64821 +/* scan cbk_cache slots looking for a match for @h */
64822 +static int cbk_cache_scan_slots(cbk_handle * h/* cbk handle */)
64823 +{
64824 +       level_lookup_result llr;
64825 +       znode *node;
64826 +       reiser4_tree *tree;
64827 +       cbk_cache_slot *slot;
64828 +       cbk_cache *cache;
64829 +       tree_level level;
64830 +       int isunique;
64831 +       const reiser4_key *key;
64832 +       int result;
64833 +
64834 +       assert("nikita-1317", h != NULL);
64835 +       assert("nikita-1315", h->tree != NULL);
64836 +       assert("nikita-1316", h->key != NULL);
64837 +
64838 +       tree = h->tree;
64839 +       cache = &tree->cbk_cache;
64840 +       if (cache->nr_slots == 0)
64841 +               /* size of cbk cache was set to 0 by mount time option. */
64842 +               return RETERR(-ENOENT);
64843 +
64844 +       assert("nikita-2474", cbk_cache_invariant(cache));
64845 +       node = NULL;            /* to keep gcc happy */
64846 +       level = h->level;
64847 +       key = h->key;
64848 +       isunique = h->flags & CBK_UNIQUE;
64849 +       result = RETERR(-ENOENT);
64850 +
64851 +       /*
64852 +        * this is time-critical function and dragons had, hence, been settled
64853 +        * here.
64854 +        *
64855 +        * Loop below scans cbk cache slots trying to find matching node with
64856 +        * suitable range of delimiting keys and located at the h->level.
64857 +        *
64858 +        * Scan is done under cbk cache spin lock that protects slot->node
64859 +        * pointers. If suitable node is found we want to pin it in
64860 +        * memory. But slot->node can point to the node with x_count 0
64861 +        * (unreferenced). Such node can be recycled at any moment, or can
64862 +        * already be in the process of being recycled (within jput()).
64863 +        *
64864 +        * As we found node in the cbk cache, it means that jput() hasn't yet
64865 +        * called cbk_cache_invalidate().
64866 +        *
64867 +        * We acquire reference to the node without holding tree lock, and
64868 +        * later, check node's RIP bit. This avoids races with jput().
64869 +        */
64870 +
64871 +       rcu_read_lock();
64872 +       read_lock(&((cbk_cache *)cache)->guard);
64873 +
64874 +       slot = list_entry(cache->lru.next, cbk_cache_slot, lru);
64875 +       slot = list_entry(slot->lru.prev, cbk_cache_slot, lru);
64876 +       BUG_ON(&slot->lru != &cache->lru);/*????*/
64877 +       while (1) {
64878 +
64879 +               slot = list_entry(slot->lru.next, cbk_cache_slot, lru);
64880 +
64881 +               if (&cache->lru != &slot->lru)
64882 +                       node = slot->node;
64883 +               else
64884 +                       node = NULL;
64885 +
64886 +               if (unlikely(node == NULL))
64887 +                       break;
64888 +
64889 +               /*
64890 +                * this is (hopefully) the only place in the code where we are
64891 +                * working with delimiting keys without holding dk lock. This
64892 +                * is fine here, because this is only "guess" anyway---keys
64893 +                * are rechecked under dk lock below.
64894 +                */
64895 +               if (znode_get_level(node) == level &&
64896 +                   /* reiser4_min_key < key < reiser4_max_key */
64897 +                   znode_contains_key_strict(node, key, isunique)) {
64898 +                       zref(node);
64899 +                       result = 0;
64900 +                       spin_lock_prefetch(&tree->tree_lock);
64901 +                       break;
64902 +               }
64903 +       }
64904 +       read_unlock(&((cbk_cache *)cache)->guard);
64905 +
64906 +       assert("nikita-2475", cbk_cache_invariant(cache));
64907 +
64908 +       if (unlikely(result == 0 && ZF_ISSET(node, JNODE_RIP)))
64909 +               result = -ENOENT;
64910 +
64911 +       rcu_read_unlock();
64912 +
64913 +       if (result != 0) {
64914 +               h->result = CBK_COORD_NOTFOUND;
64915 +               return RETERR(-ENOENT);
64916 +       }
64917 +
64918 +       result =
64919 +           longterm_lock_znode(h->active_lh, node, cbk_lock_mode(level, h),
64920 +                               ZNODE_LOCK_LOPRI);
64921 +       zput(node);
64922 +       if (result != 0)
64923 +               return result;
64924 +       result = zload(node);
64925 +       if (result != 0)
64926 +               return result;
64927 +
64928 +       /* recheck keys */
64929 +       read_lock_dk(tree);
64930 +       result = (znode_contains_key_strict(node, key, isunique) &&
64931 +               !ZF_ISSET(node, JNODE_HEARD_BANSHEE));
64932 +       read_unlock_dk(tree);
64933 +       if (result) {
64934 +               /* do lookup inside node */
64935 +               llr = cbk_node_lookup(h);
64936 +               /* if cbk_node_lookup() wandered to another node (due to eottl
64937 +                  or non-unique keys), adjust @node */
64938 +               /*node = h->active_lh->node; */
64939 +
64940 +               if (llr != LOOKUP_DONE) {
64941 +                       /* restart or continue on the next level */
64942 +                       result = RETERR(-ENOENT);
64943 +               } else if (IS_CBKERR(h->result))
64944 +                       /* io or oom */
64945 +                       result = RETERR(-ENOENT);
64946 +               else {
64947 +                       /* good. Either item found or definitely not found. */
64948 +                       result = 0;
64949 +
64950 +                       write_lock(&(cache->guard));
64951 +                       if (slot->node == h->active_lh->node) {
64952 +                               /* if this node is still in cbk cache---move
64953 +                                  its slot to the head of the LRU list. */
64954 +                               list_move(&slot->lru, &cache->lru);
64955 +                       }
64956 +                       write_unlock(&(cache->guard));
64957 +               }
64958 +       } else {
64959 +               /* race. While this thread was waiting for the lock, node was
64960 +                  rebalanced and item we are looking for, shifted out of it
64961 +                  (if it ever was here).
64962 +
64963 +                  Continuing scanning is almost hopeless: node key range was
64964 +                  moved to, is almost certainly at the beginning of the LRU
64965 +                  list at this time, because it's hot, but restarting
64966 +                  scanning from the very beginning is complex. Just return,
64967 +                  so that cbk() will be performed. This is not that
64968 +                  important, because such races should be rare. Are they?
64969 +                */
64970 +               result = RETERR(-ENOENT);       /* -ERAUGHT */
64971 +       }
64972 +       zrelse(node);
64973 +       assert("nikita-2476", cbk_cache_invariant(cache));
64974 +       return result;
64975 +}
64976 +
64977 +/* look for item with given key in the coord cache
64978 +
64979 +   This function, called by coord_by_key(), scans "coord cache" (&cbk_cache)
64980 +   which is a small LRU list of znodes accessed lately. For each znode in
64981 +   znode in this list, it checks whether key we are looking for fits into key
64982 +   range covered by this node. If so, and in addition, node lies at allowed
64983 +   level (this is to handle extents on a twig level), node is locked, and
64984 +   lookup inside it is performed.
64985 +
64986 +   we need a measurement of the cost of this cache search compared to the cost
64987 +   of coord_by_key.
64988 +
64989 +*/
64990 +static int cbk_cache_search(cbk_handle * h/* cbk handle */)
64991 +{
64992 +       int result = 0;
64993 +       tree_level level;
64994 +
64995 +       /* add CBK_IN_CACHE to the handle flags. This means that
64996 +        * cbk_node_lookup() assumes that cbk_cache is scanned and would add
64997 +        * found node to the cache. */
64998 +       h->flags |= CBK_IN_CACHE;
64999 +       for (level = h->stop_level; level <= h->lock_level; ++level) {
65000 +               h->level = level;
65001 +               result = cbk_cache_scan_slots(h);
65002 +               if (result != 0) {
65003 +                       done_lh(h->active_lh);
65004 +                       done_lh(h->parent_lh);
65005 +               } else {
65006 +                       assert("nikita-1319", !IS_CBKERR(h->result));
65007 +                       break;
65008 +               }
65009 +       }
65010 +       h->flags &= ~CBK_IN_CACHE;
65011 +       return result;
65012 +}
65013 +
65014 +/* type of lock we want to obtain during tree traversal. On stop level
65015 +    we want type of lock user asked for, on upper levels: read lock. */
65016 +znode_lock_mode cbk_lock_mode(tree_level level, cbk_handle * h)
65017 +{
65018 +       assert("nikita-382", h != NULL);
65019 +
65020 +       return (level <= h->lock_level) ? h->lock_mode : ZNODE_READ_LOCK;
65021 +}
65022 +
65023 +/* update outdated delimiting keys */
65024 +static void stale_dk(reiser4_tree * tree, znode * node)
65025 +{
65026 +       znode *right;
65027 +
65028 +       read_lock_tree(tree);
65029 +       write_lock_dk(tree);
65030 +       right = node->right;
65031 +
65032 +       if (ZF_ISSET(node, JNODE_RIGHT_CONNECTED) &&
65033 +           right && ZF_ISSET(right, JNODE_DKSET) &&
65034 +           !keyeq(znode_get_rd_key(node), znode_get_ld_key(right)))
65035 +               znode_set_rd_key(node, znode_get_ld_key(right));
65036 +
65037 +       write_unlock_dk(tree);
65038 +       read_unlock_tree(tree);
65039 +}
65040 +
65041 +/* check for possibly outdated delimiting keys, and update them if
65042 + * necessary. */
65043 +static void update_stale_dk(reiser4_tree * tree, znode * node)
65044 +{
65045 +       znode *right;
65046 +       reiser4_key rd;
65047 +
65048 +       read_lock_tree(tree);
65049 +       read_lock_dk(tree);
65050 +       rd = *znode_get_rd_key(node);
65051 +       right = node->right;
65052 +       if (unlikely(ZF_ISSET(node, JNODE_RIGHT_CONNECTED) &&
65053 +                    right && ZF_ISSET(right, JNODE_DKSET) &&
65054 +                    !keyeq(&rd, znode_get_ld_key(right)))) {
65055 +               assert("nikita-38211", ZF_ISSET(node, JNODE_DKSET));
65056 +               read_unlock_dk(tree);
65057 +               read_unlock_tree(tree);
65058 +               stale_dk(tree, node);
65059 +               return;
65060 +       }
65061 +       read_unlock_dk(tree);
65062 +       read_unlock_tree(tree);
65063 +}
65064 +
65065 +/*
65066 + * handle searches a the non-unique key.
65067 + *
65068 + * Suppose that we are looking for an item with possibly non-unique key 100.
65069 + *
65070 + * Root node contains two pointers: one to a node with left delimiting key 0,
65071 + * and another to a node with left delimiting key 100. Item we interested in
65072 + * may well happen in the sub-tree rooted at the first pointer.
65073 + *
65074 + * To handle this search_to_left() is called when search reaches stop
65075 + * level. This function checks it is _possible_ that item we are looking for
65076 + * is in the left neighbor (this can be done by comparing delimiting keys) and
65077 + * if so, tries to lock left neighbor (this is low priority lock, so it can
65078 + * deadlock, tree traversal is just restarted if it did) and then checks
65079 + * whether left neighbor actually contains items with our key.
65080 + *
65081 + * Note that this is done on the stop level only. It is possible to try such
65082 + * left-check on each level, but as duplicate keys are supposed to be rare
65083 + * (very unlikely that more than one node is completely filled with items with
65084 + * duplicate keys), it sis cheaper to scan to the left on the stop level once.
65085 + *
65086 + */
65087 +static level_lookup_result search_to_left(cbk_handle * h/* search handle */)
65088 +{
65089 +       level_lookup_result result;
65090 +       coord_t *coord;
65091 +       znode *node;
65092 +       znode *neighbor;
65093 +
65094 +       lock_handle lh;
65095 +
65096 +       assert("nikita-1761", h != NULL);
65097 +       assert("nikita-1762", h->level == h->stop_level);
65098 +
65099 +       init_lh(&lh);
65100 +       coord = h->coord;
65101 +       node = h->active_lh->node;
65102 +       assert("nikita-1763", coord_is_leftmost_unit(coord));
65103 +
65104 +       h->result =
65105 +           reiser4_get_left_neighbor(&lh, node, (int)h->lock_mode,
65106 +                                     GN_CAN_USE_UPPER_LEVELS);
65107 +       neighbor = NULL;
65108 +       switch (h->result) {
65109 +       case -E_DEADLOCK:
65110 +               result = LOOKUP_REST;
65111 +               break;
65112 +       case 0:{
65113 +                       node_plugin *nplug;
65114 +                       coord_t crd;
65115 +                       lookup_bias bias;
65116 +
65117 +                       neighbor = lh.node;
65118 +                       h->result = zload(neighbor);
65119 +                       if (h->result != 0) {
65120 +                               result = LOOKUP_DONE;
65121 +                               break;
65122 +                       }
65123 +
65124 +                       nplug = neighbor->nplug;
65125 +
65126 +                       coord_init_zero(&crd);
65127 +                       bias = h->bias;
65128 +                       h->bias = FIND_EXACT;
65129 +                       h->result =
65130 +                           nplug->lookup(neighbor, h->key, h->bias, &crd);
65131 +                       h->bias = bias;
65132 +
65133 +                       if (h->result == NS_NOT_FOUND) {
65134 +       case -E_NO_NEIGHBOR:
65135 +                               h->result = CBK_COORD_FOUND;
65136 +                               if (!(h->flags & CBK_IN_CACHE))
65137 +                                       cbk_cache_add(node);
65138 +       default:                /* some other error */
65139 +                               result = LOOKUP_DONE;
65140 +                       } else if (h->result == NS_FOUND) {
65141 +                               read_lock_dk(znode_get_tree(neighbor));
65142 +                               h->rd_key = *znode_get_ld_key(node);
65143 +                               leftmost_key_in_node(neighbor, &h->ld_key);
65144 +                               read_unlock_dk(znode_get_tree(neighbor));
65145 +                               h->flags |= CBK_DKSET;
65146 +
65147 +                               h->block = *znode_get_block(neighbor);
65148 +                               /* clear coord->node so that cbk_level_lookup()
65149 +                                  wouldn't overwrite parent hint in neighbor.
65150 +
65151 +                                  Parent hint was set up by
65152 +                                  reiser4_get_left_neighbor()
65153 +                                */
65154 +                               /* FIXME: why do we have to spinlock here? */
65155 +                               write_lock_tree(znode_get_tree(neighbor));
65156 +                               h->coord->node = NULL;
65157 +                               write_unlock_tree(znode_get_tree(neighbor));
65158 +                               result = LOOKUP_CONT;
65159 +                       } else {
65160 +                               result = LOOKUP_DONE;
65161 +                       }
65162 +                       if (neighbor != NULL)
65163 +                               zrelse(neighbor);
65164 +               }
65165 +       }
65166 +       done_lh(&lh);
65167 +       return result;
65168 +}
65169 +
65170 +/* debugging aid: return symbolic name of search bias */
65171 +static const char *bias_name(lookup_bias bias/* bias to get name of */)
65172 +{
65173 +       if (bias == FIND_EXACT)
65174 +               return "exact";
65175 +       else if (bias == FIND_MAX_NOT_MORE_THAN)
65176 +               return "left-slant";
65177 +/*     else if( bias == RIGHT_SLANT_BIAS ) */
65178 +/*             return "right-bias"; */
65179 +       else {
65180 +               static char buf[30];
65181 +
65182 +               sprintf(buf, "unknown: %i", bias);
65183 +               return buf;
65184 +       }
65185 +}
65186 +
65187 +#if REISER4_DEBUG
65188 +/* debugging aid: print human readable information about @p */
65189 +void print_coord_content(const char *prefix /* prefix to print */ ,
65190 +                        coord_t *p/* coord to print */)
65191 +{
65192 +       reiser4_key key;
65193 +
65194 +       if (p == NULL) {
65195 +               printk("%s: null\n", prefix);
65196 +               return;
65197 +       }
65198 +       if ((p->node != NULL) && znode_is_loaded(p->node)
65199 +           && coord_is_existing_item(p))
65200 +               printk("%s: data: %p, length: %i\n", prefix,
65201 +                      item_body_by_coord(p), item_length_by_coord(p));
65202 +       if (znode_is_loaded(p->node)) {
65203 +               item_key_by_coord(p, &key);
65204 +               reiser4_print_key(prefix, &key);
65205 +       }
65206 +}
65207 +
65208 +/* debugging aid: print human readable information about @block */
65209 +void reiser4_print_address(const char *prefix /* prefix to print */ ,
65210 +                  const reiser4_block_nr * block/* block number to print */)
65211 +{
65212 +       printk("%s: %s\n", prefix, sprint_address(block));
65213 +}
65214 +#endif
65215 +
65216 +/* return string containing human readable representation of @block */
65217 +char *sprint_address(const reiser4_block_nr *
65218 +                    block/* block number to print */)
65219 +{
65220 +       static char address[30];
65221 +
65222 +       if (block == NULL)
65223 +               sprintf(address, "null");
65224 +       else if (reiser4_blocknr_is_fake(block))
65225 +               sprintf(address, "%llx", (unsigned long long)(*block));
65226 +       else
65227 +               sprintf(address, "%llu", (unsigned long long)(*block));
65228 +       return address;
65229 +}
65230 +
65231 +/* release parent node during traversal */
65232 +static void put_parent(cbk_handle * h/* search handle */)
65233 +{
65234 +       assert("nikita-383", h != NULL);
65235 +       if (h->parent_lh->node != NULL)
65236 +               longterm_unlock_znode(h->parent_lh);
65237 +}
65238 +
65239 +/* helper function used by coord_by_key(): release reference to parent znode
65240 +   stored in handle before processing its child. */
65241 +static void hput(cbk_handle * h/* search handle */)
65242 +{
65243 +       assert("nikita-385", h != NULL);
65244 +       done_lh(h->parent_lh);
65245 +       done_lh(h->active_lh);
65246 +}
65247 +
65248 +/* Helper function used by cbk(): update delimiting keys of child node (stored
65249 +   in h->active_lh->node) using key taken from parent on the parent level. */
65250 +static int setup_delimiting_keys(cbk_handle * h/* search handle */)
65251 +{
65252 +       znode *active;
65253 +       reiser4_tree *tree;
65254 +
65255 +       assert("nikita-1088", h != NULL);
65256 +
65257 +       active = h->active_lh->node;
65258 +
65259 +       /* fast check without taking dk lock. This is safe, because
65260 +        * JNODE_DKSET is never cleared once set. */
65261 +       if (!ZF_ISSET(active, JNODE_DKSET)) {
65262 +               tree = znode_get_tree(active);
65263 +               write_lock_dk(tree);
65264 +               if (!ZF_ISSET(active, JNODE_DKSET)) {
65265 +                       znode_set_ld_key(active, &h->ld_key);
65266 +                       znode_set_rd_key(active, &h->rd_key);
65267 +                       ZF_SET(active, JNODE_DKSET);
65268 +               }
65269 +               write_unlock_dk(tree);
65270 +               return 1;
65271 +       }
65272 +       return 0;
65273 +}
65274 +
65275 +/* true if @block makes sense for the @tree. Used to detect corrupted node
65276 + * pointers */
65277 +static int
65278 +block_nr_is_correct(reiser4_block_nr * block /* block number to check */ ,
65279 +                   reiser4_tree * tree/* tree to check against */)
65280 +{
65281 +       assert("nikita-757", block != NULL);
65282 +       assert("nikita-758", tree != NULL);
65283 +
65284 +       /* check to see if it exceeds the size of the device. */
65285 +       return reiser4_blocknr_is_sane_for(tree->super, block);
65286 +}
65287 +
65288 +/* check consistency of fields */
65289 +static int sanity_check(cbk_handle * h/* search handle */)
65290 +{
65291 +       assert("nikita-384", h != NULL);
65292 +
65293 +       if (h->level < h->stop_level) {
65294 +               h->error = "Buried under leaves";
65295 +               h->result = RETERR(-EIO);
65296 +               return LOOKUP_DONE;
65297 +       } else if (!block_nr_is_correct(&h->block, h->tree)) {
65298 +               h->error = "bad block number";
65299 +               h->result = RETERR(-EIO);
65300 +               return LOOKUP_DONE;
65301 +       } else
65302 +               return 0;
65303 +}
65304 +
65305 +/* Make Linus happy.
65306 +   Local variables:
65307 +   c-indentation-style: "K&R"
65308 +   mode-name: "LC"
65309 +   c-basic-offset: 8
65310 +   tab-width: 8
65311 +   fill-column: 120
65312 +   scroll-step: 1
65313 +   End:
65314 +*/
65315 diff -puN /dev/null fs/reiser4/status_flags.c
65316 --- /dev/null
65317 +++ a/fs/reiser4/status_flags.c
65318 @@ -0,0 +1,174 @@
65319 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
65320 + * reiser4/README */
65321 +
65322 +/* Functions that deal with reiser4 status block, query status and update it,
65323 + * if needed */
65324 +
65325 +#include <linux/bio.h>
65326 +#include <linux/highmem.h>
65327 +#include <linux/fs.h>
65328 +#include <linux/blkdev.h>
65329 +#include "debug.h"
65330 +#include "dformat.h"
65331 +#include "status_flags.h"
65332 +#include "super.h"
65333 +
65334 +/* This is our end I/O handler that marks page uptodate if IO was successful.
65335 +   It also unconditionally unlocks the page, so we can see that io was done.
65336 +   We do not free bio, because we hope to reuse that. */
65337 +static void reiser4_status_endio(struct bio *bio, int err)
65338 +{
65339 +       if (test_bit(BIO_UPTODATE, &bio->bi_flags)) {
65340 +               SetPageUptodate(bio->bi_io_vec->bv_page);
65341 +       } else {
65342 +               ClearPageUptodate(bio->bi_io_vec->bv_page);
65343 +               SetPageError(bio->bi_io_vec->bv_page);
65344 +       }
65345 +       unlock_page(bio->bi_io_vec->bv_page);
65346 +}
65347 +
65348 +/* Initialise status code. This is expected to be called from the disk format
65349 +   code. block paremeter is where status block lives. */
65350 +int reiser4_status_init(reiser4_block_nr block)
65351 +{
65352 +       struct super_block *sb = reiser4_get_current_sb();
65353 +       struct reiser4_status *statuspage;
65354 +       struct bio *bio;
65355 +       struct page *page;
65356 +
65357 +       get_super_private(sb)->status_page = NULL;
65358 +       get_super_private(sb)->status_bio = NULL;
65359 +
65360 +       page = alloc_pages(reiser4_ctx_gfp_mask_get(), 0);
65361 +       if (!page)
65362 +               return -ENOMEM;
65363 +
65364 +       bio = bio_alloc(reiser4_ctx_gfp_mask_get(), 1);
65365 +       if (bio != NULL) {
65366 +               bio->bi_sector = block * (sb->s_blocksize >> 9);
65367 +               bio->bi_bdev = sb->s_bdev;
65368 +               bio->bi_io_vec[0].bv_page = page;
65369 +               bio->bi_io_vec[0].bv_len = sb->s_blocksize;
65370 +               bio->bi_io_vec[0].bv_offset = 0;
65371 +               bio->bi_vcnt = 1;
65372 +               bio->bi_size = sb->s_blocksize;
65373 +               bio->bi_end_io = reiser4_status_endio;
65374 +       } else {
65375 +               __free_pages(page, 0);
65376 +               return -ENOMEM;
65377 +       }
65378 +       lock_page(page);
65379 +       submit_bio(READ, bio);
65380 +       blk_run_address_space(reiser4_get_super_fake(sb)->i_mapping);
65381 +       wait_on_page_locked(page);
65382 +       if (!PageUptodate(page)) {
65383 +               warning("green-2007",
65384 +                       "I/O error while tried to read status page\n");
65385 +               return -EIO;
65386 +       }
65387 +
65388 +       statuspage = (struct reiser4_status *)kmap_atomic(page, KM_USER0);
65389 +       if (memcmp
65390 +           (statuspage->magic, REISER4_STATUS_MAGIC,
65391 +            sizeof(REISER4_STATUS_MAGIC))) {
65392 +               /* Magic does not match. */
65393 +               kunmap_atomic((char *)statuspage, KM_USER0);
65394 +               warning("green-2008", "Wrong magic in status block\n");
65395 +               __free_pages(page, 0);
65396 +               bio_put(bio);
65397 +               return -EINVAL;
65398 +       }
65399 +       kunmap_atomic((char *)statuspage, KM_USER0);
65400 +
65401 +       get_super_private(sb)->status_page = page;
65402 +       get_super_private(sb)->status_bio = bio;
65403 +       return 0;
65404 +}
65405 +
65406 +/* Query the status of fs. Returns if the FS can be safely mounted.
65407 +   Also if "status" and "extended" parameters are given, it will fill
65408 +   actual parts of status from disk there. */
65409 +int reiser4_status_query(u64 *status, u64 *extended)
65410 +{
65411 +       struct super_block *sb = reiser4_get_current_sb();
65412 +       struct reiser4_status *statuspage;
65413 +       int retval;
65414 +
65415 +       if (!get_super_private(sb)->status_page)
65416 +               /* No status page? */
65417 +               return REISER4_STATUS_MOUNT_UNKNOWN;
65418 +       statuspage = (struct reiser4_status *)
65419 +           kmap_atomic(get_super_private(sb)->status_page, KM_USER0);
65420 +       switch ((long)le64_to_cpu(get_unaligned(&statuspage->status))) {
65421 +       /* FIXME: this cast is a hack for 32 bit arches to work. */
65422 +       case REISER4_STATUS_OK:
65423 +               retval = REISER4_STATUS_MOUNT_OK;
65424 +               break;
65425 +       case REISER4_STATUS_CORRUPTED:
65426 +               retval = REISER4_STATUS_MOUNT_WARN;
65427 +               break;
65428 +       case REISER4_STATUS_DAMAGED:
65429 +       case REISER4_STATUS_DESTROYED:
65430 +       case REISER4_STATUS_IOERROR:
65431 +               retval = REISER4_STATUS_MOUNT_RO;
65432 +               break;
65433 +       default:
65434 +               retval = REISER4_STATUS_MOUNT_UNKNOWN;
65435 +               break;
65436 +       }
65437 +
65438 +       if (status)
65439 +               *status = le64_to_cpu(get_unaligned(&statuspage->status));
65440 +       if (extended)
65441 +               *extended = le64_to_cpu(get_unaligned(&statuspage->extended_status));
65442 +
65443 +       kunmap_atomic((char *)statuspage, KM_USER0);
65444 +       return retval;
65445 +}
65446 +
65447 +/* This function should be called when something bad happens (e.g. from
65448 +   reiser4_panic). It fills the status structure and tries to push it to disk.*/
65449 +int reiser4_status_write(__u64 status, __u64 extended_status, char *message)
65450 +{
65451 +       struct super_block *sb = reiser4_get_current_sb();
65452 +       struct reiser4_status *statuspage;
65453 +       struct bio *bio = get_super_private(sb)->status_bio;
65454 +
65455 +       if (!get_super_private(sb)->status_page)
65456 +               /* No status page? */
65457 +               return -1;
65458 +       statuspage = (struct reiser4_status *)
65459 +           kmap_atomic(get_super_private(sb)->status_page, KM_USER0);
65460 +
65461 +       put_unaligned(cpu_to_le64(status), &statuspage->status);
65462 +       put_unaligned(cpu_to_le64(extended_status), &statuspage->extended_status);
65463 +       strncpy(statuspage->texterror, message, REISER4_TEXTERROR_LEN);
65464 +
65465 +       kunmap_atomic((char *)statuspage, KM_USER0);
65466 +       bio->bi_bdev = sb->s_bdev;
65467 +       bio->bi_io_vec[0].bv_page = get_super_private(sb)->status_page;
65468 +       bio->bi_io_vec[0].bv_len = sb->s_blocksize;
65469 +       bio->bi_io_vec[0].bv_offset = 0;
65470 +       bio->bi_vcnt = 1;
65471 +       bio->bi_size = sb->s_blocksize;
65472 +       bio->bi_end_io = reiser4_status_endio;
65473 +       lock_page(get_super_private(sb)->status_page);  /* Safe as nobody should
65474 +                                                        * touch our page. */
65475 +       /* We can block now, but we have no other choice anyway */
65476 +       submit_bio(WRITE, bio);
65477 +       blk_run_address_space(reiser4_get_super_fake(sb)->i_mapping);
65478 +       return 0;               /* We do not wait for io to finish. */
65479 +}
65480 +
65481 +/* Frees the page with status and bio structure. Should be called by disk format
65482 + * at umount time */
65483 +int reiser4_status_finish(void)
65484 +{
65485 +       struct super_block *sb = reiser4_get_current_sb();
65486 +
65487 +       __free_pages(get_super_private(sb)->status_page, 0);
65488 +       get_super_private(sb)->status_page = NULL;
65489 +       bio_put(get_super_private(sb)->status_bio);
65490 +       get_super_private(sb)->status_bio = NULL;
65491 +       return 0;
65492 +}
65493 diff -puN /dev/null fs/reiser4/status_flags.h
65494 --- /dev/null
65495 +++ a/fs/reiser4/status_flags.h
65496 @@ -0,0 +1,47 @@
65497 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
65498 + * reiser4/README */
65499 +
65500 +/* Here we declare structures and flags that store reiser4 status on disk.
65501 +   The status that helps us to find out if the filesystem is valid or if it
65502 +   contains some critical, or not so critical errors */
65503 +
65504 +#if !defined(__REISER4_STATUS_FLAGS_H__)
65505 +#define __REISER4_STATUS_FLAGS_H__
65506 +
65507 +#include "dformat.h"
65508 +/* These are major status flags */
65509 +#define REISER4_STATUS_OK 0
65510 +#define REISER4_STATUS_CORRUPTED 0x1
65511 +#define REISER4_STATUS_DAMAGED 0x2
65512 +#define REISER4_STATUS_DESTROYED 0x4
65513 +#define REISER4_STATUS_IOERROR 0x8
65514 +
65515 +/* Return values for reiser4_status_query() */
65516 +#define REISER4_STATUS_MOUNT_OK 0
65517 +#define REISER4_STATUS_MOUNT_WARN 1
65518 +#define REISER4_STATUS_MOUNT_RO 2
65519 +#define REISER4_STATUS_MOUNT_UNKNOWN -1
65520 +
65521 +#define REISER4_TEXTERROR_LEN 256
65522 +
65523 +#define REISER4_STATUS_MAGIC "ReiSeR4StATusBl"
65524 +/* We probably need to keep its size under sector size which is 512 bytes */
65525 +struct reiser4_status {
65526 +       char magic[16];
65527 +       d64 status;             /* Current FS state */
65528 +       d64 extended_status;    /* Any additional info that might have sense in
65529 +                                * addition to "status". E.g. last sector where
65530 +                                * io error happened if status is
65531 +                                * "io error encountered" */
65532 +       d64 stacktrace[10];     /* Last ten functional calls made (addresses) */
65533 +       char texterror[REISER4_TEXTERROR_LEN];  /* Any error message if
65534 +                                                * appropriate, otherwise filled
65535 +                                                * with zeroes */
65536 +};
65537 +
65538 +int reiser4_status_init(reiser4_block_nr block);
65539 +int reiser4_status_query(u64 *status, u64 *extended);
65540 +int reiser4_status_write(u64 status, u64 extended_status, char *message);
65541 +int reiser4_status_finish(void);
65542 +
65543 +#endif
65544 diff -puN /dev/null fs/reiser4/super.c
65545 --- /dev/null
65546 +++ a/fs/reiser4/super.c
65547 @@ -0,0 +1,306 @@
65548 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
65549 + * reiser4/README */
65550 +
65551 +/* Super-block manipulations. */
65552 +
65553 +#include "debug.h"
65554 +#include "dformat.h"
65555 +#include "key.h"
65556 +#include "plugin/security/perm.h"
65557 +#include "plugin/space/space_allocator.h"
65558 +#include "plugin/plugin.h"
65559 +#include "tree.h"
65560 +#include "vfs_ops.h"
65561 +#include "super.h"
65562 +#include "reiser4.h"
65563 +
65564 +#include <linux/types.h>       /* for __u??  */
65565 +#include <linux/fs.h>          /* for struct super_block  */
65566 +
65567 +static __u64 reserved_for_gid(const struct super_block *super, gid_t gid);
65568 +static __u64 reserved_for_uid(const struct super_block *super, uid_t uid);
65569 +static __u64 reserved_for_root(const struct super_block *super);
65570 +
65571 +/* Return reiser4-specific part of super block */
65572 +reiser4_super_info_data *get_super_private_nocheck(const struct super_block *super)
65573 +{
65574 +       return (reiser4_super_info_data *) super->s_fs_info;
65575 +}
65576 +
65577 +/* Return reiser4 fstype: value that is returned in ->f_type field by statfs()
65578 + */
65579 +long reiser4_statfs_type(const struct super_block *super UNUSED_ARG)
65580 +{
65581 +       assert("nikita-448", super != NULL);
65582 +       assert("nikita-449", is_reiser4_super(super));
65583 +       return (long)REISER4_SUPER_MAGIC;
65584 +}
65585 +
65586 +/* functions to read/modify fields of reiser4_super_info_data */
65587 +
65588 +/* get number of blocks in file system */
65589 +__u64 reiser4_block_count(const struct super_block *super      /* super block
65590 +                                                                  queried */ )
65591 +{
65592 +       assert("vs-494", super != NULL);
65593 +       assert("vs-495", is_reiser4_super(super));
65594 +       return get_super_private(super)->block_count;
65595 +}
65596 +
65597 +#if REISER4_DEBUG
65598 +/*
65599 + * number of blocks in the current file system
65600 + */
65601 +__u64 reiser4_current_block_count(void)
65602 +{
65603 +       return get_current_super_private()->block_count;
65604 +}
65605 +#endif  /*  REISER4_DEBUG  */
65606 +
65607 +/* set number of block in filesystem */
65608 +void reiser4_set_block_count(const struct super_block *super, __u64 nr)
65609 +{
65610 +       assert("vs-501", super != NULL);
65611 +       assert("vs-502", is_reiser4_super(super));
65612 +       get_super_private(super)->block_count = nr;
65613 +       /*
65614 +        * The proper calculation of the reserved space counter (%5 of device
65615 +        * block counter) we need a 64 bit division which is missing in Linux
65616 +        * on i386 platform. Because we do not need a precise calculation here
65617 +        * we can replace a div64 operation by this combination of
65618 +        * multiplication and shift: 51. / (2^10) == .0498 .
65619 +        * FIXME: this is a bug. It comes up only for very small filesystems
65620 +        * which probably are never used. Nevertheless, it is a bug. Number of
65621 +        * reserved blocks must be not less than maximal number of blocks which
65622 +        * get grabbed with BA_RESERVED.
65623 +        */
65624 +       get_super_private(super)->blocks_reserved = ((nr * 51) >> 10);
65625 +}
65626 +
65627 +/* amount of blocks used (allocated for data) in file system */
65628 +__u64 reiser4_data_blocks(const struct super_block *super      /* super block
65629 +                                                                  queried */ )
65630 +{
65631 +       assert("nikita-452", super != NULL);
65632 +       assert("nikita-453", is_reiser4_super(super));
65633 +       return get_super_private(super)->blocks_used;
65634 +}
65635 +
65636 +/* set number of block used in filesystem */
65637 +void reiser4_set_data_blocks(const struct super_block *super, __u64 nr)
65638 +{
65639 +       assert("vs-503", super != NULL);
65640 +       assert("vs-504", is_reiser4_super(super));
65641 +       get_super_private(super)->blocks_used = nr;
65642 +}
65643 +
65644 +/* amount of free blocks in file system */
65645 +__u64 reiser4_free_blocks(const struct super_block *super      /* super block
65646 +                                                                  queried */ )
65647 +{
65648 +       assert("nikita-454", super != NULL);
65649 +       assert("nikita-455", is_reiser4_super(super));
65650 +       return get_super_private(super)->blocks_free;
65651 +}
65652 +
65653 +/* set number of blocks free in filesystem */
65654 +void reiser4_set_free_blocks(const struct super_block *super, __u64 nr)
65655 +{
65656 +       assert("vs-505", super != NULL);
65657 +       assert("vs-506", is_reiser4_super(super));
65658 +       get_super_private(super)->blocks_free = nr;
65659 +}
65660 +
65661 +/* get mkfs unique identifier */
65662 +__u32 reiser4_mkfs_id(const struct super_block *super  /* super block
65663 +                                                          queried */ )
65664 +{
65665 +       assert("vpf-221", super != NULL);
65666 +       assert("vpf-222", is_reiser4_super(super));
65667 +       return get_super_private(super)->mkfs_id;
65668 +}
65669 +
65670 +/* amount of free blocks in file system */
65671 +__u64 reiser4_free_committed_blocks(const struct super_block *super)
65672 +{
65673 +       assert("vs-497", super != NULL);
65674 +       assert("vs-498", is_reiser4_super(super));
65675 +       return get_super_private(super)->blocks_free_committed;
65676 +}
65677 +
65678 +/* amount of blocks in the file system reserved for @uid and @gid */
65679 +long reiser4_reserved_blocks(const struct super_block *super   /* super block
65680 +                                                                  queried */ ,
65681 +                            uid_t uid /* user id */ ,
65682 +                            gid_t gid/* group id */)
65683 +{
65684 +       long reserved;
65685 +
65686 +       assert("nikita-456", super != NULL);
65687 +       assert("nikita-457", is_reiser4_super(super));
65688 +
65689 +       reserved = 0;
65690 +       if (REISER4_SUPPORT_GID_SPACE_RESERVATION)
65691 +               reserved += reserved_for_gid(super, gid);
65692 +       if (REISER4_SUPPORT_UID_SPACE_RESERVATION)
65693 +               reserved += reserved_for_uid(super, uid);
65694 +       if (REISER4_SUPPORT_ROOT_SPACE_RESERVATION && capable(CAP_SYS_RESOURCE))
65695 +               reserved += reserved_for_root(super);
65696 +       return reserved;
65697 +}
65698 +
65699 +/* get/set value of/to grabbed blocks counter */
65700 +__u64 reiser4_grabbed_blocks(const struct super_block *super)
65701 +{
65702 +       assert("zam-512", super != NULL);
65703 +       assert("zam-513", is_reiser4_super(super));
65704 +
65705 +       return get_super_private(super)->blocks_grabbed;
65706 +}
65707 +
65708 +__u64 reiser4_flush_reserved(const struct super_block *super)
65709 +{
65710 +       assert("vpf-285", super != NULL);
65711 +       assert("vpf-286", is_reiser4_super(super));
65712 +
65713 +       return get_super_private(super)->blocks_flush_reserved;
65714 +}
65715 +
65716 +/* get/set value of/to counter of fake allocated formatted blocks */
65717 +__u64 reiser4_fake_allocated(const struct super_block *super)
65718 +{
65719 +       assert("zam-516", super != NULL);
65720 +       assert("zam-517", is_reiser4_super(super));
65721 +
65722 +       return get_super_private(super)->blocks_fake_allocated;
65723 +}
65724 +
65725 +/* get/set value of/to counter of fake allocated unformatted blocks */
65726 +__u64 reiser4_fake_allocated_unformatted(const struct super_block *super)
65727 +{
65728 +       assert("zam-516", super != NULL);
65729 +       assert("zam-517", is_reiser4_super(super));
65730 +
65731 +       return get_super_private(super)->blocks_fake_allocated_unformatted;
65732 +}
65733 +
65734 +/* get/set value of/to counter of clustered blocks */
65735 +__u64 reiser4_clustered_blocks(const struct super_block *super)
65736 +{
65737 +       assert("edward-601", super != NULL);
65738 +       assert("edward-602", is_reiser4_super(super));
65739 +
65740 +       return get_super_private(super)->blocks_clustered;
65741 +}
65742 +
65743 +/* space allocator used by this file system */
65744 +reiser4_space_allocator * reiser4_get_space_allocator(const struct super_block
65745 +                                                     *super)
65746 +{
65747 +       assert("nikita-1965", super != NULL);
65748 +       assert("nikita-1966", is_reiser4_super(super));
65749 +       return &get_super_private(super)->space_allocator;
65750 +}
65751 +
65752 +/* return fake inode used to bind formatted nodes in the page cache */
65753 +struct inode *reiser4_get_super_fake(const struct super_block *super)
65754 +{
65755 +       assert("nikita-1757", super != NULL);
65756 +       return get_super_private(super)->fake;
65757 +}
65758 +
65759 +/* return fake inode used to bind copied on capture nodes in the page cache */
65760 +struct inode *reiser4_get_cc_fake(const struct super_block *super)
65761 +{
65762 +       assert("nikita-1757", super != NULL);
65763 +       return get_super_private(super)->cc;
65764 +}
65765 +
65766 +/* return fake inode used to bind bitmaps and journlal heads */
65767 +struct inode *reiser4_get_bitmap_fake(const struct super_block *super)
65768 +{
65769 +       assert("nikita-17571", super != NULL);
65770 +       return get_super_private(super)->bitmap;
65771 +}
65772 +
65773 +/* tree used by this file system */
65774 +reiser4_tree *reiser4_get_tree(const struct super_block *super)
65775 +{
65776 +       assert("nikita-460", super != NULL);
65777 +       assert("nikita-461", is_reiser4_super(super));
65778 +       return &get_super_private(super)->tree;
65779 +}
65780 +
65781 +/* Check that @super is (looks like) reiser4 super block. This is mainly for
65782 +   use in assertions. */
65783 +int is_reiser4_super(const struct super_block *super)
65784 +{
65785 +       return
65786 +           super != NULL &&
65787 +           get_super_private(super) != NULL &&
65788 +           super->s_op == &(get_super_private(super)->ops.super);
65789 +}
65790 +
65791 +int reiser4_is_set(const struct super_block *super, reiser4_fs_flag f)
65792 +{
65793 +       return test_bit((int)f, &get_super_private(super)->fs_flags);
65794 +}
65795 +
65796 +/* amount of blocks reserved for given group in file system */
65797 +static __u64 reserved_for_gid(const struct super_block *super UNUSED_ARG,
65798 +                             gid_t gid UNUSED_ARG/* group id */)
65799 +{
65800 +       return 0;
65801 +}
65802 +
65803 +/* amount of blocks reserved for given user in file system */
65804 +static __u64 reserved_for_uid(const struct super_block *super UNUSED_ARG,
65805 +                             uid_t uid UNUSED_ARG/* user id */)
65806 +{
65807 +       return 0;
65808 +}
65809 +
65810 +/* amount of blocks reserved for super user in file system */
65811 +static __u64 reserved_for_root(const struct super_block *super UNUSED_ARG)
65812 +{
65813 +       return 0;
65814 +}
65815 +
65816 +/*
65817 + * true if block number @blk makes sense for the file system at @super.
65818 + */
65819 +int
65820 +reiser4_blocknr_is_sane_for(const struct super_block *super,
65821 +                           const reiser4_block_nr * blk)
65822 +{
65823 +       reiser4_super_info_data *sbinfo;
65824 +
65825 +       assert("nikita-2957", super != NULL);
65826 +       assert("nikita-2958", blk != NULL);
65827 +
65828 +       if (reiser4_blocknr_is_fake(blk))
65829 +               return 1;
65830 +
65831 +       sbinfo = get_super_private(super);
65832 +       return *blk < sbinfo->block_count;
65833 +}
65834 +
65835 +#if REISER4_DEBUG
65836 +/*
65837 + * true, if block number @blk makes sense for the current file system
65838 + */
65839 +int reiser4_blocknr_is_sane(const reiser4_block_nr * blk)
65840 +{
65841 +       return reiser4_blocknr_is_sane_for(reiser4_get_current_sb(), blk);
65842 +}
65843 +#endif  /*  REISER4_DEBUG  */
65844 +
65845 +/* Make Linus happy.
65846 +   Local variables:
65847 +   c-indentation-style: "K&R"
65848 +   mode-name: "LC"
65849 +   c-basic-offset: 8
65850 +   tab-width: 8
65851 +   fill-column: 120
65852 +   End:
65853 +*/
65854 diff -puN /dev/null fs/reiser4/super.h
65855 --- /dev/null
65856 +++ a/fs/reiser4/super.h
65857 @@ -0,0 +1,466 @@
65858 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
65859 + * reiser4/README */
65860 +
65861 +/* Super-block functions. See super.c for details. */
65862 +
65863 +#if !defined(__REISER4_SUPER_H__)
65864 +#define __REISER4_SUPER_H__
65865 +
65866 +#include <linux/exportfs.h>
65867 +
65868 +#include "tree.h"
65869 +#include "entd.h"
65870 +#include "wander.h"
65871 +#include "fsdata.h"
65872 +#include "plugin/object.h"
65873 +#include "plugin/space/space_allocator.h"
65874 +
65875 +/*
65876 + * Flush algorithms parameters.
65877 + */
65878 +struct flush_params {
65879 +       unsigned relocate_threshold;
65880 +       unsigned relocate_distance;
65881 +       unsigned written_threshold;
65882 +       unsigned scan_maxnodes;
65883 +};
65884 +
65885 +typedef enum {
65886 +       /*
65887 +        * True if this file system doesn't support hard-links (multiple names)
65888 +        * for directories: this is default UNIX behavior.
65889 +        *
65890 +        * If hard-links on directoires are not allowed, file system is Acyclic
65891 +        * Directed Graph (modulo dot, and dotdot, of course).
65892 +        *
65893 +        * This is used by reiser4_link().
65894 +        */
65895 +       REISER4_ADG = 0,
65896 +       /*
65897 +        * set if all nodes in internal tree have the same node layout plugin.
65898 +        * If so, znode_guess_plugin() will return tree->node_plugin in stead
65899 +        * of guessing plugin by plugin id stored in the node.
65900 +        */
65901 +       REISER4_ONE_NODE_PLUGIN = 1,
65902 +       /* if set, bsd gid assignment is supported. */
65903 +       REISER4_BSD_GID = 2,
65904 +       /* [mac]_time are 32 bit in inode */
65905 +       REISER4_32_BIT_TIMES = 3,
65906 +       /* load all bitmap blocks at mount time */
65907 +       REISER4_DONT_LOAD_BITMAP = 5,
65908 +       /* enforce atomicity during write(2) */
65909 +       REISER4_ATOMIC_WRITE = 6,
65910 +       /* don't use write barriers in the log writer code. */
65911 +       REISER4_NO_WRITE_BARRIER = 7
65912 +} reiser4_fs_flag;
65913 +
65914 +/*
65915 + * VFS related operation vectors.
65916 + */
65917 +struct object_ops {
65918 +       struct super_operations super;
65919 +       struct dentry_operations dentry;
65920 +       struct export_operations export;
65921 +};
65922 +
65923 +/* reiser4-specific part of super block
65924 +
65925 +   Locking
65926 +
65927 +   Fields immutable after mount:
65928 +
65929 +    ->oid*
65930 +    ->space*
65931 +    ->default_[ug]id
65932 +    ->mkfs_id
65933 +    ->trace_flags
65934 +    ->debug_flags
65935 +    ->fs_flags
65936 +    ->df_plug
65937 +    ->optimal_io_size
65938 +    ->plug
65939 +    ->flush
65940 +    ->u (bad name)
65941 +    ->txnmgr
65942 +    ->ra_params
65943 +    ->fsuid
65944 +    ->journal_header
65945 +    ->journal_footer
65946 +
65947 +   Fields protected by ->lnode_guard
65948 +
65949 +    ->lnode_htable
65950 +
65951 +   Fields protected by per-super block spin lock
65952 +
65953 +    ->block_count
65954 +    ->blocks_used
65955 +    ->blocks_free
65956 +    ->blocks_free_committed
65957 +    ->blocks_grabbed
65958 +    ->blocks_fake_allocated_unformatted
65959 +    ->blocks_fake_allocated
65960 +    ->blocks_flush_reserved
65961 +    ->eflushed
65962 +    ->blocknr_hint_default
65963 +
65964 +   After journal replaying during mount,
65965 +
65966 +    ->last_committed_tx
65967 +
65968 +   is protected by ->tmgr.commit_mutex
65969 +
65970 +   Invariants involving this data-type:
65971 +
65972 +      [sb-block-counts]
65973 +      [sb-grabbed]
65974 +      [sb-fake-allocated]
65975 +*/
65976 +struct reiser4_super_info_data {
65977 +       /*
65978 +        * guard spinlock which protects reiser4 super block fields (currently
65979 +        * blocks_free, blocks_free_committed)
65980 +        */
65981 +       spinlock_t guard;
65982 +
65983 +       /* next oid that will be returned by oid_allocate() */
65984 +       oid_t next_to_use;
65985 +       /* total number of used oids */
65986 +       oid_t oids_in_use;
65987 +
65988 +       /* space manager plugin */
65989 +       reiser4_space_allocator space_allocator;
65990 +
65991 +       /* reiser4 internal tree */
65992 +       reiser4_tree tree;
65993 +
65994 +       /*
65995 +        * default user id used for light-weight files without their own
65996 +        * stat-data.
65997 +        */
65998 +       uid_t default_uid;
65999 +
66000 +       /*
66001 +        * default group id used for light-weight files without their own
66002 +        * stat-data.
66003 +        */
66004 +       gid_t default_gid;
66005 +
66006 +       /* mkfs identifier generated at mkfs time. */
66007 +       __u32 mkfs_id;
66008 +       /* amount of blocks in a file system */
66009 +       __u64 block_count;
66010 +
66011 +       /* inviolable reserve */
66012 +       __u64 blocks_reserved;
66013 +
66014 +       /* amount of blocks used by file system data and meta-data. */
66015 +       __u64 blocks_used;
66016 +
66017 +       /*
66018 +        * amount of free blocks. This is "working" free blocks counter. It is
66019 +        * like "working" bitmap, please see block_alloc.c for description.
66020 +        */
66021 +       __u64 blocks_free;
66022 +
66023 +       /*
66024 +        * free block count for fs committed state. This is "commit" version of
66025 +        * free block counter.
66026 +        */
66027 +       __u64 blocks_free_committed;
66028 +
66029 +       /*
66030 +        * number of blocks reserved for further allocation, for all
66031 +        * threads.
66032 +        */
66033 +       __u64 blocks_grabbed;
66034 +
66035 +       /* number of fake allocated unformatted blocks in tree. */
66036 +       __u64 blocks_fake_allocated_unformatted;
66037 +
66038 +       /* number of fake allocated formatted blocks in tree. */
66039 +       __u64 blocks_fake_allocated;
66040 +
66041 +       /* number of blocks reserved for flush operations. */
66042 +       __u64 blocks_flush_reserved;
66043 +
66044 +       /* number of blocks reserved for cluster operations. */
66045 +       __u64 blocks_clustered;
66046 +
66047 +       /* unique file-system identifier */
66048 +       __u32 fsuid;
66049 +
66050 +       /* On-disk format version. If does not equal to the disk_format
66051 +          plugin version, some format updates (e.g. enlarging plugin
66052 +          set, etc) may have place on mount. */
66053 +       int version;
66054 +
66055 +       /* file-system wide flags. See reiser4_fs_flag enum */
66056 +       unsigned long fs_flags;
66057 +
66058 +       /* transaction manager */
66059 +       txn_mgr tmgr;
66060 +
66061 +       /* ent thread */
66062 +       entd_context entd;
66063 +
66064 +       /* fake inode used to bind formatted nodes */
66065 +       struct inode *fake;
66066 +       /* inode used to bind bitmaps (and journal heads) */
66067 +       struct inode *bitmap;
66068 +       /* inode used to bind copied on capture nodes */
66069 +       struct inode *cc;
66070 +
66071 +       /* disk layout plugin */
66072 +       disk_format_plugin *df_plug;
66073 +
66074 +       /* disk layout specific part of reiser4 super info data */
66075 +       union {
66076 +               format40_super_info format40;
66077 +       } u;
66078 +
66079 +       /* value we return in st_blksize on stat(2) */
66080 +       unsigned long optimal_io_size;
66081 +
66082 +       /* parameters for the flush algorithm */
66083 +       struct flush_params flush;
66084 +
66085 +       /* pointers to jnodes for journal header and footer */
66086 +       jnode *journal_header;
66087 +       jnode *journal_footer;
66088 +
66089 +       journal_location jloc;
66090 +
66091 +       /* head block number of last committed transaction */
66092 +       __u64 last_committed_tx;
66093 +
66094 +       /*
66095 +        * we remember last written location for using as a hint for new block
66096 +        * allocation
66097 +        */
66098 +       __u64 blocknr_hint_default;
66099 +
66100 +       /* committed number of files (oid allocator state variable ) */
66101 +       __u64 nr_files_committed;
66102 +
66103 +       struct formatted_ra_params ra_params;
66104 +
66105 +       /*
66106 +        * A mutex for serializing cut tree operation if out-of-free-space:
66107 +        * the only one cut_tree thread is allowed to grab space from reserved
66108 +        * area (it is 5% of disk space)
66109 +        */
66110 +       struct mutex delete_mutex;
66111 +       /* task owning ->delete_mutex */
66112 +       struct task_struct *delete_mutex_owner;
66113 +
66114 +       /* Diskmap's blocknumber */
66115 +       __u64 diskmap_block;
66116 +
66117 +       /* What to do in case of error */
66118 +       int onerror;
66119 +
66120 +       /* operations for objects on this file system */
66121 +       struct object_ops ops;
66122 +
66123 +       /*
66124 +        * structure to maintain d_cursors. See plugin/file_ops_readdir.c for
66125 +        * more details
66126 +        */
66127 +       struct d_cursor_info d_info;
66128 +
66129 +#ifdef CONFIG_REISER4_BADBLOCKS
66130 +       /* Alternative master superblock offset (in bytes) */
66131 +       unsigned long altsuper;
66132 +#endif
66133 +       struct repacker *repacker;
66134 +       struct page *status_page;
66135 +       struct bio *status_bio;
66136 +
66137 +#if REISER4_DEBUG
66138 +       /*
66139 +        * minimum used blocks value (includes super blocks, bitmap blocks and
66140 +        * other fs reserved areas), depends on fs format and fs size.
66141 +        */
66142 +       __u64 min_blocks_used;
66143 +
66144 +       /*
66145 +        * when debugging is on, all jnodes (including znodes, bitmaps, etc.)
66146 +        * are kept on a list anchored at sbinfo->all_jnodes. This list is
66147 +        * protected by sbinfo->all_guard spin lock. This lock should be taken
66148 +        * with _irq modifier, because it is also modified from interrupt
66149 +        * contexts (by RCU).
66150 +        */
66151 +       spinlock_t all_guard;
66152 +       /* list of all jnodes */
66153 +       struct list_head all_jnodes;
66154 +#endif
66155 +       struct dentry *debugfs_root;
66156 +};
66157 +
66158 +extern reiser4_super_info_data *get_super_private_nocheck(const struct
66159 +                                                         super_block * super);
66160 +
66161 +/* Return reiser4-specific part of super block */
66162 +static inline reiser4_super_info_data *get_super_private(const struct
66163 +                                                        super_block * super)
66164 +{
66165 +       assert("nikita-447", super != NULL);
66166 +
66167 +       return (reiser4_super_info_data *) super->s_fs_info;
66168 +}
66169 +
66170 +/* get ent context for the @super */
66171 +static inline entd_context *get_entd_context(struct super_block *super)
66172 +{
66173 +       return &get_super_private(super)->entd;
66174 +}
66175 +
66176 +/* "Current" super-block: main super block used during current system
66177 +   call. Reference to this super block is stored in reiser4_context. */
66178 +static inline struct super_block *reiser4_get_current_sb(void)
66179 +{
66180 +       return get_current_context()->super;
66181 +}
66182 +
66183 +/* Reiser4-specific part of "current" super-block: main super block used
66184 +   during current system call. Reference to this super block is stored in
66185 +   reiser4_context. */
66186 +static inline reiser4_super_info_data *get_current_super_private(void)
66187 +{
66188 +       return get_super_private(reiser4_get_current_sb());
66189 +}
66190 +
66191 +static inline struct formatted_ra_params *get_current_super_ra_params(void)
66192 +{
66193 +       return &(get_current_super_private()->ra_params);
66194 +}
66195 +
66196 +/*
66197 + * true, if file system on @super is read-only
66198 + */
66199 +static inline int rofs_super(struct super_block *super)
66200 +{
66201 +       return super->s_flags & MS_RDONLY;
66202 +}
66203 +
66204 +/*
66205 + * true, if @tree represents read-only file system
66206 + */
66207 +static inline int rofs_tree(reiser4_tree * tree)
66208 +{
66209 +       return rofs_super(tree->super);
66210 +}
66211 +
66212 +/*
66213 + * true, if file system where @inode lives on, is read-only
66214 + */
66215 +static inline int rofs_inode(struct inode *inode)
66216 +{
66217 +       return rofs_super(inode->i_sb);
66218 +}
66219 +
66220 +/*
66221 + * true, if file system where @node lives on, is read-only
66222 + */
66223 +static inline int rofs_jnode(jnode * node)
66224 +{
66225 +       return rofs_tree(jnode_get_tree(node));
66226 +}
66227 +
66228 +extern __u64 reiser4_current_block_count(void);
66229 +
66230 +extern void build_object_ops(struct super_block *super, struct object_ops *ops);
66231 +
66232 +#define REISER4_SUPER_MAGIC 0x52345362 /* (*(__u32 *)"R4Sb"); */
66233 +
66234 +static inline void spin_lock_reiser4_super(reiser4_super_info_data *sbinfo)
66235 +{
66236 +       spin_lock(&(sbinfo->guard));
66237 +}
66238 +
66239 +static inline void spin_unlock_reiser4_super(reiser4_super_info_data *sbinfo)
66240 +{
66241 +       assert_spin_locked(&(sbinfo->guard));
66242 +       spin_unlock(&(sbinfo->guard));
66243 +}
66244 +
66245 +extern __u64 reiser4_flush_reserved(const struct super_block *);
66246 +extern int reiser4_is_set(const struct super_block *super, reiser4_fs_flag f);
66247 +extern long reiser4_statfs_type(const struct super_block *super);
66248 +extern __u64 reiser4_block_count(const struct super_block *super);
66249 +extern void reiser4_set_block_count(const struct super_block *super, __u64 nr);
66250 +extern __u64 reiser4_data_blocks(const struct super_block *super);
66251 +extern void reiser4_set_data_blocks(const struct super_block *super, __u64 nr);
66252 +extern __u64 reiser4_free_blocks(const struct super_block *super);
66253 +extern void reiser4_set_free_blocks(const struct super_block *super, __u64 nr);
66254 +extern __u32 reiser4_mkfs_id(const struct super_block *super);
66255 +
66256 +extern __u64 reiser4_free_committed_blocks(const struct super_block *super);
66257 +
66258 +extern __u64 reiser4_grabbed_blocks(const struct super_block *);
66259 +extern __u64 reiser4_fake_allocated(const struct super_block *);
66260 +extern __u64 reiser4_fake_allocated_unformatted(const struct super_block *);
66261 +extern __u64 reiser4_clustered_blocks(const struct super_block *);
66262 +
66263 +extern long reiser4_reserved_blocks(const struct super_block *super, uid_t uid,
66264 +                                   gid_t gid);
66265 +
66266 +extern reiser4_space_allocator *
66267 +reiser4_get_space_allocator(const struct super_block *super);
66268 +extern reiser4_oid_allocator *
66269 +reiser4_get_oid_allocator(const struct super_block *super);
66270 +extern struct inode *reiser4_get_super_fake(const struct super_block *super);
66271 +extern struct inode *reiser4_get_cc_fake(const struct super_block *super);
66272 +extern struct inode *reiser4_get_bitmap_fake(const struct super_block *super);
66273 +extern reiser4_tree *reiser4_get_tree(const struct super_block *super);
66274 +extern int is_reiser4_super(const struct super_block *super);
66275 +
66276 +extern int reiser4_blocknr_is_sane(const reiser4_block_nr * blk);
66277 +extern int reiser4_blocknr_is_sane_for(const struct super_block *super,
66278 +                                      const reiser4_block_nr * blk);
66279 +extern int reiser4_fill_super(struct super_block *s, void *data, int silent);
66280 +extern int reiser4_done_super(struct super_block *s);
66281 +
66282 +/* step of fill super */
66283 +extern int reiser4_init_fs_info(struct super_block *);
66284 +extern void reiser4_done_fs_info(struct super_block *);
66285 +extern int reiser4_init_super_data(struct super_block *, char *opt_string);
66286 +extern int reiser4_init_read_super(struct super_block *, int silent);
66287 +extern int reiser4_init_root_inode(struct super_block *);
66288 +extern reiser4_plugin *get_default_plugin(pset_member memb);
66289 +
66290 +/* Maximal possible object id. */
66291 +#define  ABSOLUTE_MAX_OID ((oid_t)~0)
66292 +
66293 +#define OIDS_RESERVED  (1 << 16)
66294 +int oid_init_allocator(struct super_block *, oid_t nr_files, oid_t next);
66295 +oid_t oid_allocate(struct super_block *);
66296 +int oid_release(struct super_block *, oid_t);
66297 +oid_t oid_next(const struct super_block *);
66298 +void oid_count_allocated(void);
66299 +void oid_count_released(void);
66300 +long oids_used(const struct super_block *);
66301 +
66302 +#if REISER4_DEBUG
66303 +void print_fs_info(const char *prefix, const struct super_block *);
66304 +#endif
66305 +
66306 +extern void destroy_reiser4_cache(struct kmem_cache **);
66307 +
66308 +extern struct super_operations reiser4_super_operations;
66309 +extern struct export_operations reiser4_export_operations;
66310 +extern struct dentry_operations reiser4_dentry_operations;
66311 +
66312 +/* __REISER4_SUPER_H__ */
66313 +#endif
66314 +
66315 +/*
66316 + * Local variables:
66317 + * c-indentation-style: "K&R"
66318 + * mode-name: "LC"
66319 + * c-basic-offset: 8
66320 + * tab-width: 8
66321 + * fill-column: 120
66322 + * End:
66323 + */
66324 diff -puN /dev/null fs/reiser4/super_ops.c
66325 --- /dev/null
66326 +++ a/fs/reiser4/super_ops.c
66327 @@ -0,0 +1,725 @@
66328 +/* Copyright 2005 by Hans Reiser, licensing governed by
66329 + * reiser4/README */
66330 +
66331 +#include "inode.h"
66332 +#include "page_cache.h"
66333 +#include "ktxnmgrd.h"
66334 +#include "flush.h"
66335 +#include "safe_link.h"
66336 +
66337 +#include <linux/vfs.h>
66338 +#include <linux/writeback.h>
66339 +#include <linux/mount.h>
66340 +#include <linux/seq_file.h>
66341 +#include <linux/debugfs.h>
66342 +
66343 +/* slab cache for inodes */
66344 +static struct kmem_cache *inode_cache;
66345 +
66346 +static struct dentry *reiser4_debugfs_root = NULL;
66347 +
66348 +/**
66349 + * init_once - constructor for reiser4 inodes
66350 + * @cache: cache @obj belongs to
66351 + * @obj: inode to be initialized
66352 + *
66353 + * Initialization function to be called when new page is allocated by reiser4
66354 + * inode cache. It is set on inode cache creation.
66355 + */
66356 +static void init_once(void *obj)
66357 +{
66358 +       struct reiser4_inode_object *info;
66359 +
66360 +       info = obj;
66361 +
66362 +       /* initialize vfs inode */
66363 +       inode_init_once(&info->vfs_inode);
66364 +
66365 +       /*
66366 +        * initialize reiser4 specific part fo inode.
66367 +        * NOTE-NIKITA add here initializations for locks, list heads,
66368 +        * etc. that will be added to our private inode part.
66369 +        */
66370 +       INIT_LIST_HEAD(get_readdir_list(&info->vfs_inode));
66371 +       init_rwsem(&info->p.conv_sem);
66372 +       /* init semaphore which is used during inode loading */
66373 +       loading_init_once(&info->p);
66374 +       INIT_RADIX_TREE(jnode_tree_by_reiser4_inode(&info->p),
66375 +                       GFP_ATOMIC);
66376 +#if REISER4_DEBUG
66377 +       info->p.nr_jnodes = 0;
66378 +#endif
66379 +}
66380 +
66381 +/**
66382 + * init_inodes - create znode cache
66383 + *
66384 + * Initializes slab cache of inodes. It is part of reiser4 module initialization
66385 + */
66386 +static int init_inodes(void)
66387 +{
66388 +       inode_cache = kmem_cache_create("reiser4_inode",
66389 +                                       sizeof(struct reiser4_inode_object),
66390 +                                       0,
66391 +                                       SLAB_HWCACHE_ALIGN |
66392 +                                       SLAB_RECLAIM_ACCOUNT, init_once);
66393 +       if (inode_cache == NULL)
66394 +               return RETERR(-ENOMEM);
66395 +       return 0;
66396 +}
66397 +
66398 +/**
66399 + * done_inodes - delete inode cache
66400 + *
66401 + * This is called on reiser4 module unloading or system shutdown.
66402 + */
66403 +static void done_inodes(void)
66404 +{
66405 +       destroy_reiser4_cache(&inode_cache);
66406 +}
66407 +
66408 +/**
66409 + * reiser4_alloc_inode - alloc_inode of super operations
66410 + * @super: super block new inode is allocated for
66411 + *
66412 + * Allocates new inode, initializes reiser4 specific part of it.
66413 + */
66414 +static struct inode *reiser4_alloc_inode(struct super_block *super)
66415 +{
66416 +       struct reiser4_inode_object *obj;
66417 +
66418 +       assert("nikita-1696", super != NULL);
66419 +       obj = kmem_cache_alloc(inode_cache, reiser4_ctx_gfp_mask_get());
66420 +       if (obj != NULL) {
66421 +               reiser4_inode *info;
66422 +
66423 +               info = &obj->p;
66424 +
66425 +               info->pset = plugin_set_get_empty();
66426 +               info->hset = plugin_set_get_empty();
66427 +               info->extmask = 0;
66428 +               info->locality_id = 0ull;
66429 +               info->plugin_mask = 0;
66430 +               info->heir_mask = 0;
66431 +#if !REISER4_INO_IS_OID
66432 +               info->oid_hi = 0;
66433 +#endif
66434 +               reiser4_seal_init(&info->sd_seal, NULL, NULL);
66435 +               coord_init_invalid(&info->sd_coord, NULL);
66436 +               info->flags = 0;
66437 +               spin_lock_init(&info->guard);
66438 +               /* this deals with info's loading semaphore */
66439 +               loading_alloc(info);
66440 +               info->vroot = UBER_TREE_ADDR;
66441 +               return &obj->vfs_inode;
66442 +       } else
66443 +               return NULL;
66444 +}
66445 +
66446 +/**
66447 + * reiser4_destroy_inode - destroy_inode of super operations
66448 + * @inode: inode being destroyed
66449 + *
66450 + * Puts reiser4 specific portion of inode, frees memory occupied by inode.
66451 + */
66452 +static void reiser4_destroy_inode(struct inode *inode)
66453 +{
66454 +       reiser4_inode *info;
66455 +
66456 +       info = reiser4_inode_data(inode);
66457 +
66458 +       assert("vs-1220", inode_has_no_jnodes(info));
66459 +
66460 +       if (!is_bad_inode(inode) && is_inode_loaded(inode)) {
66461 +               file_plugin *fplug = inode_file_plugin(inode);
66462 +               if (fplug->destroy_inode != NULL)
66463 +                       fplug->destroy_inode(inode);
66464 +       }
66465 +       reiser4_dispose_cursors(inode);
66466 +       if (info->pset)
66467 +               plugin_set_put(info->pset);
66468 +       if (info->hset)
66469 +               plugin_set_put(info->hset);
66470 +
66471 +       /*
66472 +        * cannot add similar assertion about ->i_list as prune_icache return
66473 +        * inode into slab with dangling ->list.{next,prev}. This is safe,
66474 +        * because they are re-initialized in the new_inode().
66475 +        */
66476 +       assert("nikita-2895", list_empty(&inode->i_dentry));
66477 +       assert("nikita-2896", hlist_unhashed(&inode->i_hash));
66478 +       assert("nikita-2898", list_empty_careful(get_readdir_list(inode)));
66479 +
66480 +       /* this deals with info's loading semaphore */
66481 +       loading_destroy(info);
66482 +
66483 +       kmem_cache_free(inode_cache,
66484 +                       container_of(info, struct reiser4_inode_object, p));
66485 +}
66486 +
66487 +/**
66488 + * reiser4_dirty_inode - dirty_inode of super operations
66489 + * @inode: inode being dirtied
66490 + *
66491 + * Updates stat data.
66492 + */
66493 +static void reiser4_dirty_inode(struct inode *inode)
66494 +{
66495 +       int result;
66496 +
66497 +       if (!is_in_reiser4_context())
66498 +               return;
66499 +       assert("", !IS_RDONLY(inode));
66500 +       assert("", (inode_file_plugin(inode)->estimate.update(inode) <=
66501 +                   get_current_context()->grabbed_blocks));
66502 +
66503 +       result = reiser4_update_sd(inode);
66504 +       if (result)
66505 +               warning("", "failed to dirty inode for %llu: %d",
66506 +                       get_inode_oid(inode), result);
66507 +}
66508 +
66509 +/**
66510 + * reiser4_delete_inode - delete_inode of super operations
66511 + * @inode: inode to delete
66512 + *
66513 + * Calls file plugin's delete_object method to delete object items from
66514 + * filesystem tree and calls clear_inode.
66515 + */
66516 +static void reiser4_delete_inode(struct inode *inode)
66517 +{
66518 +       reiser4_context *ctx;
66519 +       file_plugin *fplug;
66520 +
66521 +       ctx = reiser4_init_context(inode->i_sb);
66522 +       if (IS_ERR(ctx)) {
66523 +               warning("vs-15", "failed to init context");
66524 +               return;
66525 +       }
66526 +
66527 +       if (is_inode_loaded(inode)) {
66528 +               fplug = inode_file_plugin(inode);
66529 +               if (fplug != NULL && fplug->delete_object != NULL)
66530 +                       fplug->delete_object(inode);
66531 +       }
66532 +
66533 +       truncate_inode_pages(&inode->i_data, 0);
66534 +       inode->i_blocks = 0;
66535 +       clear_inode(inode);
66536 +       reiser4_exit_context(ctx);
66537 +}
66538 +
66539 +/**
66540 + * reiser4_put_super - put_super of super operations
66541 + * @super: super block to free
66542 + *
66543 + * Stops daemons, release resources, umounts in short.
66544 + */
66545 +static void reiser4_put_super(struct super_block *super)
66546 +{
66547 +       reiser4_super_info_data *sbinfo;
66548 +       reiser4_context *ctx;
66549 +
66550 +       sbinfo = get_super_private(super);
66551 +       assert("vs-1699", sbinfo);
66552 +
66553 +       debugfs_remove(sbinfo->tmgr.debugfs_atom_count);
66554 +       debugfs_remove(sbinfo->tmgr.debugfs_id_count);
66555 +       debugfs_remove(sbinfo->debugfs_root);
66556 +
66557 +       ctx = reiser4_init_context(super);
66558 +       if (IS_ERR(ctx)) {
66559 +               warning("vs-17", "failed to init context");
66560 +               return;
66561 +       }
66562 +
66563 +       /* have disk format plugin to free its resources */
66564 +       if (get_super_private(super)->df_plug->release)
66565 +               get_super_private(super)->df_plug->release(super);
66566 +
66567 +       reiser4_done_formatted_fake(super);
66568 +
66569 +       /* stop daemons: ktxnmgr and entd */
66570 +       reiser4_done_entd(super);
66571 +       reiser4_done_ktxnmgrd(super);
66572 +       reiser4_done_txnmgr(&sbinfo->tmgr);
66573 +
66574 +       reiser4_done_fs_info(super);
66575 +       reiser4_exit_context(ctx);
66576 +}
66577 +
66578 +/**
66579 + * reiser4_write_super - write_super of super operations
66580 + * @super: super block to write
66581 + *
66582 + * Captures znode associated with super block, comit all transactions.
66583 + */
66584 +static void reiser4_write_super(struct super_block *super)
66585 +{
66586 +       int ret;
66587 +       reiser4_context *ctx;
66588 +
66589 +       assert("vs-1700", !rofs_super(super));
66590 +
66591 +       ctx = reiser4_init_context(super);
66592 +       if (IS_ERR(ctx)) {
66593 +               warning("vs-16", "failed to init context");
66594 +               return;
66595 +       }
66596 +
66597 +       ret = reiser4_capture_super_block(super);
66598 +       if (ret != 0)
66599 +               warning("vs-1701",
66600 +                       "reiser4_capture_super_block failed in write_super: %d",
66601 +                       ret);
66602 +       ret = txnmgr_force_commit_all(super, 0);
66603 +       if (ret != 0)
66604 +               warning("jmacd-77113",
66605 +                       "txn_force failed in write_super: %d", ret);
66606 +
66607 +       super->s_dirt = 0;
66608 +
66609 +       reiser4_exit_context(ctx);
66610 +}
66611 +
66612 +/**
66613 + * reiser4_statfs - statfs of super operations
66614 + * @super: super block of file system in queried
66615 + * @stafs: buffer to fill with statistics
66616 + *
66617 + * Returns information about filesystem.
66618 + */
66619 +static int reiser4_statfs(struct dentry *dentry, struct kstatfs *statfs)
66620 +{
66621 +       sector_t total;
66622 +       sector_t reserved;
66623 +       sector_t free;
66624 +       sector_t forroot;
66625 +       sector_t deleted;
66626 +       reiser4_context *ctx;
66627 +       struct super_block *super = dentry->d_sb;
66628 +
66629 +       assert("nikita-408", super != NULL);
66630 +       assert("nikita-409", statfs != NULL);
66631 +
66632 +       ctx = reiser4_init_context(super);
66633 +       if (IS_ERR(ctx))
66634 +               return PTR_ERR(ctx);
66635 +
66636 +       statfs->f_type = reiser4_statfs_type(super);
66637 +       statfs->f_bsize = super->s_blocksize;
66638 +
66639 +       /*
66640 +        * 5% of total block space is reserved. This is needed for flush and
66641 +        * for truncates (so that we are able to perform truncate/unlink even
66642 +        * on the otherwise completely full file system). If this reservation
66643 +        * is hidden from statfs(2), users will mistakenly guess that they
66644 +        * have enough free space to complete some operation, which is
66645 +        * frustrating.
66646 +        *
66647 +        * Another possible solution is to subtract ->blocks_reserved from
66648 +        * ->f_bfree, but changing available space seems less intrusive than
66649 +        * letting user to see 5% of disk space to be used directly after
66650 +        * mkfs.
66651 +        */
66652 +       total = reiser4_block_count(super);
66653 +       reserved = get_super_private(super)->blocks_reserved;
66654 +       deleted = txnmgr_count_deleted_blocks();
66655 +       free = reiser4_free_blocks(super) + deleted;
66656 +       forroot = reiser4_reserved_blocks(super, 0, 0);
66657 +
66658 +       /*
66659 +        * These counters may be in inconsistent state because we take the
66660 +        * values without keeping any global spinlock.  Here we do a sanity
66661 +        * check that free block counter does not exceed the number of all
66662 +        * blocks.
66663 +        */
66664 +       if (free > total)
66665 +               free = total;
66666 +       statfs->f_blocks = total - reserved;
66667 +       /* make sure statfs->f_bfree is never larger than statfs->f_blocks */
66668 +       if (free > reserved)
66669 +               free -= reserved;
66670 +       else
66671 +               free = 0;
66672 +       statfs->f_bfree = free;
66673 +
66674 +       if (free > forroot)
66675 +               free -= forroot;
66676 +       else
66677 +               free = 0;
66678 +       statfs->f_bavail = free;
66679 +
66680 +       statfs->f_files = 0;
66681 +       statfs->f_ffree = 0;
66682 +
66683 +       /* maximal acceptable name length depends on directory plugin. */
66684 +       assert("nikita-3351", super->s_root->d_inode != NULL);
66685 +       statfs->f_namelen = reiser4_max_filename_len(super->s_root->d_inode);
66686 +       reiser4_exit_context(ctx);
66687 +       return 0;
66688 +}
66689 +
66690 +/**
66691 + * reiser4_clear_inode - clear_inode of super operation
66692 + * @inode: inode about to destroy
66693 + *
66694 + * Does sanity checks: being destroyed should have all jnodes detached.
66695 + */
66696 +static void reiser4_clear_inode(struct inode *inode)
66697 +{
66698 +#if REISER4_DEBUG
66699 +       reiser4_inode *r4_inode;
66700 +
66701 +       r4_inode = reiser4_inode_data(inode);
66702 +       if (!inode_has_no_jnodes(r4_inode))
66703 +               warning("vs-1732", "reiser4 inode has %ld jnodes\n",
66704 +                       r4_inode->nr_jnodes);
66705 +#endif
66706 +}
66707 +
66708 +/**
66709 + * reiser4_sync_inodes - sync_inodes of super operations
66710 + * @super:
66711 + * @wbc:
66712 + *
66713 + * This method is called by background and non-backgound writeback. Reiser4's
66714 + * implementation uses generic_sync_sb_inodes to call reiser4_writepages for
66715 + * each of dirty inodes. Reiser4_writepages handles pages dirtied via shared
66716 + * mapping - dirty pages get into atoms. Writeout is called to flush some
66717 + * atoms.
66718 + */
66719 +static void reiser4_sync_inodes(struct super_block *super,
66720 +                              struct writeback_control *wbc)
66721 +{
66722 +       reiser4_context *ctx;
66723 +       long to_write;
66724 +
66725 +       if (wbc->for_kupdate)
66726 +               /* reiser4 has its own means of periodical write-out */
66727 +               return;
66728 +
66729 +       to_write = wbc->nr_to_write;
66730 +       assert("vs-49", wbc->older_than_this == NULL);
66731 +
66732 +       ctx = reiser4_init_context(super);
66733 +       if (IS_ERR(ctx)) {
66734 +               warning("vs-13", "failed to init context");
66735 +               return;
66736 +       }
66737 +
66738 +       /*
66739 +        * call reiser4_writepages for each of dirty inodes to turn dirty pages
66740 +        * into transactions if they were not yet.
66741 +        */
66742 +       generic_sync_sb_inodes(super, wbc);
66743 +
66744 +       /* flush goes here */
66745 +       wbc->nr_to_write = to_write;
66746 +       reiser4_writeout(super, wbc);
66747 +
66748 +       /* avoid recursive calls to ->sync_inodes */
66749 +       context_set_commit_async(ctx);
66750 +       reiser4_exit_context(ctx);
66751 +}
66752 +
66753 +/**
66754 + * reiser4_show_options - show_options of super operations
66755 + * @m: file where to write information
66756 + * @mnt: mount structure
66757 + *
66758 + * Makes reiser4 mount options visible in /proc/mounts.
66759 + */
66760 +static int reiser4_show_options(struct seq_file *m, struct vfsmount *mnt)
66761 +{
66762 +       struct super_block *super;
66763 +       reiser4_super_info_data *sbinfo;
66764 +
66765 +       super = mnt->mnt_sb;
66766 +       sbinfo = get_super_private(super);
66767 +
66768 +       seq_printf(m, ",atom_max_size=0x%x", sbinfo->tmgr.atom_max_size);
66769 +       seq_printf(m, ",atom_max_age=0x%x", sbinfo->tmgr.atom_max_age);
66770 +       seq_printf(m, ",atom_min_size=0x%x", sbinfo->tmgr.atom_min_size);
66771 +       seq_printf(m, ",atom_max_flushers=0x%x",
66772 +                  sbinfo->tmgr.atom_max_flushers);
66773 +       seq_printf(m, ",cbk_cache_slots=0x%x",
66774 +                  sbinfo->tree.cbk_cache.nr_slots);
66775 +
66776 +       return 0;
66777 +}
66778 +
66779 +struct super_operations reiser4_super_operations = {
66780 +       .alloc_inode = reiser4_alloc_inode,
66781 +       .destroy_inode = reiser4_destroy_inode,
66782 +       .dirty_inode = reiser4_dirty_inode,
66783 +       .delete_inode = reiser4_delete_inode,
66784 +       .put_super = reiser4_put_super,
66785 +       .write_super = reiser4_write_super,
66786 +       .statfs = reiser4_statfs,
66787 +       .clear_inode = reiser4_clear_inode,
66788 +       .sync_inodes = reiser4_sync_inodes,
66789 +       .show_options = reiser4_show_options
66790 +};
66791 +
66792 +/**
66793 + * fill_super - initialize super block on mount
66794 + * @super: super block to fill
66795 + * @data: reiser4 specific mount option
66796 + * @silent:
66797 + *
66798 + * This is to be called by reiser4_get_sb. Mounts filesystem.
66799 + */
66800 +static int fill_super(struct super_block *super, void *data, int silent)
66801 +{
66802 +       reiser4_context ctx;
66803 +       int result;
66804 +       reiser4_super_info_data *sbinfo;
66805 +
66806 +       assert("zam-989", super != NULL);
66807 +
66808 +       super->s_op = NULL;
66809 +       init_stack_context(&ctx, super);
66810 +
66811 +       /* allocate reiser4 specific super block */
66812 +       if ((result = reiser4_init_fs_info(super)) != 0)
66813 +               goto failed_init_sinfo;
66814 +
66815 +       sbinfo = get_super_private(super);
66816 +       /* initialize various reiser4 parameters, parse mount options */
66817 +       if ((result = reiser4_init_super_data(super, data)) != 0)
66818 +               goto failed_init_super_data;
66819 +
66820 +       /* read reiser4 master super block, initialize disk format plugin */
66821 +       if ((result = reiser4_init_read_super(super, silent)) != 0)
66822 +               goto failed_init_read_super;
66823 +
66824 +       /* initialize transaction manager */
66825 +       reiser4_init_txnmgr(&sbinfo->tmgr);
66826 +
66827 +       /* initialize ktxnmgrd context and start kernel thread ktxnmrgd */
66828 +       if ((result = reiser4_init_ktxnmgrd(super)) != 0)
66829 +               goto failed_init_ktxnmgrd;
66830 +
66831 +       /* initialize entd context and start kernel thread entd */
66832 +       if ((result = reiser4_init_entd(super)) != 0)
66833 +               goto failed_init_entd;
66834 +
66835 +       /* initialize address spaces for formatted nodes and bitmaps */
66836 +       if ((result = reiser4_init_formatted_fake(super)) != 0)
66837 +               goto failed_init_formatted_fake;
66838 +
66839 +       /* initialize disk format plugin */
66840 +       if ((result = get_super_private(super)->df_plug->init_format(super,
66841 +                                                                   data)) != 0)
66842 +               goto failed_init_disk_format;
66843 +
66844 +       /*
66845 +        * There are some 'committed' versions of reiser4 super block counters,
66846 +        * which correspond to reiser4 on-disk state. These counters are
66847 +        * initialized here
66848 +        */
66849 +       sbinfo->blocks_free_committed = sbinfo->blocks_free;
66850 +       sbinfo->nr_files_committed = oids_used(super);
66851 +
66852 +       /* get inode of root directory */
66853 +       if ((result = reiser4_init_root_inode(super)) != 0)
66854 +               goto failed_init_root_inode;
66855 +
66856 +       if ((result = get_super_private(super)->df_plug->version_update(super)) != 0)
66857 +               goto failed_update_format_version;
66858 +
66859 +       process_safelinks(super);
66860 +       reiser4_exit_context(&ctx);
66861 +
66862 +       sbinfo->debugfs_root = debugfs_create_dir(super->s_id,
66863 +                                                 reiser4_debugfs_root);
66864 +       if (sbinfo->debugfs_root) {
66865 +               sbinfo->tmgr.debugfs_atom_count =
66866 +                       debugfs_create_u32("atom_count", S_IFREG|S_IRUSR,
66867 +                                          sbinfo->debugfs_root,
66868 +                                          &sbinfo->tmgr.atom_count);
66869 +               sbinfo->tmgr.debugfs_id_count =
66870 +                       debugfs_create_u32("id_count", S_IFREG|S_IRUSR,
66871 +                                          sbinfo->debugfs_root,
66872 +                                          &sbinfo->tmgr.id_count);
66873 +       }
66874 +       return 0;
66875 +
66876 + failed_update_format_version:
66877 + failed_init_root_inode:
66878 +       if (sbinfo->df_plug->release)
66879 +               sbinfo->df_plug->release(super);
66880 + failed_init_disk_format:
66881 +       reiser4_done_formatted_fake(super);
66882 + failed_init_formatted_fake:
66883 +       reiser4_done_entd(super);
66884 + failed_init_entd:
66885 +       reiser4_done_ktxnmgrd(super);
66886 + failed_init_ktxnmgrd:
66887 +       reiser4_done_txnmgr(&sbinfo->tmgr);
66888 + failed_init_read_super:
66889 + failed_init_super_data:
66890 +       reiser4_done_fs_info(super);
66891 + failed_init_sinfo:
66892 +       reiser4_exit_context(&ctx);
66893 +       return result;
66894 +}
66895 +
66896 +/**
66897 + * reiser4_get_sb - get_sb of file_system_type operations
66898 + * @fs_type:
66899 + * @flags: mount flags MS_RDONLY, MS_VERBOSE, etc
66900 + * @dev_name: block device file name
66901 + * @data: specific mount options
66902 + *
66903 + * Reiser4 mount entry.
66904 + */
66905 +static int reiser4_get_sb(struct file_system_type *fs_type, int flags,
66906 +                       const char *dev_name, void *data, struct vfsmount *mnt)
66907 +{
66908 +       return get_sb_bdev(fs_type, flags, dev_name, data, fill_super, mnt);
66909 +}
66910 +
66911 +/* structure describing the reiser4 filesystem implementation */
66912 +static struct file_system_type reiser4_fs_type = {
66913 +       .owner = THIS_MODULE,
66914 +       .name = "reiser4",
66915 +       .fs_flags = FS_REQUIRES_DEV,
66916 +       .get_sb = reiser4_get_sb,
66917 +       .kill_sb = kill_block_super,
66918 +       .next = NULL
66919 +};
66920 +
66921 +void destroy_reiser4_cache(struct kmem_cache **cachep)
66922 +{
66923 +       BUG_ON(*cachep == NULL);
66924 +       kmem_cache_destroy(*cachep);
66925 +       *cachep = NULL;
66926 +}
66927 +
66928 +/**
66929 + * init_reiser4 - reiser4 initialization entry point
66930 + *
66931 + * Initializes reiser4 slabs, registers reiser4 filesystem type. It is called
66932 + * on kernel initialization or during reiser4 module load.
66933 + */
66934 +static int __init init_reiser4(void)
66935 +{
66936 +       int result;
66937 +
66938 +       printk(KERN_INFO
66939 +              "Loading Reiser4. "
66940 +              "See www.namesys.com for a description of Reiser4.\n");
66941 +
66942 +       /* initialize slab cache of inodes */
66943 +       if ((result = init_inodes()) != 0)
66944 +               goto failed_inode_cache;
66945 +
66946 +       /* initialize cache of znodes */
66947 +       if ((result = init_znodes()) != 0)
66948 +               goto failed_init_znodes;
66949 +
66950 +       /* initialize all plugins */
66951 +       if ((result = init_plugins()) != 0)
66952 +               goto failed_init_plugins;
66953 +
66954 +       /* initialize cache of plugin_set-s and plugin_set's hash table */
66955 +       if ((result = init_plugin_set()) != 0)
66956 +               goto failed_init_plugin_set;
66957 +
66958 +       /* initialize caches of txn_atom-s and txn_handle-s */
66959 +       if ((result = init_txnmgr_static()) != 0)
66960 +               goto failed_init_txnmgr_static;
66961 +
66962 +       /* initialize cache of jnodes */
66963 +       if ((result = init_jnodes()) != 0)
66964 +               goto failed_init_jnodes;
66965 +
66966 +       /* initialize cache of flush queues */
66967 +       if ((result = reiser4_init_fqs()) != 0)
66968 +               goto failed_init_fqs;
66969 +
66970 +       /* initialize cache of structures attached to dentry->d_fsdata */
66971 +       if ((result = reiser4_init_dentry_fsdata()) != 0)
66972 +               goto failed_init_dentry_fsdata;
66973 +
66974 +       /* initialize cache of structures attached to file->private_data */
66975 +       if ((result = reiser4_init_file_fsdata()) != 0)
66976 +               goto failed_init_file_fsdata;
66977 +
66978 +       /*
66979 +        * initialize cache of d_cursors. See plugin/file_ops_readdir.c for
66980 +        * more details
66981 +        */
66982 +       if ((result = reiser4_init_d_cursor()) != 0)
66983 +               goto failed_init_d_cursor;
66984 +
66985 +       if ((result = register_filesystem(&reiser4_fs_type)) == 0) {
66986 +               reiser4_debugfs_root = debugfs_create_dir("reiser4", NULL);
66987 +               return 0;
66988 +       }
66989 +
66990 +       reiser4_done_d_cursor();
66991 + failed_init_d_cursor:
66992 +       reiser4_done_file_fsdata();
66993 + failed_init_file_fsdata:
66994 +       reiser4_done_dentry_fsdata();
66995 + failed_init_dentry_fsdata:
66996 +       reiser4_done_fqs();
66997 + failed_init_fqs:
66998 +       done_jnodes();
66999 + failed_init_jnodes:
67000 +       done_txnmgr_static();
67001 + failed_init_txnmgr_static:
67002 +       done_plugin_set();
67003 + failed_init_plugin_set:
67004 + failed_init_plugins:
67005 +       done_znodes();
67006 + failed_init_znodes:
67007 +       done_inodes();
67008 + failed_inode_cache:
67009 +       return result;
67010 +}
67011 +
67012 +/**
67013 + * done_reiser4 - reiser4 exit entry point
67014 + *
67015 + * Unregister reiser4 filesystem type, deletes caches. It is called on shutdown
67016 + * or at module unload.
67017 + */
67018 +static void __exit done_reiser4(void)
67019 +{
67020 +       int result;
67021 +
67022 +       debugfs_remove(reiser4_debugfs_root);
67023 +       result = unregister_filesystem(&reiser4_fs_type);
67024 +       BUG_ON(result != 0);
67025 +       reiser4_done_d_cursor();
67026 +       reiser4_done_file_fsdata();
67027 +       reiser4_done_dentry_fsdata();
67028 +       reiser4_done_fqs();
67029 +       done_jnodes();
67030 +       done_txnmgr_static();
67031 +       done_plugin_set();
67032 +       done_znodes();
67033 +       destroy_reiser4_cache(&inode_cache);
67034 +}
67035 +
67036 +module_init(init_reiser4);
67037 +module_exit(done_reiser4);
67038 +
67039 +MODULE_DESCRIPTION("Reiser4 filesystem");
67040 +MODULE_AUTHOR("Hans Reiser <Reiser@Namesys.COM>");
67041 +
67042 +MODULE_LICENSE("GPL");
67043 +
67044 +/*
67045 + * Local variables:
67046 + * c-indentation-style: "K&R"
67047 + * mode-name: "LC"
67048 + * c-basic-offset: 8
67049 + * tab-width: 8
67050 + * fill-column: 79
67051 + * End:
67052 + */
67053 diff -puN /dev/null fs/reiser4/tap.c
67054 --- /dev/null
67055 +++ a/fs/reiser4/tap.c
67056 @@ -0,0 +1,376 @@
67057 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
67058 + * reiser4/README */
67059 +
67060 +/*
67061 +   Tree Access Pointer (tap).
67062 +
67063 +   tap is data structure combining coord and lock handle (mostly). It is
67064 +   useful when one has to scan tree nodes (for example, in readdir, or flush),
67065 +   for tap functions allow to move tap in either direction transparently
67066 +   crossing unit/item/node borders.
67067 +
67068 +   Tap doesn't provide automatic synchronization of its fields as it is
67069 +   supposed to be per-thread object.
67070 +*/
67071 +
67072 +#include "forward.h"
67073 +#include "debug.h"
67074 +#include "coord.h"
67075 +#include "tree.h"
67076 +#include "context.h"
67077 +#include "tap.h"
67078 +#include "znode.h"
67079 +#include "tree_walk.h"
67080 +
67081 +#if REISER4_DEBUG
67082 +static int tap_invariant(const tap_t *tap);
67083 +static void tap_check(const tap_t *tap);
67084 +#else
67085 +#define tap_check(tap) noop
67086 +#endif
67087 +
67088 +/** load node tap is pointing to, if not loaded already */
67089 +int reiser4_tap_load(tap_t *tap)
67090 +{
67091 +       tap_check(tap);
67092 +       if (tap->loaded == 0) {
67093 +               int result;
67094 +
67095 +               result = zload_ra(tap->coord->node, &tap->ra_info);
67096 +               if (result != 0)
67097 +                       return result;
67098 +               coord_clear_iplug(tap->coord);
67099 +       }
67100 +       ++tap->loaded;
67101 +       tap_check(tap);
67102 +       return 0;
67103 +}
67104 +
67105 +/** release node tap is pointing to. Dual to tap_load() */
67106 +void reiser4_tap_relse(tap_t *tap)
67107 +{
67108 +       tap_check(tap);
67109 +       if (tap->loaded > 0) {
67110 +               --tap->loaded;
67111 +               if (tap->loaded == 0)
67112 +                       zrelse(tap->coord->node);
67113 +       }
67114 +       tap_check(tap);
67115 +}
67116 +
67117 +/**
67118 + * init tap to consist of @coord and @lh. Locks on nodes will be acquired with
67119 + * @mode
67120 + */
67121 +void reiser4_tap_init(tap_t *tap, coord_t *coord, lock_handle * lh,
67122 +                     znode_lock_mode mode)
67123 +{
67124 +       tap->coord = coord;
67125 +       tap->lh = lh;
67126 +       tap->mode = mode;
67127 +       tap->loaded = 0;
67128 +       INIT_LIST_HEAD(&tap->linkage);
67129 +       reiser4_init_ra_info(&tap->ra_info);
67130 +}
67131 +
67132 +/** add @tap to the per-thread list of all taps */
67133 +void reiser4_tap_monitor(tap_t *tap)
67134 +{
67135 +       assert("nikita-2623", tap != NULL);
67136 +       tap_check(tap);
67137 +       list_add(&tap->linkage, reiser4_taps_list());
67138 +       tap_check(tap);
67139 +}
67140 +
67141 +/* duplicate @src into @dst. Copy lock handle. @dst is not initially
67142 + * loaded. */
67143 +void reiser4_tap_copy(tap_t *dst, tap_t *src)
67144 +{
67145 +       assert("nikita-3193", src != NULL);
67146 +       assert("nikita-3194", dst != NULL);
67147 +
67148 +       *dst->coord = *src->coord;
67149 +       if (src->lh->node)
67150 +               copy_lh(dst->lh, src->lh);
67151 +       dst->mode = src->mode;
67152 +       dst->loaded = 0;
67153 +       INIT_LIST_HEAD(&dst->linkage);
67154 +       dst->ra_info = src->ra_info;
67155 +}
67156 +
67157 +/** finish with @tap */
67158 +void reiser4_tap_done(tap_t *tap)
67159 +{
67160 +       assert("nikita-2565", tap != NULL);
67161 +       tap_check(tap);
67162 +       if (tap->loaded > 0)
67163 +               zrelse(tap->coord->node);
67164 +       done_lh(tap->lh);
67165 +       tap->loaded = 0;
67166 +       list_del_init(&tap->linkage);
67167 +       tap->coord->node = NULL;
67168 +}
67169 +
67170 +/**
67171 + * move @tap to the new node, locked with @target. Load @target, if @tap was
67172 + * already loaded.
67173 + */
67174 +int reiser4_tap_move(tap_t *tap, lock_handle * target)
67175 +{
67176 +       int result = 0;
67177 +
67178 +       assert("nikita-2567", tap != NULL);
67179 +       assert("nikita-2568", target != NULL);
67180 +       assert("nikita-2570", target->node != NULL);
67181 +       assert("nikita-2569", tap->coord->node == tap->lh->node);
67182 +
67183 +       tap_check(tap);
67184 +       if (tap->loaded > 0)
67185 +               result = zload_ra(target->node, &tap->ra_info);
67186 +
67187 +       if (result == 0) {
67188 +               if (tap->loaded > 0)
67189 +                       zrelse(tap->coord->node);
67190 +               done_lh(tap->lh);
67191 +               copy_lh(tap->lh, target);
67192 +               tap->coord->node = target->node;
67193 +               coord_clear_iplug(tap->coord);
67194 +       }
67195 +       tap_check(tap);
67196 +       return result;
67197 +}
67198 +
67199 +/**
67200 + * move @tap to @target. Acquire lock on @target, if @tap was already
67201 + * loaded.
67202 + */
67203 +static int tap_to(tap_t *tap, znode * target)
67204 +{
67205 +       int result;
67206 +
67207 +       assert("nikita-2624", tap != NULL);
67208 +       assert("nikita-2625", target != NULL);
67209 +
67210 +       tap_check(tap);
67211 +       result = 0;
67212 +       if (tap->coord->node != target) {
67213 +               lock_handle here;
67214 +
67215 +               init_lh(&here);
67216 +               result = longterm_lock_znode(&here, target,
67217 +                                            tap->mode, ZNODE_LOCK_HIPRI);
67218 +               if (result == 0) {
67219 +                       result = reiser4_tap_move(tap, &here);
67220 +                       done_lh(&here);
67221 +               }
67222 +       }
67223 +       tap_check(tap);
67224 +       return result;
67225 +}
67226 +
67227 +/**
67228 + * move @tap to given @target, loading and locking @target->node if
67229 + * necessary
67230 + */
67231 +int tap_to_coord(tap_t *tap, coord_t *target)
67232 +{
67233 +       int result;
67234 +
67235 +       tap_check(tap);
67236 +       result = tap_to(tap, target->node);
67237 +       if (result == 0)
67238 +               coord_dup(tap->coord, target);
67239 +       tap_check(tap);
67240 +       return result;
67241 +}
67242 +
67243 +/** return list of all taps */
67244 +struct list_head *reiser4_taps_list(void)
67245 +{
67246 +       return &get_current_context()->taps;
67247 +}
67248 +
67249 +/** helper function for go_{next,prev}_{item,unit,node}() */
67250 +int go_dir_el(tap_t *tap, sideof dir, int units_p)
67251 +{
67252 +       coord_t dup;
67253 +       coord_t *coord;
67254 +       int result;
67255 +
67256 +       int (*coord_dir) (coord_t *);
67257 +       int (*get_dir_neighbor) (lock_handle *, znode *, int, int);
67258 +       void (*coord_init) (coord_t *, const znode *);
67259 +       ON_DEBUG(int (*coord_check) (const coord_t *));
67260 +
67261 +       assert("nikita-2556", tap != NULL);
67262 +       assert("nikita-2557", tap->coord != NULL);
67263 +       assert("nikita-2558", tap->lh != NULL);
67264 +       assert("nikita-2559", tap->coord->node != NULL);
67265 +
67266 +       tap_check(tap);
67267 +       if (dir == LEFT_SIDE) {
67268 +               coord_dir = units_p ? coord_prev_unit : coord_prev_item;
67269 +               get_dir_neighbor = reiser4_get_left_neighbor;
67270 +               coord_init = coord_init_last_unit;
67271 +       } else {
67272 +               coord_dir = units_p ? coord_next_unit : coord_next_item;
67273 +               get_dir_neighbor = reiser4_get_right_neighbor;
67274 +               coord_init = coord_init_first_unit;
67275 +       }
67276 +       ON_DEBUG(coord_check =
67277 +                units_p ? coord_is_existing_unit : coord_is_existing_item);
67278 +       assert("nikita-2560", coord_check(tap->coord));
67279 +
67280 +       coord = tap->coord;
67281 +       coord_dup(&dup, coord);
67282 +       if (coord_dir(&dup) != 0) {
67283 +               do {
67284 +                       /* move to the left neighboring node */
67285 +                       lock_handle dup;
67286 +
67287 +                       init_lh(&dup);
67288 +                       result =
67289 +                           get_dir_neighbor(&dup, coord->node, (int)tap->mode,
67290 +                                            GN_CAN_USE_UPPER_LEVELS);
67291 +                       if (result == 0) {
67292 +                               result = reiser4_tap_move(tap, &dup);
67293 +                               if (result == 0)
67294 +                                       coord_init(tap->coord, dup.node);
67295 +                               done_lh(&dup);
67296 +                       }
67297 +                       /* skip empty nodes */
67298 +               } while ((result == 0) && node_is_empty(coord->node));
67299 +       } else {
67300 +               result = 0;
67301 +               coord_dup(coord, &dup);
67302 +       }
67303 +       assert("nikita-2564", ergo(!result, coord_check(tap->coord)));
67304 +       tap_check(tap);
67305 +       return result;
67306 +}
67307 +
67308 +/**
67309 + * move @tap to the next unit, transparently crossing item and node
67310 + * boundaries
67311 + */
67312 +int go_next_unit(tap_t *tap)
67313 +{
67314 +       return go_dir_el(tap, RIGHT_SIDE, 1);
67315 +}
67316 +
67317 +/**
67318 + * move @tap to the previous unit, transparently crossing item and node
67319 + * boundaries
67320 + */
67321 +int go_prev_unit(tap_t *tap)
67322 +{
67323 +       return go_dir_el(tap, LEFT_SIDE, 1);
67324 +}
67325 +
67326 +/**
67327 + * @shift times apply @actor to the @tap. This is used to move @tap by
67328 + * @shift units (or items, or nodes) in either direction.
67329 + */
67330 +static int rewind_to(tap_t *tap, go_actor_t actor, int shift)
67331 +{
67332 +       int result;
67333 +
67334 +       assert("nikita-2555", shift >= 0);
67335 +       assert("nikita-2562", tap->coord->node == tap->lh->node);
67336 +
67337 +       tap_check(tap);
67338 +       result = reiser4_tap_load(tap);
67339 +       if (result != 0)
67340 +               return result;
67341 +
67342 +       for (; shift > 0; --shift) {
67343 +               result = actor(tap);
67344 +               assert("nikita-2563", tap->coord->node == tap->lh->node);
67345 +               if (result != 0)
67346 +                       break;
67347 +       }
67348 +       reiser4_tap_relse(tap);
67349 +       tap_check(tap);
67350 +       return result;
67351 +}
67352 +
67353 +/** move @tap @shift units rightward */
67354 +int rewind_right(tap_t *tap, int shift)
67355 +{
67356 +       return rewind_to(tap, go_next_unit, shift);
67357 +}
67358 +
67359 +/** move @tap @shift units leftward */
67360 +int rewind_left(tap_t *tap, int shift)
67361 +{
67362 +       return rewind_to(tap, go_prev_unit, shift);
67363 +}
67364 +
67365 +#if REISER4_DEBUG
67366 +/** debugging function: print @tap content in human readable form */
67367 +static void print_tap(const char *prefix, const tap_t *tap)
67368 +{
67369 +       if (tap == NULL) {
67370 +               printk("%s: null tap\n", prefix);
67371 +               return;
67372 +       }
67373 +       printk("%s: loaded: %i, in-list: %i, node: %p, mode: %s\n", prefix,
67374 +              tap->loaded, (&tap->linkage == tap->linkage.next &&
67375 +                            &tap->linkage == tap->linkage.prev),
67376 +              tap->lh->node,
67377 +              lock_mode_name(tap->mode));
67378 +       print_coord("\tcoord", tap->coord, 0);
67379 +}
67380 +
67381 +/** check [tap-sane] invariant */
67382 +static int tap_invariant(const tap_t *tap)
67383 +{
67384 +       /* [tap-sane] invariant */
67385 +
67386 +       if (tap == NULL)
67387 +               return 1;
67388 +       /* tap->mode is one of
67389 +        *
67390 +        * {ZNODE_NO_LOCK, ZNODE_READ_LOCK, ZNODE_WRITE_LOCK}, and
67391 +        */
67392 +       if (tap->mode != ZNODE_NO_LOCK &&
67393 +           tap->mode != ZNODE_READ_LOCK && tap->mode != ZNODE_WRITE_LOCK)
67394 +               return 2;
67395 +       /* tap->coord != NULL, and */
67396 +       if (tap->coord == NULL)
67397 +               return 3;
67398 +       /* tap->lh != NULL, and */
67399 +       if (tap->lh == NULL)
67400 +               return 4;
67401 +       /* tap->loaded > 0 => znode_is_loaded(tap->coord->node), and */
67402 +       if (!ergo(tap->loaded, znode_is_loaded(tap->coord->node)))
67403 +               return 5;
67404 +       /* tap->coord->node == tap->lh->node if tap->lh->node is not 0 */
67405 +       if (tap->lh->node != NULL && tap->coord->node != tap->lh->node)
67406 +               return 6;
67407 +       return 0;
67408 +}
67409 +
67410 +/** debugging function: check internal @tap consistency */
67411 +static void tap_check(const tap_t *tap)
67412 +{
67413 +       int result;
67414 +
67415 +       result = tap_invariant(tap);
67416 +       if (result != 0) {
67417 +               print_tap("broken", tap);
67418 +               reiser4_panic("nikita-2831", "tap broken: %i\n", result);
67419 +       }
67420 +}
67421 +#endif
67422 +
67423 +/* Make Linus happy.
67424 +   Local variables:
67425 +   c-indentation-style: "K&R"
67426 +   mode-name: "LC"
67427 +   c-basic-offset: 8
67428 +   tab-width: 8
67429 +   fill-column: 120
67430 +   scroll-step: 1
67431 +   End:
67432 +*/
67433 diff -puN /dev/null fs/reiser4/tap.h
67434 --- /dev/null
67435 +++ a/fs/reiser4/tap.h
67436 @@ -0,0 +1,70 @@
67437 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
67438 +
67439 +/* Tree Access Pointers. See tap.c for more details. */
67440 +
67441 +#if !defined(__REISER4_TAP_H__)
67442 +#define __REISER4_TAP_H__
67443 +
67444 +#include "forward.h"
67445 +#include "readahead.h"
67446 +
67447 +/**
67448 +    tree_access_pointer aka tap. Data structure combining coord_t and lock
67449 +    handle.
67450 +    Invariants involving this data-type, see doc/lock-ordering for details:
67451 +
67452 +      [tap-sane]
67453 + */
67454 +struct tree_access_pointer {
67455 +       /* coord tap is at */
67456 +       coord_t *coord;
67457 +       /* lock handle on ->coord->node */
67458 +       lock_handle *lh;
67459 +       /* mode of lock acquired by this tap */
67460 +       znode_lock_mode mode;
67461 +       /* incremented by reiser4_tap_load().
67462 +          Decremented by reiser4_tap_relse(). */
67463 +       int loaded;
67464 +       /* list of taps */
67465 +       struct list_head linkage;
67466 +       /* read-ahead hint */
67467 +       ra_info_t ra_info;
67468 +};
67469 +
67470 +typedef int (*go_actor_t) (tap_t *tap);
67471 +
67472 +extern int reiser4_tap_load(tap_t *tap);
67473 +extern void reiser4_tap_relse(tap_t *tap);
67474 +extern void reiser4_tap_init(tap_t *tap, coord_t *coord, lock_handle * lh,
67475 +                    znode_lock_mode mode);
67476 +extern void reiser4_tap_monitor(tap_t *tap);
67477 +extern void reiser4_tap_copy(tap_t *dst, tap_t *src);
67478 +extern void reiser4_tap_done(tap_t *tap);
67479 +extern int reiser4_tap_move(tap_t *tap, lock_handle * target);
67480 +extern int tap_to_coord(tap_t *tap, coord_t *target);
67481 +
67482 +extern int go_dir_el(tap_t *tap, sideof dir, int units_p);
67483 +extern int go_next_unit(tap_t *tap);
67484 +extern int go_prev_unit(tap_t *tap);
67485 +extern int rewind_right(tap_t *tap, int shift);
67486 +extern int rewind_left(tap_t *tap, int shift);
67487 +
67488 +extern struct list_head *reiser4_taps_list(void);
67489 +
67490 +#define for_all_taps(tap)                                                     \
67491 +       for (tap = list_entry(reiser4_taps_list()->next, tap_t, linkage);      \
67492 +            reiser4_taps_list() != &tap->linkage;                             \
67493 +            tap = list_entry(tap->linkage.next, tap_t, linkage))
67494 +
67495 +/* __REISER4_TAP_H__ */
67496 +#endif
67497 +/* Make Linus happy.
67498 +   Local variables:
67499 +   c-indentation-style: "K&R"
67500 +   mode-name: "LC"
67501 +   c-basic-offset: 8
67502 +   tab-width: 8
67503 +   fill-column: 120
67504 +   scroll-step: 1
67505 +   End:
67506 +*/
67507 diff -puN /dev/null fs/reiser4/tree.c
67508 --- /dev/null
67509 +++ a/fs/reiser4/tree.c
67510 @@ -0,0 +1,1878 @@
67511 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
67512 + * reiser4/README */
67513 +
67514 +/*
67515 + * KEYS IN A TREE.
67516 + *
67517 + * The tree consists of nodes located on the disk. Node in the tree is either
67518 + * formatted or unformatted. Formatted node is one that has structure
67519 + * understood by the tree balancing and traversal code. Formatted nodes are
67520 + * further classified into leaf and internal nodes. Latter distinctions is
67521 + * (almost) of only historical importance: general structure of leaves and
67522 + * internal nodes is the same in Reiser4. Unformatted nodes contain raw data
67523 + * that are part of bodies of ordinary files and attributes.
67524 + *
67525 + * Each node in the tree spawns some interval in the key space. Key ranges for
67526 + * all nodes in the tree are disjoint. Actually, this only holds in some weak
67527 + * sense, because of the non-unique keys: intersection of key ranges for
67528 + * different nodes is either empty, or consists of exactly one key.
67529 + *
67530 + * Formatted node consists of a sequence of items. Each item spawns some
67531 + * interval in key space. Key ranges for all items in a tree are disjoint,
67532 + * modulo non-unique keys again. Items within nodes are ordered in the key
67533 + * order of the smallest key in a item.
67534 + *
67535 + * Particular type of item can be further split into units. Unit is piece of
67536 + * item that can be cut from item and moved into another item of the same
67537 + * time. Units are used by balancing code to repack data during balancing.
67538 + *
67539 + * Unit can be further split into smaller entities (for example, extent unit
67540 + * represents several pages, and it is natural for extent code to operate on
67541 + * particular pages and even bytes within one unit), but this is of no
67542 + * relevance to the generic balancing and lookup code.
67543 + *
67544 + * Although item is said to "spawn" range or interval of keys, it is not
67545 + * necessary that item contains piece of data addressable by each and every
67546 + * key in this range. For example, compound directory item, consisting of
67547 + * units corresponding to directory entries and keyed by hashes of file names,
67548 + * looks more as having "discrete spectrum": only some disjoint keys inside
67549 + * range occupied by this item really address data.
67550 + *
67551 + * No than less, each item always has well-defined least (minimal) key, that
67552 + * is recorded in item header, stored in the node this item is in. Also, item
67553 + * plugin can optionally define method ->max_key_inside() returning maximal
67554 + * key that can _possibly_ be located within this item. This method is used
67555 + * (mainly) to determine when given piece of data should be merged into
67556 + * existing item, in stead of creating new one. Because of this, even though
67557 + * ->max_key_inside() can be larger that any key actually located in the item,
67558 + * intervals
67559 + *
67560 + * [ reiser4_min_key( item ), ->max_key_inside( item ) ]
67561 + *
67562 + * are still disjoint for all items within the _same_ node.
67563 + *
67564 + * In memory node is represented by znode. It plays several roles:
67565 + *
67566 + *  . something locks are taken on
67567 + *
67568 + *  . something tracked by transaction manager (this is going to change)
67569 + *
67570 + *  . something used to access node data
67571 + *
67572 + *  . something used to maintain tree structure in memory: sibling and
67573 + *  parental linkage.
67574 + *
67575 + *  . something used to organize nodes into "slums"
67576 + *
67577 + * More on znodes see in znode.[ch]
67578 + *
67579 + * DELIMITING KEYS
67580 + *
67581 + *   To simplify balancing, allow some flexibility in locking and speed up
67582 + *   important coord cache optimization, we keep delimiting keys of nodes in
67583 + *   memory. Depending on disk format (implemented by appropriate node plugin)
67584 + *   node on disk can record both left and right delimiting key, only one of
67585 + *   them, or none. Still, our balancing and tree traversal code keep both
67586 + *   delimiting keys for a node that is in memory stored in the znode. When
67587 + *   node is first brought into memory during tree traversal, its left
67588 + *   delimiting key is taken from its parent, and its right delimiting key is
67589 + *   either next key in its parent, or is right delimiting key of parent if
67590 + *   node is the rightmost child of parent.
67591 + *
67592 + *   Physical consistency of delimiting key is protected by special dk
67593 + *   read-write lock. That is, delimiting keys can only be inspected or
67594 + *   modified under this lock. But dk lock is only sufficient for fast
67595 + *   "pessimistic" check, because to simplify code and to decrease lock
67596 + *   contention, balancing (carry) only updates delimiting keys right before
67597 + *   unlocking all locked nodes on the given tree level. For example,
67598 + *   coord-by-key cache scans LRU list of recently accessed znodes. For each
67599 + *   node it first does fast check under dk spin lock. If key looked for is
67600 + *   not between delimiting keys for this node, next node is inspected and so
67601 + *   on. If key is inside of the key range, long term lock is taken on node
67602 + *   and key range is rechecked.
67603 + *
67604 + * COORDINATES
67605 + *
67606 + *   To find something in the tree, you supply a key, and the key is resolved
67607 + *   by coord_by_key() into a coord (coordinate) that is valid as long as the
67608 + *   node the coord points to remains locked.  As mentioned above trees
67609 + *   consist of nodes that consist of items that consist of units. A unit is
67610 + *   the smallest and indivisible piece of tree as far as balancing and tree
67611 + *   search are concerned. Each node, item, and unit can be addressed by
67612 + *   giving its level in the tree and the key occupied by this entity.  A node
67613 + *   knows what the key ranges are of the items within it, and how to find its
67614 + *   items and invoke their item handlers, but it does not know how to access
67615 + *   individual units within its items except through the item handlers.
67616 + *   coord is a structure containing a pointer to the node, the ordinal number
67617 + *   of the item within this node (a sort of item offset), and the ordinal
67618 + *   number of the unit within this item.
67619 + *
67620 + * TREE LOOKUP
67621 + *
67622 + *   There are two types of access to the tree: lookup and modification.
67623 + *
67624 + *   Lookup is a search for the key in the tree. Search can look for either
67625 + *   exactly the key given to it, or for the largest key that is not greater
67626 + *   than the key given to it. This distinction is determined by "bias"
67627 + *   parameter of search routine (coord_by_key()). coord_by_key() either
67628 + *   returns error (key is not in the tree, or some kind of external error
67629 + *   occurred), or successfully resolves key into coord.
67630 + *
67631 + *   This resolution is done by traversing tree top-to-bottom from root level
67632 + *   to the desired level. On levels above twig level (level one above the
67633 + *   leaf level) nodes consist exclusively of internal items. Internal item is
67634 + *   nothing more than pointer to the tree node on the child level. On twig
67635 + *   level nodes consist of internal items intermixed with extent
67636 + *   items. Internal items form normal search tree structure used by traversal
67637 + *   to descent through the tree.
67638 + *
67639 + * TREE LOOKUP OPTIMIZATIONS
67640 + *
67641 + * Tree lookup described above is expensive even if all nodes traversed are
67642 + * already in the memory: for each node binary search within it has to be
67643 + * performed and binary searches are CPU consuming and tend to destroy CPU
67644 + * caches.
67645 + *
67646 + * Several optimizations are used to work around this:
67647 + *
67648 + *   . cbk_cache (look-aside cache for tree traversals, see search.c for
67649 + *   details)
67650 + *
67651 + *   . seals (see seal.[ch])
67652 + *
67653 + *   . vroot (see search.c)
67654 + *
67655 + * General search-by-key is layered thusly:
67656 + *
67657 + *                   [check seal, if any]   --ok--> done
67658 + *                           |
67659 + *                         failed
67660 + *                           |
67661 + *                           V
67662 + *                     [vroot defined] --no--> node = tree_root
67663 + *                           |                   |
67664 + *                          yes                  |
67665 + *                           |                   |
67666 + *                           V                   |
67667 + *                       node = vroot            |
67668 + *                                 |             |
67669 + *                                 |             |
67670 + *                                 |             |
67671 + *                                 V             V
67672 + *                            [check cbk_cache for key]  --ok--> done
67673 + *                                        |
67674 + *                                      failed
67675 + *                                        |
67676 + *                                        V
67677 + *                       [start tree traversal from node]
67678 + *
67679 + */
67680 +
67681 +#include "forward.h"
67682 +#include "debug.h"
67683 +#include "dformat.h"
67684 +#include "key.h"
67685 +#include "coord.h"
67686 +#include "plugin/item/static_stat.h"
67687 +#include "plugin/item/item.h"
67688 +#include "plugin/node/node.h"
67689 +#include "plugin/plugin.h"
67690 +#include "txnmgr.h"
67691 +#include "jnode.h"
67692 +#include "znode.h"
67693 +#include "block_alloc.h"
67694 +#include "tree_walk.h"
67695 +#include "carry.h"
67696 +#include "carry_ops.h"
67697 +#include "tap.h"
67698 +#include "tree.h"
67699 +#include "vfs_ops.h"
67700 +#include "page_cache.h"
67701 +#include "super.h"
67702 +#include "reiser4.h"
67703 +#include "inode.h"
67704 +
67705 +#include <linux/fs.h>          /* for struct super_block  */
67706 +#include <linux/spinlock.h>
67707 +
67708 +/* Disk address (block number) never ever used for any real tree node. This is
67709 +   used as block number of "uber" znode.
67710 +
67711 +   Invalid block addresses are 0 by tradition.
67712 +
67713 +*/
67714 +const reiser4_block_nr UBER_TREE_ADDR = 0ull;
67715 +
67716 +#define CUT_TREE_MIN_ITERATIONS 64
67717 +
67718 +static int find_child_by_addr(znode * parent, znode * child, coord_t *result);
67719 +
67720 +/* return node plugin of coord->node */
67721 +node_plugin *node_plugin_by_coord(const coord_t *coord)
67722 +{
67723 +       assert("vs-1", coord != NULL);
67724 +       assert("vs-2", coord->node != NULL);
67725 +
67726 +       return coord->node->nplug;
67727 +}
67728 +
67729 +/* insert item into tree. Fields of @coord are updated so that they can be
67730 + * used by consequent insert operation. */
67731 +insert_result insert_by_key(reiser4_tree * tree        /* tree to insert new item
67732 +                                                * into */ ,
67733 +                           const reiser4_key * key /* key of new item */ ,
67734 +                           reiser4_item_data * data    /* parameters for item
67735 +                                                        * creation */ ,
67736 +                           coord_t *coord /* resulting insertion coord */ ,
67737 +                           lock_handle * lh    /* resulting lock
67738 +                                                * handle */ ,
67739 +                           tree_level stop_level /* level where to insert */ ,
67740 +                           __u32 flags/* insertion flags */)
67741 +{
67742 +       int result;
67743 +
67744 +       assert("nikita-358", tree != NULL);
67745 +       assert("nikita-360", coord != NULL);
67746 +
67747 +       result = coord_by_key(tree, key, coord, lh, ZNODE_WRITE_LOCK,
67748 +                             FIND_EXACT, stop_level, stop_level,
67749 +                             flags | CBK_FOR_INSERT, NULL/*ra_info */);
67750 +       switch (result) {
67751 +       default:
67752 +               break;
67753 +       case CBK_COORD_FOUND:
67754 +               result = IBK_ALREADY_EXISTS;
67755 +               break;
67756 +       case CBK_COORD_NOTFOUND:
67757 +               assert("nikita-2017", coord->node != NULL);
67758 +               result = insert_by_coord(coord, data, key, lh, 0/*flags */);
67759 +               break;
67760 +       }
67761 +       return result;
67762 +}
67763 +
67764 +/* insert item by calling carry. Helper function called if short-cut
67765 +   insertion failed  */
67766 +static insert_result insert_with_carry_by_coord(coord_t *coord,
67767 +                                       /* coord where to insert */
67768 +                                               lock_handle * lh,
67769 +                                       /* lock handle of insertion node */
67770 +                                               reiser4_item_data * data,
67771 +                                       /* parameters of new item */
67772 +                                               const reiser4_key * key,
67773 +                                       /* key of new item */
67774 +                                               carry_opcode cop,
67775 +                                       /* carry operation to perform */
67776 +                                               cop_insert_flag flags
67777 +                                       /* carry flags */ )
67778 +{
67779 +       int result;
67780 +       carry_pool *pool;
67781 +       carry_level *lowest_level;
67782 +       carry_insert_data *cdata;
67783 +       carry_op *op;
67784 +
67785 +       assert("umka-314", coord != NULL);
67786 +
67787 +       /* allocate carry_pool and 3 carry_level-s */
67788 +       pool =
67789 +           init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
67790 +                           sizeof(*cdata));
67791 +       if (IS_ERR(pool))
67792 +               return PTR_ERR(pool);
67793 +       lowest_level = (carry_level *) (pool + 1);
67794 +       init_carry_level(lowest_level, pool);
67795 +
67796 +       op = reiser4_post_carry(lowest_level, cop, coord->node, 0);
67797 +       if (IS_ERR(op) || (op == NULL)) {
67798 +               done_carry_pool(pool);
67799 +               return RETERR(op ? PTR_ERR(op) : -EIO);
67800 +       }
67801 +       cdata = (carry_insert_data *) (lowest_level + 3);
67802 +       cdata->coord = coord;
67803 +       cdata->data = data;
67804 +       cdata->key = key;
67805 +       op->u.insert.d = cdata;
67806 +       if (flags == 0)
67807 +               flags = znode_get_tree(coord->node)->carry.insert_flags;
67808 +       op->u.insert.flags = flags;
67809 +       op->u.insert.type = COPT_ITEM_DATA;
67810 +       op->u.insert.child = NULL;
67811 +       if (lh != NULL) {
67812 +               assert("nikita-3245", lh->node == coord->node);
67813 +               lowest_level->track_type = CARRY_TRACK_CHANGE;
67814 +               lowest_level->tracked = lh;
67815 +       }
67816 +
67817 +       result = reiser4_carry(lowest_level, NULL);
67818 +       done_carry_pool(pool);
67819 +
67820 +       return result;
67821 +}
67822 +
67823 +/* form carry queue to perform paste of @data with @key at @coord, and launch
67824 +   its execution by calling carry().
67825 +
67826 +   Instruct carry to update @lh it after balancing insertion coord moves into
67827 +   different block.
67828 +
67829 +*/
67830 +static int paste_with_carry(coord_t *coord,    /* coord of paste */
67831 +                           lock_handle * lh,   /* lock handle of node
67832 +                                                * where item is
67833 +                                                * pasted */
67834 +                           reiser4_item_data * data,   /* parameters of new
67835 +                                                        * item */
67836 +                           const reiser4_key * key,    /* key of new item */
67837 +                           unsigned flags/* paste flags */)
67838 +{
67839 +       int result;
67840 +       carry_pool *pool;
67841 +       carry_level *lowest_level;
67842 +       carry_insert_data *cdata;
67843 +       carry_op *op;
67844 +
67845 +       assert("umka-315", coord != NULL);
67846 +       assert("umka-316", key != NULL);
67847 +
67848 +       pool =
67849 +           init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
67850 +                           sizeof(*cdata));
67851 +       if (IS_ERR(pool))
67852 +               return PTR_ERR(pool);
67853 +       lowest_level = (carry_level *) (pool + 1);
67854 +       init_carry_level(lowest_level, pool);
67855 +
67856 +       op = reiser4_post_carry(lowest_level, COP_PASTE, coord->node, 0);
67857 +       if (IS_ERR(op) || (op == NULL)) {
67858 +               done_carry_pool(pool);
67859 +               return RETERR(op ? PTR_ERR(op) : -EIO);
67860 +       }
67861 +       cdata = (carry_insert_data *) (lowest_level + 3);
67862 +       cdata->coord = coord;
67863 +       cdata->data = data;
67864 +       cdata->key = key;
67865 +       op->u.paste.d = cdata;
67866 +       if (flags == 0)
67867 +               flags = znode_get_tree(coord->node)->carry.paste_flags;
67868 +       op->u.paste.flags = flags;
67869 +       op->u.paste.type = COPT_ITEM_DATA;
67870 +       if (lh != NULL) {
67871 +               lowest_level->track_type = CARRY_TRACK_CHANGE;
67872 +               lowest_level->tracked = lh;
67873 +       }
67874 +
67875 +       result = reiser4_carry(lowest_level, NULL);
67876 +       done_carry_pool(pool);
67877 +
67878 +       return result;
67879 +}
67880 +
67881 +/* insert item at the given coord.
67882 +
67883 +   First try to skip carry by directly calling ->create_item() method of node
67884 +   plugin. If this is impossible (there is not enough free space in the node,
67885 +   or leftmost item in the node is created), call insert_with_carry_by_coord()
67886 +   that will do full carry().
67887 +
67888 +*/
67889 +insert_result insert_by_coord(coord_t *coord   /* coord where to
67890 +                                                * insert. coord->node has
67891 +                                                * to be write locked by
67892 +                                                * caller */ ,
67893 +                             reiser4_item_data * data  /* data to be
67894 +                                                        * inserted */ ,
67895 +                             const reiser4_key * key /* key of new item */ ,
67896 +                             lock_handle * lh  /* lock handle of write
67897 +                                                * lock on node */ ,
67898 +                             __u32 flags/* insertion flags */)
67899 +{
67900 +       unsigned item_size;
67901 +       int result;
67902 +       znode *node;
67903 +
67904 +       assert("vs-247", coord != NULL);
67905 +       assert("vs-248", data != NULL);
67906 +       assert("vs-249", data->length >= 0);
67907 +       assert("nikita-1191", znode_is_write_locked(coord->node));
67908 +
67909 +       node = coord->node;
67910 +       coord_clear_iplug(coord);
67911 +       result = zload(node);
67912 +       if (result != 0)
67913 +               return result;
67914 +
67915 +       item_size = space_needed(node, NULL, data, 1);
67916 +       if (item_size > znode_free_space(node) &&
67917 +           (flags & COPI_DONT_SHIFT_LEFT) && (flags & COPI_DONT_SHIFT_RIGHT)
67918 +           && (flags & COPI_DONT_ALLOCATE)) {
67919 +               /* we are forced to use free space of coord->node and new item
67920 +                  does not fit into it.
67921 +
67922 +                  Currently we get here only when we allocate and copy units
67923 +                  of extent item from a node to its left neighbor during
67924 +                  "squalloc"-ing.  If @node (this is left neighbor) does not
67925 +                  have enough free space - we do not want to attempt any
67926 +                  shifting and allocations because we are in squeezing and
67927 +                  everything to the left of @node is tightly packed.
67928 +                */
67929 +               result = -E_NODE_FULL;
67930 +       } else if ((item_size <= znode_free_space(node)) &&
67931 +                  !coord_is_before_leftmost(coord) &&
67932 +                  (node_plugin_by_node(node)->fast_insert != NULL)
67933 +                  && node_plugin_by_node(node)->fast_insert(coord)) {
67934 +               /* shortcut insertion without carry() overhead.
67935 +
67936 +                  Only possible if:
67937 +
67938 +                  - there is enough free space
67939 +
67940 +                  - insertion is not into the leftmost position in a node
67941 +                  (otherwise it would require updating of delimiting key in a
67942 +                  parent)
67943 +
67944 +                  - node plugin agrees with this
67945 +
67946 +                */
67947 +               result =
67948 +                   node_plugin_by_node(node)->create_item(coord, key, data,
67949 +                                                          NULL);
67950 +               znode_make_dirty(node);
67951 +       } else {
67952 +               /* otherwise do full-fledged carry(). */
67953 +               result =
67954 +                   insert_with_carry_by_coord(coord, lh, data, key, COP_INSERT,
67955 +                                              flags);
67956 +       }
67957 +       zrelse(node);
67958 +       return result;
67959 +}
67960 +
67961 +/* @coord is set to leaf level and @data is to be inserted to twig level */
67962 +insert_result
67963 +insert_extent_by_coord(coord_t *coord,         /* coord where to insert.
67964 +                                               * coord->node has to be write
67965 +                                               * locked by caller */
67966 +                      reiser4_item_data *data,/* data to be inserted */
67967 +                      const reiser4_key *key, /* key of new item */
67968 +                      lock_handle *lh         /* lock handle of write lock
67969 +                                                 on node */)
67970 +{
67971 +       assert("vs-405", coord != NULL);
67972 +       assert("vs-406", data != NULL);
67973 +       assert("vs-407", data->length > 0);
67974 +       assert("vs-408", znode_is_write_locked(coord->node));
67975 +       assert("vs-409", znode_get_level(coord->node) == LEAF_LEVEL);
67976 +
67977 +       return insert_with_carry_by_coord(coord, lh, data, key, COP_EXTENT,
67978 +                                         0 /*flags */ );
67979 +}
67980 +
67981 +/* Insert into the item at the given coord.
67982 +
67983 +   First try to skip carry by directly calling ->paste() method of item
67984 +   plugin. If this is impossible (there is not enough free space in the node,
67985 +   or we are pasting into leftmost position in the node), call
67986 +   paste_with_carry() that will do full carry().
67987 +
67988 +*/
67989 +/* paste_into_item */
67990 +int insert_into_item(coord_t * coord /* coord of pasting */ ,
67991 +                    lock_handle * lh /* lock handle on node involved */ ,
67992 +                    const reiser4_key * key /* key of unit being pasted */ ,
67993 +                    reiser4_item_data * data /* parameters for new unit */ ,
67994 +                    unsigned flags /* insert/paste flags */ )
67995 +{
67996 +       int result;
67997 +       int size_change;
67998 +       node_plugin *nplug;
67999 +       item_plugin *iplug;
68000 +
68001 +       assert("umka-317", coord != NULL);
68002 +       assert("umka-318", key != NULL);
68003 +
68004 +       iplug = item_plugin_by_coord(coord);
68005 +       nplug = node_plugin_by_coord(coord);
68006 +
68007 +       assert("nikita-1480", iplug == data->iplug);
68008 +
68009 +       size_change = space_needed(coord->node, coord, data, 0);
68010 +       if (size_change > (int)znode_free_space(coord->node) &&
68011 +           (flags & COPI_DONT_SHIFT_LEFT) && (flags & COPI_DONT_SHIFT_RIGHT)
68012 +           && (flags & COPI_DONT_ALLOCATE)) {
68013 +               /* we are forced to use free space of coord->node and new data
68014 +                  does not fit into it. */
68015 +               return -E_NODE_FULL;
68016 +       }
68017 +
68018 +       /* shortcut paste without carry() overhead.
68019 +
68020 +          Only possible if:
68021 +
68022 +          - there is enough free space
68023 +
68024 +          - paste is not into the leftmost unit in a node (otherwise
68025 +          it would require updating of delimiting key in a parent)
68026 +
68027 +          - node plugin agrees with this
68028 +
68029 +          - item plugin agrees with us
68030 +        */
68031 +       if (size_change <= (int)znode_free_space(coord->node) &&
68032 +           (coord->item_pos != 0 ||
68033 +            coord->unit_pos != 0 || coord->between == AFTER_UNIT) &&
68034 +           coord->unit_pos != 0 && nplug->fast_paste != NULL &&
68035 +           nplug->fast_paste(coord) &&
68036 +           iplug->b.fast_paste != NULL && iplug->b.fast_paste(coord)) {
68037 +               if (size_change > 0)
68038 +                       nplug->change_item_size(coord, size_change);
68039 +               /* NOTE-NIKITA: huh? where @key is used? */
68040 +               result = iplug->b.paste(coord, data, NULL);
68041 +               if (size_change < 0)
68042 +                       nplug->change_item_size(coord, size_change);
68043 +               znode_make_dirty(coord->node);
68044 +       } else
68045 +               /* otherwise do full-fledged carry(). */
68046 +               result = paste_with_carry(coord, lh, data, key, flags);
68047 +       return result;
68048 +}
68049 +
68050 +/* this either appends or truncates item @coord */
68051 +int reiser4_resize_item(coord_t * coord /* coord of item being resized */ ,
68052 +                       reiser4_item_data * data /* parameters of resize */ ,
68053 +                       reiser4_key * key /* key of new unit */ ,
68054 +                       lock_handle * lh        /* lock handle of node
68055 +                                                * being modified */ ,
68056 +                       cop_insert_flag flags /* carry flags */ )
68057 +{
68058 +       int result;
68059 +       znode *node;
68060 +
68061 +       assert("nikita-362", coord != NULL);
68062 +       assert("nikita-363", data != NULL);
68063 +       assert("vs-245", data->length != 0);
68064 +
68065 +       node = coord->node;
68066 +       coord_clear_iplug(coord);
68067 +       result = zload(node);
68068 +       if (result != 0)
68069 +               return result;
68070 +
68071 +       if (data->length < 0)
68072 +               result = node_plugin_by_coord(coord)->shrink_item(coord,
68073 +                                                                 -data->length);
68074 +       else
68075 +               result = insert_into_item(coord, lh, key, data, flags);
68076 +
68077 +       zrelse(node);
68078 +       return result;
68079 +}
68080 +
68081 +/* insert flow @f */
68082 +int reiser4_insert_flow(coord_t * coord, lock_handle * lh, flow_t * f)
68083 +{
68084 +       int result;
68085 +       carry_pool *pool;
68086 +       carry_level *lowest_level;
68087 +       reiser4_item_data *data;
68088 +       carry_op *op;
68089 +
68090 +       pool =
68091 +           init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
68092 +                           sizeof(*data));
68093 +       if (IS_ERR(pool))
68094 +               return PTR_ERR(pool);
68095 +       lowest_level = (carry_level *) (pool + 1);
68096 +       init_carry_level(lowest_level, pool);
68097 +
68098 +       op = reiser4_post_carry(lowest_level, COP_INSERT_FLOW, coord->node,
68099 +                       0 /* operate directly on coord -> node */ );
68100 +       if (IS_ERR(op) || (op == NULL)) {
68101 +               done_carry_pool(pool);
68102 +               return RETERR(op ? PTR_ERR(op) : -EIO);
68103 +       }
68104 +
68105 +       /* these are permanent during insert_flow */
68106 +       data = (reiser4_item_data *) (lowest_level + 3);
68107 +       data->user = 1;
68108 +       data->iplug = item_plugin_by_id(FORMATTING_ID);
68109 +       data->arg = NULL;
68110 +       /* data.length and data.data will be set before calling paste or
68111 +          insert */
68112 +       data->length = 0;
68113 +       data->data = NULL;
68114 +
68115 +       op->u.insert_flow.flags = 0;
68116 +       op->u.insert_flow.insert_point = coord;
68117 +       op->u.insert_flow.flow = f;
68118 +       op->u.insert_flow.data = data;
68119 +       op->u.insert_flow.new_nodes = 0;
68120 +
68121 +       lowest_level->track_type = CARRY_TRACK_CHANGE;
68122 +       lowest_level->tracked = lh;
68123 +
68124 +       result = reiser4_carry(lowest_level, NULL);
68125 +       done_carry_pool(pool);
68126 +
68127 +       return result;
68128 +}
68129 +
68130 +/* Given a coord in parent node, obtain a znode for the corresponding child */
68131 +znode *child_znode(const coord_t * parent_coord        /* coord of pointer to
68132 +                                                * child */ ,
68133 +                  znode * parent /* parent of child */ ,
68134 +                  int incore_p /* if !0 only return child if already in
68135 +                                * memory */ ,
68136 +                  int setup_dkeys_p    /* if !0 update delimiting keys of
68137 +                                        * child */ )
68138 +{
68139 +       znode *child;
68140 +
68141 +       assert("nikita-1374", parent_coord != NULL);
68142 +       assert("nikita-1482", parent != NULL);
68143 +#if REISER4_DEBUG
68144 +       if (setup_dkeys_p)
68145 +               assert_rw_not_locked(&(znode_get_tree(parent)->dk_lock));
68146 +#endif
68147 +       assert("nikita-2947", znode_is_any_locked(parent));
68148 +
68149 +       if (znode_get_level(parent) <= LEAF_LEVEL) {
68150 +               /* trying to get child of leaf node */
68151 +               warning("nikita-1217", "Child of maize?");
68152 +               return ERR_PTR(RETERR(-EIO));
68153 +       }
68154 +       if (item_is_internal(parent_coord)) {
68155 +               reiser4_block_nr addr;
68156 +               item_plugin *iplug;
68157 +               reiser4_tree *tree;
68158 +
68159 +               iplug = item_plugin_by_coord(parent_coord);
68160 +               assert("vs-512", iplug->s.internal.down_link);
68161 +               iplug->s.internal.down_link(parent_coord, NULL, &addr);
68162 +
68163 +               tree = znode_get_tree(parent);
68164 +               if (incore_p)
68165 +                       child = zlook(tree, &addr);
68166 +               else
68167 +                       child =
68168 +                           zget(tree, &addr, parent,
68169 +                                znode_get_level(parent) - 1,
68170 +                                reiser4_ctx_gfp_mask_get());
68171 +               if ((child != NULL) && !IS_ERR(child) && setup_dkeys_p)
68172 +                       set_child_delimiting_keys(parent, parent_coord, child);
68173 +       } else {
68174 +               warning("nikita-1483", "Internal item expected");
68175 +               child = ERR_PTR(RETERR(-EIO));
68176 +       }
68177 +       return child;
68178 +}
68179 +
68180 +/* remove znode from transaction */
68181 +static void uncapture_znode(znode * node)
68182 +{
68183 +       struct page *page;
68184 +
68185 +       assert("zam-1001", ZF_ISSET(node, JNODE_HEARD_BANSHEE));
68186 +
68187 +       if (!reiser4_blocknr_is_fake(znode_get_block(node))) {
68188 +               int ret;
68189 +
68190 +               /* An already allocated block goes right to the atom's delete set. */
68191 +               ret =
68192 +                   reiser4_dealloc_block(znode_get_block(node), 0,
68193 +                                         BA_DEFER | BA_FORMATTED);
68194 +               if (ret)
68195 +                       warning("zam-942",
68196 +                               "can\'t add a block (%llu) number to atom's delete set\n",
68197 +                               (unsigned long long)(*znode_get_block(node)));
68198 +
68199 +               spin_lock_znode(node);
68200 +               /* Here we return flush reserved block which was reserved at the
68201 +                * moment when this allocated node was marked dirty and still
68202 +                * not used by flush in node relocation procedure.  */
68203 +               if (ZF_ISSET(node, JNODE_FLUSH_RESERVED)) {
68204 +                       txn_atom *atom;
68205 +
68206 +                       atom = jnode_get_atom(ZJNODE(node));
68207 +                       assert("zam-939", atom != NULL);
68208 +                       spin_unlock_znode(node);
68209 +                       flush_reserved2grabbed(atom, (__u64) 1);
68210 +                       spin_unlock_atom(atom);
68211 +               } else
68212 +                       spin_unlock_znode(node);
68213 +       } else {
68214 +               /* znode has assigned block which is counted as "fake
68215 +                  allocated". Return it back to "free blocks") */
68216 +               fake_allocated2free((__u64) 1, BA_FORMATTED);
68217 +       }
68218 +
68219 +       /*
68220 +        * uncapture page from transaction. There is a possibility of a race
68221 +        * with ->releasepage(): reiser4_releasepage() detaches page from this
68222 +        * jnode and we have nothing to uncapture. To avoid this, get
68223 +        * reference of node->pg under jnode spin lock. reiser4_uncapture_page()
68224 +        * will deal with released page itself.
68225 +        */
68226 +       spin_lock_znode(node);
68227 +       page = znode_page(node);
68228 +       if (likely(page != NULL)) {
68229 +               /*
68230 +                * reiser4_uncapture_page() can only be called when we are sure
68231 +                * that znode is pinned in memory, which we are, because
68232 +                * forget_znode() is only called from longterm_unlock_znode().
68233 +                */
68234 +               page_cache_get(page);
68235 +               spin_unlock_znode(node);
68236 +               lock_page(page);
68237 +               reiser4_uncapture_page(page);
68238 +               unlock_page(page);
68239 +               page_cache_release(page);
68240 +       } else {
68241 +               txn_atom *atom;
68242 +
68243 +               /* handle "flush queued" znodes */
68244 +               while (1) {
68245 +                       atom = jnode_get_atom(ZJNODE(node));
68246 +                       assert("zam-943", atom != NULL);
68247 +
68248 +                       if (!ZF_ISSET(node, JNODE_FLUSH_QUEUED)
68249 +                           || !atom->nr_running_queues)
68250 +                               break;
68251 +
68252 +                       spin_unlock_znode(node);
68253 +                       reiser4_atom_wait_event(atom);
68254 +                       spin_lock_znode(node);
68255 +               }
68256 +
68257 +               reiser4_uncapture_block(ZJNODE(node));
68258 +               spin_unlock_atom(atom);
68259 +               zput(node);
68260 +       }
68261 +}
68262 +
68263 +/* This is called from longterm_unlock_znode() when last lock is released from
68264 +   the node that has been removed from the tree. At this point node is removed
68265 +   from sibling list and its lock is invalidated. */
68266 +void forget_znode(lock_handle * handle)
68267 +{
68268 +       znode *node;
68269 +       reiser4_tree *tree;
68270 +
68271 +       assert("umka-319", handle != NULL);
68272 +
68273 +       node = handle->node;
68274 +       tree = znode_get_tree(node);
68275 +
68276 +       assert("vs-164", znode_is_write_locked(node));
68277 +       assert("nikita-1280", ZF_ISSET(node, JNODE_HEARD_BANSHEE));
68278 +       assert_rw_locked(&(node->lock.guard));
68279 +
68280 +       /* We assume that this node was detached from its parent before
68281 +        * unlocking, it gives no way to reach this node from parent through a
68282 +        * down link.  The node should have no children and, thereby, can't be
68283 +        * reached from them by their parent pointers.  The only way to obtain a
68284 +        * reference to the node is to use sibling pointers from its left and
68285 +        * right neighbors.  In the next several lines we remove the node from
68286 +        * the sibling list. */
68287 +
68288 +       write_lock_tree(tree);
68289 +       sibling_list_remove(node);
68290 +       znode_remove(node, tree);
68291 +       write_unlock_tree(tree);
68292 +
68293 +       /* Here we set JNODE_DYING and cancel all pending lock requests.  It
68294 +        * forces all lock requestor threads to repeat iterations of getting
68295 +        * lock on a child, neighbor or parent node.  But, those threads can't
68296 +        * come to this node again, because this node is no longer a child,
68297 +        * neighbor or parent of any other node.  This order of znode
68298 +        * invalidation does not allow other threads to waste cpu time is a busy
68299 +        * loop, trying to lock dying object.  The exception is in the flush
68300 +        * code when we take node directly from atom's capture list.*/
68301 +       reiser4_invalidate_lock(handle);
68302 +       uncapture_znode(node);
68303 +}
68304 +
68305 +/* Check that internal item at @pointer really contains pointer to @child. */
68306 +int check_tree_pointer(const coord_t * pointer /* would-be pointer to
68307 +                                                * @child */ ,
68308 +                      const znode * child /* child znode */ )
68309 +{
68310 +       assert("nikita-1016", pointer != NULL);
68311 +       assert("nikita-1017", child != NULL);
68312 +       assert("nikita-1018", pointer->node != NULL);
68313 +
68314 +       assert("nikita-1325", znode_is_any_locked(pointer->node));
68315 +
68316 +       assert("nikita-2985",
68317 +              znode_get_level(pointer->node) == znode_get_level(child) + 1);
68318 +
68319 +       coord_clear_iplug((coord_t *) pointer);
68320 +
68321 +       if (coord_is_existing_unit(pointer)) {
68322 +               item_plugin *iplug;
68323 +               reiser4_block_nr addr;
68324 +
68325 +               if (item_is_internal(pointer)) {
68326 +                       iplug = item_plugin_by_coord(pointer);
68327 +                       assert("vs-513", iplug->s.internal.down_link);
68328 +                       iplug->s.internal.down_link(pointer, NULL, &addr);
68329 +                       /* check that cached value is correct */
68330 +                       if (disk_addr_eq(&addr, znode_get_block(child))) {
68331 +                               return NS_FOUND;
68332 +                       }
68333 +               }
68334 +       }
68335 +       /* warning ("jmacd-1002", "tree pointer incorrect"); */
68336 +       return NS_NOT_FOUND;
68337 +}
68338 +
68339 +/* find coord of pointer to new @child in @parent.
68340 +
68341 +   Find the &coord_t in the @parent where pointer to a given @child will
68342 +   be in.
68343 +
68344 +*/
68345 +int find_new_child_ptr(znode * parent /* parent znode, passed locked */ ,
68346 +                      znode *
68347 +                      child UNUSED_ARG /* child znode, passed locked */ ,
68348 +                      znode * left /* left brother of new node */ ,
68349 +                      coord_t * result /* where result is stored in */ )
68350 +{
68351 +       int ret;
68352 +
68353 +       assert("nikita-1486", parent != NULL);
68354 +       assert("nikita-1487", child != NULL);
68355 +       assert("nikita-1488", result != NULL);
68356 +
68357 +       ret = find_child_ptr(parent, left, result);
68358 +       if (ret != NS_FOUND) {
68359 +               warning("nikita-1489", "Cannot find brother position: %i", ret);
68360 +               return RETERR(-EIO);
68361 +       } else {
68362 +               result->between = AFTER_UNIT;
68363 +               return RETERR(NS_NOT_FOUND);
68364 +       }
68365 +}
68366 +
68367 +/* find coord of pointer to @child in @parent.
68368 +
68369 +   Find the &coord_t in the @parent where pointer to a given @child is in.
68370 +
68371 +*/
68372 +int find_child_ptr(znode * parent /* parent znode, passed locked */ ,
68373 +                  znode * child /* child znode, passed locked */ ,
68374 +                  coord_t * result /* where result is stored in */ )
68375 +{
68376 +       int lookup_res;
68377 +       node_plugin *nplug;
68378 +       /* left delimiting key of a child */
68379 +       reiser4_key ld;
68380 +       reiser4_tree *tree;
68381 +
68382 +       assert("nikita-934", parent != NULL);
68383 +       assert("nikita-935", child != NULL);
68384 +       assert("nikita-936", result != NULL);
68385 +       assert("zam-356", znode_is_loaded(parent));
68386 +
68387 +       coord_init_zero(result);
68388 +       result->node = parent;
68389 +
68390 +       nplug = parent->nplug;
68391 +       assert("nikita-939", nplug != NULL);
68392 +
68393 +       tree = znode_get_tree(parent);
68394 +       /* NOTE-NIKITA taking read-lock on tree here assumes that @result is
68395 +        * not aliased to ->in_parent of some znode. Otherwise,
68396 +        * parent_coord_to_coord() below would modify data protected by tree
68397 +        * lock. */
68398 +       read_lock_tree(tree);
68399 +       /* fast path. Try to use cached value. Lock tree to keep
68400 +          node->pos_in_parent and pos->*_blocknr consistent. */
68401 +       if (child->in_parent.item_pos + 1 != 0) {
68402 +               parent_coord_to_coord(&child->in_parent, result);
68403 +               if (check_tree_pointer(result, child) == NS_FOUND) {
68404 +                       read_unlock_tree(tree);
68405 +                       return NS_FOUND;
68406 +               }
68407 +
68408 +               child->in_parent.item_pos = (unsigned short)~0;
68409 +       }
68410 +       read_unlock_tree(tree);
68411 +
68412 +       /* is above failed, find some key from @child. We are looking for the
68413 +          least key in a child. */
68414 +       read_lock_dk(tree);
68415 +       ld = *znode_get_ld_key(child);
68416 +       read_unlock_dk(tree);
68417 +       /*
68418 +        * now, lookup parent with key just found. Note, that left delimiting
68419 +        * key doesn't identify node uniquely, because (in extremely rare
68420 +        * case) two nodes can have equal left delimiting keys, if one of them
68421 +        * is completely filled with directory entries that all happened to be
68422 +        * hash collision. But, we check block number in check_tree_pointer()
68423 +        * and, so, are safe.
68424 +        */
68425 +       lookup_res = nplug->lookup(parent, &ld, FIND_EXACT, result);
68426 +       /* update cached pos_in_node */
68427 +       if (lookup_res == NS_FOUND) {
68428 +               write_lock_tree(tree);
68429 +               coord_to_parent_coord(result, &child->in_parent);
68430 +               write_unlock_tree(tree);
68431 +               lookup_res = check_tree_pointer(result, child);
68432 +       }
68433 +       if (lookup_res == NS_NOT_FOUND)
68434 +               lookup_res = find_child_by_addr(parent, child, result);
68435 +       return lookup_res;
68436 +}
68437 +
68438 +/* find coord of pointer to @child in @parent by scanning
68439 +
68440 +   Find the &coord_t in the @parent where pointer to a given @child
68441 +   is in by scanning all internal items in @parent and comparing block
68442 +   numbers in them with that of @child.
68443 +
68444 +*/
68445 +static int find_child_by_addr(znode * parent /* parent znode, passed locked */ ,
68446 +                             znode * child /* child znode, passed locked */ ,
68447 +                             coord_t * result /* where result is stored in */ )
68448 +{
68449 +       int ret;
68450 +
68451 +       assert("nikita-1320", parent != NULL);
68452 +       assert("nikita-1321", child != NULL);
68453 +       assert("nikita-1322", result != NULL);
68454 +
68455 +       ret = NS_NOT_FOUND;
68456 +
68457 +       for_all_units(result, parent) {
68458 +               if (check_tree_pointer(result, child) == NS_FOUND) {
68459 +                       write_lock_tree(znode_get_tree(parent));
68460 +                       coord_to_parent_coord(result, &child->in_parent);
68461 +                       write_unlock_tree(znode_get_tree(parent));
68462 +                       ret = NS_FOUND;
68463 +                       break;
68464 +               }
68465 +       }
68466 +       return ret;
68467 +}
68468 +
68469 +/* true, if @addr is "unallocated block number", which is just address, with
68470 +   highest bit set. */
68471 +int is_disk_addr_unallocated(const reiser4_block_nr * addr     /* address to
68472 +                                                                * check */ )
68473 +{
68474 +       assert("nikita-1766", addr != NULL);
68475 +       cassert(sizeof(reiser4_block_nr) == 8);
68476 +       return (*addr & REISER4_BLOCKNR_STATUS_BIT_MASK) ==
68477 +           REISER4_UNALLOCATED_STATUS_VALUE;
68478 +}
68479 +
68480 +/* returns true if removing bytes of given range of key [from_key, to_key]
68481 +   causes removing of whole item @from */
68482 +static int
68483 +item_removed_completely(coord_t * from, const reiser4_key * from_key,
68484 +                       const reiser4_key * to_key)
68485 +{
68486 +       item_plugin *iplug;
68487 +       reiser4_key key_in_item;
68488 +
68489 +       assert("umka-325", from != NULL);
68490 +       assert("", item_is_extent(from));
68491 +
68492 +       /* check first key just for case */
68493 +       item_key_by_coord(from, &key_in_item);
68494 +       if (keygt(from_key, &key_in_item))
68495 +               return 0;
68496 +
68497 +       /* check last key */
68498 +       iplug = item_plugin_by_coord(from);
68499 +       assert("vs-611", iplug && iplug->s.file.append_key);
68500 +
68501 +       iplug->s.file.append_key(from, &key_in_item);
68502 +       set_key_offset(&key_in_item, get_key_offset(&key_in_item) - 1);
68503 +
68504 +       if (keylt(to_key, &key_in_item))
68505 +               /* last byte is not removed */
68506 +               return 0;
68507 +       return 1;
68508 +}
68509 +
68510 +/* helper function for prepare_twig_kill(): @left and @right are formatted
68511 + * neighbors of extent item being completely removed. Load and lock neighbors
68512 + * and store lock handles into @cdata for later use by kill_hook_extent() */
68513 +static int
68514 +prepare_children(znode * left, znode * right, carry_kill_data * kdata)
68515 +{
68516 +       int result;
68517 +       int left_loaded;
68518 +       int right_loaded;
68519 +
68520 +       result = 0;
68521 +       left_loaded = right_loaded = 0;
68522 +
68523 +       if (left != NULL) {
68524 +               result = zload(left);
68525 +               if (result == 0) {
68526 +                       left_loaded = 1;
68527 +                       result = longterm_lock_znode(kdata->left, left,
68528 +                                                    ZNODE_READ_LOCK,
68529 +                                                    ZNODE_LOCK_LOPRI);
68530 +               }
68531 +       }
68532 +       if (result == 0 && right != NULL) {
68533 +               result = zload(right);
68534 +               if (result == 0) {
68535 +                       right_loaded = 1;
68536 +                       result = longterm_lock_znode(kdata->right, right,
68537 +                                                    ZNODE_READ_LOCK,
68538 +                                                    ZNODE_LOCK_HIPRI |
68539 +                                                    ZNODE_LOCK_NONBLOCK);
68540 +               }
68541 +       }
68542 +       if (result != 0) {
68543 +               done_lh(kdata->left);
68544 +               done_lh(kdata->right);
68545 +               if (left_loaded != 0)
68546 +                       zrelse(left);
68547 +               if (right_loaded != 0)
68548 +                       zrelse(right);
68549 +       }
68550 +       return result;
68551 +}
68552 +
68553 +static void done_children(carry_kill_data * kdata)
68554 +{
68555 +       if (kdata->left != NULL && kdata->left->node != NULL) {
68556 +               zrelse(kdata->left->node);
68557 +               done_lh(kdata->left);
68558 +       }
68559 +       if (kdata->right != NULL && kdata->right->node != NULL) {
68560 +               zrelse(kdata->right->node);
68561 +               done_lh(kdata->right);
68562 +       }
68563 +}
68564 +
68565 +/* part of cut_node. It is called when cut_node is called to remove or cut part
68566 +   of extent item. When head of that item is removed - we have to update right
68567 +   delimiting of left neighbor of extent. When item is removed completely - we
68568 +   have to set sibling link between left and right neighbor of removed
68569 +   extent. This may return -E_DEADLOCK because of trying to get left neighbor
68570 +   locked. So, caller should repeat an attempt
68571 +*/
68572 +/* Audited by: umka (2002.06.16) */
68573 +static int
68574 +prepare_twig_kill(carry_kill_data * kdata, znode * locked_left_neighbor)
68575 +{
68576 +       int result;
68577 +       reiser4_key key;
68578 +       lock_handle left_lh;
68579 +       lock_handle right_lh;
68580 +       coord_t left_coord;
68581 +       coord_t *from;
68582 +       znode *left_child;
68583 +       znode *right_child;
68584 +       reiser4_tree *tree;
68585 +       int left_zloaded_here, right_zloaded_here;
68586 +
68587 +       from = kdata->params.from;
68588 +       assert("umka-326", from != NULL);
68589 +       assert("umka-327", kdata->params.to != NULL);
68590 +
68591 +       /* for one extent item only yet */
68592 +       assert("vs-591", item_is_extent(from));
68593 +       assert("vs-592", from->item_pos == kdata->params.to->item_pos);
68594 +
68595 +       if ((kdata->params.from_key
68596 +            && keygt(kdata->params.from_key, item_key_by_coord(from, &key)))
68597 +           || from->unit_pos != 0) {
68598 +               /* head of item @from is not removed, there is nothing to
68599 +                  worry about */
68600 +               return 0;
68601 +       }
68602 +
68603 +       result = 0;
68604 +       left_zloaded_here = 0;
68605 +       right_zloaded_here = 0;
68606 +
68607 +       left_child = right_child = NULL;
68608 +
68609 +       coord_dup(&left_coord, from);
68610 +       init_lh(&left_lh);
68611 +       init_lh(&right_lh);
68612 +       if (coord_prev_unit(&left_coord)) {
68613 +               /* @from is leftmost item in its node */
68614 +               if (!locked_left_neighbor) {
68615 +                       result =
68616 +                           reiser4_get_left_neighbor(&left_lh, from->node,
68617 +                                                     ZNODE_READ_LOCK,
68618 +                                                     GN_CAN_USE_UPPER_LEVELS);
68619 +                       switch (result) {
68620 +                       case 0:
68621 +                               break;
68622 +                       case -E_NO_NEIGHBOR:
68623 +                               /* there is no formatted node to the left of
68624 +                                  from->node */
68625 +                               warning("vs-605",
68626 +                                       "extent item has smallest key in "
68627 +                                       "the tree and it is about to be removed");
68628 +                               return 0;
68629 +                       case -E_DEADLOCK:
68630 +                               /* need to restart */
68631 +                       default:
68632 +                               return result;
68633 +                       }
68634 +
68635 +                       /* we have acquired left neighbor of from->node */
68636 +                       result = zload(left_lh.node);
68637 +                       if (result)
68638 +                               goto done;
68639 +
68640 +                       locked_left_neighbor = left_lh.node;
68641 +               } else {
68642 +                       /* squalloc_right_twig_cut should have supplied locked
68643 +                        * left neighbor */
68644 +                       assert("vs-834",
68645 +                              znode_is_write_locked(locked_left_neighbor));
68646 +                       result = zload(locked_left_neighbor);
68647 +                       if (result)
68648 +                               return result;
68649 +               }
68650 +
68651 +               left_zloaded_here = 1;
68652 +               coord_init_last_unit(&left_coord, locked_left_neighbor);
68653 +       }
68654 +
68655 +       if (!item_is_internal(&left_coord)) {
68656 +               /* what else but extent can be on twig level */
68657 +               assert("vs-606", item_is_extent(&left_coord));
68658 +
68659 +               /* there is no left formatted child */
68660 +               if (left_zloaded_here)
68661 +                       zrelse(locked_left_neighbor);
68662 +               done_lh(&left_lh);
68663 +               return 0;
68664 +       }
68665 +
68666 +       tree = znode_get_tree(left_coord.node);
68667 +       left_child = child_znode(&left_coord, left_coord.node, 1, 0);
68668 +
68669 +       if (IS_ERR(left_child)) {
68670 +               result = PTR_ERR(left_child);
68671 +               goto done;
68672 +       }
68673 +
68674 +       /* left child is acquired, calculate new right delimiting key for it
68675 +          and get right child if it is necessary */
68676 +       if (item_removed_completely
68677 +           (from, kdata->params.from_key, kdata->params.to_key)) {
68678 +               /* try to get right child of removed item */
68679 +               coord_t right_coord;
68680 +
68681 +               assert("vs-607",
68682 +                      kdata->params.to->unit_pos ==
68683 +                      coord_last_unit_pos(kdata->params.to));
68684 +               coord_dup(&right_coord, kdata->params.to);
68685 +               if (coord_next_unit(&right_coord)) {
68686 +                       /* @to is rightmost unit in the node */
68687 +                       result =
68688 +                           reiser4_get_right_neighbor(&right_lh, from->node,
68689 +                                                      ZNODE_READ_LOCK,
68690 +                                                      GN_CAN_USE_UPPER_LEVELS);
68691 +                       switch (result) {
68692 +                       case 0:
68693 +                               result = zload(right_lh.node);
68694 +                               if (result)
68695 +                                       goto done;
68696 +
68697 +                               right_zloaded_here = 1;
68698 +                               coord_init_first_unit(&right_coord,
68699 +                                                     right_lh.node);
68700 +                               item_key_by_coord(&right_coord, &key);
68701 +                               break;
68702 +
68703 +                       case -E_NO_NEIGHBOR:
68704 +                               /* there is no formatted node to the right of
68705 +                                  from->node */
68706 +                               read_lock_dk(tree);
68707 +                               key = *znode_get_rd_key(from->node);
68708 +                               read_unlock_dk(tree);
68709 +                               right_coord.node = NULL;
68710 +                               result = 0;
68711 +                               break;
68712 +                       default:
68713 +                               /* real error */
68714 +                               goto done;
68715 +                       }
68716 +               } else {
68717 +                       /* there is an item to the right of @from - take its key */
68718 +                       item_key_by_coord(&right_coord, &key);
68719 +               }
68720 +
68721 +               /* try to get right child of @from */
68722 +               if (right_coord.node && /* there is right neighbor of @from */
68723 +                   item_is_internal(&right_coord)) {   /* it is internal item */
68724 +                       right_child = child_znode(&right_coord,
68725 +                                                 right_coord.node, 1, 0);
68726 +
68727 +                       if (IS_ERR(right_child)) {
68728 +                               result = PTR_ERR(right_child);
68729 +                               goto done;
68730 +                       }
68731 +
68732 +               }
68733 +               /* whole extent is removed between znodes left_child and right_child. Prepare them for linking and
68734 +                  update of right delimiting key of left_child */
68735 +               result = prepare_children(left_child, right_child, kdata);
68736 +       } else {
68737 +               /* head of item @to is removed. left_child has to get right delimting key update. Prepare it for that */
68738 +               result = prepare_children(left_child, NULL, kdata);
68739 +       }
68740 +
68741 +      done:
68742 +       if (right_child)
68743 +               zput(right_child);
68744 +       if (right_zloaded_here)
68745 +               zrelse(right_lh.node);
68746 +       done_lh(&right_lh);
68747 +
68748 +       if (left_child)
68749 +               zput(left_child);
68750 +       if (left_zloaded_here)
68751 +               zrelse(locked_left_neighbor);
68752 +       done_lh(&left_lh);
68753 +       return result;
68754 +}
68755 +
68756 +/* this is used to remove part of node content between coordinates @from and @to. Units to which @from and @to are set
68757 +   are to be cut completely */
68758 +/* for try_to_merge_with_left, delete_copied, reiser4_delete_node */
68759 +int cut_node_content(coord_t * from, coord_t * to, const reiser4_key * from_key,       /* first key to be removed */
68760 +                    const reiser4_key * to_key,        /* last key to be removed */
68761 +                    reiser4_key *
68762 +                    smallest_removed /* smallest key actually removed */ )
68763 +{
68764 +       int result;
68765 +       carry_pool *pool;
68766 +       carry_level *lowest_level;
68767 +       carry_cut_data *cut_data;
68768 +       carry_op *op;
68769 +
68770 +       assert("vs-1715", coord_compare(from, to) != COORD_CMP_ON_RIGHT);
68771 +
68772 +       pool =
68773 +           init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
68774 +                           sizeof(*cut_data));
68775 +       if (IS_ERR(pool))
68776 +               return PTR_ERR(pool);
68777 +       lowest_level = (carry_level *) (pool + 1);
68778 +       init_carry_level(lowest_level, pool);
68779 +
68780 +       op = reiser4_post_carry(lowest_level, COP_CUT, from->node, 0);
68781 +       assert("vs-1509", op != 0);
68782 +       if (IS_ERR(op)) {
68783 +               done_carry_pool(pool);
68784 +               return PTR_ERR(op);
68785 +       }
68786 +
68787 +       cut_data = (carry_cut_data *) (lowest_level + 3);
68788 +       cut_data->params.from = from;
68789 +       cut_data->params.to = to;
68790 +       cut_data->params.from_key = from_key;
68791 +       cut_data->params.to_key = to_key;
68792 +       cut_data->params.smallest_removed = smallest_removed;
68793 +
68794 +       op->u.cut_or_kill.is_cut = 1;
68795 +       op->u.cut_or_kill.u.cut = cut_data;
68796 +
68797 +       result = reiser4_carry(lowest_level, NULL);
68798 +       done_carry_pool(pool);
68799 +
68800 +       return result;
68801 +}
68802 +
68803 +/* cut part of the node
68804 +
68805 +   Cut part or whole content of node.
68806 +
68807 +   cut data between @from and @to of @from->node and call carry() to make
68808 +   corresponding changes in the tree. @from->node may become empty. If so -
68809 +   pointer to it will be removed. Neighboring nodes are not changed. Smallest
68810 +   removed key is stored in @smallest_removed
68811 +
68812 +*/
68813 +int kill_node_content(coord_t * from,  /* coord of the first unit/item that will be eliminated */
68814 +                     coord_t * to,     /* coord of the last unit/item that will be eliminated */
68815 +                     const reiser4_key * from_key,     /* first key to be removed */
68816 +                     const reiser4_key * to_key,       /* last key to be removed */
68817 +                     reiser4_key * smallest_removed,   /* smallest key actually removed */
68818 +                     znode * locked_left_neighbor,     /* this is set when kill_node_content is called with left neighbor
68819 +                                                        * locked (in squalloc_right_twig_cut, namely) */
68820 +                     struct inode *inode,      /* inode of file whose item (or its part) is to be killed. This is necessary to
68821 +                                                  invalidate pages together with item pointing to them */
68822 +                     int truncate)
68823 +{                              /* this call is made for file truncate)  */
68824 +       int result;
68825 +       carry_pool *pool;
68826 +       carry_level *lowest_level;
68827 +       carry_kill_data *kdata;
68828 +       lock_handle *left_child;
68829 +       lock_handle *right_child;
68830 +       carry_op *op;
68831 +
68832 +       assert("umka-328", from != NULL);
68833 +       assert("vs-316", !node_is_empty(from->node));
68834 +       assert("nikita-1812", coord_is_existing_unit(from)
68835 +              && coord_is_existing_unit(to));
68836 +
68837 +       /* allocate carry_pool, 3 carry_level-s, carry_kill_data and structures for kill_hook_extent */
68838 +       pool = init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
68839 +                              sizeof(carry_kill_data) +
68840 +                              2 * sizeof(lock_handle) +
68841 +                              5 * sizeof(reiser4_key) + 2 * sizeof(coord_t));
68842 +       if (IS_ERR(pool))
68843 +               return PTR_ERR(pool);
68844 +
68845 +       lowest_level = (carry_level *) (pool + 1);
68846 +       init_carry_level(lowest_level, pool);
68847 +
68848 +       kdata = (carry_kill_data *) (lowest_level + 3);
68849 +       left_child = (lock_handle *) (kdata + 1);
68850 +       right_child = left_child + 1;
68851 +
68852 +       init_lh(left_child);
68853 +       init_lh(right_child);
68854 +
68855 +       kdata->params.from = from;
68856 +       kdata->params.to = to;
68857 +       kdata->params.from_key = from_key;
68858 +       kdata->params.to_key = to_key;
68859 +       kdata->params.smallest_removed = smallest_removed;
68860 +       kdata->params.truncate = truncate;
68861 +       kdata->flags = 0;
68862 +       kdata->inode = inode;
68863 +       kdata->left = left_child;
68864 +       kdata->right = right_child;
68865 +       /* memory for 5 reiser4_key and 2 coord_t will be used in kill_hook_extent */
68866 +       kdata->buf = (char *)(right_child + 1);
68867 +
68868 +       if (znode_get_level(from->node) == TWIG_LEVEL && item_is_extent(from)) {
68869 +               /* left child of extent item may have to get updated right
68870 +                  delimiting key and to get linked with right child of extent
68871 +                  @from if it will be removed completely */
68872 +               result = prepare_twig_kill(kdata, locked_left_neighbor);
68873 +               if (result) {
68874 +                       done_children(kdata);
68875 +                       done_carry_pool(pool);
68876 +                       return result;
68877 +               }
68878 +       }
68879 +
68880 +       op = reiser4_post_carry(lowest_level, COP_CUT, from->node, 0);
68881 +       if (IS_ERR(op) || (op == NULL)) {
68882 +               done_children(kdata);
68883 +               done_carry_pool(pool);
68884 +               return RETERR(op ? PTR_ERR(op) : -EIO);
68885 +       }
68886 +
68887 +       op->u.cut_or_kill.is_cut = 0;
68888 +       op->u.cut_or_kill.u.kill = kdata;
68889 +
68890 +       result = reiser4_carry(lowest_level, NULL);
68891 +
68892 +       done_children(kdata);
68893 +       done_carry_pool(pool);
68894 +       return result;
68895 +}
68896 +
68897 +void
68898 +fake_kill_hook_tail(struct inode *inode, loff_t start, loff_t end, int truncate)
68899 +{
68900 +       if (reiser4_inode_get_flag(inode, REISER4_HAS_MMAP)) {
68901 +               pgoff_t start_pg, end_pg;
68902 +
68903 +               start_pg = start >> PAGE_CACHE_SHIFT;
68904 +               end_pg = (end - 1) >> PAGE_CACHE_SHIFT;
68905 +
68906 +               if ((start & (PAGE_CACHE_SIZE - 1)) == 0) {
68907 +                       /*
68908 +                        * kill up to the page boundary.
68909 +                        */
68910 +                       assert("vs-123456", start_pg == end_pg);
68911 +                       reiser4_invalidate_pages(inode->i_mapping, start_pg, 1,
68912 +                                                truncate);
68913 +               } else if (start_pg != end_pg) {
68914 +                       /*
68915 +                        * page boundary is within killed portion of node.
68916 +                        */
68917 +                       assert("vs-654321", end_pg - start_pg == 1);
68918 +                       reiser4_invalidate_pages(inode->i_mapping, end_pg,
68919 +                                                end_pg - start_pg, 1);
68920 +               }
68921 +       }
68922 +       inode_sub_bytes(inode, end - start);
68923 +}
68924 +
68925 +/**
68926 + * Delete whole @node from the reiser4 tree without loading it.
68927 + *
68928 + * @left: locked left neighbor,
68929 + * @node: node to be deleted,
68930 + * @smallest_removed: leftmost key of deleted node,
68931 + * @object: inode pointer, if we truncate a file body.
68932 + * @truncate: true if called for file truncate.
68933 + *
68934 + * @return: 0 if success, error code otherwise.
68935 + *
68936 + * NOTE: if @object!=NULL we assume that @smallest_removed != NULL and it
68937 + * contains the right value of the smallest removed key from the previous
68938 + * cut_worker() iteration.  This is needed for proper accounting of
68939 + * "i_blocks" and "i_bytes" fields of the @object.
68940 + */
68941 +int reiser4_delete_node(znode * node, reiser4_key * smallest_removed,
68942 +                       struct inode *object, int truncate)
68943 +{
68944 +       lock_handle parent_lock;
68945 +       coord_t cut_from;
68946 +       coord_t cut_to;
68947 +       reiser4_tree *tree;
68948 +       int ret;
68949 +
68950 +       assert("zam-937", node != NULL);
68951 +       assert("zam-933", znode_is_write_locked(node));
68952 +       assert("zam-999", smallest_removed != NULL);
68953 +
68954 +       init_lh(&parent_lock);
68955 +
68956 +       ret = reiser4_get_parent(&parent_lock, node, ZNODE_WRITE_LOCK);
68957 +       if (ret)
68958 +               return ret;
68959 +
68960 +       assert("zam-934", !znode_above_root(parent_lock.node));
68961 +
68962 +       ret = zload(parent_lock.node);
68963 +       if (ret)
68964 +               goto failed_nozrelse;
68965 +
68966 +       ret = find_child_ptr(parent_lock.node, node, &cut_from);
68967 +       if (ret)
68968 +               goto failed;
68969 +
68970 +       /* decrement child counter and set parent pointer to NULL before
68971 +          deleting the list from parent node because of checks in
68972 +          internal_kill_item_hook (we can delete the last item from the parent
68973 +          node, the parent node is going to be deleted and its c_count should
68974 +          be zero). */
68975 +
68976 +       tree = znode_get_tree(node);
68977 +       write_lock_tree(tree);
68978 +       init_parent_coord(&node->in_parent, NULL);
68979 +       --parent_lock.node->c_count;
68980 +       write_unlock_tree(tree);
68981 +
68982 +       assert("zam-989", item_is_internal(&cut_from));
68983 +
68984 +       /* @node should be deleted after unlocking. */
68985 +       ZF_SET(node, JNODE_HEARD_BANSHEE);
68986 +
68987 +       /* remove a pointer from the parent node to the node being deleted. */
68988 +       coord_dup(&cut_to, &cut_from);
68989 +       /* FIXME: shouldn't this be kill_node_content */
68990 +       ret = cut_node_content(&cut_from, &cut_to, NULL, NULL, NULL);
68991 +       if (ret)
68992 +               /* FIXME(Zam): Should we re-connect the node to its parent if
68993 +                * cut_node fails? */
68994 +               goto failed;
68995 +
68996 +       {
68997 +               reiser4_tree *tree = current_tree;
68998 +               __u64 start_offset = 0, end_offset = 0;
68999 +
69000 +               read_lock_tree(tree);
69001 +               write_lock_dk(tree);
69002 +               if (object) {
69003 +                       /* We use @smallest_removed and the left delimiting of
69004 +                        * the current node for @object->i_blocks, i_bytes
69005 +                        * calculation.  We assume that the items after the
69006 +                        * *@smallest_removed key have been deleted from the
69007 +                        * file body. */
69008 +                       start_offset = get_key_offset(znode_get_ld_key(node));
69009 +                       end_offset = get_key_offset(smallest_removed);
69010 +               }
69011 +
69012 +               assert("zam-1021", znode_is_connected(node));
69013 +               if (node->left)
69014 +                       znode_set_rd_key(node->left, znode_get_rd_key(node));
69015 +
69016 +               *smallest_removed = *znode_get_ld_key(node);
69017 +
69018 +               write_unlock_dk(tree);
69019 +               read_unlock_tree(tree);
69020 +
69021 +               if (object) {
69022 +                       /* we used to perform actions which are to be performed on items on their removal from tree in
69023 +                          special item method - kill_hook. Here for optimization reasons we avoid reading node
69024 +                          containing item we remove and can not call item's kill hook. Instead we call function which
69025 +                          does exactly the same things as tail kill hook in assumption that node we avoid reading
69026 +                          contains only one item and that item is a tail one. */
69027 +                       fake_kill_hook_tail(object, start_offset, end_offset,
69028 +                                           truncate);
69029 +               }
69030 +       }
69031 +      failed:
69032 +       zrelse(parent_lock.node);
69033 +      failed_nozrelse:
69034 +       done_lh(&parent_lock);
69035 +
69036 +       return ret;
69037 +}
69038 +
69039 +static int can_delete(const reiser4_key *key, znode *node)
69040 +{
69041 +       int result;
69042 +
69043 +       read_lock_dk(current_tree);
69044 +       result = keyle(key, znode_get_ld_key(node));
69045 +       read_unlock_dk(current_tree);
69046 +       return result;
69047 +}
69048 +
69049 +/**
69050 + * This subroutine is not optimal but implementation seems to
69051 + * be easier).
69052 + *
69053 + * @tap: the point deletion process begins from,
69054 + * @from_key: the beginning of the deleted key range,
69055 + * @to_key: the end of the deleted key range,
69056 + * @smallest_removed: the smallest removed key,
69057 + * @truncate: true if called for file truncate.
69058 + * @progress: return true if a progress in file items deletions was made,
69059 + *            @smallest_removed value is actual in that case.
69060 + *
69061 + * @return: 0 if success, error code otherwise, -E_REPEAT means that long
69062 + * reiser4_cut_tree operation was interrupted for allowing atom commit.
69063 + */
69064 +int
69065 +cut_tree_worker_common(tap_t * tap, const reiser4_key * from_key,
69066 +                      const reiser4_key * to_key,
69067 +                      reiser4_key * smallest_removed, struct inode *object,
69068 +                      int truncate, int *progress)
69069 +{
69070 +       lock_handle next_node_lock;
69071 +       coord_t left_coord;
69072 +       int result;
69073 +
69074 +       assert("zam-931", tap->coord->node != NULL);
69075 +       assert("zam-932", znode_is_write_locked(tap->coord->node));
69076 +
69077 +       *progress = 0;
69078 +       init_lh(&next_node_lock);
69079 +
69080 +       while (1) {
69081 +               znode *node;    /* node from which items are cut */
69082 +               node_plugin *nplug;     /* node plugin for @node */
69083 +
69084 +               node = tap->coord->node;
69085 +
69086 +               /* Move next_node_lock to the next node on the left. */
69087 +               result =
69088 +                   reiser4_get_left_neighbor(&next_node_lock, node,
69089 +                                             ZNODE_WRITE_LOCK,
69090 +                                             GN_CAN_USE_UPPER_LEVELS);
69091 +               if (result != 0 && result != -E_NO_NEIGHBOR)
69092 +                       break;
69093 +               /* Check can we delete the node as a whole. */
69094 +               if (*progress && znode_get_level(node) == LEAF_LEVEL &&
69095 +                   can_delete(from_key, node)) {
69096 +                       result = reiser4_delete_node(node, smallest_removed,
69097 +                                                    object, truncate);
69098 +               } else {
69099 +                       result = reiser4_tap_load(tap);
69100 +                       if (result)
69101 +                               return result;
69102 +
69103 +                       /* Prepare the second (right) point for cut_node() */
69104 +                       if (*progress)
69105 +                               coord_init_last_unit(tap->coord, node);
69106 +
69107 +                       else if (item_plugin_by_coord(tap->coord)->b.lookup ==
69108 +                                NULL)
69109 +                               /* set rightmost unit for the items without lookup method */
69110 +                               tap->coord->unit_pos =
69111 +                                   coord_last_unit_pos(tap->coord);
69112 +
69113 +                       nplug = node->nplug;
69114 +
69115 +                       assert("vs-686", nplug);
69116 +                       assert("vs-687", nplug->lookup);
69117 +
69118 +                       /* left_coord is leftmost unit cut from @node */
69119 +                       result = nplug->lookup(node, from_key,
69120 +                                              FIND_MAX_NOT_MORE_THAN,
69121 +                                              &left_coord);
69122 +
69123 +                       if (IS_CBKERR(result))
69124 +                               break;
69125 +
69126 +                       /* adjust coordinates so that they are set to existing units */
69127 +                       if (coord_set_to_right(&left_coord)
69128 +                           || coord_set_to_left(tap->coord)) {
69129 +                               result = 0;
69130 +                               break;
69131 +                       }
69132 +
69133 +                       if (coord_compare(&left_coord, tap->coord) ==
69134 +                           COORD_CMP_ON_RIGHT) {
69135 +                               /* keys from @from_key to @to_key are not in the tree */
69136 +                               result = 0;
69137 +                               break;
69138 +                       }
69139 +
69140 +                       if (left_coord.item_pos != tap->coord->item_pos) {
69141 +                               /* do not allow to cut more than one item. It is added to solve problem of truncating
69142 +                                  partially converted files. If file is partially converted there may exist a twig node
69143 +                                  containing both internal item or items pointing to leaf nodes with formatting items
69144 +                                  and extent item. We do not want to kill internal items being at twig node here
69145 +                                  because cut_tree_worker assumes killing them from level level */
69146 +                               coord_dup(&left_coord, tap->coord);
69147 +                               assert("vs-1652",
69148 +                                      coord_is_existing_unit(&left_coord));
69149 +                               left_coord.unit_pos = 0;
69150 +                       }
69151 +
69152 +                       /* cut data from one node */
69153 +                       /* *smallest_removed = *reiser4_min_key(); */
69154 +                       result =
69155 +                           kill_node_content(&left_coord, tap->coord, from_key,
69156 +                                             to_key, smallest_removed,
69157 +                                             next_node_lock.node, object,
69158 +                                             truncate);
69159 +                       reiser4_tap_relse(tap);
69160 +               }
69161 +               if (result)
69162 +                       break;
69163 +
69164 +               ++(*progress);
69165 +
69166 +               /* Check whether all items with keys >= from_key were removed
69167 +                * from the tree. */
69168 +               if (keyle(smallest_removed, from_key))
69169 +                       /* result = 0; */
69170 +                       break;
69171 +
69172 +               if (next_node_lock.node == NULL)
69173 +                       break;
69174 +
69175 +               result = reiser4_tap_move(tap, &next_node_lock);
69176 +               done_lh(&next_node_lock);
69177 +               if (result)
69178 +                       break;
69179 +
69180 +               /* Break long reiser4_cut_tree operation (deletion of a large
69181 +                  file) if atom requires commit. */
69182 +               if (*progress > CUT_TREE_MIN_ITERATIONS
69183 +                   && current_atom_should_commit()) {
69184 +                       result = -E_REPEAT;
69185 +                       break;
69186 +               }
69187 +       }
69188 +       done_lh(&next_node_lock);
69189 +       /* assert("vs-301", !keyeq(&smallest_removed, reiser4_min_key())); */
69190 +       return result;
69191 +}
69192 +
69193 +/* there is a fundamental problem with optimizing deletes: VFS does it
69194 +   one file at a time.  Another problem is that if an item can be
69195 +   anything, then deleting items must be done one at a time.  It just
69196 +   seems clean to writes this to specify a from and a to key, and cut
69197 +   everything between them though.  */
69198 +
69199 +/* use this function with care if deleting more than what is part of a single file. */
69200 +/* do not use this when cutting a single item, it is suboptimal for that */
69201 +
69202 +/* You are encouraged to write plugin specific versions of this.  It
69203 +   cannot be optimal for all plugins because it works item at a time,
69204 +   and some plugins could sometimes work node at a time. Regular files
69205 +   however are not optimizable to work node at a time because of
69206 +   extents needing to free the blocks they point to.
69207 +
69208 +   Optimizations compared to v3 code:
69209 +
69210 +   It does not balance (that task is left to memory pressure code).
69211 +
69212 +   Nodes are deleted only if empty.
69213 +
69214 +   Uses extents.
69215 +
69216 +   Performs read-ahead of formatted nodes whose contents are part of
69217 +   the deletion.
69218 +*/
69219 +
69220 +/**
69221 + * Delete everything from the reiser4 tree between two keys: @from_key and
69222 + * @to_key.
69223 + *
69224 + * @from_key: the beginning of the deleted key range,
69225 + * @to_key: the end of the deleted key range,
69226 + * @smallest_removed: the smallest removed key,
69227 + * @object: owner of cutting items.
69228 + * @truncate: true if called for file truncate.
69229 + * @progress: return true if a progress in file items deletions was made,
69230 + *            @smallest_removed value is actual in that case.
69231 + *
69232 + * @return: 0 if success, error code otherwise, -E_REPEAT means that long cut_tree
69233 + * operation was interrupted for allowing atom commit .
69234 + */
69235 +
69236 +int reiser4_cut_tree_object(reiser4_tree * tree, const reiser4_key * from_key,
69237 +                           const reiser4_key * to_key,
69238 +                           reiser4_key * smallest_removed_p,
69239 +                           struct inode *object, int truncate, int *progress)
69240 +{
69241 +       lock_handle lock;
69242 +       int result;
69243 +       tap_t tap;
69244 +       coord_t right_coord;
69245 +       reiser4_key smallest_removed;
69246 +       int (*cut_tree_worker) (tap_t *, const reiser4_key *,
69247 +                               const reiser4_key *, reiser4_key *,
69248 +                               struct inode *, int, int *);
69249 +       STORE_COUNTERS;
69250 +
69251 +       assert("umka-329", tree != NULL);
69252 +       assert("umka-330", from_key != NULL);
69253 +       assert("umka-331", to_key != NULL);
69254 +       assert("zam-936", keyle(from_key, to_key));
69255 +
69256 +       if (smallest_removed_p == NULL)
69257 +               smallest_removed_p = &smallest_removed;
69258 +
69259 +       init_lh(&lock);
69260 +
69261 +       do {
69262 +               /* Find rightmost item to cut away from the tree. */
69263 +               result = reiser4_object_lookup(object, to_key, &right_coord,
69264 +                                              &lock, ZNODE_WRITE_LOCK,
69265 +                                              FIND_MAX_NOT_MORE_THAN,
69266 +                                              TWIG_LEVEL, LEAF_LEVEL,
69267 +                                              CBK_UNIQUE, NULL /*ra_info */);
69268 +               if (result != CBK_COORD_FOUND)
69269 +                       break;
69270 +               if (object == NULL
69271 +                   || inode_file_plugin(object)->cut_tree_worker == NULL)
69272 +                       cut_tree_worker = cut_tree_worker_common;
69273 +               else
69274 +                       cut_tree_worker =
69275 +                           inode_file_plugin(object)->cut_tree_worker;
69276 +               reiser4_tap_init(&tap, &right_coord, &lock, ZNODE_WRITE_LOCK);
69277 +               result =
69278 +                   cut_tree_worker(&tap, from_key, to_key, smallest_removed_p,
69279 +                                   object, truncate, progress);
69280 +               reiser4_tap_done(&tap);
69281 +
69282 +               reiser4_preempt_point();
69283 +
69284 +       } while (0);
69285 +
69286 +       done_lh(&lock);
69287 +
69288 +       if (result) {
69289 +               switch (result) {
69290 +               case -E_NO_NEIGHBOR:
69291 +                       result = 0;
69292 +                       break;
69293 +               case -E_DEADLOCK:
69294 +                       result = -E_REPEAT;
69295 +               case -E_REPEAT:
69296 +               case -ENOMEM:
69297 +               case -ENOENT:
69298 +                       break;
69299 +               default:
69300 +                       warning("nikita-2861", "failure: %i", result);
69301 +               }
69302 +       }
69303 +
69304 +       CHECK_COUNTERS;
69305 +       return result;
69306 +}
69307 +
69308 +/* repeat reiser4_cut_tree_object until everything is deleted.
69309 + * unlike cut_file_items, it does not end current transaction if -E_REPEAT
69310 + * is returned by cut_tree_object. */
69311 +int reiser4_cut_tree(reiser4_tree * tree, const reiser4_key * from,
69312 +                    const reiser4_key * to, struct inode *inode, int truncate)
69313 +{
69314 +       int result;
69315 +       int progress;
69316 +
69317 +       do {
69318 +               result = reiser4_cut_tree_object(tree, from, to, NULL,
69319 +                                                inode, truncate, &progress);
69320 +       } while (result == -E_REPEAT);
69321 +
69322 +       return result;
69323 +}
69324 +
69325 +/* finishing reiser4 initialization */
69326 +int reiser4_init_tree(reiser4_tree * tree      /* pointer to structure being
69327 +                                        * initialized */ ,
69328 +             const reiser4_block_nr * root_block       /* address of a root block
69329 +                                                        * on a disk */ ,
69330 +             tree_level height /* height of a tree */ ,
69331 +             node_plugin * nplug /* default node plugin */ )
69332 +{
69333 +       int result;
69334 +
69335 +       assert("nikita-306", tree != NULL);
69336 +       assert("nikita-307", root_block != NULL);
69337 +       assert("nikita-308", height > 0);
69338 +       assert("nikita-309", nplug != NULL);
69339 +       assert("zam-587", tree->super != NULL);
69340 +
69341 +       tree->root_block = *root_block;
69342 +       tree->height = height;
69343 +       tree->estimate_one_insert = calc_estimate_one_insert(height);
69344 +       tree->nplug = nplug;
69345 +
69346 +       tree->znode_epoch = 1ull;
69347 +
69348 +       cbk_cache_init(&tree->cbk_cache);
69349 +
69350 +       result = znodes_tree_init(tree);
69351 +       if (result == 0)
69352 +               result = jnodes_tree_init(tree);
69353 +       if (result == 0) {
69354 +               tree->uber = zget(tree, &UBER_TREE_ADDR, NULL, 0,
69355 +                                 reiser4_ctx_gfp_mask_get());
69356 +               if (IS_ERR(tree->uber)) {
69357 +                       result = PTR_ERR(tree->uber);
69358 +                       tree->uber = NULL;
69359 +               }
69360 +       }
69361 +       return result;
69362 +}
69363 +
69364 +/* release resources associated with @tree */
69365 +void reiser4_done_tree(reiser4_tree * tree /* tree to release */ )
69366 +{
69367 +       if (tree == NULL)
69368 +               return;
69369 +
69370 +       if (tree->uber != NULL) {
69371 +               zput(tree->uber);
69372 +               tree->uber = NULL;
69373 +       }
69374 +       znodes_tree_done(tree);
69375 +       jnodes_tree_done(tree);
69376 +       cbk_cache_done(&tree->cbk_cache);
69377 +}
69378 +
69379 +/* Make Linus happy.
69380 +   Local variables:
69381 +   c-indentation-style: "K&R"
69382 +   mode-name: "LC"
69383 +   c-basic-offset: 8
69384 +   tab-width: 8
69385 +   fill-column: 120
69386 +   scroll-step: 1
69387 +   End:
69388 +*/
69389 diff -puN /dev/null fs/reiser4/tree.h
69390 --- /dev/null
69391 +++ a/fs/reiser4/tree.h
69392 @@ -0,0 +1,577 @@
69393 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
69394 + * reiser4/README */
69395 +
69396 +/* Tree operations. See fs/reiser4/tree.c for comments */
69397 +
69398 +#if !defined( __REISER4_TREE_H__ )
69399 +#define __REISER4_TREE_H__
69400 +
69401 +#include "forward.h"
69402 +#include "debug.h"
69403 +#include "dformat.h"
69404 +#include "plugin/node/node.h"
69405 +#include "plugin/plugin.h"
69406 +#include "znode.h"
69407 +#include "tap.h"
69408 +
69409 +#include <linux/types.h>       /* for __u??  */
69410 +#include <linux/fs.h>          /* for struct super_block  */
69411 +#include <linux/spinlock.h>
69412 +#include <linux/sched.h>       /* for struct task_struct */
69413 +
69414 +/* fictive block number never actually used */
69415 +extern const reiser4_block_nr UBER_TREE_ADDR;
69416 +
69417 +/* &cbk_cache_slot - entry in a coord cache.
69418 +
69419 +   This is entry in a coord_by_key (cbk) cache, represented by
69420 +   &cbk_cache.
69421 +
69422 +*/
69423 +typedef struct cbk_cache_slot {
69424 +       /* cached node */
69425 +       znode *node;
69426 +       /* linkage to the next cbk cache slot in a LRU order */
69427 +       struct list_head lru;
69428 +} cbk_cache_slot;
69429 +
69430 +/* &cbk_cache - coord cache. This is part of reiser4_tree.
69431 +
69432 +   cbk_cache is supposed to speed up tree lookups by caching results of recent
69433 +   successful lookups (we don't cache negative results as dentry cache
69434 +   does). Cache consists of relatively small number of entries kept in a LRU
69435 +   order. Each entry (&cbk_cache_slot) contains a pointer to znode, from
69436 +   which we can obtain a range of keys that covered by this znode. Before
69437 +   embarking into real tree traversal we scan cbk_cache slot by slot and for
69438 +   each slot check whether key we are looking for is between minimal and
69439 +   maximal keys for node pointed to by this slot. If no match is found, real
69440 +   tree traversal is performed and if result is successful, appropriate entry
69441 +   is inserted into cache, possibly pulling least recently used entry out of
69442 +   it.
69443 +
69444 +   Tree spin lock is used to protect coord cache. If contention for this
69445 +   lock proves to be too high, more finer grained locking can be added.
69446 +
69447 +   Invariants involving parts of this data-type:
69448 +
69449 +      [cbk-cache-invariant]
69450 +*/
69451 +typedef struct cbk_cache {
69452 +       /* serializator */
69453 +       rwlock_t guard;
69454 +       int nr_slots;
69455 +       /* head of LRU list of cache slots */
69456 +       struct list_head lru;
69457 +       /* actual array of slots */
69458 +       cbk_cache_slot *slot;
69459 +} cbk_cache;
69460 +
69461 +/* level_lookup_result - possible outcome of looking up key at some level.
69462 +   This is used by coord_by_key when traversing tree downward. */
69463 +typedef enum {
69464 +       /* continue to the next level */
69465 +       LOOKUP_CONT,
69466 +       /* done. Either required item was found, or we can prove it
69467 +          doesn't exist, or some error occurred. */
69468 +       LOOKUP_DONE,
69469 +       /* restart traversal from the root. Infamous "repetition". */
69470 +       LOOKUP_REST
69471 +} level_lookup_result;
69472 +
69473 +/*    This is representation of internal reiser4 tree where all file-system
69474 +   data and meta-data are stored. This structure is passed to all tree
69475 +   manipulation functions. It's different from the super block because:
69476 +   we don't want to limit ourselves to strictly one to one mapping
69477 +   between super blocks and trees, and, because they are logically
69478 +   different: there are things in a super block that have no relation to
69479 +   the tree (bitmaps, journalling area, mount options, etc.) and there
69480 +   are things in a tree that bear no relation to the super block, like
69481 +   tree of znodes.
69482 +
69483 +   At this time, there is only one tree
69484 +   per filesystem, and this struct is part of the super block.  We only
69485 +   call the super block the super block for historical reasons (most
69486 +   other filesystems call the per filesystem metadata the super block).
69487 +*/
69488 +
69489 +struct reiser4_tree {
69490 +       /* block_nr == 0 is fake znode. Write lock it, while changing
69491 +          tree height. */
69492 +       /* disk address of root node of a tree */
69493 +       reiser4_block_nr root_block;
69494 +
69495 +       /* level of the root node. If this is 1, tree consists of root
69496 +          node only */
69497 +       tree_level height;
69498 +
69499 +       /*
69500 +        * this is cached here avoid calling plugins through function
69501 +        * dereference all the time.
69502 +        */
69503 +       __u64 estimate_one_insert;
69504 +
69505 +       /* cache of recent tree lookup results */
69506 +       cbk_cache cbk_cache;
69507 +
69508 +       /* hash table to look up znodes by block number. */
69509 +       z_hash_table zhash_table;
69510 +       z_hash_table zfake_table;
69511 +       /* hash table to look up jnodes by inode and offset. */
69512 +       j_hash_table jhash_table;
69513 +
69514 +       /* lock protecting:
69515 +          - parent pointers,
69516 +          - sibling pointers,
69517 +          - znode hash table
69518 +          - coord cache
69519 +        */
69520 +       /* NOTE: The "giant" tree lock can be replaced by more spin locks,
69521 +          hoping they will be less contented. We can use one spin lock per one
69522 +          znode hash bucket.  With adding of some code complexity, sibling
69523 +          pointers can be protected by both znode spin locks.  However it looks
69524 +          more SMP scalable we should test this locking change on n-ways (n >
69525 +          4) SMP machines.  Current 4-ways machine test does not show that tree
69526 +          lock is contented and it is a bottleneck (2003.07.25). */
69527 +
69528 +       rwlock_t tree_lock;
69529 +
69530 +       /* lock protecting delimiting keys */
69531 +       rwlock_t dk_lock;
69532 +
69533 +       /* spin lock protecting znode_epoch */
69534 +       spinlock_t epoch_lock;
69535 +       /* version stamp used to mark znode updates. See seal.[ch] for more
69536 +        * information. */
69537 +       __u64 znode_epoch;
69538 +
69539 +       znode *uber;
69540 +       node_plugin *nplug;
69541 +       struct super_block *super;
69542 +       struct {
69543 +               /* carry flags used for insertion of new nodes */
69544 +               __u32 new_node_flags;
69545 +               /* carry flags used for insertion of new extents */
69546 +               __u32 new_extent_flags;
69547 +               /* carry flags used for paste operations */
69548 +               __u32 paste_flags;
69549 +               /* carry flags used for insert operations */
69550 +               __u32 insert_flags;
69551 +       } carry;
69552 +};
69553 +
69554 +extern int reiser4_init_tree(reiser4_tree * tree,
69555 +                            const reiser4_block_nr * root_block,
69556 +                            tree_level height, node_plugin * default_plugin);
69557 +extern void reiser4_done_tree(reiser4_tree * tree);
69558 +
69559 +/* cbk flags: options for coord_by_key() */
69560 +typedef enum {
69561 +       /* coord_by_key() is called for insertion. This is necessary because
69562 +          of extents being located at the twig level. For explanation, see
69563 +          comment just above is_next_item_internal().
69564 +        */
69565 +       CBK_FOR_INSERT = (1 << 0),
69566 +       /* coord_by_key() is called with key that is known to be unique */
69567 +       CBK_UNIQUE = (1 << 1),
69568 +       /* coord_by_key() can trust delimiting keys. This options is not user
69569 +          accessible. coord_by_key() will set it automatically. It will be
69570 +          only cleared by special-case in extents-on-the-twig-level handling
69571 +          where it is necessary to insert item with a key smaller than
69572 +          leftmost key in a node. This is necessary because of extents being
69573 +          located at the twig level. For explanation, see comment just above
69574 +          is_next_item_internal().
69575 +        */
69576 +       CBK_TRUST_DK = (1 << 2),
69577 +       CBK_READA = (1 << 3),   /* original: readahead leaves which contain items of certain file */
69578 +       CBK_READDIR_RA = (1 << 4),      /* readdir: readahead whole directory and all its stat datas */
69579 +       CBK_DKSET = (1 << 5),
69580 +       CBK_EXTENDED_COORD = (1 << 6),  /* coord_t is actually */
69581 +       CBK_IN_CACHE = (1 << 7),        /* node is already in cache */
69582 +       CBK_USE_CRABLOCK = (1 << 8)     /* use crab_lock in stead of long term
69583 +                                        * lock */
69584 +} cbk_flags;
69585 +
69586 +/* insertion outcome. IBK = insert by key */
69587 +typedef enum {
69588 +       IBK_INSERT_OK = 0,
69589 +       IBK_ALREADY_EXISTS = -EEXIST,
69590 +       IBK_IO_ERROR = -EIO,
69591 +       IBK_NO_SPACE = -E_NODE_FULL,
69592 +       IBK_OOM = -ENOMEM
69593 +} insert_result;
69594 +
69595 +#define IS_CBKERR(err) ((err) != CBK_COORD_FOUND && (err) != CBK_COORD_NOTFOUND)
69596 +
69597 +typedef int (*tree_iterate_actor_t) (reiser4_tree * tree, coord_t * coord,
69598 +                                    lock_handle * lh, void *arg);
69599 +extern int reiser4_iterate_tree(reiser4_tree * tree, coord_t * coord,
69600 +                               lock_handle * lh,
69601 +                               tree_iterate_actor_t actor, void *arg,
69602 +                               znode_lock_mode mode, int through_units_p);
69603 +extern int get_uber_znode(reiser4_tree * tree, znode_lock_mode mode,
69604 +                         znode_lock_request pri, lock_handle * lh);
69605 +
69606 +/* return node plugin of @node */
69607 +static inline node_plugin *node_plugin_by_node(const znode *
69608 +                                              node /* node to query */ )
69609 +{
69610 +       assert("vs-213", node != NULL);
69611 +       assert("vs-214", znode_is_loaded(node));
69612 +
69613 +       return node->nplug;
69614 +}
69615 +
69616 +/* number of items in @node */
69617 +static inline pos_in_node_t node_num_items(const znode * node)
69618 +{
69619 +       assert("nikita-2754", znode_is_loaded(node));
69620 +       assert("nikita-2468",
69621 +              node_plugin_by_node(node)->num_of_items(node) == node->nr_items);
69622 +
69623 +       return node->nr_items;
69624 +}
69625 +
69626 +/* Return the number of items at the present node.  Asserts coord->node !=
69627 +   NULL. */
69628 +static inline unsigned coord_num_items(const coord_t * coord)
69629 +{
69630 +       assert("jmacd-9805", coord->node != NULL);
69631 +
69632 +       return node_num_items(coord->node);
69633 +}
69634 +
69635 +/* true if @node is empty */
69636 +static inline int node_is_empty(const znode * node)
69637 +{
69638 +       return node_num_items(node) == 0;
69639 +}
69640 +
69641 +typedef enum {
69642 +       SHIFTED_SOMETHING = 0,
69643 +       SHIFT_NO_SPACE = -E_NODE_FULL,
69644 +       SHIFT_IO_ERROR = -EIO,
69645 +       SHIFT_OOM = -ENOMEM,
69646 +} shift_result;
69647 +
69648 +extern node_plugin *node_plugin_by_coord(const coord_t * coord);
69649 +extern int is_coord_in_node(const coord_t * coord);
69650 +extern int key_in_node(const reiser4_key *, const coord_t *);
69651 +extern void coord_item_move_to(coord_t * coord, int items);
69652 +extern void coord_unit_move_to(coord_t * coord, int units);
69653 +
69654 +/* there are two types of repetitive accesses (ra): intra-syscall
69655 +   (local) and inter-syscall (global). Local ra is used when
69656 +   during single syscall we add/delete several items and units in the
69657 +   same place in a tree. Note that plan-A fragments local ra by
69658 +   separating stat-data and file body in key-space. Global ra is
69659 +   used when user does repetitive modifications in the same place in a
69660 +   tree.
69661 +
69662 +   Our ra implementation serves following purposes:
69663 +    1 it affects balancing decisions so that next operation in a row
69664 +      can be performed faster;
69665 +    2 it affects lower-level read-ahead in page-cache;
69666 +    3 it allows to avoid unnecessary lookups by maintaining some state
69667 +      across several operations (this is only for local ra);
69668 +    4 it leaves room for lazy-micro-balancing: when we start a sequence of
69669 +      operations they are performed without actually doing any intra-node
69670 +      shifts, until we finish sequence or scope of sequence leaves
69671 +      current node, only then we really pack node (local ra only).
69672 +*/
69673 +
69674 +/* another thing that can be useful is to keep per-tree and/or
69675 +   per-process cache of recent lookups. This cache can be organised as a
69676 +   list of block numbers of formatted nodes sorted by starting key in
69677 +   this node. Balancings should invalidate appropriate parts of this
69678 +   cache.
69679 +*/
69680 +
69681 +lookup_result coord_by_key(reiser4_tree * tree, const reiser4_key * key,
69682 +                          coord_t * coord, lock_handle * handle,
69683 +                          znode_lock_mode lock, lookup_bias bias,
69684 +                          tree_level lock_level, tree_level stop_level,
69685 +                          __u32 flags, ra_info_t *);
69686 +
69687 +lookup_result reiser4_object_lookup(struct inode *object,
69688 +                                   const reiser4_key * key,
69689 +                                   coord_t * coord,
69690 +                                   lock_handle * lh,
69691 +                                   znode_lock_mode lock_mode,
69692 +                                   lookup_bias bias,
69693 +                                   tree_level lock_level,
69694 +                                   tree_level stop_level,
69695 +                                   __u32 flags, ra_info_t * info);
69696 +
69697 +insert_result insert_by_key(reiser4_tree * tree, const reiser4_key * key,
69698 +                           reiser4_item_data * data, coord_t * coord,
69699 +                           lock_handle * lh,
69700 +                           tree_level stop_level, __u32 flags);
69701 +insert_result insert_by_coord(coord_t * coord,
69702 +                             reiser4_item_data * data, const reiser4_key * key,
69703 +                             lock_handle * lh, __u32);
69704 +insert_result insert_extent_by_coord(coord_t * coord,
69705 +                                    reiser4_item_data * data,
69706 +                                    const reiser4_key * key, lock_handle * lh);
69707 +int cut_node_content(coord_t * from, coord_t * to, const reiser4_key * from_key,
69708 +                    const reiser4_key * to_key,
69709 +                    reiser4_key * smallest_removed);
69710 +int kill_node_content(coord_t * from, coord_t * to,
69711 +                     const reiser4_key * from_key, const reiser4_key * to_key,
69712 +                     reiser4_key * smallest_removed,
69713 +                     znode * locked_left_neighbor, struct inode *inode,
69714 +                     int truncate);
69715 +
69716 +int reiser4_resize_item(coord_t * coord, reiser4_item_data * data,
69717 +                       reiser4_key * key, lock_handle * lh, cop_insert_flag);
69718 +int insert_into_item(coord_t * coord, lock_handle * lh, const reiser4_key * key,
69719 +                    reiser4_item_data * data, unsigned);
69720 +int reiser4_insert_flow(coord_t * coord, lock_handle * lh, flow_t * f);
69721 +int find_new_child_ptr(znode * parent, znode * child, znode * left,
69722 +                      coord_t * result);
69723 +
69724 +int shift_right_of_but_excluding_insert_coord(coord_t * insert_coord);
69725 +int shift_left_of_and_including_insert_coord(coord_t * insert_coord);
69726 +
69727 +void fake_kill_hook_tail(struct inode *, loff_t start, loff_t end, int);
69728 +
69729 +extern int cut_tree_worker_common(tap_t *, const reiser4_key *,
69730 +                                 const reiser4_key *, reiser4_key *,
69731 +                                 struct inode *, int, int *);
69732 +extern int reiser4_cut_tree_object(reiser4_tree *, const reiser4_key *,
69733 +                                  const reiser4_key *, reiser4_key *,
69734 +                                  struct inode *, int, int *);
69735 +extern int reiser4_cut_tree(reiser4_tree * tree, const reiser4_key * from,
69736 +                           const reiser4_key * to, struct inode *, int);
69737 +
69738 +extern int reiser4_delete_node(znode *, reiser4_key *, struct inode *, int);
69739 +extern int check_tree_pointer(const coord_t * pointer, const znode * child);
69740 +extern int find_new_child_ptr(znode * parent, znode * child UNUSED_ARG,
69741 +                             znode * left, coord_t * result);
69742 +extern int find_child_ptr(znode * parent, znode * child, coord_t * result);
69743 +extern int set_child_delimiting_keys(znode * parent, const coord_t * in_parent,
69744 +                                    znode * child);
69745 +extern znode *child_znode(const coord_t * in_parent, znode * parent,
69746 +                         int incore_p, int setup_dkeys_p);
69747 +
69748 +extern int cbk_cache_init(cbk_cache * cache);
69749 +extern void cbk_cache_done(cbk_cache * cache);
69750 +extern void cbk_cache_invalidate(const znode * node, reiser4_tree * tree);
69751 +
69752 +extern char *sprint_address(const reiser4_block_nr * block);
69753 +
69754 +#if REISER4_DEBUG
69755 +extern void print_coord_content(const char *prefix, coord_t * p);
69756 +extern void reiser4_print_address(const char *prefix,
69757 +                       const reiser4_block_nr * block);
69758 +extern void print_tree_rec(const char *prefix, reiser4_tree * tree,
69759 +                          __u32 flags);
69760 +extern void check_dkeys(znode *node);
69761 +#else
69762 +#define print_coord_content(p, c) noop
69763 +#define reiser4_print_address(p, b) noop
69764 +#endif
69765 +
69766 +extern void forget_znode(lock_handle * handle);
69767 +extern int deallocate_znode(znode * node);
69768 +
69769 +extern int is_disk_addr_unallocated(const reiser4_block_nr * addr);
69770 +
69771 +/* struct used internally to pack all numerous arguments of tree lookup.
69772 +    Used to avoid passing a lot of arguments to helper functions. */
69773 +typedef struct cbk_handle {
69774 +       /* tree we are in */
69775 +       reiser4_tree *tree;
69776 +       /* key we are going after */
69777 +       const reiser4_key *key;
69778 +       /* coord we will store result in */
69779 +       coord_t *coord;
69780 +       /* type of lock to take on target node */
69781 +       znode_lock_mode lock_mode;
69782 +       /* lookup bias. See comments at the declaration of lookup_bias */
69783 +       lookup_bias bias;
69784 +       /* lock level: level starting from which tree traversal starts taking
69785 +        * write locks. */
69786 +       tree_level lock_level;
69787 +       /* level where search will stop. Either item will be found between
69788 +          lock_level and stop_level, or CBK_COORD_NOTFOUND will be
69789 +          returned.
69790 +        */
69791 +       tree_level stop_level;
69792 +       /* level we are currently at */
69793 +       tree_level level;
69794 +       /* block number of @active node. Tree traversal operates on two
69795 +          nodes: active and parent.  */
69796 +       reiser4_block_nr block;
69797 +       /* put here error message to be printed by caller */
69798 +       const char *error;
69799 +       /* result passed back to caller */
69800 +       lookup_result result;
69801 +       /* lock handles for active and parent */
69802 +       lock_handle *parent_lh;
69803 +       lock_handle *active_lh;
69804 +       reiser4_key ld_key;
69805 +       reiser4_key rd_key;
69806 +       /* flags, passed to the cbk routine. Bits of this bitmask are defined
69807 +          in tree.h:cbk_flags enum. */
69808 +       __u32 flags;
69809 +       ra_info_t *ra_info;
69810 +       struct inode *object;
69811 +} cbk_handle;
69812 +
69813 +extern znode_lock_mode cbk_lock_mode(tree_level level, cbk_handle * h);
69814 +
69815 +/* eottl.c */
69816 +extern int handle_eottl(cbk_handle *h, int *outcome);
69817 +
69818 +int lookup_multikey(cbk_handle * handle, int nr_keys);
69819 +int lookup_couple(reiser4_tree * tree,
69820 +                 const reiser4_key * key1, const reiser4_key * key2,
69821 +                 coord_t * coord1, coord_t * coord2,
69822 +                 lock_handle * lh1, lock_handle * lh2,
69823 +                 znode_lock_mode lock_mode, lookup_bias bias,
69824 +                 tree_level lock_level, tree_level stop_level, __u32 flags,
69825 +                 int *result1, int *result2);
69826 +
69827 +static inline void read_lock_tree(reiser4_tree *tree)
69828 +{
69829 +       /* check that tree is not locked */
69830 +       assert("", (LOCK_CNT_NIL(rw_locked_tree) &&
69831 +                   LOCK_CNT_NIL(read_locked_tree) &&
69832 +                   LOCK_CNT_NIL(write_locked_tree)));
69833 +       /* check that spinlocks of lower priorities are not held */
69834 +       assert("", (LOCK_CNT_NIL(spin_locked_txnh) &&
69835 +                   LOCK_CNT_NIL(rw_locked_dk) &&
69836 +                   LOCK_CNT_NIL(spin_locked_stack)));
69837 +
69838 +       read_lock(&(tree->tree_lock));
69839 +
69840 +       LOCK_CNT_INC(read_locked_tree);
69841 +       LOCK_CNT_INC(rw_locked_tree);
69842 +       LOCK_CNT_INC(spin_locked);
69843 +}
69844 +
69845 +static inline void read_unlock_tree(reiser4_tree *tree)
69846 +{
69847 +       assert("nikita-1375", LOCK_CNT_GTZ(read_locked_tree));
69848 +       assert("nikita-1376", LOCK_CNT_GTZ(rw_locked_tree));
69849 +       assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
69850 +
69851 +       LOCK_CNT_DEC(read_locked_tree);
69852 +       LOCK_CNT_DEC(rw_locked_tree);
69853 +       LOCK_CNT_DEC(spin_locked);
69854 +
69855 +       read_unlock(&(tree->tree_lock));
69856 +}
69857 +
69858 +static inline void write_lock_tree(reiser4_tree *tree)
69859 +{
69860 +       /* check that tree is not locked */
69861 +       assert("", (LOCK_CNT_NIL(rw_locked_tree) &&
69862 +                   LOCK_CNT_NIL(read_locked_tree) &&
69863 +                   LOCK_CNT_NIL(write_locked_tree)));
69864 +       /* check that spinlocks of lower priorities are not held */
69865 +       assert("", (LOCK_CNT_NIL(spin_locked_txnh) &&
69866 +                   LOCK_CNT_NIL(rw_locked_dk) &&
69867 +                   LOCK_CNT_NIL(spin_locked_stack)));
69868 +
69869 +       write_lock(&(tree->tree_lock));
69870 +
69871 +       LOCK_CNT_INC(write_locked_tree);
69872 +       LOCK_CNT_INC(rw_locked_tree);
69873 +       LOCK_CNT_INC(spin_locked);
69874 +}
69875 +
69876 +static inline void write_unlock_tree(reiser4_tree *tree)
69877 +{
69878 +       assert("nikita-1375", LOCK_CNT_GTZ(write_locked_tree));
69879 +       assert("nikita-1376", LOCK_CNT_GTZ(rw_locked_tree));
69880 +       assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
69881 +
69882 +       LOCK_CNT_DEC(write_locked_tree);
69883 +       LOCK_CNT_DEC(rw_locked_tree);
69884 +       LOCK_CNT_DEC(spin_locked);
69885 +
69886 +       write_unlock(&(tree->tree_lock));
69887 +}
69888 +
69889 +static inline void read_lock_dk(reiser4_tree *tree)
69890 +{
69891 +       /* check that dk is not locked */
69892 +       assert("", (LOCK_CNT_NIL(rw_locked_dk) &&
69893 +                   LOCK_CNT_NIL(read_locked_dk) &&
69894 +                   LOCK_CNT_NIL(write_locked_dk)));
69895 +       /* check that spinlocks of lower priorities are not held */
69896 +       assert("", LOCK_CNT_NIL(spin_locked_stack));
69897 +
69898 +       read_lock(&((tree)->dk_lock));
69899 +
69900 +       LOCK_CNT_INC(read_locked_dk);
69901 +       LOCK_CNT_INC(rw_locked_dk);
69902 +       LOCK_CNT_INC(spin_locked);
69903 +}
69904 +
69905 +static inline void read_unlock_dk(reiser4_tree *tree)
69906 +{
69907 +       assert("nikita-1375", LOCK_CNT_GTZ(read_locked_dk));
69908 +       assert("nikita-1376", LOCK_CNT_GTZ(rw_locked_dk));
69909 +       assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
69910 +
69911 +       LOCK_CNT_DEC(read_locked_dk);
69912 +       LOCK_CNT_DEC(rw_locked_dk);
69913 +       LOCK_CNT_DEC(spin_locked);
69914 +
69915 +       read_unlock(&(tree->dk_lock));
69916 +}
69917 +
69918 +static inline void write_lock_dk(reiser4_tree *tree)
69919 +{
69920 +       /* check that dk is not locked */
69921 +       assert("", (LOCK_CNT_NIL(rw_locked_dk) &&
69922 +                   LOCK_CNT_NIL(read_locked_dk) &&
69923 +                   LOCK_CNT_NIL(write_locked_dk)));
69924 +       /* check that spinlocks of lower priorities are not held */
69925 +       assert("", LOCK_CNT_NIL(spin_locked_stack));
69926 +
69927 +       write_lock(&((tree)->dk_lock));
69928 +
69929 +       LOCK_CNT_INC(write_locked_dk);
69930 +       LOCK_CNT_INC(rw_locked_dk);
69931 +       LOCK_CNT_INC(spin_locked);
69932 +}
69933 +
69934 +static inline void write_unlock_dk(reiser4_tree *tree)
69935 +{
69936 +       assert("nikita-1375", LOCK_CNT_GTZ(write_locked_dk));
69937 +       assert("nikita-1376", LOCK_CNT_GTZ(rw_locked_dk));
69938 +       assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
69939 +
69940 +       LOCK_CNT_DEC(write_locked_dk);
69941 +       LOCK_CNT_DEC(rw_locked_dk);
69942 +       LOCK_CNT_DEC(spin_locked);
69943 +
69944 +       write_unlock(&(tree->dk_lock));
69945 +}
69946 +
69947 +/* estimate api. Implementation is in estimate.c */
69948 +reiser4_block_nr estimate_one_insert_item(reiser4_tree *);
69949 +reiser4_block_nr estimate_one_insert_into_item(reiser4_tree *);
69950 +reiser4_block_nr estimate_insert_flow(tree_level);
69951 +reiser4_block_nr estimate_one_item_removal(reiser4_tree *);
69952 +reiser4_block_nr calc_estimate_one_insert(tree_level);
69953 +reiser4_block_nr estimate_dirty_cluster(struct inode *);
69954 +reiser4_block_nr estimate_insert_cluster(struct inode *);
69955 +reiser4_block_nr estimate_update_cluster(struct inode *);
69956 +
69957 +/* __REISER4_TREE_H__ */
69958 +#endif
69959 +
69960 +/* Make Linus happy.
69961 +   Local variables:
69962 +   c-indentation-style: "K&R"
69963 +   mode-name: "LC"
69964 +   c-basic-offset: 8
69965 +   tab-width: 8
69966 +   fill-column: 120
69967 +   scroll-step: 1
69968 +   End:
69969 +*/
69970 diff -puN /dev/null fs/reiser4/tree_mod.c
69971 --- /dev/null
69972 +++ a/fs/reiser4/tree_mod.c
69973 @@ -0,0 +1,386 @@
69974 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
69975 + * reiser4/README */
69976 +
69977 +/*
69978 + * Functions to add/delete new nodes to/from the tree.
69979 + *
69980 + * Functions from this file are used by carry (see carry*) to handle:
69981 + *
69982 + *     . insertion of new formatted node into tree
69983 + *
69984 + *     . addition of new tree root, increasing tree height
69985 + *
69986 + *     . removing tree root, decreasing tree height
69987 + *
69988 + */
69989 +
69990 +#include "forward.h"
69991 +#include "debug.h"
69992 +#include "dformat.h"
69993 +#include "key.h"
69994 +#include "coord.h"
69995 +#include "plugin/plugin.h"
69996 +#include "jnode.h"
69997 +#include "znode.h"
69998 +#include "tree_mod.h"
69999 +#include "block_alloc.h"
70000 +#include "tree_walk.h"
70001 +#include "tree.h"
70002 +#include "super.h"
70003 +
70004 +#include <linux/err.h>
70005 +
70006 +static int add_child_ptr(znode * parent, znode * child);
70007 +/* warning only issued if error is not -E_REPEAT */
70008 +#define ewarning( error, ... )                 \
70009 +       if( ( error ) != -E_REPEAT )            \
70010 +               warning( __VA_ARGS__ )
70011 +
70012 +/* allocate new node on the @level and immediately on the right of @brother. */
70013 +znode * reiser4_new_node(znode * brother /* existing left neighbor
70014 +                                         *  of new node */,
70015 +                        tree_level level /* tree level at which new node is to
70016 +                                          * be allocated */)
70017 +{
70018 +       znode *result;
70019 +       int retcode;
70020 +       reiser4_block_nr blocknr;
70021 +
70022 +       assert("nikita-930", brother != NULL);
70023 +       assert("umka-264", level < REAL_MAX_ZTREE_HEIGHT);
70024 +
70025 +       retcode = assign_fake_blocknr_formatted(&blocknr);
70026 +       if (retcode == 0) {
70027 +               result =
70028 +                   zget(znode_get_tree(brother), &blocknr, NULL, level,
70029 +                        reiser4_ctx_gfp_mask_get());
70030 +               if (IS_ERR(result)) {
70031 +                       ewarning(PTR_ERR(result), "nikita-929",
70032 +                                "Cannot allocate znode for carry: %li",
70033 +                                PTR_ERR(result));
70034 +                       return result;
70035 +               }
70036 +               /* cheap test, can be executed even when debugging is off */
70037 +               if (!znode_just_created(result)) {
70038 +                       warning("nikita-2213",
70039 +                               "Allocated already existing block: %llu",
70040 +                               (unsigned long long)blocknr);
70041 +                       zput(result);
70042 +                       return ERR_PTR(RETERR(-EIO));
70043 +               }
70044 +
70045 +               assert("nikita-931", result != NULL);
70046 +               result->nplug = znode_get_tree(brother)->nplug;
70047 +               assert("nikita-933", result->nplug != NULL);
70048 +
70049 +               retcode = zinit_new(result, reiser4_ctx_gfp_mask_get());
70050 +               if (retcode == 0) {
70051 +                       ZF_SET(result, JNODE_CREATED);
70052 +                       zrelse(result);
70053 +               } else {
70054 +                       zput(result);
70055 +                       result = ERR_PTR(retcode);
70056 +               }
70057 +       } else {
70058 +               /* failure to allocate new node during balancing.
70059 +                  This should never happen. Ever. Returning -E_REPEAT
70060 +                  is not viable solution, because "out of disk space"
70061 +                  is not transient error that will go away by itself.
70062 +                */
70063 +               ewarning(retcode, "nikita-928",
70064 +                        "Cannot allocate block for carry: %i", retcode);
70065 +               result = ERR_PTR(retcode);
70066 +       }
70067 +       assert("nikita-1071", result != NULL);
70068 +       return result;
70069 +}
70070 +
70071 +/* allocate new root and add it to the tree
70072 +
70073 +   This helper function is called by add_new_root().
70074 +
70075 +*/
70076 +znode *reiser4_add_tree_root(znode * old_root /* existing tree root */ ,
70077 +                    znode * fake /* "fake" znode */ )
70078 +{
70079 +       reiser4_tree *tree = znode_get_tree(old_root);
70080 +       znode *new_root = NULL; /* to shut gcc up */
70081 +       int result;
70082 +
70083 +       assert("nikita-1069", old_root != NULL);
70084 +       assert("umka-262", fake != NULL);
70085 +       assert("umka-263", tree != NULL);
70086 +
70087 +       /* "fake" znode---one always hanging just above current root. This
70088 +          node is locked when new root is created or existing root is
70089 +          deleted. Downward tree traversal takes lock on it before taking
70090 +          lock on a root node. This avoids race conditions with root
70091 +          manipulations.
70092 +
70093 +        */
70094 +       assert("nikita-1348", znode_above_root(fake));
70095 +       assert("nikita-1211", znode_is_root(old_root));
70096 +
70097 +       result = 0;
70098 +       if (tree->height >= REAL_MAX_ZTREE_HEIGHT) {
70099 +               warning("nikita-1344", "Tree is too tall: %i", tree->height);
70100 +               /* ext2 returns -ENOSPC when it runs out of free inodes with a
70101 +                  following comment (fs/ext2/ialloc.c:441): Is it really
70102 +                  ENOSPC?
70103 +
70104 +                  -EXFULL? -EINVAL?
70105 +                */
70106 +               result = RETERR(-ENOSPC);
70107 +       } else {
70108 +               /* Allocate block for new root. It's not that
70109 +                  important where it will be allocated, as root is
70110 +                  almost always in memory. Moreover, allocate on
70111 +                  flush can be going here.
70112 +                */
70113 +               assert("nikita-1448", znode_is_root(old_root));
70114 +               new_root = reiser4_new_node(fake, tree->height + 1);
70115 +               if (!IS_ERR(new_root) && (result = zload(new_root)) == 0) {
70116 +                       lock_handle rlh;
70117 +
70118 +                       init_lh(&rlh);
70119 +                       result =
70120 +                           longterm_lock_znode(&rlh, new_root,
70121 +                                               ZNODE_WRITE_LOCK,
70122 +                                               ZNODE_LOCK_LOPRI);
70123 +                       if (result == 0) {
70124 +                               parent_coord_t *in_parent;
70125 +
70126 +                               znode_make_dirty(fake);
70127 +
70128 +                               /* new root is a child of "fake" node */
70129 +                               write_lock_tree(tree);
70130 +
70131 +                               ++tree->height;
70132 +
70133 +                               /* recalculate max balance overhead */
70134 +                               tree->estimate_one_insert =
70135 +                                   estimate_one_insert_item(tree);
70136 +
70137 +                               tree->root_block = *znode_get_block(new_root);
70138 +                               in_parent = &new_root->in_parent;
70139 +                               init_parent_coord(in_parent, fake);
70140 +                               /* manually insert new root into sibling
70141 +                                * list. With this all nodes involved into
70142 +                                * balancing are connected after balancing is
70143 +                                * done---useful invariant to check. */
70144 +                               sibling_list_insert_nolock(new_root, NULL);
70145 +                               write_unlock_tree(tree);
70146 +
70147 +                               /* insert into new root pointer to the
70148 +                                  @old_root. */
70149 +                               assert("nikita-1110",
70150 +                                      WITH_DATA(new_root,
70151 +                                                node_is_empty(new_root)));
70152 +                               write_lock_dk(tree);
70153 +                               znode_set_ld_key(new_root, reiser4_min_key());
70154 +                               znode_set_rd_key(new_root, reiser4_max_key());
70155 +                               write_unlock_dk(tree);
70156 +                               if (REISER4_DEBUG) {
70157 +                                       ZF_CLR(old_root, JNODE_LEFT_CONNECTED);
70158 +                                       ZF_CLR(old_root, JNODE_RIGHT_CONNECTED);
70159 +                                       ZF_SET(old_root, JNODE_ORPHAN);
70160 +                               }
70161 +                               result = add_child_ptr(new_root, old_root);
70162 +                               done_lh(&rlh);
70163 +                       }
70164 +                       zrelse(new_root);
70165 +               }
70166 +       }
70167 +       if (result != 0)
70168 +               new_root = ERR_PTR(result);
70169 +       return new_root;
70170 +}
70171 +
70172 +/* build &reiser4_item_data for inserting child pointer
70173 +
70174 +   Build &reiser4_item_data that can be later used to insert pointer to @child
70175 +   in its parent.
70176 +
70177 +*/
70178 +void build_child_ptr_data(znode * child        /* node pointer to which will be
70179 +                                        * inserted */ ,
70180 +                         reiser4_item_data * data /* where to store result */ )
70181 +{
70182 +       assert("nikita-1116", child != NULL);
70183 +       assert("nikita-1117", data != NULL);
70184 +
70185 +       /*
70186 +        * NOTE: use address of child's blocknr as address of data to be
70187 +        * inserted. As result of this data gets into on-disk structure in cpu
70188 +        * byte order. internal's create_hook converts it to little endian byte
70189 +        * order.
70190 +        */
70191 +       data->data = (char *)znode_get_block(child);
70192 +       /* data -> data is kernel space */
70193 +       data->user = 0;
70194 +       data->length = sizeof(reiser4_block_nr);
70195 +       /* FIXME-VS: hardcoded internal item? */
70196 +
70197 +       /* AUDIT: Is it possible that "item_plugin_by_id" may find nothing? */
70198 +       data->iplug = item_plugin_by_id(NODE_POINTER_ID);
70199 +}
70200 +
70201 +/* add pointer to @child into empty @parent.
70202 +
70203 +   This is used when pointer to old root is inserted into new root which is
70204 +   empty.
70205 +*/
70206 +static int add_child_ptr(znode * parent, znode * child)
70207 +{
70208 +       coord_t coord;
70209 +       reiser4_item_data data;
70210 +       int result;
70211 +       reiser4_key key;
70212 +
70213 +       assert("nikita-1111", parent != NULL);
70214 +       assert("nikita-1112", child != NULL);
70215 +       assert("nikita-1115",
70216 +              znode_get_level(parent) == znode_get_level(child) + 1);
70217 +
70218 +       result = zload(parent);
70219 +       if (result != 0)
70220 +               return result;
70221 +       assert("nikita-1113", node_is_empty(parent));
70222 +       coord_init_first_unit(&coord, parent);
70223 +
70224 +       build_child_ptr_data(child, &data);
70225 +       data.arg = NULL;
70226 +
70227 +       read_lock_dk(znode_get_tree(parent));
70228 +       key = *znode_get_ld_key(child);
70229 +       read_unlock_dk(znode_get_tree(parent));
70230 +
70231 +       result = node_plugin_by_node(parent)->create_item(&coord, &key, &data,
70232 +                                                         NULL);
70233 +       znode_make_dirty(parent);
70234 +       zrelse(parent);
70235 +       return result;
70236 +}
70237 +
70238 +/* actually remove tree root */
70239 +static int reiser4_kill_root(reiser4_tree * tree /* tree from which root is
70240 +                                                 * being removed */,
70241 +                            znode * old_root /* root node that is being
70242 +                                              * removed */ ,
70243 +                            znode * new_root   /* new root---sole child of
70244 +                                                * @old_root */,
70245 +                    const reiser4_block_nr * new_root_blk /* disk address of
70246 +                                                           * @new_root */)
70247 +{
70248 +       znode *uber;
70249 +       int result;
70250 +       lock_handle handle_for_uber;
70251 +
70252 +       assert("umka-265", tree != NULL);
70253 +       assert("nikita-1198", new_root != NULL);
70254 +       assert("nikita-1199",
70255 +              znode_get_level(new_root) + 1 == znode_get_level(old_root));
70256 +
70257 +       assert("nikita-1201", znode_is_write_locked(old_root));
70258 +
70259 +       assert("nikita-1203",
70260 +              disk_addr_eq(new_root_blk, znode_get_block(new_root)));
70261 +
70262 +       init_lh(&handle_for_uber);
70263 +       /* obtain and lock "fake" znode protecting changes in tree height. */
70264 +       result = get_uber_znode(tree, ZNODE_WRITE_LOCK, ZNODE_LOCK_HIPRI,
70265 +                               &handle_for_uber);
70266 +       if (result == 0) {
70267 +               uber = handle_for_uber.node;
70268 +
70269 +               znode_make_dirty(uber);
70270 +
70271 +               /* don't take long term lock a @new_root. Take spinlock. */
70272 +
70273 +               write_lock_tree(tree);
70274 +
70275 +               tree->root_block = *new_root_blk;
70276 +               --tree->height;
70277 +
70278 +               /* recalculate max balance overhead */
70279 +               tree->estimate_one_insert = estimate_one_insert_item(tree);
70280 +
70281 +               assert("nikita-1202",
70282 +                      tree->height == znode_get_level(new_root));
70283 +
70284 +               /* new root is child on "fake" node */
70285 +               init_parent_coord(&new_root->in_parent, uber);
70286 +               ++uber->c_count;
70287 +
70288 +               /* sibling_list_insert_nolock(new_root, NULL); */
70289 +               write_unlock_tree(tree);
70290 +
70291 +               /* reinitialise old root. */
70292 +               result = node_plugin_by_node(old_root)->init(old_root);
70293 +               znode_make_dirty(old_root);
70294 +               if (result == 0) {
70295 +                       assert("nikita-1279", node_is_empty(old_root));
70296 +                       ZF_SET(old_root, JNODE_HEARD_BANSHEE);
70297 +                       old_root->c_count = 0;
70298 +               }
70299 +       }
70300 +       done_lh(&handle_for_uber);
70301 +
70302 +       return result;
70303 +}
70304 +
70305 +/* remove tree root
70306 +
70307 +   This function removes tree root, decreasing tree height by one.  Tree root
70308 +   and its only child (that is going to become new tree root) are write locked
70309 +   at the entry.
70310 +
70311 +   To remove tree root we need to take lock on special "fake" znode that
70312 +   protects changes of tree height. See comments in reiser4_add_tree_root() for
70313 +   more on this.
70314 +
70315 +   Also parent pointers have to be updated in
70316 +   old and new root. To simplify code, function is split into two parts: outer
70317 +   reiser4_kill_tree_root() collects all necessary arguments and calls
70318 +   reiser4_kill_root() to do the actual job.
70319 +
70320 +*/
70321 +int reiser4_kill_tree_root(znode * old_root /* tree root that we are
70322 +                                              removing*/)
70323 +{
70324 +       int result;
70325 +       coord_t down_link;
70326 +       znode *new_root;
70327 +       reiser4_tree *tree;
70328 +
70329 +       assert("umka-266", current_tree != NULL);
70330 +       assert("nikita-1194", old_root != NULL);
70331 +       assert("nikita-1196", znode_is_root(old_root));
70332 +       assert("nikita-1200", node_num_items(old_root) == 1);
70333 +       assert("nikita-1401", znode_is_write_locked(old_root));
70334 +
70335 +       coord_init_first_unit(&down_link, old_root);
70336 +
70337 +       tree = znode_get_tree(old_root);
70338 +       new_root = child_znode(&down_link, old_root, 0, 1);
70339 +       if (!IS_ERR(new_root)) {
70340 +               result =
70341 +                       reiser4_kill_root(tree, old_root, new_root,
70342 +                                         znode_get_block(new_root));
70343 +               zput(new_root);
70344 +       } else
70345 +               result = PTR_ERR(new_root);
70346 +
70347 +       return result;
70348 +}
70349 +
70350 +/* Make Linus happy.
70351 +   Local variables:
70352 +   c-indentation-style: "K&R"
70353 +   mode-name: "LC"
70354 +   c-basic-offset: 8
70355 +   tab-width: 8
70356 +   fill-column: 120
70357 +   scroll-step: 1
70358 +   End:
70359 +*/
70360 diff -puN /dev/null fs/reiser4/tree_mod.h
70361 --- /dev/null
70362 +++ a/fs/reiser4/tree_mod.h
70363 @@ -0,0 +1,29 @@
70364 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
70365 + * reiser4/README */
70366 +
70367 +/* Functions to add/delete new nodes to/from the tree. See tree_mod.c for
70368 + * comments. */
70369 +
70370 +#if !defined( __REISER4_TREE_MOD_H__ )
70371 +#define __REISER4_TREE_MOD_H__
70372 +
70373 +#include "forward.h"
70374 +
70375 +znode *reiser4_new_node(znode * brother, tree_level level);
70376 +znode *reiser4_add_tree_root(znode * old_root, znode * fake);
70377 +int reiser4_kill_tree_root(znode * old_root);
70378 +void build_child_ptr_data(znode * child, reiser4_item_data * data);
70379 +
70380 +/* __REISER4_TREE_MOD_H__ */
70381 +#endif
70382 +
70383 +/* Make Linus happy.
70384 +   Local variables:
70385 +   c-indentation-style: "K&R"
70386 +   mode-name: "LC"
70387 +   c-basic-offset: 8
70388 +   tab-width: 8
70389 +   fill-column: 120
70390 +   scroll-step: 1
70391 +   End:
70392 +*/
70393 diff -puN /dev/null fs/reiser4/tree_walk.c
70394 --- /dev/null
70395 +++ a/fs/reiser4/tree_walk.c
70396 @@ -0,0 +1,927 @@
70397 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
70398 + * reiser4/README */
70399 +
70400 +/* Routines and macros to:
70401 +
70402 +   get_left_neighbor()
70403 +
70404 +   get_right_neighbor()
70405 +
70406 +   get_parent()
70407 +
70408 +   get_first_child()
70409 +
70410 +   get_last_child()
70411 +
70412 +   various routines to walk the whole tree and do things to it like
70413 +   repack it, or move it to tertiary storage.  Please make them as
70414 +   generic as is reasonable.
70415 +
70416 +*/
70417 +
70418 +#include "forward.h"
70419 +#include "debug.h"
70420 +#include "dformat.h"
70421 +#include "coord.h"
70422 +#include "plugin/item/item.h"
70423 +#include "jnode.h"
70424 +#include "znode.h"
70425 +#include "tree_walk.h"
70426 +#include "tree.h"
70427 +#include "super.h"
70428 +
70429 +/* These macros are used internally in tree_walk.c in attempt to make
70430 +   lock_neighbor() code usable to build lock_parent(), lock_right_neighbor,
70431 +   lock_left_neighbor */
70432 +#define GET_NODE_BY_PTR_OFFSET(node, off) (*(znode**)(((unsigned long)(node)) + (off)))
70433 +#define FIELD_OFFSET(name)  offsetof(znode, name)
70434 +#define PARENT_PTR_OFFSET FIELD_OFFSET(in_parent.node)
70435 +#define LEFT_PTR_OFFSET   FIELD_OFFSET(left)
70436 +#define RIGHT_PTR_OFFSET  FIELD_OFFSET(right)
70437 +
70438 +/* This is the generic procedure to get and lock `generic' neighbor (left or
70439 +    right neighbor or parent). It implements common algorithm for all cases of
70440 +    getting lock on neighbor node, only znode structure field is different in
70441 +    each case. This is parameterized by ptr_offset argument, which is byte
70442 +    offset for the pointer to the desired neighbor within the current node's
70443 +    znode structure. This function should be called with the tree lock held */
70444 +static int lock_neighbor(
70445 +                               /* resulting lock handle */
70446 +                               lock_handle * result,
70447 +                               /* znode to lock */
70448 +                               znode * node,
70449 +                               /* pointer to neighbor (or parent) znode field offset, in bytes from
70450 +                                  the base address of znode structure  */
70451 +                               int ptr_offset,
70452 +                               /* lock mode for longterm_lock_znode call */
70453 +                               znode_lock_mode mode,
70454 +                               /* lock request for longterm_lock_znode call */
70455 +                               znode_lock_request req,
70456 +                               /* GN_* flags */
70457 +                               int flags, int rlocked)
70458 +{
70459 +       reiser4_tree *tree = znode_get_tree(node);
70460 +       znode *neighbor;
70461 +       int ret;
70462 +
70463 +       assert("umka-236", node != NULL);
70464 +       assert("umka-237", tree != NULL);
70465 +       assert_rw_locked(&(tree->tree_lock));
70466 +
70467 +       if (flags & GN_TRY_LOCK)
70468 +               req |= ZNODE_LOCK_NONBLOCK;
70469 +       if (flags & GN_SAME_ATOM)
70470 +               req |= ZNODE_LOCK_DONT_FUSE;
70471 +
70472 +       /* get neighbor's address by using of sibling link, quit while loop
70473 +          (and return) if link is not available. */
70474 +       while (1) {
70475 +               neighbor = GET_NODE_BY_PTR_OFFSET(node, ptr_offset);
70476 +
70477 +               /* return -E_NO_NEIGHBOR if parent or side pointer is NULL or if
70478 +                * node pointed by it is not connected.
70479 +                *
70480 +                * However, GN_ALLOW_NOT_CONNECTED option masks "connected"
70481 +                * check and allows passing reference to not connected znode to
70482 +                * subsequent longterm_lock_znode() call.  This kills possible
70483 +                * busy loop if we are trying to get longterm lock on locked but
70484 +                * not yet connected parent node. */
70485 +               if (neighbor == NULL || !((flags & GN_ALLOW_NOT_CONNECTED)
70486 +                                         || znode_is_connected(neighbor))) {
70487 +                       return RETERR(-E_NO_NEIGHBOR);
70488 +               }
70489 +
70490 +               /* protect it from deletion. */
70491 +               zref(neighbor);
70492 +
70493 +               rlocked ? read_unlock_tree(tree) : write_unlock_tree(tree);
70494 +
70495 +               ret = longterm_lock_znode(result, neighbor, mode, req);
70496 +
70497 +               /* The lock handle obtains its own reference, release the one from above. */
70498 +               zput(neighbor);
70499 +
70500 +               rlocked ? read_lock_tree(tree) : write_lock_tree(tree);
70501 +
70502 +               /* restart if node we got reference to is being
70503 +                  invalidated. we should not get reference to this node
70504 +                  again. */
70505 +               if (ret == -EINVAL)
70506 +                       continue;
70507 +               if (ret)
70508 +                       return ret;
70509 +
70510 +               /* check if neighbor link still points to just locked znode;
70511 +                  the link could have been changed while the process slept. */
70512 +               if (neighbor == GET_NODE_BY_PTR_OFFSET(node, ptr_offset))
70513 +                       return 0;
70514 +
70515 +               /* znode was locked by mistake; unlock it and restart locking
70516 +                  process from beginning. */
70517 +               rlocked ? read_unlock_tree(tree) : write_unlock_tree(tree);
70518 +               longterm_unlock_znode(result);
70519 +               rlocked ? read_lock_tree(tree) : write_lock_tree(tree);
70520 +       }
70521 +}
70522 +
70523 +/* get parent node with longterm lock, accepts GN* flags. */
70524 +int reiser4_get_parent_flags(lock_handle * lh /* resulting lock handle */ ,
70525 +                            znode * node /* child node */ ,
70526 +                            znode_lock_mode mode
70527 +                            /* type of lock: read or write */ ,
70528 +                            int flags /* GN_* flags */ )
70529 +{
70530 +       int result;
70531 +
70532 +       read_lock_tree(znode_get_tree(node));
70533 +       result = lock_neighbor(lh, node, PARENT_PTR_OFFSET, mode,
70534 +                              ZNODE_LOCK_HIPRI, flags, 1);
70535 +       read_unlock_tree(znode_get_tree(node));
70536 +       return result;
70537 +}
70538 +
70539 +/* wrapper function to lock right or left neighbor depending on GN_GO_LEFT
70540 +   bit in @flags parameter  */
70541 +/* Audited by: umka (2002.06.14) */
70542 +static inline int
70543 +lock_side_neighbor(lock_handle * result,
70544 +                  znode * node, znode_lock_mode mode, int flags, int rlocked)
70545 +{
70546 +       int ret;
70547 +       int ptr_offset;
70548 +       znode_lock_request req;
70549 +
70550 +       if (flags & GN_GO_LEFT) {
70551 +               ptr_offset = LEFT_PTR_OFFSET;
70552 +               req = ZNODE_LOCK_LOPRI;
70553 +       } else {
70554 +               ptr_offset = RIGHT_PTR_OFFSET;
70555 +               req = ZNODE_LOCK_HIPRI;
70556 +       }
70557 +
70558 +       ret =
70559 +           lock_neighbor(result, node, ptr_offset, mode, req, flags, rlocked);
70560 +
70561 +       if (ret == -E_NO_NEIGHBOR)      /* if we walk left or right -E_NO_NEIGHBOR does not
70562 +                                        * guarantee that neighbor is absent in the
70563 +                                        * tree; in this case we return -ENOENT --
70564 +                                        * means neighbor at least not found in
70565 +                                        * cache */
70566 +               return RETERR(-ENOENT);
70567 +
70568 +       return ret;
70569 +}
70570 +
70571 +#if REISER4_DEBUG
70572 +
70573 +int check_sibling_list(znode * node)
70574 +{
70575 +       znode *scan;
70576 +       znode *next;
70577 +
70578 +       assert("nikita-3283", LOCK_CNT_GTZ(write_locked_tree));
70579 +
70580 +       if (node == NULL)
70581 +               return 1;
70582 +
70583 +       if (ZF_ISSET(node, JNODE_RIP))
70584 +               return 1;
70585 +
70586 +       assert("nikita-3270", node != NULL);
70587 +       assert_rw_write_locked(&(znode_get_tree(node)->tree_lock));
70588 +
70589 +       for (scan = node; znode_is_left_connected(scan); scan = next) {
70590 +               next = scan->left;
70591 +               if (next != NULL && !ZF_ISSET(next, JNODE_RIP)) {
70592 +                       assert("nikita-3271", znode_is_right_connected(next));
70593 +                       assert("nikita-3272", next->right == scan);
70594 +               } else
70595 +                       break;
70596 +       }
70597 +       for (scan = node; znode_is_right_connected(scan); scan = next) {
70598 +               next = scan->right;
70599 +               if (next != NULL && !ZF_ISSET(next, JNODE_RIP)) {
70600 +                       assert("nikita-3273", znode_is_left_connected(next));
70601 +                       assert("nikita-3274", next->left == scan);
70602 +               } else
70603 +                       break;
70604 +       }
70605 +       return 1;
70606 +}
70607 +
70608 +#endif
70609 +
70610 +/* Znode sibling pointers maintenence. */
70611 +
70612 +/* Znode sibling pointers are established between any neighbored nodes which are
70613 +   in cache.  There are two znode state bits (JNODE_LEFT_CONNECTED,
70614 +   JNODE_RIGHT_CONNECTED), if left or right sibling pointer contains actual
70615 +   value (even NULL), corresponded JNODE_*_CONNECTED bit is set.
70616 +
70617 +   Reiser4 tree operations which may allocate new znodes (CBK, tree balancing)
70618 +   take care about searching (hash table lookup may be required) of znode
70619 +   neighbors, establishing sibling pointers between them and setting
70620 +   JNODE_*_CONNECTED state bits. */
70621 +
70622 +/* adjusting of sibling pointers and `connected' states for two
70623 +   neighbors; works if one neighbor is NULL (was not found). */
70624 +
70625 +/* FIXME-VS: this is unstatic-ed to use in tree.c in prepare_twig_cut */
70626 +void link_left_and_right(znode * left, znode * right)
70627 +{
70628 +       assert("nikita-3275", check_sibling_list(left));
70629 +       assert("nikita-3275", check_sibling_list(right));
70630 +
70631 +       if (left != NULL) {
70632 +               if (left->right == NULL) {
70633 +                       left->right = right;
70634 +                       ZF_SET(left, JNODE_RIGHT_CONNECTED);
70635 +
70636 +                       ON_DEBUG(left->right_version =
70637 +                                atomic_inc_return(&delim_key_version);
70638 +                           );
70639 +
70640 +               } else if (ZF_ISSET(left->right, JNODE_HEARD_BANSHEE)
70641 +                          && left->right != right) {
70642 +
70643 +                       ON_DEBUG(left->right->left_version =
70644 +                                atomic_inc_return(&delim_key_version);
70645 +                                left->right_version =
70646 +                                atomic_inc_return(&delim_key_version););
70647 +
70648 +                       left->right->left = NULL;
70649 +                       left->right = right;
70650 +                       ZF_SET(left, JNODE_RIGHT_CONNECTED);
70651 +               } else
70652 +                       /*
70653 +                        * there is a race condition in renew_sibling_link()
70654 +                        * and assertions below check that it is only one
70655 +                        * there. Thread T1 calls renew_sibling_link() without
70656 +                        * GN_NO_ALLOC flag. zlook() doesn't find neighbor
70657 +                        * node, but before T1 gets to the
70658 +                        * link_left_and_right(), another thread T2 creates
70659 +                        * neighbor node and connects it. check for
70660 +                        * left->right == NULL above protects T1 from
70661 +                        * overwriting correct left->right pointer installed
70662 +                        * by T2.
70663 +                        */
70664 +                       assert("nikita-3302",
70665 +                              right == NULL || left->right == right);
70666 +       }
70667 +       if (right != NULL) {
70668 +               if (right->left == NULL) {
70669 +                       right->left = left;
70670 +                       ZF_SET(right, JNODE_LEFT_CONNECTED);
70671 +
70672 +                       ON_DEBUG(right->left_version =
70673 +                                atomic_inc_return(&delim_key_version);
70674 +                           );
70675 +
70676 +               } else if (ZF_ISSET(right->left, JNODE_HEARD_BANSHEE)
70677 +                          && right->left != left) {
70678 +
70679 +                       ON_DEBUG(right->left->right_version =
70680 +                                atomic_inc_return(&delim_key_version);
70681 +                                right->left_version =
70682 +                                atomic_inc_return(&delim_key_version););
70683 +
70684 +                       right->left->right = NULL;
70685 +                       right->left = left;
70686 +                       ZF_SET(right, JNODE_LEFT_CONNECTED);
70687 +
70688 +               } else
70689 +                       assert("nikita-3303",
70690 +                              left == NULL || right->left == left);
70691 +       }
70692 +       assert("nikita-3275", check_sibling_list(left));
70693 +       assert("nikita-3275", check_sibling_list(right));
70694 +}
70695 +
70696 +/* Audited by: umka (2002.06.14) */
70697 +static void link_znodes(znode * first, znode * second, int to_left)
70698 +{
70699 +       if (to_left)
70700 +               link_left_and_right(second, first);
70701 +       else
70702 +               link_left_and_right(first, second);
70703 +}
70704 +
70705 +/* getting of next (to left or to right, depend on gn_to_left bit in flags)
70706 +   coord's unit position in horizontal direction, even across node
70707 +   boundary. Should be called under tree lock, it protects nonexistence of
70708 +   sibling link on parent level, if lock_side_neighbor() fails with
70709 +   -ENOENT. */
70710 +static int far_next_coord(coord_t * coord, lock_handle * handle, int flags)
70711 +{
70712 +       int ret;
70713 +       znode *node;
70714 +       reiser4_tree *tree;
70715 +
70716 +       assert("umka-243", coord != NULL);
70717 +       assert("umka-244", handle != NULL);
70718 +       assert("zam-1069", handle->node == NULL);
70719 +
70720 +       ret =
70721 +           (flags & GN_GO_LEFT) ? coord_prev_unit(coord) :
70722 +           coord_next_unit(coord);
70723 +       if (!ret)
70724 +               return 0;
70725 +
70726 +       ret =
70727 +           lock_side_neighbor(handle, coord->node, ZNODE_READ_LOCK, flags, 0);
70728 +       if (ret)
70729 +               return ret;
70730 +
70731 +       node = handle->node;
70732 +       tree = znode_get_tree(node);
70733 +       write_unlock_tree(tree);
70734 +
70735 +       coord_init_zero(coord);
70736 +
70737 +       /* We avoid synchronous read here if it is specified by flag. */
70738 +       if ((flags & GN_ASYNC) && znode_page(handle->node) == NULL) {
70739 +               ret = jstartio(ZJNODE(handle->node));
70740 +               if (!ret)
70741 +                       ret = -E_REPEAT;
70742 +               goto error_locked;
70743 +       }
70744 +
70745 +       /* corresponded zrelse() should be called by the clients of
70746 +          far_next_coord(), in place when this node gets unlocked. */
70747 +       ret = zload(handle->node);
70748 +       if (ret)
70749 +               goto error_locked;
70750 +
70751 +       if (flags & GN_GO_LEFT)
70752 +               coord_init_last_unit(coord, node);
70753 +       else
70754 +               coord_init_first_unit(coord, node);
70755 +
70756 +       if (0) {
70757 +             error_locked:
70758 +               longterm_unlock_znode(handle);
70759 +       }
70760 +       write_lock_tree(tree);
70761 +       return ret;
70762 +}
70763 +
70764 +/* Very significant function which performs a step in horizontal direction
70765 +   when sibling pointer is not available.  Actually, it is only function which
70766 +   does it.
70767 +   Note: this function does not restore locking status at exit,
70768 +   caller should does care about proper unlocking and zrelsing */
70769 +static int
70770 +renew_sibling_link(coord_t * coord, lock_handle * handle, znode * child,
70771 +                  tree_level level, int flags, int *nr_locked)
70772 +{
70773 +       int ret;
70774 +       int to_left = flags & GN_GO_LEFT;
70775 +       reiser4_block_nr da;
70776 +       /* parent of the neighbor node; we set it to parent until not sharing
70777 +          of one parent between child and neighbor node is detected */
70778 +       znode *side_parent = coord->node;
70779 +       reiser4_tree *tree = znode_get_tree(child);
70780 +       znode *neighbor = NULL;
70781 +
70782 +       assert("umka-245", coord != NULL);
70783 +       assert("umka-246", handle != NULL);
70784 +       assert("umka-247", child != NULL);
70785 +       assert("umka-303", tree != NULL);
70786 +
70787 +       init_lh(handle);
70788 +       write_lock_tree(tree);
70789 +       ret = far_next_coord(coord, handle, flags);
70790 +
70791 +       if (ret) {
70792 +               if (ret != -ENOENT) {
70793 +                       write_unlock_tree(tree);
70794 +                       return ret;
70795 +               }
70796 +       } else {
70797 +               item_plugin *iplug;
70798 +
70799 +               if (handle->node != NULL) {
70800 +                       (*nr_locked)++;
70801 +                       side_parent = handle->node;
70802 +               }
70803 +
70804 +               /* does coord object points to internal item? We do not
70805 +                  support sibling pointers between znode for formatted and
70806 +                  unformatted nodes and return -E_NO_NEIGHBOR in that case. */
70807 +               iplug = item_plugin_by_coord(coord);
70808 +               if (!item_is_internal(coord)) {
70809 +                       link_znodes(child, NULL, to_left);
70810 +                       write_unlock_tree(tree);
70811 +                       /* we know there can't be formatted neighbor */
70812 +                       return RETERR(-E_NO_NEIGHBOR);
70813 +               }
70814 +               write_unlock_tree(tree);
70815 +
70816 +               iplug->s.internal.down_link(coord, NULL, &da);
70817 +
70818 +               if (flags & GN_NO_ALLOC) {
70819 +                       neighbor = zlook(tree, &da);
70820 +               } else {
70821 +                       neighbor =
70822 +                           zget(tree, &da, side_parent, level,
70823 +                                reiser4_ctx_gfp_mask_get());
70824 +               }
70825 +
70826 +               if (IS_ERR(neighbor)) {
70827 +                       ret = PTR_ERR(neighbor);
70828 +                       return ret;
70829 +               }
70830 +
70831 +               if (neighbor)
70832 +                       /* update delimiting keys */
70833 +                       set_child_delimiting_keys(coord->node, coord, neighbor);
70834 +
70835 +               write_lock_tree(tree);
70836 +       }
70837 +
70838 +       if (likely(neighbor == NULL ||
70839 +                  (znode_get_level(child) == znode_get_level(neighbor)
70840 +                   && child != neighbor)))
70841 +               link_znodes(child, neighbor, to_left);
70842 +       else {
70843 +               warning("nikita-3532",
70844 +                       "Sibling nodes on the different levels: %i != %i\n",
70845 +                       znode_get_level(child), znode_get_level(neighbor));
70846 +               ret = RETERR(-EIO);
70847 +       }
70848 +
70849 +       write_unlock_tree(tree);
70850 +
70851 +       /* if GN_NO_ALLOC isn't set we keep reference to neighbor znode */
70852 +       if (neighbor != NULL && (flags & GN_NO_ALLOC))
70853 +               /* atomic_dec(&ZJNODE(neighbor)->x_count); */
70854 +               zput(neighbor);
70855 +
70856 +       return ret;
70857 +}
70858 +
70859 +/* This function is for establishing of one side relation. */
70860 +/* Audited by: umka (2002.06.14) */
70861 +static int connect_one_side(coord_t * coord, znode * node, int flags)
70862 +{
70863 +       coord_t local;
70864 +       lock_handle handle;
70865 +       int nr_locked;
70866 +       int ret;
70867 +
70868 +       assert("umka-248", coord != NULL);
70869 +       assert("umka-249", node != NULL);
70870 +
70871 +       coord_dup_nocheck(&local, coord);
70872 +
70873 +       init_lh(&handle);
70874 +
70875 +       ret =
70876 +           renew_sibling_link(&local, &handle, node, znode_get_level(node),
70877 +                              flags | GN_NO_ALLOC, &nr_locked);
70878 +
70879 +       if (handle.node != NULL) {
70880 +               /* complementary operations for zload() and lock() in far_next_coord() */
70881 +               zrelse(handle.node);
70882 +               longterm_unlock_znode(&handle);
70883 +       }
70884 +
70885 +       /* we catch error codes which are not interesting for us because we
70886 +          run renew_sibling_link() only for znode connection. */
70887 +       if (ret == -ENOENT || ret == -E_NO_NEIGHBOR)
70888 +               return 0;
70889 +
70890 +       return ret;
70891 +}
70892 +
70893 +/* if @child is not in `connected' state, performs hash searches for left and
70894 +   right neighbor nodes and establishes horizontal sibling links */
70895 +/* Audited by: umka (2002.06.14), umka (2002.06.15) */
70896 +int connect_znode(coord_t * parent_coord, znode * child)
70897 +{
70898 +       reiser4_tree *tree = znode_get_tree(child);
70899 +       int ret = 0;
70900 +
70901 +       assert("zam-330", parent_coord != NULL);
70902 +       assert("zam-331", child != NULL);
70903 +       assert("zam-332", parent_coord->node != NULL);
70904 +       assert("umka-305", tree != NULL);
70905 +
70906 +       /* it is trivial to `connect' root znode because it can't have
70907 +          neighbors */
70908 +       if (znode_above_root(parent_coord->node)) {
70909 +               child->left = NULL;
70910 +               child->right = NULL;
70911 +               ZF_SET(child, JNODE_LEFT_CONNECTED);
70912 +               ZF_SET(child, JNODE_RIGHT_CONNECTED);
70913 +
70914 +               ON_DEBUG(child->left_version =
70915 +                        atomic_inc_return(&delim_key_version);
70916 +                        child->right_version =
70917 +                        atomic_inc_return(&delim_key_version););
70918 +
70919 +               return 0;
70920 +       }
70921 +
70922 +       /* load parent node */
70923 +       coord_clear_iplug(parent_coord);
70924 +       ret = zload(parent_coord->node);
70925 +
70926 +       if (ret != 0)
70927 +               return ret;
70928 +
70929 +       /* protect `connected' state check by tree_lock */
70930 +       read_lock_tree(tree);
70931 +
70932 +       if (!znode_is_right_connected(child)) {
70933 +               read_unlock_tree(tree);
70934 +               /* connect right (default is right) */
70935 +               ret = connect_one_side(parent_coord, child, GN_NO_ALLOC);
70936 +               if (ret)
70937 +                       goto zrelse_and_ret;
70938 +
70939 +               read_lock_tree(tree);
70940 +       }
70941 +
70942 +       ret = znode_is_left_connected(child);
70943 +
70944 +       read_unlock_tree(tree);
70945 +
70946 +       if (!ret) {
70947 +               ret =
70948 +                   connect_one_side(parent_coord, child,
70949 +                                    GN_NO_ALLOC | GN_GO_LEFT);
70950 +       } else
70951 +               ret = 0;
70952 +
70953 +      zrelse_and_ret:
70954 +       zrelse(parent_coord->node);
70955 +
70956 +       return ret;
70957 +}
70958 +
70959 +/* this function is like renew_sibling_link() but allocates neighbor node if
70960 +   it doesn't exist and `connects' it. It may require making two steps in
70961 +   horizontal direction, first one for neighbor node finding/allocation,
70962 +   second one is for finding neighbor of neighbor to connect freshly allocated
70963 +   znode. */
70964 +/* Audited by: umka (2002.06.14), umka (2002.06.15) */
70965 +static int
70966 +renew_neighbor(coord_t * coord, znode * node, tree_level level, int flags)
70967 +{
70968 +       coord_t local;
70969 +       lock_handle empty[2];
70970 +       reiser4_tree *tree = znode_get_tree(node);
70971 +       znode *neighbor = NULL;
70972 +       int nr_locked = 0;
70973 +       int ret;
70974 +
70975 +       assert("umka-250", coord != NULL);
70976 +       assert("umka-251", node != NULL);
70977 +       assert("umka-307", tree != NULL);
70978 +       assert("umka-308", level <= tree->height);
70979 +
70980 +       /* umka (2002.06.14)
70981 +          Here probably should be a check for given "level" validness.
70982 +          Something like assert("xxx-yyy", level < REAL_MAX_ZTREE_HEIGHT);
70983 +        */
70984 +
70985 +       coord_dup(&local, coord);
70986 +
70987 +       ret =
70988 +           renew_sibling_link(&local, &empty[0], node, level,
70989 +                              flags & ~GN_NO_ALLOC, &nr_locked);
70990 +       if (ret)
70991 +               goto out;
70992 +
70993 +       /* tree lock is not needed here because we keep parent node(s) locked
70994 +          and reference to neighbor znode incremented */
70995 +       neighbor = (flags & GN_GO_LEFT) ? node->left : node->right;
70996 +
70997 +       read_lock_tree(tree);
70998 +       ret = znode_is_connected(neighbor);
70999 +       read_unlock_tree(tree);
71000 +       if (ret) {
71001 +               ret = 0;
71002 +               goto out;
71003 +       }
71004 +
71005 +       ret =
71006 +           renew_sibling_link(&local, &empty[nr_locked], neighbor, level,
71007 +                              flags | GN_NO_ALLOC, &nr_locked);
71008 +       /* second renew_sibling_link() call is used for znode connection only,
71009 +          so we can live with these errors */
71010 +       if (-ENOENT == ret || -E_NO_NEIGHBOR == ret)
71011 +               ret = 0;
71012 +
71013 +      out:
71014 +
71015 +       for (--nr_locked; nr_locked >= 0; --nr_locked) {
71016 +               zrelse(empty[nr_locked].node);
71017 +               longterm_unlock_znode(&empty[nr_locked]);
71018 +       }
71019 +
71020 +       if (neighbor != NULL)
71021 +               /* decrement znode reference counter without actually
71022 +                  releasing it. */
71023 +               atomic_dec(&ZJNODE(neighbor)->x_count);
71024 +
71025 +       return ret;
71026 +}
71027 +
71028 +/*
71029 +   reiser4_get_neighbor() -- lock node's neighbor.
71030 +
71031 +   reiser4_get_neighbor() locks node's neighbor (left or right one, depends on
71032 +   given parameter) using sibling link to it. If sibling link is not available
71033 +   (i.e. neighbor znode is not in cache) and flags allow read blocks, we go one
71034 +   level up for information about neighbor's disk address. We lock node's
71035 +   parent, if it is common parent for both 'node' and its neighbor, neighbor's
71036 +   disk address is in next (to left or to right) down link from link that points
71037 +   to original node. If not, we need to lock parent's neighbor, read its content
71038 +   and take first(last) downlink with neighbor's disk address.  That locking
71039 +   could be done by using sibling link and lock_neighbor() function, if sibling
71040 +   link exists. In another case we have to go level up again until we find
71041 +   common parent or valid sibling link. Then go down
71042 +   allocating/connecting/locking/reading nodes until neighbor of first one is
71043 +   locked.
71044 +
71045 +   @neighbor:  result lock handle,
71046 +   @node: a node which we lock neighbor of,
71047 +   @lock_mode: lock mode {LM_READ, LM_WRITE},
71048 +   @flags: logical OR of {GN_*} (see description above) subset.
71049 +
71050 +   @return: 0 if success, negative value if lock was impossible due to an error
71051 +   or lack of neighbor node.
71052 +*/
71053 +
71054 +/* Audited by: umka (2002.06.14), umka (2002.06.15) */
71055 +int
71056 +reiser4_get_neighbor(lock_handle * neighbor, znode * node,
71057 +                    znode_lock_mode lock_mode, int flags)
71058 +{
71059 +       reiser4_tree *tree = znode_get_tree(node);
71060 +       lock_handle path[REAL_MAX_ZTREE_HEIGHT];
71061 +
71062 +       coord_t coord;
71063 +
71064 +       tree_level base_level;
71065 +       tree_level h = 0;
71066 +       int ret;
71067 +
71068 +       assert("umka-252", tree != NULL);
71069 +       assert("umka-253", neighbor != NULL);
71070 +       assert("umka-254", node != NULL);
71071 +
71072 +       base_level = znode_get_level(node);
71073 +
71074 +       assert("umka-310", base_level <= tree->height);
71075 +
71076 +       coord_init_zero(&coord);
71077 +
71078 +      again:
71079 +       /* first, we try to use simple lock_neighbor() which requires sibling
71080 +          link existence */
71081 +       read_lock_tree(tree);
71082 +       ret = lock_side_neighbor(neighbor, node, lock_mode, flags, 1);
71083 +       read_unlock_tree(tree);
71084 +       if (!ret) {
71085 +               /* load znode content if it was specified */
71086 +               if (flags & GN_LOAD_NEIGHBOR) {
71087 +                       ret = zload(node);
71088 +                       if (ret)
71089 +                               longterm_unlock_znode(neighbor);
71090 +               }
71091 +               return ret;
71092 +       }
71093 +
71094 +       /* only -ENOENT means we may look upward and try to connect
71095 +          @node with its neighbor (if @flags allow us to do it) */
71096 +       if (ret != -ENOENT || !(flags & GN_CAN_USE_UPPER_LEVELS))
71097 +               return ret;
71098 +
71099 +       /* before establishing of sibling link we lock parent node; it is
71100 +          required by renew_neighbor() to work.  */
71101 +       init_lh(&path[0]);
71102 +       ret = reiser4_get_parent(&path[0], node, ZNODE_READ_LOCK);
71103 +       if (ret)
71104 +               return ret;
71105 +       if (znode_above_root(path[0].node)) {
71106 +               longterm_unlock_znode(&path[0]);
71107 +               return RETERR(-E_NO_NEIGHBOR);
71108 +       }
71109 +
71110 +       while (1) {
71111 +               znode *child = (h == 0) ? node : path[h - 1].node;
71112 +               znode *parent = path[h].node;
71113 +
71114 +               ret = zload(parent);
71115 +               if (ret)
71116 +                       break;
71117 +
71118 +               ret = find_child_ptr(parent, child, &coord);
71119 +
71120 +               if (ret) {
71121 +                       zrelse(parent);
71122 +                       break;
71123 +               }
71124 +
71125 +               /* try to establish missing sibling link */
71126 +               ret = renew_neighbor(&coord, child, h + base_level, flags);
71127 +
71128 +               zrelse(parent);
71129 +
71130 +               switch (ret) {
71131 +               case 0:
71132 +                       /* unlocking of parent znode prevents simple
71133 +                          deadlock situation */
71134 +                       done_lh(&path[h]);
71135 +
71136 +                       /* depend on tree level we stay on we repeat first
71137 +                          locking attempt ...  */
71138 +                       if (h == 0)
71139 +                               goto again;
71140 +
71141 +                       /* ... or repeat establishing of sibling link at
71142 +                          one level below. */
71143 +                       --h;
71144 +                       break;
71145 +
71146 +               case -ENOENT:
71147 +                       /* sibling link is not available -- we go
71148 +                          upward. */
71149 +                       init_lh(&path[h + 1]);
71150 +                       ret =
71151 +                           reiser4_get_parent(&path[h + 1], parent,
71152 +                                              ZNODE_READ_LOCK);
71153 +                       if (ret)
71154 +                               goto fail;
71155 +                       ++h;
71156 +                       if (znode_above_root(path[h].node)) {
71157 +                               ret = RETERR(-E_NO_NEIGHBOR);
71158 +                               goto fail;
71159 +                       }
71160 +                       break;
71161 +
71162 +               case -E_DEADLOCK:
71163 +                       /* there was lock request from hi-pri locker. if
71164 +                          it is possible we unlock last parent node and
71165 +                          re-lock it again. */
71166 +                       for (; reiser4_check_deadlock(); h--) {
71167 +                               done_lh(&path[h]);
71168 +                               if (h == 0)
71169 +                                       goto fail;
71170 +                       }
71171 +
71172 +                       break;
71173 +
71174 +               default:        /* other errors. */
71175 +                       goto fail;
71176 +               }
71177 +       }
71178 +      fail:
71179 +       ON_DEBUG(check_lock_node_data(node));
71180 +       ON_DEBUG(check_lock_data());
71181 +
71182 +       /* unlock path */
71183 +       do {
71184 +               /* FIXME-Zam: when we get here from case -E_DEADLOCK's goto
71185 +                  fail; path[0] is already done_lh-ed, therefore
71186 +                  longterm_unlock_znode(&path[h]); is not applicable */
71187 +               done_lh(&path[h]);
71188 +               --h;
71189 +       } while (h + 1 != 0);
71190 +
71191 +       return ret;
71192 +}
71193 +
71194 +/* remove node from sibling list */
71195 +/* Audited by: umka (2002.06.14) */
71196 +void sibling_list_remove(znode * node)
71197 +{
71198 +       reiser4_tree *tree;
71199 +
71200 +       tree = znode_get_tree(node);
71201 +       assert("umka-255", node != NULL);
71202 +       assert_rw_write_locked(&(tree->tree_lock));
71203 +       assert("nikita-3275", check_sibling_list(node));
71204 +
71205 +       write_lock_dk(tree);
71206 +       if (znode_is_right_connected(node) && node->right != NULL &&
71207 +           znode_is_left_connected(node) && node->left != NULL) {
71208 +               assert("zam-32245",
71209 +                      keyeq(znode_get_rd_key(node),
71210 +                            znode_get_ld_key(node->right)));
71211 +               znode_set_rd_key(node->left, znode_get_ld_key(node->right));
71212 +       }
71213 +       write_unlock_dk(tree);
71214 +
71215 +       if (znode_is_right_connected(node) && node->right != NULL) {
71216 +               assert("zam-322", znode_is_left_connected(node->right));
71217 +               node->right->left = node->left;
71218 +               ON_DEBUG(node->right->left_version =
71219 +                        atomic_inc_return(&delim_key_version);
71220 +                   );
71221 +       }
71222 +       if (znode_is_left_connected(node) && node->left != NULL) {
71223 +               assert("zam-323", znode_is_right_connected(node->left));
71224 +               node->left->right = node->right;
71225 +               ON_DEBUG(node->left->right_version =
71226 +                        atomic_inc_return(&delim_key_version);
71227 +                   );
71228 +       }
71229 +
71230 +       ZF_CLR(node, JNODE_LEFT_CONNECTED);
71231 +       ZF_CLR(node, JNODE_RIGHT_CONNECTED);
71232 +       ON_DEBUG(node->left = node->right = NULL;
71233 +                node->left_version = atomic_inc_return(&delim_key_version);
71234 +                node->right_version = atomic_inc_return(&delim_key_version););
71235 +       assert("nikita-3276", check_sibling_list(node));
71236 +}
71237 +
71238 +/* disconnect node from sibling list */
71239 +void sibling_list_drop(znode * node)
71240 +{
71241 +       znode *right;
71242 +       znode *left;
71243 +
71244 +       assert("nikita-2464", node != NULL);
71245 +       assert("nikita-3277", check_sibling_list(node));
71246 +
71247 +       right = node->right;
71248 +       if (right != NULL) {
71249 +               assert("nikita-2465", znode_is_left_connected(right));
71250 +               right->left = NULL;
71251 +               ON_DEBUG(right->left_version =
71252 +                        atomic_inc_return(&delim_key_version);
71253 +                   );
71254 +       }
71255 +       left = node->left;
71256 +       if (left != NULL) {
71257 +               assert("zam-323", znode_is_right_connected(left));
71258 +               left->right = NULL;
71259 +               ON_DEBUG(left->right_version =
71260 +                        atomic_inc_return(&delim_key_version);
71261 +                   );
71262 +       }
71263 +       ZF_CLR(node, JNODE_LEFT_CONNECTED);
71264 +       ZF_CLR(node, JNODE_RIGHT_CONNECTED);
71265 +       ON_DEBUG(node->left = node->right = NULL;
71266 +                node->left_version = atomic_inc_return(&delim_key_version);
71267 +                node->right_version = atomic_inc_return(&delim_key_version););
71268 +}
71269 +
71270 +/* Insert new node into sibling list. Regular balancing inserts new node
71271 +   after (at right side) existing and locked node (@before), except one case
71272 +   of adding new tree root node. @before should be NULL in that case. */
71273 +void sibling_list_insert_nolock(znode * new, znode * before)
71274 +{
71275 +       assert("zam-334", new != NULL);
71276 +       assert("nikita-3298", !znode_is_left_connected(new));
71277 +       assert("nikita-3299", !znode_is_right_connected(new));
71278 +       assert("nikita-3300", new->left == NULL);
71279 +       assert("nikita-3301", new->right == NULL);
71280 +       assert("nikita-3278", check_sibling_list(new));
71281 +       assert("nikita-3279", check_sibling_list(before));
71282 +
71283 +       if (before != NULL) {
71284 +               assert("zam-333", znode_is_connected(before));
71285 +               new->right = before->right;
71286 +               new->left = before;
71287 +               ON_DEBUG(new->right_version =
71288 +                        atomic_inc_return(&delim_key_version);
71289 +                        new->left_version =
71290 +                        atomic_inc_return(&delim_key_version););
71291 +               if (before->right != NULL) {
71292 +                       before->right->left = new;
71293 +                       ON_DEBUG(before->right->left_version =
71294 +                                atomic_inc_return(&delim_key_version);
71295 +                           );
71296 +               }
71297 +               before->right = new;
71298 +               ON_DEBUG(before->right_version =
71299 +                        atomic_inc_return(&delim_key_version);
71300 +                   );
71301 +       } else {
71302 +               new->right = NULL;
71303 +               new->left = NULL;
71304 +               ON_DEBUG(new->right_version =
71305 +                        atomic_inc_return(&delim_key_version);
71306 +                        new->left_version =
71307 +                        atomic_inc_return(&delim_key_version););
71308 +       }
71309 +       ZF_SET(new, JNODE_LEFT_CONNECTED);
71310 +       ZF_SET(new, JNODE_RIGHT_CONNECTED);
71311 +       assert("nikita-3280", check_sibling_list(new));
71312 +       assert("nikita-3281", check_sibling_list(before));
71313 +}
71314 +
71315 +/*
71316 +   Local variables:
71317 +   c-indentation-style: "K&R"
71318 +   mode-name: "LC"
71319 +   c-basic-offset: 8
71320 +   tab-width: 8
71321 +   fill-column: 80
71322 +   End:
71323 +*/
71324 diff -puN /dev/null fs/reiser4/tree_walk.h
71325 --- /dev/null
71326 +++ a/fs/reiser4/tree_walk.h
71327 @@ -0,0 +1,125 @@
71328 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
71329 +
71330 +/* definitions of reiser4 tree walk functions */
71331 +
71332 +#ifndef __FS_REISER4_TREE_WALK_H__
71333 +#define __FS_REISER4_TREE_WALK_H__
71334 +
71335 +#include "debug.h"
71336 +#include "forward.h"
71337 +
71338 +/* establishes horizontal links between cached znodes */
71339 +int connect_znode(coord_t * coord, znode * node);
71340 +
71341 +/* tree traversal functions (reiser4_get_parent(), reiser4_get_neighbor())
71342 +  have the following common arguments:
71343 +
71344 +  return codes:
71345 +
71346 +  @return : 0        - OK,
71347 +
71348 +ZAM-FIXME-HANS: wrong return code name.  Change them all.
71349 +           -ENOENT  - neighbor is not in cache, what is detected by sibling
71350 +                      link absence.
71351 +
71352 +            -E_NO_NEIGHBOR - we are sure that neighbor (or parent) node cannot be
71353 +                       found (because we are left-/right- most node of the
71354 +                      tree, for example). Also, this return code is for
71355 +                      reiser4_get_parent() when we see no parent link -- it
71356 +                      means that our node is root node.
71357 +
71358 +            -E_DEADLOCK - deadlock detected (request from high-priority process
71359 +                      received), other error codes are conformed to
71360 +                      /usr/include/asm/errno.h .
71361 +*/
71362 +
71363 +int
71364 +reiser4_get_parent_flags(lock_handle * result, znode * node,
71365 +                        znode_lock_mode mode, int flags);
71366 +
71367 +/* bits definition for reiser4_get_neighbor function `flags' arg. */
71368 +typedef enum {
71369 +       /* If sibling pointer is NULL, this flag allows get_neighbor() to try to
71370 +        * find not allocated not connected neigbor by going though upper
71371 +        * levels */
71372 +       GN_CAN_USE_UPPER_LEVELS = 0x1,
71373 +       /* locking left neighbor instead of right one */
71374 +       GN_GO_LEFT = 0x2,
71375 +       /* automatically load neighbor node content */
71376 +       GN_LOAD_NEIGHBOR = 0x4,
71377 +       /* return -E_REPEAT if can't lock  */
71378 +       GN_TRY_LOCK = 0x8,
71379 +       /* used internally in tree_walk.c, causes renew_sibling to not
71380 +          allocate neighbor znode, but only search for it in znode cache */
71381 +       GN_NO_ALLOC = 0x10,
71382 +       /* do not go across atom boundaries */
71383 +       GN_SAME_ATOM = 0x20,
71384 +       /* allow to lock not connected nodes */
71385 +       GN_ALLOW_NOT_CONNECTED = 0x40,
71386 +       /*  Avoid synchronous jload, instead, call jstartio() and return -E_REPEAT. */
71387 +       GN_ASYNC = 0x80
71388 +} znode_get_neigbor_flags;
71389 +
71390 +/* A commonly used wrapper for reiser4_get_parent_flags(). */
71391 +static inline int reiser4_get_parent(lock_handle * result, znode * node,
71392 +                                    znode_lock_mode mode)
71393 +{
71394 +       return reiser4_get_parent_flags(result, node, mode,
71395 +                                       GN_ALLOW_NOT_CONNECTED);
71396 +}
71397 +
71398 +int reiser4_get_neighbor(lock_handle * neighbor, znode * node,
71399 +                        znode_lock_mode lock_mode, int flags);
71400 +
71401 +/* there are wrappers for most common usages of reiser4_get_neighbor() */
71402 +static inline int
71403 +reiser4_get_left_neighbor(lock_handle * result, znode * node, int lock_mode,
71404 +                         int flags)
71405 +{
71406 +       return reiser4_get_neighbor(result, node, lock_mode,
71407 +                                   flags | GN_GO_LEFT);
71408 +}
71409 +
71410 +static inline int
71411 +reiser4_get_right_neighbor(lock_handle * result, znode * node, int lock_mode,
71412 +                          int flags)
71413 +{
71414 +       ON_DEBUG(check_lock_node_data(node));
71415 +       ON_DEBUG(check_lock_data());
71416 +       return reiser4_get_neighbor(result, node, lock_mode,
71417 +                                   flags & (~GN_GO_LEFT));
71418 +}
71419 +
71420 +extern void sibling_list_remove(znode * node);
71421 +extern void sibling_list_drop(znode * node);
71422 +extern void sibling_list_insert_nolock(znode * new, znode * before);
71423 +extern void link_left_and_right(znode * left, znode * right);
71424 +
71425 +/* Functions called by tree_walk() when tree_walk() ...  */
71426 +struct tree_walk_actor {
71427 +       /* ... meets a formatted node, */
71428 +       int (*process_znode) (tap_t *, void *);
71429 +       /* ... meets an extent, */
71430 +       int (*process_extent) (tap_t *, void *);
71431 +       /* ... begins tree traversal or repeats it after -E_REPEAT was returned by
71432 +        * node or extent processing functions. */
71433 +       int (*before) (void *);
71434 +};
71435 +
71436 +#if REISER4_DEBUG
71437 +int check_sibling_list(znode * node);
71438 +#else
71439 +#define check_sibling_list(n) (1)
71440 +#endif
71441 +
71442 +#endif                         /* __FS_REISER4_TREE_WALK_H__ */
71443 +
71444 +/*
71445 +   Local variables:
71446 +   c-indentation-style: "K&R"
71447 +   mode-name: "LC"
71448 +   c-basic-offset: 8
71449 +   tab-width: 8
71450 +   fill-column: 120
71451 +   End:
71452 +*/
71453 diff -puN /dev/null fs/reiser4/txnmgr.c
71454 --- /dev/null
71455 +++ a/fs/reiser4/txnmgr.c
71456 @@ -0,0 +1,3164 @@
71457 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
71458 + * reiser4/README */
71459 +
71460 +/* Joshua MacDonald wrote the first draft of this code. */
71461 +
71462 +/* ZAM-LONGTERM-FIXME-HANS: The locking in this file is badly designed, and a
71463 +filesystem scales only as well as its worst locking design.  You need to
71464 +substantially restructure this code. Josh was not as experienced a programmer
71465 +as you.  Particularly review how the locking style differs from what you did
71466 +for znodes usingt hi-lo priority locking, and present to me an opinion on
71467 +whether the differences are well founded.  */
71468 +
71469 +/* I cannot help but to disagree with the sentiment above. Locking of
71470 + * transaction manager is _not_ badly designed, and, at the very least, is not
71471 + * the scaling bottleneck. Scaling bottleneck is _exactly_ hi-lo priority
71472 + * locking on znodes, especially on the root node of the tree. --nikita,
71473 + * 2003.10.13 */
71474 +
71475 +/* The txnmgr is a set of interfaces that keep track of atoms and transcrash handles.  The
71476 +   txnmgr processes capture_block requests and manages the relationship between jnodes and
71477 +   atoms through the various stages of a transcrash, and it also oversees the fusion and
71478 +   capture-on-copy processes.  The main difficulty with this task is maintaining a
71479 +   deadlock-free lock ordering between atoms and jnodes/handles.  The reason for the
71480 +   difficulty is that jnodes, handles, and atoms contain pointer circles, and the cycle
71481 +   must be broken.  The main requirement is that atom-fusion be deadlock free, so once you
71482 +   hold the atom_lock you may then wait to acquire any jnode or handle lock.  This implies
71483 +   that any time you check the atom-pointer of a jnode or handle and then try to lock that
71484 +   atom, you must use trylock() and possibly reverse the order.
71485 +
71486 +   This code implements the design documented at:
71487 +
71488 +     http://namesys.com/txn-doc.html
71489 +
71490 +ZAM-FIXME-HANS: update v4.html to contain all of the information present in the above (but updated), and then remove the
71491 +above document and reference the new.  Be sure to provide some credit to Josh.  I already have some writings on this
71492 +topic in v4.html, but they are lacking in details present in the above.  Cure that.  Remember to write for the bright 12
71493 +year old --- define all technical terms used.
71494 +
71495 +*/
71496 +
71497 +/* Thoughts on the external transaction interface:
71498 +
71499 +   In the current code, a TRANSCRASH handle is created implicitly by reiser4_init_context() (which
71500 +   creates state that lasts for the duration of a system call and is called at the start
71501 +   of ReiserFS methods implementing VFS operations), and closed by reiser4_exit_context(),
71502 +   occupying the scope of a single system call.  We wish to give certain applications an
71503 +   interface to begin and close (commit) transactions.  Since our implementation of
71504 +   transactions does not yet support isolation, allowing an application to open a
71505 +   transaction implies trusting it to later close the transaction.  Part of the
71506 +   transaction interface will be aimed at enabling that trust, but the interface for
71507 +   actually using transactions is fairly narrow.
71508 +
71509 +   BEGIN_TRANSCRASH: Returns a transcrash identifier.  It should be possible to translate
71510 +   this identifier into a string that a shell-script could use, allowing you to start a
71511 +   transaction by issuing a command.  Once open, the transcrash should be set in the task
71512 +   structure, and there should be options (I suppose) to allow it to be carried across
71513 +   fork/exec.  A transcrash has several options:
71514 +
71515 +     - READ_FUSING or WRITE_FUSING: The default policy is for txn-capture to capture only
71516 +     on writes (WRITE_FUSING) and allow "dirty reads".  If the application wishes to
71517 +     capture on reads as well, it should set READ_FUSING.
71518 +
71519 +     - TIMEOUT: Since a non-isolated transcrash cannot be undone, every transcrash must
71520 +     eventually close (or else the machine must crash).  If the application dies an
71521 +     unexpected death with an open transcrash, for example, or if it hangs for a long
71522 +     duration, one solution (to avoid crashing the machine) is to simply close it anyway.
71523 +     This is a dangerous option, but it is one way to solve the problem until isolated
71524 +     transcrashes are available for untrusted applications.
71525 +
71526 +     It seems to be what databases do, though it is unclear how one avoids a DoS attack
71527 +     creating a vulnerability based on resource starvation.  Guaranteeing that some
71528 +     minimum amount of computational resources are made available would seem more correct
71529 +     than guaranteeing some amount of time.  When we again have someone to code the work,
71530 +     this issue should be considered carefully.  -Hans
71531 +
71532 +   RESERVE_BLOCKS: A running transcrash should indicate to the transaction manager how
71533 +   many dirty blocks it expects.  The reserve_blocks interface should be called at a point
71534 +   where it is safe for the application to fail, because the system may not be able to
71535 +   grant the allocation and the application must be able to back-out.  For this reason,
71536 +   the number of reserve-blocks can also be passed as an argument to BEGIN_TRANSCRASH, but
71537 +   the application may also wish to extend the allocation after beginning its transcrash.
71538 +
71539 +   CLOSE_TRANSCRASH: The application closes the transcrash when it is finished making
71540 +   modifications that require transaction protection.  When isolated transactions are
71541 +   supported the CLOSE operation is replaced by either COMMIT or ABORT.  For example, if a
71542 +   RESERVE_BLOCKS call fails for the application, it should "abort" by calling
71543 +   CLOSE_TRANSCRASH, even though it really commits any changes that were made (which is
71544 +   why, for safety, the application should call RESERVE_BLOCKS before making any changes).
71545 +
71546 +   For actually implementing these out-of-system-call-scopped transcrashes, the
71547 +   reiser4_context has a "txn_handle *trans" pointer that may be set to an open
71548 +   transcrash.  Currently there are no dynamically-allocated transcrashes, but there is a
71549 +   "struct kmem_cache *_txnh_slab" created for that purpose in this file.
71550 +*/
71551 +
71552 +/* Extending the other system call interfaces for future transaction features:
71553 +
71554 +   Specialized applications may benefit from passing flags to the ordinary system call
71555 +   interface such as read(), write(), or stat().  For example, the application specifies
71556 +   WRITE_FUSING by default but wishes to add that a certain read() command should be
71557 +   treated as READ_FUSING.  But which read?  Is it the directory-entry read, the stat-data
71558 +   read, or the file-data read?  These issues are straight-forward, but there are a lot of
71559 +   them and adding the necessary flags-passing code will be tedious.
71560 +
71561 +   When supporting isolated transactions, there is a corresponding READ_MODIFY_WRITE (RMW)
71562 +   flag, which specifies that although it is a read operation being requested, a
71563 +   write-lock should be taken.  The reason is that read-locks are shared while write-locks
71564 +   are exclusive, so taking a read-lock when a later-write is known in advance will often
71565 +   leads to deadlock.  If a reader knows it will write later, it should issue read
71566 +   requests with the RMW flag set.
71567 +*/
71568 +
71569 +/*
71570 +   The znode/atom deadlock avoidance.
71571 +
71572 +   FIXME(Zam): writing of this comment is in progress.
71573 +
71574 +   The atom's special stage ASTAGE_CAPTURE_WAIT introduces a kind of atom's
71575 +   long-term locking, which makes reiser4 locking scheme more complex.  It had
71576 +   deadlocks until we implement deadlock avoidance algorithms.  That deadlocks
71577 +   looked as the following: one stopped thread waits for a long-term lock on
71578 +   znode, the thread who owns that lock waits when fusion with another atom will
71579 +   be allowed.
71580 +
71581 +   The source of the deadlocks is an optimization of not capturing index nodes
71582 +   for read.  Let's prove it.  Suppose we have dumb node capturing scheme which
71583 +   unconditionally captures each block before locking it.
71584 +
71585 +   That scheme has no deadlocks.  Let's begin with the thread which stage is
71586 +   ASTAGE_CAPTURE_WAIT and it waits for a znode lock.  The thread can't wait for
71587 +   a capture because it's stage allows fusion with any atom except which are
71588 +   being committed currently. A process of atom commit can't deadlock because
71589 +   atom commit procedure does not acquire locks and does not fuse with other
71590 +   atoms.  Reiser4 does capturing right before going to sleep inside the
71591 +   longtertm_lock_znode() function, it means the znode which we want to lock is
71592 +   already captured and its atom is in ASTAGE_CAPTURE_WAIT stage.  If we
71593 +   continue the analysis we understand that no one process in the sequence may
71594 +   waits atom fusion.  Thereby there are no deadlocks of described kind.
71595 +
71596 +   The capturing optimization makes the deadlocks possible.  A thread can wait a
71597 +   lock which owner did not captured that node.  The lock owner's current atom
71598 +   is not fused with the first atom and it does not get a ASTAGE_CAPTURE_WAIT
71599 +   state. A deadlock is possible when that atom meets another one which is in
71600 +   ASTAGE_CAPTURE_WAIT already.
71601 +
71602 +   The deadlock avoidance scheme includes two algorithms:
71603 +
71604 +   First algorithm is used when a thread captures a node which is locked but not
71605 +   captured by another thread.  Those nodes are marked MISSED_IN_CAPTURE at the
71606 +   moment we skip their capturing.  If such a node (marked MISSED_IN_CAPTURE) is
71607 +   being captured by a thread with current atom is in ASTAGE_CAPTURE_WAIT, the
71608 +   routine which forces all lock owners to join with current atom is executed.
71609 +
71610 +   Second algorithm does not allow to skip capturing of already captured nodes.
71611 +
71612 +   Both algorithms together prevent waiting a longterm lock without atom fusion
71613 +   with atoms of all lock owners, which is a key thing for getting atom/znode
71614 +   locking deadlocks.
71615 +*/
71616 +
71617 +/*
71618 + * Transactions and mmap(2).
71619 + *
71620 + *     1. Transactions are not supported for accesses through mmap(2), because
71621 + *     this would effectively amount to user-level transactions whose duration
71622 + *     is beyond control of the kernel.
71623 + *
71624 + *     2. That said, we still want to preserve some decency with regard to
71625 + *     mmap(2). During normal write(2) call, following sequence of events
71626 + *     happens:
71627 + *
71628 + *         1. page is created;
71629 + *
71630 + *         2. jnode is created, dirtied and captured into current atom.
71631 + *
71632 + *         3. extent is inserted and modified.
71633 + *
71634 + *     Steps (2) and (3) take place under long term lock on the twig node.
71635 + *
71636 + *     When file is accessed through mmap(2) page is always created during
71637 + *     page fault.
71638 + *     After this (in reiser4_readpage()->reiser4_readpage_extent()):
71639 + *
71640 + *         1. if access is made to non-hole page new jnode is created, (if
71641 + *         necessary)
71642 + *
71643 + *         2. if access is made to the hole page, jnode is not created (XXX
71644 + *         not clear why).
71645 + *
71646 + *     Also, even if page is created by write page fault it is not marked
71647 + *     dirty immediately by handle_mm_fault(). Probably this is to avoid races
71648 + *     with page write-out.
71649 + *
71650 + *     Dirty bit installed by hardware is only transferred to the struct page
71651 + *     later, when page is unmapped (in zap_pte_range(), or
71652 + *     try_to_unmap_one()).
71653 + *
71654 + *     So, with mmap(2) we have to handle following irksome situations:
71655 + *
71656 + *         1. there exists modified page (clean or dirty) without jnode
71657 + *
71658 + *         2. there exists modified page (clean or dirty) with clean jnode
71659 + *
71660 + *         3. clean page which is a part of atom can be transparently modified
71661 + *         at any moment through mapping without becoming dirty.
71662 + *
71663 + *     (1) and (2) can lead to the out-of-memory situation: ->writepage()
71664 + *     doesn't know what to do with such pages and ->sync_sb()/->writepages()
71665 + *     don't see them, because these methods operate on atoms.
71666 + *
71667 + *     (3) can lead to the loss of data: suppose we have dirty page with dirty
71668 + *     captured jnode captured by some atom. As part of early flush (for
71669 + *     example) page was written out. Dirty bit was cleared on both page and
71670 + *     jnode. After this page is modified through mapping, but kernel doesn't
71671 + *     notice and just discards page and jnode as part of commit. (XXX
71672 + *     actually it doesn't, because to reclaim page ->releasepage() has to be
71673 + *     called and before this dirty bit will be transferred to the struct
71674 + *     page).
71675 + *
71676 + */
71677 +
71678 +#include "debug.h"
71679 +#include "txnmgr.h"
71680 +#include "jnode.h"
71681 +#include "znode.h"
71682 +#include "block_alloc.h"
71683 +#include "tree.h"
71684 +#include "wander.h"
71685 +#include "ktxnmgrd.h"
71686 +#include "super.h"
71687 +#include "page_cache.h"
71688 +#include "reiser4.h"
71689 +#include "vfs_ops.h"
71690 +#include "inode.h"
71691 +#include "flush.h"
71692 +
71693 +#include <asm/atomic.h>
71694 +#include <linux/types.h>
71695 +#include <linux/fs.h>
71696 +#include <linux/mm.h>
71697 +#include <linux/slab.h>
71698 +#include <linux/pagemap.h>
71699 +#include <linux/writeback.h>
71700 +#include <linux/swap.h>                /* for totalram_pages */
71701 +
71702 +static void atom_free(txn_atom * atom);
71703 +
71704 +static int commit_txnh(txn_handle * txnh);
71705 +
71706 +static void wakeup_atom_waitfor_list(txn_atom * atom);
71707 +static void wakeup_atom_waiting_list(txn_atom * atom);
71708 +
71709 +static void capture_assign_txnh_nolock(txn_atom * atom, txn_handle * txnh);
71710 +
71711 +static void capture_assign_block_nolock(txn_atom * atom, jnode * node);
71712 +
71713 +static void fuse_not_fused_lock_owners(txn_handle * txnh, znode * node);
71714 +
71715 +static int capture_init_fusion(jnode * node, txn_handle * txnh,
71716 +                              txn_capture mode);
71717 +
71718 +static int capture_fuse_wait(txn_handle *, txn_atom *, txn_atom *, txn_capture);
71719 +
71720 +static void capture_fuse_into(txn_atom * small, txn_atom * large);
71721 +
71722 +void reiser4_invalidate_list(struct list_head *);
71723 +
71724 +/* GENERIC STRUCTURES */
71725 +
71726 +typedef struct _txn_wait_links txn_wait_links;
71727 +
71728 +struct _txn_wait_links {
71729 +       lock_stack *_lock_stack;
71730 +       struct list_head _fwaitfor_link;
71731 +       struct list_head _fwaiting_link;
71732 +       int (*waitfor_cb) (txn_atom * atom, struct _txn_wait_links * wlinks);
71733 +       int (*waiting_cb) (txn_atom * atom, struct _txn_wait_links * wlinks);
71734 +};
71735 +
71736 +/* FIXME: In theory, we should be using the slab cache init & destructor
71737 +   methods instead of, e.g., jnode_init, etc. */
71738 +static struct kmem_cache *_atom_slab = NULL;
71739 +/* this is for user-visible, cross system-call transactions. */
71740 +static struct kmem_cache *_txnh_slab = NULL;
71741 +
71742 +/**
71743 + * init_txnmgr_static - create transaction manager slab caches
71744 + *
71745 + * Initializes caches of txn-atoms and txn_handle. It is part of reiser4 module
71746 + * initialization.
71747 + */
71748 +int init_txnmgr_static(void)
71749 +{
71750 +       assert("jmacd-600", _atom_slab == NULL);
71751 +       assert("jmacd-601", _txnh_slab == NULL);
71752 +
71753 +       ON_DEBUG(atomic_set(&flush_cnt, 0));
71754 +
71755 +       _atom_slab = kmem_cache_create("txn_atom", sizeof(txn_atom), 0,
71756 +                                      SLAB_HWCACHE_ALIGN |
71757 +                                      SLAB_RECLAIM_ACCOUNT, NULL);
71758 +       if (_atom_slab == NULL)
71759 +               return RETERR(-ENOMEM);
71760 +
71761 +       _txnh_slab = kmem_cache_create("txn_handle", sizeof(txn_handle), 0,
71762 +                             SLAB_HWCACHE_ALIGN, NULL);
71763 +       if (_txnh_slab == NULL) {
71764 +               kmem_cache_destroy(_atom_slab);
71765 +               _atom_slab = NULL;
71766 +               return RETERR(-ENOMEM);
71767 +       }
71768 +
71769 +       return 0;
71770 +}
71771 +
71772 +/**
71773 + * done_txnmgr_static - delete txn_atom and txn_handle caches
71774 + *
71775 + * This is called on reiser4 module unloading or system shutdown.
71776 + */
71777 +void done_txnmgr_static(void)
71778 +{
71779 +       destroy_reiser4_cache(&_atom_slab);
71780 +       destroy_reiser4_cache(&_txnh_slab);
71781 +}
71782 +
71783 +/**
71784 + * init_txnmgr - initialize a new transaction manager
71785 + * @mgr: pointer to transaction manager embedded in reiser4 super block
71786 + *
71787 + * This is called on mount. Makes necessary initializations.
71788 + */
71789 +void reiser4_init_txnmgr(txn_mgr *mgr)
71790 +{
71791 +       assert("umka-169", mgr != NULL);
71792 +
71793 +       mgr->atom_count = 0;
71794 +       mgr->id_count = 1;
71795 +       INIT_LIST_HEAD(&mgr->atoms_list);
71796 +       spin_lock_init(&mgr->tmgr_lock);
71797 +       mutex_init(&mgr->commit_mutex);
71798 +}
71799 +
71800 +/**
71801 + * reiser4_done_txnmgr - stop transaction manager
71802 + * @mgr: pointer to transaction manager embedded in reiser4 super block
71803 + *
71804 + * This is called on umount. Does sanity checks.
71805 + */
71806 +void reiser4_done_txnmgr(txn_mgr *mgr)
71807 +{
71808 +       assert("umka-170", mgr != NULL);
71809 +       assert("umka-1701", list_empty_careful(&mgr->atoms_list));
71810 +       assert("umka-1702", mgr->atom_count == 0);
71811 +}
71812 +
71813 +/* Initialize a transaction handle. */
71814 +/* Audited by: umka (2002.06.13) */
71815 +static void txnh_init(txn_handle * txnh, txn_mode mode)
71816 +{
71817 +       assert("umka-171", txnh != NULL);
71818 +
71819 +       txnh->mode = mode;
71820 +       txnh->atom = NULL;
71821 +       reiser4_ctx_gfp_mask_set();
71822 +       txnh->flags = 0;
71823 +       spin_lock_init(&txnh->hlock);
71824 +       INIT_LIST_HEAD(&txnh->txnh_link);
71825 +}
71826 +
71827 +#if REISER4_DEBUG
71828 +/* Check if a transaction handle is clean. */
71829 +static int txnh_isclean(txn_handle * txnh)
71830 +{
71831 +       assert("umka-172", txnh != NULL);
71832 +       return txnh->atom == NULL &&
71833 +               LOCK_CNT_NIL(spin_locked_txnh);
71834 +}
71835 +#endif
71836 +
71837 +/* Initialize an atom. */
71838 +static void atom_init(txn_atom * atom)
71839 +{
71840 +       int level;
71841 +
71842 +       assert("umka-173", atom != NULL);
71843 +
71844 +       memset(atom, 0, sizeof(txn_atom));
71845 +
71846 +       atom->stage = ASTAGE_FREE;
71847 +       atom->start_time = jiffies;
71848 +
71849 +       for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1)
71850 +               INIT_LIST_HEAD(ATOM_DIRTY_LIST(atom, level));
71851 +
71852 +       INIT_LIST_HEAD(ATOM_CLEAN_LIST(atom));
71853 +       INIT_LIST_HEAD(ATOM_OVRWR_LIST(atom));
71854 +       INIT_LIST_HEAD(ATOM_WB_LIST(atom));
71855 +       INIT_LIST_HEAD(&atom->inodes);
71856 +       spin_lock_init(&(atom->alock));
71857 +       /* list of transaction handles */
71858 +       INIT_LIST_HEAD(&atom->txnh_list);
71859 +       /* link to transaction manager's list of atoms */
71860 +       INIT_LIST_HEAD(&atom->atom_link);
71861 +       INIT_LIST_HEAD(&atom->fwaitfor_list);
71862 +       INIT_LIST_HEAD(&atom->fwaiting_list);
71863 +       blocknr_set_init(&atom->delete_set);
71864 +       blocknr_set_init(&atom->wandered_map);
71865 +
71866 +       init_atom_fq_parts(atom);
71867 +}
71868 +
71869 +#if REISER4_DEBUG
71870 +/* Check if an atom is clean. */
71871 +static int atom_isclean(txn_atom * atom)
71872 +{
71873 +       int level;
71874 +
71875 +       assert("umka-174", atom != NULL);
71876 +
71877 +       for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1) {
71878 +               if (!list_empty_careful(ATOM_DIRTY_LIST(atom, level))) {
71879 +                       return 0;
71880 +               }
71881 +       }
71882 +
71883 +       return  atom->stage == ASTAGE_FREE &&
71884 +               atom->txnh_count == 0 &&
71885 +               atom->capture_count == 0 &&
71886 +               atomic_read(&atom->refcount) == 0 &&
71887 +               (&atom->atom_link == atom->atom_link.next &&
71888 +                &atom->atom_link == atom->atom_link.prev) &&
71889 +               list_empty_careful(&atom->txnh_list) &&
71890 +               list_empty_careful(ATOM_CLEAN_LIST(atom)) &&
71891 +               list_empty_careful(ATOM_OVRWR_LIST(atom)) &&
71892 +               list_empty_careful(ATOM_WB_LIST(atom)) &&
71893 +               list_empty_careful(&atom->fwaitfor_list) &&
71894 +               list_empty_careful(&atom->fwaiting_list) &&
71895 +               atom_fq_parts_are_clean(atom);
71896 +}
71897 +#endif
71898 +
71899 +/* Begin a transaction in this context.  Currently this uses the reiser4_context's
71900 +   trans_in_ctx, which means that transaction handles are stack-allocated.  Eventually
71901 +   this will be extended to allow transaction handles to span several contexts. */
71902 +/* Audited by: umka (2002.06.13) */
71903 +void reiser4_txn_begin(reiser4_context * context)
71904 +{
71905 +       assert("jmacd-544", context->trans == NULL);
71906 +
71907 +       context->trans = &context->trans_in_ctx;
71908 +
71909 +       /* FIXME_LATER_JMACD Currently there's no way to begin a TXN_READ_FUSING
71910 +          transcrash.  Default should be TXN_WRITE_FUSING.  Also, the _trans variable is
71911 +          stack allocated right now, but we would like to allow for dynamically allocated
71912 +          transcrashes that span multiple system calls.
71913 +        */
71914 +       txnh_init(context->trans, TXN_WRITE_FUSING);
71915 +}
71916 +
71917 +/* Finish a transaction handle context. */
71918 +int reiser4_txn_end(reiser4_context * context)
71919 +{
71920 +       long ret = 0;
71921 +       txn_handle *txnh;
71922 +
71923 +       assert("umka-283", context != NULL);
71924 +       assert("nikita-3012", reiser4_schedulable());
71925 +       assert("vs-24", context == get_current_context());
71926 +       assert("nikita-2967", lock_stack_isclean(get_current_lock_stack()));
71927 +
71928 +       txnh = context->trans;
71929 +       if (txnh != NULL) {
71930 +               if (txnh->atom != NULL)
71931 +                       ret = commit_txnh(txnh);
71932 +               assert("jmacd-633", txnh_isclean(txnh));
71933 +               context->trans = NULL;
71934 +       }
71935 +       return ret;
71936 +}
71937 +
71938 +void reiser4_txn_restart(reiser4_context * context)
71939 +{
71940 +       reiser4_txn_end(context);
71941 +       reiser4_preempt_point();
71942 +       reiser4_txn_begin(context);
71943 +}
71944 +
71945 +void reiser4_txn_restart_current(void)
71946 +{
71947 +       reiser4_txn_restart(get_current_context());
71948 +}
71949 +
71950 +/* TXN_ATOM */
71951 +
71952 +/* Get the atom belonging to a txnh, which is not locked.  Return txnh locked. Locks atom, if atom
71953 +   is not NULL.  This performs the necessary spin_trylock to break the lock-ordering cycle.  May
71954 +   return NULL. */
71955 +static txn_atom *txnh_get_atom(txn_handle * txnh)
71956 +{
71957 +       txn_atom *atom;
71958 +
71959 +       assert("umka-180", txnh != NULL);
71960 +       assert_spin_not_locked(&(txnh->hlock));
71961 +
71962 +       while (1) {
71963 +               spin_lock_txnh(txnh);
71964 +               atom = txnh->atom;
71965 +
71966 +               if (atom == NULL)
71967 +                       break;
71968 +
71969 +               if (spin_trylock_atom(atom))
71970 +                       break;
71971 +
71972 +               atomic_inc(&atom->refcount);
71973 +
71974 +               spin_unlock_txnh(txnh);
71975 +               spin_lock_atom(atom);
71976 +               spin_lock_txnh(txnh);
71977 +
71978 +               if (txnh->atom == atom) {
71979 +                       atomic_dec(&atom->refcount);
71980 +                       break;
71981 +               }
71982 +
71983 +               spin_unlock_txnh(txnh);
71984 +               atom_dec_and_unlock(atom);
71985 +       }
71986 +
71987 +       return atom;
71988 +}
71989 +
71990 +/* Get the current atom and spinlock it if current atom present. May return NULL  */
71991 +txn_atom *get_current_atom_locked_nocheck(void)
71992 +{
71993 +       reiser4_context *cx;
71994 +       txn_atom *atom;
71995 +       txn_handle *txnh;
71996 +
71997 +       cx = get_current_context();
71998 +       assert("zam-437", cx != NULL);
71999 +
72000 +       txnh = cx->trans;
72001 +       assert("zam-435", txnh != NULL);
72002 +
72003 +       atom = txnh_get_atom(txnh);
72004 +
72005 +       spin_unlock_txnh(txnh);
72006 +       return atom;
72007 +}
72008 +
72009 +/* Get the atom belonging to a jnode, which is initially locked.  Return with
72010 +   both jnode and atom locked.  This performs the necessary spin_trylock to
72011 +   break the lock-ordering cycle.  Assumes the jnode is already locked, and
72012 +   returns NULL if atom is not set. */
72013 +txn_atom *jnode_get_atom(jnode * node)
72014 +{
72015 +       txn_atom *atom;
72016 +
72017 +       assert("umka-181", node != NULL);
72018 +
72019 +       while (1) {
72020 +               assert_spin_locked(&(node->guard));
72021 +
72022 +               atom = node->atom;
72023 +               /* node is not in any atom */
72024 +               if (atom == NULL)
72025 +                       break;
72026 +
72027 +               /* If atom is not locked, grab the lock and return */
72028 +               if (spin_trylock_atom(atom))
72029 +                       break;
72030 +
72031 +               /* At least one jnode belongs to this atom it guarantees that
72032 +                * atom->refcount > 0, we can safely increment refcount. */
72033 +               atomic_inc(&atom->refcount);
72034 +               spin_unlock_jnode(node);
72035 +
72036 +               /* re-acquire spin locks in the right order */
72037 +               spin_lock_atom(atom);
72038 +               spin_lock_jnode(node);
72039 +
72040 +               /* check if node still points to the same atom. */
72041 +               if (node->atom == atom) {
72042 +                       atomic_dec(&atom->refcount);
72043 +                       break;
72044 +               }
72045 +
72046 +               /* releasing of atom lock and reference requires not holding
72047 +                * locks on jnodes.  */
72048 +               spin_unlock_jnode(node);
72049 +
72050 +               /* We do not sure that this atom has extra references except our
72051 +                * one, so we should call proper function which may free atom if
72052 +                * last reference is released. */
72053 +               atom_dec_and_unlock(atom);
72054 +
72055 +               /* lock jnode again for getting valid node->atom pointer
72056 +                * value. */
72057 +               spin_lock_jnode(node);
72058 +       }
72059 +
72060 +       return atom;
72061 +}
72062 +
72063 +/* Returns true if @node is dirty and part of the same atom as one of its neighbors.  Used
72064 +   by flush code to indicate whether the next node (in some direction) is suitable for
72065 +   flushing. */
72066 +int
72067 +same_slum_check(jnode * node, jnode * check, int alloc_check, int alloc_value)
72068 +{
72069 +       int compat;
72070 +       txn_atom *atom;
72071 +
72072 +       assert("umka-182", node != NULL);
72073 +       assert("umka-183", check != NULL);
72074 +
72075 +       /* Not sure what this function is supposed to do if supplied with @check that is
72076 +          neither formatted nor unformatted (bitmap or so). */
72077 +       assert("nikita-2373", jnode_is_znode(check)
72078 +              || jnode_is_unformatted(check));
72079 +
72080 +       /* Need a lock on CHECK to get its atom and to check various state bits.
72081 +          Don't need a lock on NODE once we get the atom lock. */
72082 +       /* It is not enough to lock two nodes and check (node->atom ==
72083 +          check->atom) because atom could be locked and being fused at that
72084 +          moment, jnodes of the atom of that state (being fused) can point to
72085 +          different objects, but the atom is the same. */
72086 +       spin_lock_jnode(check);
72087 +
72088 +       atom = jnode_get_atom(check);
72089 +
72090 +       if (atom == NULL) {
72091 +               compat = 0;
72092 +       } else {
72093 +               compat = (node->atom == atom && JF_ISSET(check, JNODE_DIRTY));
72094 +
72095 +               if (compat && jnode_is_znode(check)) {
72096 +                       compat &= znode_is_connected(JZNODE(check));
72097 +               }
72098 +
72099 +               if (compat && alloc_check) {
72100 +                       compat &= (alloc_value == jnode_is_flushprepped(check));
72101 +               }
72102 +
72103 +               spin_unlock_atom(atom);
72104 +       }
72105 +
72106 +       spin_unlock_jnode(check);
72107 +
72108 +       return compat;
72109 +}
72110 +
72111 +/* Decrement the atom's reference count and if it falls to zero, free it. */
72112 +void atom_dec_and_unlock(txn_atom * atom)
72113 +{
72114 +       txn_mgr *mgr = &get_super_private(reiser4_get_current_sb())->tmgr;
72115 +
72116 +       assert("umka-186", atom != NULL);
72117 +       assert_spin_locked(&(atom->alock));
72118 +       assert("zam-1039", atomic_read(&atom->refcount) > 0);
72119 +
72120 +       if (atomic_dec_and_test(&atom->refcount)) {
72121 +               /* take txnmgr lock and atom lock in proper order. */
72122 +               if (!spin_trylock_txnmgr(mgr)) {
72123 +                       /* This atom should exist after we re-acquire its
72124 +                        * spinlock, so we increment its reference counter. */
72125 +                       atomic_inc(&atom->refcount);
72126 +                       spin_unlock_atom(atom);
72127 +                       spin_lock_txnmgr(mgr);
72128 +                       spin_lock_atom(atom);
72129 +
72130 +                       if (!atomic_dec_and_test(&atom->refcount)) {
72131 +                               spin_unlock_atom(atom);
72132 +                               spin_unlock_txnmgr(mgr);
72133 +                               return;
72134 +                       }
72135 +               }
72136 +               assert_spin_locked(&(mgr->tmgr_lock));
72137 +               atom_free(atom);
72138 +               spin_unlock_txnmgr(mgr);
72139 +       } else
72140 +               spin_unlock_atom(atom);
72141 +}
72142 +
72143 +/* Create new atom and connect it to given transaction handle.  This adds the
72144 +   atom to the transaction manager's list and sets its reference count to 1, an
72145 +   artificial reference which is kept until it commits.  We play strange games
72146 +   to avoid allocation under jnode & txnh spinlocks.*/
72147 +
72148 +static int atom_begin_and_assign_to_txnh(txn_atom ** atom_alloc, txn_handle * txnh)
72149 +{
72150 +       txn_atom *atom;
72151 +       txn_mgr *mgr;
72152 +
72153 +       if (REISER4_DEBUG && rofs_tree(current_tree)) {
72154 +               warning("nikita-3366", "Creating atom on rofs");
72155 +               dump_stack();
72156 +       }
72157 +
72158 +       if (*atom_alloc == NULL) {
72159 +               (*atom_alloc) = kmem_cache_alloc(_atom_slab,
72160 +                                                reiser4_ctx_gfp_mask_get());
72161 +
72162 +               if (*atom_alloc == NULL)
72163 +                       return RETERR(-ENOMEM);
72164 +       }
72165 +
72166 +       /* and, also, txnmgr spin lock should be taken before jnode and txnh
72167 +          locks. */
72168 +       mgr = &get_super_private(reiser4_get_current_sb())->tmgr;
72169 +       spin_lock_txnmgr(mgr);
72170 +       spin_lock_txnh(txnh);
72171 +
72172 +       /* Check whether new atom still needed */
72173 +       if (txnh->atom != NULL) {
72174 +               /* NOTE-NIKITA probably it is rather better to free
72175 +                * atom_alloc here than thread it up to reiser4_try_capture() */
72176 +
72177 +               spin_unlock_txnh(txnh);
72178 +               spin_unlock_txnmgr(mgr);
72179 +
72180 +               return -E_REPEAT;
72181 +       }
72182 +
72183 +       atom = *atom_alloc;
72184 +       *atom_alloc = NULL;
72185 +
72186 +       atom_init(atom);
72187 +
72188 +       assert("jmacd-17", atom_isclean(atom));
72189 +
72190 +        /*
72191 +        * lock ordering is broken here. It is ok, as long as @atom is new
72192 +        * and inaccessible for others. We can't use spin_lock_atom or
72193 +        * spin_lock(&atom->alock) because they care about locking
72194 +        * dependencies. spin_trylock_lock doesn't.
72195 +        */
72196 +       check_me("", spin_trylock_atom(atom));
72197 +
72198 +       /* add atom to the end of transaction manager's list of atoms */
72199 +       list_add_tail(&atom->atom_link, &mgr->atoms_list);
72200 +       atom->atom_id = mgr->id_count++;
72201 +       mgr->atom_count += 1;
72202 +
72203 +       /* Release txnmgr lock */
72204 +       spin_unlock_txnmgr(mgr);
72205 +
72206 +       /* One reference until it commits. */
72207 +       atomic_inc(&atom->refcount);
72208 +       atom->stage = ASTAGE_CAPTURE_FUSE;
72209 +       atom->super = reiser4_get_current_sb();
72210 +       capture_assign_txnh_nolock(atom, txnh);
72211 +
72212 +       spin_unlock_atom(atom);
72213 +       spin_unlock_txnh(txnh);
72214 +
72215 +       return -E_REPEAT;
72216 +}
72217 +
72218 +/* Return true if an atom is currently "open". */
72219 +static int atom_isopen(const txn_atom * atom)
72220 +{
72221 +       assert("umka-185", atom != NULL);
72222 +
72223 +       return atom->stage > 0 && atom->stage < ASTAGE_PRE_COMMIT;
72224 +}
72225 +
72226 +/* Return the number of pointers to this atom that must be updated during fusion.  This
72227 +   approximates the amount of work to be done.  Fusion chooses the atom with fewer
72228 +   pointers to fuse into the atom with more pointers. */
72229 +static int atom_pointer_count(const txn_atom * atom)
72230 +{
72231 +       assert("umka-187", atom != NULL);
72232 +
72233 +       /* This is a measure of the amount of work needed to fuse this atom
72234 +        * into another. */
72235 +       return atom->txnh_count + atom->capture_count;
72236 +}
72237 +
72238 +/* Called holding the atom lock, this removes the atom from the transaction manager list
72239 +   and frees it. */
72240 +static void atom_free(txn_atom * atom)
72241 +{
72242 +       txn_mgr *mgr = &get_super_private(reiser4_get_current_sb())->tmgr;
72243 +
72244 +       assert("umka-188", atom != NULL);
72245 +       assert_spin_locked(&(atom->alock));
72246 +
72247 +       /* Remove from the txn_mgr's atom list */
72248 +       assert_spin_locked(&(mgr->tmgr_lock));
72249 +       mgr->atom_count -= 1;
72250 +       list_del_init(&atom->atom_link);
72251 +
72252 +       /* Clean the atom */
72253 +       assert("jmacd-16",
72254 +              (atom->stage == ASTAGE_INVALID || atom->stage == ASTAGE_DONE));
72255 +       atom->stage = ASTAGE_FREE;
72256 +
72257 +       blocknr_set_destroy(&atom->delete_set);
72258 +       blocknr_set_destroy(&atom->wandered_map);
72259 +
72260 +       assert("jmacd-16", atom_isclean(atom));
72261 +
72262 +       spin_unlock_atom(atom);
72263 +
72264 +       kmem_cache_free(_atom_slab, atom);
72265 +}
72266 +
72267 +static int atom_is_dotard(const txn_atom * atom)
72268 +{
72269 +       return time_after(jiffies, atom->start_time +
72270 +                         get_current_super_private()->tmgr.atom_max_age);
72271 +}
72272 +
72273 +static int atom_can_be_committed(txn_atom * atom)
72274 +{
72275 +       assert_spin_locked(&(atom->alock));
72276 +       assert("zam-885", atom->txnh_count > atom->nr_waiters);
72277 +       return atom->txnh_count == atom->nr_waiters + 1;
72278 +}
72279 +
72280 +/* Return true if an atom should commit now.  This is determined by aging, atom
72281 +   size or atom flags. */
72282 +static int atom_should_commit(const txn_atom * atom)
72283 +{
72284 +       assert("umka-189", atom != NULL);
72285 +       return
72286 +           (atom->flags & ATOM_FORCE_COMMIT) ||
72287 +           ((unsigned)atom_pointer_count(atom) >
72288 +            get_current_super_private()->tmgr.atom_max_size)
72289 +           || atom_is_dotard(atom);
72290 +}
72291 +
72292 +/* return 1 if current atom exists and requires commit. */
72293 +int current_atom_should_commit(void)
72294 +{
72295 +       txn_atom *atom;
72296 +       int result = 0;
72297 +
72298 +       atom = get_current_atom_locked_nocheck();
72299 +       if (atom) {
72300 +               result = atom_should_commit(atom);
72301 +               spin_unlock_atom(atom);
72302 +       }
72303 +       return result;
72304 +}
72305 +
72306 +static int atom_should_commit_asap(const txn_atom * atom)
72307 +{
72308 +       unsigned int captured;
72309 +       unsigned int pinnedpages;
72310 +
72311 +       assert("nikita-3309", atom != NULL);
72312 +
72313 +       captured = (unsigned)atom->capture_count;
72314 +       pinnedpages = (captured >> PAGE_CACHE_SHIFT) * sizeof(znode);
72315 +
72316 +       return (pinnedpages > (totalram_pages >> 3)) || (atom->flushed > 100);
72317 +}
72318 +
72319 +static jnode *find_first_dirty_in_list(struct list_head *head, int flags)
72320 +{
72321 +       jnode *first_dirty;
72322 +
72323 +       list_for_each_entry(first_dirty, head, capture_link) {
72324 +               if (!(flags & JNODE_FLUSH_COMMIT)) {
72325 +                       /*
72326 +                        * skip jnodes which "heard banshee" or having active
72327 +                        * I/O
72328 +                        */
72329 +                       if (JF_ISSET(first_dirty, JNODE_HEARD_BANSHEE) ||
72330 +                           JF_ISSET(first_dirty, JNODE_WRITEBACK))
72331 +                               continue;
72332 +               }
72333 +               return first_dirty;
72334 +       }
72335 +       return NULL;
72336 +}
72337 +
72338 +/* Get first dirty node from the atom's dirty_nodes[n] lists; return NULL if atom has no dirty
72339 +   nodes on atom's lists */
72340 +jnode *find_first_dirty_jnode(txn_atom * atom, int flags)
72341 +{
72342 +       jnode *first_dirty;
72343 +       tree_level level;
72344 +
72345 +       assert_spin_locked(&(atom->alock));
72346 +
72347 +       /* The flush starts from LEAF_LEVEL (=1). */
72348 +       for (level = 1; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1) {
72349 +               if (list_empty_careful(ATOM_DIRTY_LIST(atom, level)))
72350 +                       continue;
72351 +
72352 +               first_dirty =
72353 +                   find_first_dirty_in_list(ATOM_DIRTY_LIST(atom, level),
72354 +                                            flags);
72355 +               if (first_dirty)
72356 +                       return first_dirty;
72357 +       }
72358 +
72359 +       /* znode-above-root is on the list #0. */
72360 +       return find_first_dirty_in_list(ATOM_DIRTY_LIST(atom, 0), flags);
72361 +}
72362 +
72363 +static void dispatch_wb_list(txn_atom * atom, flush_queue_t * fq)
72364 +{
72365 +       jnode *cur;
72366 +
72367 +       assert("zam-905", atom_is_protected(atom));
72368 +
72369 +       cur = list_entry(ATOM_WB_LIST(atom)->next, jnode, capture_link);
72370 +       while (ATOM_WB_LIST(atom) != &cur->capture_link) {
72371 +               jnode *next = list_entry(cur->capture_link.next, jnode, capture_link);
72372 +
72373 +               spin_lock_jnode(cur);
72374 +               if (!JF_ISSET(cur, JNODE_WRITEBACK)) {
72375 +                       if (JF_ISSET(cur, JNODE_DIRTY)) {
72376 +                               queue_jnode(fq, cur);
72377 +                       } else {
72378 +                               /* move jnode to atom's clean list */
72379 +                               list_move_tail(&cur->capture_link,
72380 +                                             ATOM_CLEAN_LIST(atom));
72381 +                       }
72382 +               }
72383 +               spin_unlock_jnode(cur);
72384 +
72385 +               cur = next;
72386 +       }
72387 +}
72388 +
72389 +/* Scan current atom->writeback_nodes list, re-submit dirty and !writeback
72390 + * jnodes to disk. */
72391 +static int submit_wb_list(void)
72392 +{
72393 +       int ret;
72394 +       flush_queue_t *fq;
72395 +
72396 +       fq = get_fq_for_current_atom();
72397 +       if (IS_ERR(fq))
72398 +               return PTR_ERR(fq);
72399 +
72400 +       dispatch_wb_list(fq->atom, fq);
72401 +       spin_unlock_atom(fq->atom);
72402 +
72403 +       ret = reiser4_write_fq(fq, NULL, 1);
72404 +       reiser4_fq_put(fq);
72405 +
72406 +       return ret;
72407 +}
72408 +
72409 +/* Wait completion of all writes, re-submit atom writeback list if needed. */
72410 +static int current_atom_complete_writes(void)
72411 +{
72412 +       int ret;
72413 +
72414 +       /* Each jnode from that list was modified and dirtied when it had i/o
72415 +        * request running already. After i/o completion we have to resubmit
72416 +        * them to disk again.*/
72417 +       ret = submit_wb_list();
72418 +       if (ret < 0)
72419 +               return ret;
72420 +
72421 +       /* Wait all i/o completion */
72422 +       ret = current_atom_finish_all_fq();
72423 +       if (ret)
72424 +               return ret;
72425 +
72426 +       /* Scan wb list again; all i/o should be completed, we re-submit dirty
72427 +        * nodes to disk */
72428 +       ret = submit_wb_list();
72429 +       if (ret < 0)
72430 +               return ret;
72431 +
72432 +       /* Wait all nodes we just submitted */
72433 +       return current_atom_finish_all_fq();
72434 +}
72435 +
72436 +#if REISER4_DEBUG
72437 +
72438 +static void reiser4_info_atom(const char *prefix, const txn_atom * atom)
72439 +{
72440 +       if (atom == NULL) {
72441 +               printk("%s: no atom\n", prefix);
72442 +               return;
72443 +       }
72444 +
72445 +       printk("%s: refcount: %i id: %i flags: %x txnh_count: %i"
72446 +              " capture_count: %i stage: %x start: %lu, flushed: %i\n", prefix,
72447 +              atomic_read(&atom->refcount), atom->atom_id, atom->flags,
72448 +              atom->txnh_count, atom->capture_count, atom->stage,
72449 +              atom->start_time, atom->flushed);
72450 +}
72451 +
72452 +#else  /*  REISER4_DEBUG  */
72453 +
72454 +static inline void reiser4_info_atom(const char *prefix, const txn_atom * atom) {}
72455 +
72456 +#endif  /*  REISER4_DEBUG  */
72457 +
72458 +#define TOOMANYFLUSHES (1 << 13)
72459 +
72460 +/* Called with the atom locked and no open "active" transaction handlers except
72461 +   ours, this function calls flush_current_atom() until all dirty nodes are
72462 +   processed.  Then it initiates commit processing.
72463 +
72464 +   Called by the single remaining open "active" txnh, which is closing. Other
72465 +   open txnhs belong to processes which wait atom commit in commit_txnh()
72466 +   routine. They are counted as "waiters" in atom->nr_waiters.  Therefore as
72467 +   long as we hold the atom lock none of the jnodes can be captured and/or
72468 +   locked.
72469 +
72470 +   Return value is an error code if commit fails.
72471 +*/
72472 +static int commit_current_atom(long *nr_submitted, txn_atom ** atom)
72473 +{
72474 +       reiser4_super_info_data *sbinfo = get_current_super_private();
72475 +       long ret = 0;
72476 +       /* how many times jnode_flush() was called as a part of attempt to
72477 +        * commit this atom. */
72478 +       int flushiters;
72479 +
72480 +       assert("zam-888", atom != NULL && *atom != NULL);
72481 +       assert_spin_locked(&((*atom)->alock));
72482 +       assert("zam-887", get_current_context()->trans->atom == *atom);
72483 +       assert("jmacd-151", atom_isopen(*atom));
72484 +
72485 +       assert("nikita-3184",
72486 +              get_current_super_private()->delete_mutex_owner != current);
72487 +
72488 +       for (flushiters = 0;; ++flushiters) {
72489 +               ret =
72490 +                   flush_current_atom(JNODE_FLUSH_WRITE_BLOCKS |
72491 +                                      JNODE_FLUSH_COMMIT,
72492 +                                      LONG_MAX /* nr_to_write */ ,
72493 +                                      nr_submitted, atom, NULL);
72494 +               if (ret != -E_REPEAT)
72495 +                       break;
72496 +
72497 +               /* if atom's dirty list contains one znode which is
72498 +                  HEARD_BANSHEE and is locked we have to allow lock owner to
72499 +                  continue and uncapture that znode */
72500 +               reiser4_preempt_point();
72501 +
72502 +               *atom = get_current_atom_locked();
72503 +               if (flushiters > TOOMANYFLUSHES && IS_POW(flushiters)) {
72504 +                       warning("nikita-3176",
72505 +                               "Flushing like mad: %i", flushiters);
72506 +                       reiser4_info_atom("atom", *atom);
72507 +                       DEBUGON(flushiters > (1 << 20));
72508 +               }
72509 +       }
72510 +
72511 +       if (ret)
72512 +               return ret;
72513 +
72514 +       assert_spin_locked(&((*atom)->alock));
72515 +
72516 +       if (!atom_can_be_committed(*atom)) {
72517 +               spin_unlock_atom(*atom);
72518 +               return RETERR(-E_REPEAT);
72519 +       }
72520 +
72521 +       if ((*atom)->capture_count == 0)
72522 +               goto done;
72523 +
72524 +       /* Up to this point we have been flushing and after flush is called we
72525 +          return -E_REPEAT.  Now we can commit.  We cannot return -E_REPEAT
72526 +          at this point, commit should be successful. */
72527 +       reiser4_atom_set_stage(*atom, ASTAGE_PRE_COMMIT);
72528 +       ON_DEBUG(((*atom)->committer = current));
72529 +       spin_unlock_atom(*atom);
72530 +
72531 +       ret = current_atom_complete_writes();
72532 +       if (ret)
72533 +               return ret;
72534 +
72535 +       assert("zam-906", list_empty(ATOM_WB_LIST(*atom)));
72536 +
72537 +       /* isolate critical code path which should be executed by only one
72538 +        * thread using tmgr mutex */
72539 +       mutex_lock(&sbinfo->tmgr.commit_mutex);
72540 +
72541 +       ret = reiser4_write_logs(nr_submitted);
72542 +       if (ret < 0)
72543 +               reiser4_panic("zam-597", "write log failed (%ld)\n", ret);
72544 +
72545 +       /* The atom->ovrwr_nodes list is processed under commit mutex held
72546 +          because of bitmap nodes which are captured by special way in
72547 +          reiser4_pre_commit_hook_bitmap(), that way does not include
72548 +          capture_fuse_wait() as a capturing of other nodes does -- the commit
72549 +          mutex is used for transaction isolation instead. */
72550 +       reiser4_invalidate_list(ATOM_OVRWR_LIST(*atom));
72551 +       mutex_unlock(&sbinfo->tmgr.commit_mutex);
72552 +
72553 +       reiser4_invalidate_list(ATOM_CLEAN_LIST(*atom));
72554 +       reiser4_invalidate_list(ATOM_WB_LIST(*atom));
72555 +       assert("zam-927", list_empty(&(*atom)->inodes));
72556 +
72557 +       spin_lock_atom(*atom);
72558 + done:
72559 +       reiser4_atom_set_stage(*atom, ASTAGE_DONE);
72560 +       ON_DEBUG((*atom)->committer = NULL);
72561 +
72562 +       /* Atom's state changes, so wake up everybody waiting for this
72563 +          event. */
72564 +       wakeup_atom_waiting_list(*atom);
72565 +
72566 +       /* Decrement the "until commit" reference, at least one txnh (the caller) is
72567 +          still open. */
72568 +       atomic_dec(&(*atom)->refcount);
72569 +
72570 +       assert("jmacd-1070", atomic_read(&(*atom)->refcount) > 0);
72571 +       assert("jmacd-1062", (*atom)->capture_count == 0);
72572 +       BUG_ON((*atom)->capture_count != 0);
72573 +       assert_spin_locked(&((*atom)->alock));
72574 +
72575 +       return ret;
72576 +}
72577 +
72578 +/* TXN_TXNH */
72579 +
72580 +/**
72581 + * force_commit_atom - commit current atom and wait commit completion
72582 + * @txnh:
72583 + *
72584 + * Commits current atom and wait commit completion; current atom and @txnh have
72585 + * to be spinlocked before call, this function unlocks them on exit.
72586 + */
72587 +int force_commit_atom(txn_handle *txnh)
72588 +{
72589 +       txn_atom *atom;
72590 +
72591 +       assert("zam-837", txnh != NULL);
72592 +       assert_spin_locked(&(txnh->hlock));
72593 +       assert("nikita-2966", lock_stack_isclean(get_current_lock_stack()));
72594 +
72595 +       atom = txnh->atom;
72596 +
72597 +       assert("zam-834", atom != NULL);
72598 +       assert_spin_locked(&(atom->alock));
72599 +
72600 +       /*
72601 +        * Set flags for atom and txnh: forcing atom commit and waiting for
72602 +        * commit completion
72603 +        */
72604 +       txnh->flags |= TXNH_WAIT_COMMIT;
72605 +       atom->flags |= ATOM_FORCE_COMMIT;
72606 +
72607 +       spin_unlock_txnh(txnh);
72608 +       spin_unlock_atom(atom);
72609 +
72610 +       /* commit is here */
72611 +       reiser4_txn_restart_current();
72612 +       return 0;
72613 +}
72614 +
72615 +/* Called to force commit of any outstanding atoms.  @commit_all_atoms controls
72616 + * should we commit all atoms including new ones which are created after this
72617 + * functions is called. */
72618 +int txnmgr_force_commit_all(struct super_block *super, int commit_all_atoms)
72619 +{
72620 +       int ret;
72621 +       txn_atom *atom;
72622 +       txn_mgr *mgr;
72623 +       txn_handle *txnh;
72624 +       unsigned long start_time = jiffies;
72625 +       reiser4_context *ctx = get_current_context();
72626 +
72627 +       assert("nikita-2965", lock_stack_isclean(get_current_lock_stack()));
72628 +       assert("nikita-3058", reiser4_commit_check_locks());
72629 +
72630 +       reiser4_txn_restart_current();
72631 +
72632 +       mgr = &get_super_private(super)->tmgr;
72633 +
72634 +       txnh = ctx->trans;
72635 +
72636 +      again:
72637 +
72638 +       spin_lock_txnmgr(mgr);
72639 +
72640 +       list_for_each_entry(atom, &mgr->atoms_list, atom_link) {
72641 +               spin_lock_atom(atom);
72642 +
72643 +               /* Commit any atom which can be committed.  If @commit_new_atoms
72644 +                * is not set we commit only atoms which were created before
72645 +                * this call is started. */
72646 +               if (commit_all_atoms
72647 +                   || time_before_eq(atom->start_time, start_time)) {
72648 +                       if (atom->stage <= ASTAGE_POST_COMMIT) {
72649 +                               spin_unlock_txnmgr(mgr);
72650 +
72651 +                               if (atom->stage < ASTAGE_PRE_COMMIT) {
72652 +                                       spin_lock_txnh(txnh);
72653 +                                       /* Add force-context txnh */
72654 +                                       capture_assign_txnh_nolock(atom, txnh);
72655 +                                       ret = force_commit_atom(txnh);
72656 +                                       if (ret)
72657 +                                               return ret;
72658 +                               } else
72659 +                                       /* wait atom commit */
72660 +                                       reiser4_atom_wait_event(atom);
72661 +
72662 +                               goto again;
72663 +                       }
72664 +               }
72665 +
72666 +               spin_unlock_atom(atom);
72667 +       }
72668 +
72669 +#if REISER4_DEBUG
72670 +       if (commit_all_atoms) {
72671 +               reiser4_super_info_data *sbinfo = get_super_private(super);
72672 +               spin_lock_reiser4_super(sbinfo);
72673 +               assert("zam-813",
72674 +                      sbinfo->blocks_fake_allocated_unformatted == 0);
72675 +               assert("zam-812", sbinfo->blocks_fake_allocated == 0);
72676 +               spin_unlock_reiser4_super(sbinfo);
72677 +       }
72678 +#endif
72679 +
72680 +       spin_unlock_txnmgr(mgr);
72681 +
72682 +       return 0;
72683 +}
72684 +
72685 +/* check whether commit_some_atoms() can commit @atom. Locking is up to the
72686 + * caller */
72687 +static int atom_is_committable(txn_atom * atom)
72688 +{
72689 +       return
72690 +           atom->stage < ASTAGE_PRE_COMMIT &&
72691 +           atom->txnh_count == atom->nr_waiters && atom_should_commit(atom);
72692 +}
72693 +
72694 +/* called periodically from ktxnmgrd to commit old atoms. Releases ktxnmgrd spin
72695 + * lock at exit */
72696 +int commit_some_atoms(txn_mgr * mgr)
72697 +{
72698 +       int ret = 0;
72699 +       txn_atom *atom;
72700 +       txn_handle *txnh;
72701 +       reiser4_context *ctx;
72702 +       struct list_head *pos, *tmp;
72703 +
72704 +       ctx = get_current_context();
72705 +       assert("nikita-2444", ctx != NULL);
72706 +
72707 +       txnh = ctx->trans;
72708 +       spin_lock_txnmgr(mgr);
72709 +
72710 +       /*
72711 +        * this is to avoid gcc complain that atom might be used
72712 +        * uninitialized
72713 +        */
72714 +       atom = NULL;
72715 +
72716 +       /* look for atom to commit */
72717 +       list_for_each_safe(pos, tmp, &mgr->atoms_list) {
72718 +               atom = list_entry(pos, txn_atom, atom_link);
72719 +               /*
72720 +                * first test without taking atom spin lock, whether it is
72721 +                * eligible for committing at all
72722 +                */
72723 +               if (atom_is_committable(atom)) {
72724 +                       /* now, take spin lock and re-check */
72725 +                       spin_lock_atom(atom);
72726 +                       if (atom_is_committable(atom))
72727 +                               break;
72728 +                       spin_unlock_atom(atom);
72729 +               }
72730 +       }
72731 +
72732 +       ret = (&mgr->atoms_list == pos);
72733 +       spin_unlock_txnmgr(mgr);
72734 +
72735 +       if (ret) {
72736 +               /* nothing found */
72737 +               spin_unlock(&mgr->daemon->guard);
72738 +               return 0;
72739 +       }
72740 +
72741 +       spin_lock_txnh(txnh);
72742 +
72743 +       BUG_ON(atom == NULL);
72744 +       /* Set the atom to force committing */
72745 +       atom->flags |= ATOM_FORCE_COMMIT;
72746 +
72747 +       /* Add force-context txnh */
72748 +       capture_assign_txnh_nolock(atom, txnh);
72749 +
72750 +       spin_unlock_txnh(txnh);
72751 +       spin_unlock_atom(atom);
72752 +
72753 +       /* we are about to release daemon spin lock, notify daemon it
72754 +          has to rescan atoms */
72755 +       mgr->daemon->rescan = 1;
72756 +       spin_unlock(&mgr->daemon->guard);
72757 +       reiser4_txn_restart_current();
72758 +       return 0;
72759 +}
72760 +
72761 +static int txn_try_to_fuse_small_atom(txn_mgr * tmgr, txn_atom * atom)
72762 +{
72763 +       int atom_stage;
72764 +       txn_atom *atom_2;
72765 +       int repeat;
72766 +
72767 +       assert("zam-1051", atom->stage < ASTAGE_PRE_COMMIT);
72768 +
72769 +       atom_stage = atom->stage;
72770 +       repeat = 0;
72771 +
72772 +       if (!spin_trylock_txnmgr(tmgr)) {
72773 +               atomic_inc(&atom->refcount);
72774 +               spin_unlock_atom(atom);
72775 +               spin_lock_txnmgr(tmgr);
72776 +               spin_lock_atom(atom);
72777 +               repeat = 1;
72778 +               if (atom->stage != atom_stage) {
72779 +                       spin_unlock_txnmgr(tmgr);
72780 +                       atom_dec_and_unlock(atom);
72781 +                       return -E_REPEAT;
72782 +               }
72783 +               atomic_dec(&atom->refcount);
72784 +       }
72785 +
72786 +       list_for_each_entry(atom_2, &tmgr->atoms_list, atom_link) {
72787 +               if (atom == atom_2)
72788 +                       continue;
72789 +               /*
72790 +                * if trylock does not succeed we just do not fuse with that
72791 +                * atom.
72792 +                */
72793 +               if (spin_trylock_atom(atom_2)) {
72794 +                       if (atom_2->stage < ASTAGE_PRE_COMMIT) {
72795 +                               spin_unlock_txnmgr(tmgr);
72796 +                               capture_fuse_into(atom_2, atom);
72797 +                               /* all locks are lost we can only repeat here */
72798 +                               return -E_REPEAT;
72799 +                       }
72800 +                       spin_unlock_atom(atom_2);
72801 +               }
72802 +       }
72803 +       atom->flags |= ATOM_CANCEL_FUSION;
72804 +       spin_unlock_txnmgr(tmgr);
72805 +       if (repeat) {
72806 +               spin_unlock_atom(atom);
72807 +               return -E_REPEAT;
72808 +       }
72809 +       return 0;
72810 +}
72811 +
72812 +/* Calls jnode_flush for current atom if it exists; if not, just take another
72813 +   atom and call jnode_flush() for him.  If current transaction handle has
72814 +   already assigned atom (current atom) we have to close current transaction
72815 +   prior to switch to another atom or do something with current atom. This
72816 +   code tries to flush current atom.
72817 +
72818 +   flush_some_atom() is called as part of memory clearing process. It is
72819 +   invoked from balance_dirty_pages(), pdflushd, and entd.
72820 +
72821 +   If we can flush no nodes, atom is committed, because this frees memory.
72822 +
72823 +   If atom is too large or too old it is committed also.
72824 +*/
72825 +int
72826 +flush_some_atom(jnode * start, long *nr_submitted, const struct writeback_control *wbc,
72827 +               int flags)
72828 +{
72829 +       reiser4_context *ctx = get_current_context();
72830 +       txn_mgr *tmgr = &get_super_private(ctx->super)->tmgr;
72831 +       txn_handle *txnh = ctx->trans;
72832 +       txn_atom *atom;
72833 +       int ret;
72834 +
72835 +       BUG_ON(wbc->nr_to_write == 0);
72836 +       BUG_ON(*nr_submitted != 0);
72837 +       assert("zam-1042", txnh != NULL);
72838 +      repeat:
72839 +       if (txnh->atom == NULL) {
72840 +               /* current atom is not available, take first from txnmgr */
72841 +               spin_lock_txnmgr(tmgr);
72842 +
72843 +               /* traverse the list of all atoms */
72844 +               list_for_each_entry(atom, &tmgr->atoms_list, atom_link) {
72845 +                       /* lock atom before checking its state */
72846 +                       spin_lock_atom(atom);
72847 +
72848 +                       /*
72849 +                        * we need an atom which is not being committed and
72850 +                        * which has no flushers (jnode_flush() add one flusher
72851 +                        * at the beginning and subtract one at the end).
72852 +                        */
72853 +                       if (atom->stage < ASTAGE_PRE_COMMIT &&
72854 +                           atom->nr_flushers == 0) {
72855 +                               spin_lock_txnh(txnh);
72856 +                               capture_assign_txnh_nolock(atom, txnh);
72857 +                               spin_unlock_txnh(txnh);
72858 +
72859 +                               goto found;
72860 +                       }
72861 +
72862 +                       spin_unlock_atom(atom);
72863 +               }
72864 +
72865 +               /*
72866 +                * Write throttling is case of no one atom can be
72867 +                * flushed/committed.
72868 +                */
72869 +               if (!current_is_pdflush() && !wbc->nonblocking) {
72870 +                       list_for_each_entry(atom, &tmgr->atoms_list, atom_link) {
72871 +                               spin_lock_atom(atom);
72872 +                               /* Repeat the check from the above. */
72873 +                               if (atom->stage < ASTAGE_PRE_COMMIT
72874 +                                   && atom->nr_flushers == 0) {
72875 +                                       spin_lock_txnh(txnh);
72876 +                                       capture_assign_txnh_nolock(atom, txnh);
72877 +                                       spin_unlock_txnh(txnh);
72878 +
72879 +                                       goto found;
72880 +                               }
72881 +                               if (atom->stage <= ASTAGE_POST_COMMIT) {
72882 +                                       spin_unlock_txnmgr(tmgr);
72883 +                                       /*
72884 +                                        * we just wait until atom's flusher
72885 +                                        * makes a progress in flushing or
72886 +                                        * committing the atom
72887 +                                        */
72888 +                                       reiser4_atom_wait_event(atom);
72889 +                                       goto repeat;
72890 +                               }
72891 +                               spin_unlock_atom(atom);
72892 +                       }
72893 +               }
72894 +               spin_unlock_txnmgr(tmgr);
72895 +               return 0;
72896 +             found:
72897 +               spin_unlock_txnmgr(tmgr);
72898 +       } else
72899 +               atom = get_current_atom_locked();
72900 +
72901 +       BUG_ON(atom->super != ctx->super);
72902 +       assert("vs-35", atom->super == ctx->super);
72903 +       if (start) {
72904 +               spin_lock_jnode(start);
72905 +               ret = (atom == start->atom) ? 1 : 0;
72906 +               spin_unlock_jnode(start);
72907 +               if (ret == 0)
72908 +                       start = NULL;
72909 +       }
72910 +       ret = flush_current_atom(flags, wbc->nr_to_write, nr_submitted, &atom, start);
72911 +       if (ret == 0) {
72912 +               /* flush_current_atom returns 0 only if it submitted for write
72913 +                  nothing */
72914 +               BUG_ON(*nr_submitted != 0);
72915 +               if (*nr_submitted == 0 || atom_should_commit_asap(atom)) {
72916 +                       if (atom->capture_count < tmgr->atom_min_size &&
72917 +                           !(atom->flags & ATOM_CANCEL_FUSION)) {
72918 +                               ret = txn_try_to_fuse_small_atom(tmgr, atom);
72919 +                               if (ret == -E_REPEAT) {
72920 +                                       reiser4_preempt_point();
72921 +                                       goto repeat;
72922 +                               }
72923 +                       }
72924 +                       /* if early flushing could not make more nodes clean,
72925 +                        * or atom is too old/large,
72926 +                        * we force current atom to commit */
72927 +                       /* wait for commit completion but only if this
72928 +                        * wouldn't stall pdflushd and ent thread. */
72929 +                       if (!wbc->nonblocking && !ctx->entd)
72930 +                               txnh->flags |= TXNH_WAIT_COMMIT;
72931 +                       atom->flags |= ATOM_FORCE_COMMIT;
72932 +               }
72933 +               spin_unlock_atom(atom);
72934 +       } else if (ret == -E_REPEAT) {
72935 +               if (*nr_submitted == 0) {
72936 +                       /* let others who hampers flushing (hold longterm locks,
72937 +                          for instance) to free the way for flush */
72938 +                       reiser4_preempt_point();
72939 +                       goto repeat;
72940 +               }
72941 +               ret = 0;
72942 +       }
72943 +/*
72944 +       if (*nr_submitted > wbc->nr_to_write)
72945 +               warning("", "asked for %ld, written %ld\n", wbc->nr_to_write, *nr_submitted);
72946 +*/
72947 +       reiser4_txn_restart(ctx);
72948 +
72949 +       return ret;
72950 +}
72951 +
72952 +/* Remove processed nodes from atom's clean list (thereby remove them from transaction). */
72953 +void reiser4_invalidate_list(struct list_head *head)
72954 +{
72955 +       while (!list_empty(head)) {
72956 +               jnode *node;
72957 +
72958 +               node = list_entry(head->next, jnode, capture_link);
72959 +               spin_lock_jnode(node);
72960 +               reiser4_uncapture_block(node);
72961 +               jput(node);
72962 +       }
72963 +}
72964 +
72965 +static void init_wlinks(txn_wait_links * wlinks)
72966 +{
72967 +       wlinks->_lock_stack = get_current_lock_stack();
72968 +       INIT_LIST_HEAD(&wlinks->_fwaitfor_link);
72969 +       INIT_LIST_HEAD(&wlinks->_fwaiting_link);
72970 +       wlinks->waitfor_cb = NULL;
72971 +       wlinks->waiting_cb = NULL;
72972 +}
72973 +
72974 +/* Add atom to the atom's waitfor list and wait for somebody to wake us up; */
72975 +void reiser4_atom_wait_event(txn_atom * atom)
72976 +{
72977 +       txn_wait_links _wlinks;
72978 +
72979 +       assert_spin_locked(&(atom->alock));
72980 +       assert("nikita-3156",
72981 +              lock_stack_isclean(get_current_lock_stack()) ||
72982 +              atom->nr_running_queues > 0);
72983 +
72984 +       init_wlinks(&_wlinks);
72985 +       list_add_tail(&_wlinks._fwaitfor_link, &atom->fwaitfor_list);
72986 +       atomic_inc(&atom->refcount);
72987 +       spin_unlock_atom(atom);
72988 +
72989 +       reiser4_prepare_to_sleep(_wlinks._lock_stack);
72990 +       reiser4_go_to_sleep(_wlinks._lock_stack);
72991 +
72992 +       spin_lock_atom(atom);
72993 +       list_del(&_wlinks._fwaitfor_link);
72994 +       atom_dec_and_unlock(atom);
72995 +}
72996 +
72997 +void reiser4_atom_set_stage(txn_atom * atom, txn_stage stage)
72998 +{
72999 +       assert("nikita-3535", atom != NULL);
73000 +       assert_spin_locked(&(atom->alock));
73001 +       assert("nikita-3536", stage <= ASTAGE_INVALID);
73002 +       /* Excelsior! */
73003 +       assert("nikita-3537", stage >= atom->stage);
73004 +       if (atom->stage != stage) {
73005 +               atom->stage = stage;
73006 +               reiser4_atom_send_event(atom);
73007 +       }
73008 +}
73009 +
73010 +/* wake all threads which wait for an event */
73011 +void reiser4_atom_send_event(txn_atom * atom)
73012 +{
73013 +       assert_spin_locked(&(atom->alock));
73014 +       wakeup_atom_waitfor_list(atom);
73015 +}
73016 +
73017 +/* Informs txn manager code that owner of this txn_handle should wait atom commit completion (for
73018 +   example, because it does fsync(2)) */
73019 +static int should_wait_commit(txn_handle * h)
73020 +{
73021 +       return h->flags & TXNH_WAIT_COMMIT;
73022 +}
73023 +
73024 +typedef struct commit_data {
73025 +       txn_atom *atom;
73026 +       txn_handle *txnh;
73027 +       long nr_written;
73028 +       /* as an optimization we start committing atom by first trying to
73029 +        * flush it few times without switching into ASTAGE_CAPTURE_WAIT. This
73030 +        * allows to reduce stalls due to other threads waiting for atom in
73031 +        * ASTAGE_CAPTURE_WAIT stage. ->preflush is counter of these
73032 +        * preliminary flushes. */
73033 +       int preflush;
73034 +       /* have we waited on atom. */
73035 +       int wait;
73036 +       int failed;
73037 +       int wake_ktxnmgrd_up;
73038 +} commit_data;
73039 +
73040 +/*
73041 + * Called from commit_txnh() repeatedly, until either error happens, or atom
73042 + * commits successfully.
73043 + */
73044 +static int try_commit_txnh(commit_data * cd)
73045 +{
73046 +       int result;
73047 +
73048 +       assert("nikita-2968", lock_stack_isclean(get_current_lock_stack()));
73049 +
73050 +       /* Get the atom and txnh locked. */
73051 +       cd->atom = txnh_get_atom(cd->txnh);
73052 +       assert("jmacd-309", cd->atom != NULL);
73053 +       spin_unlock_txnh(cd->txnh);
73054 +
73055 +       if (cd->wait) {
73056 +               cd->atom->nr_waiters--;
73057 +               cd->wait = 0;
73058 +       }
73059 +
73060 +       if (cd->atom->stage == ASTAGE_DONE)
73061 +               return 0;
73062 +
73063 +       if (cd->failed)
73064 +               return 0;
73065 +
73066 +       if (atom_should_commit(cd->atom)) {
73067 +               /* if atom is _very_ large schedule it for commit as soon as
73068 +                * possible. */
73069 +               if (atom_should_commit_asap(cd->atom)) {
73070 +                       /*
73071 +                        * When atom is in PRE_COMMIT or later stage following
73072 +                        * invariant (encoded   in    atom_can_be_committed())
73073 +                        * holds:  there is exactly one non-waiter transaction
73074 +                        * handle opened  on this atom.  When  thread wants to
73075 +                        * wait  until atom  commits (for  example  sync()) it
73076 +                        * waits    on    atom  event     after     increasing
73077 +                        * atom->nr_waiters (see blow  in  this  function). It
73078 +                        * cannot be guaranteed that atom is already committed
73079 +                        * after    receiving event,  so     loop has   to  be
73080 +                        * re-started. But  if  atom switched into  PRE_COMMIT
73081 +                        * stage and became  too  large, we cannot  change its
73082 +                        * state back   to CAPTURE_WAIT (atom  stage can  only
73083 +                        * increase monotonically), hence this check.
73084 +                        */
73085 +                       if (cd->atom->stage < ASTAGE_CAPTURE_WAIT)
73086 +                               reiser4_atom_set_stage(cd->atom,
73087 +                                                      ASTAGE_CAPTURE_WAIT);
73088 +                       cd->atom->flags |= ATOM_FORCE_COMMIT;
73089 +               }
73090 +               if (cd->txnh->flags & TXNH_DONT_COMMIT) {
73091 +                       /*
73092 +                        * this  thread (transaction  handle  that is) doesn't
73093 +                        * want to commit  atom. Notify waiters that handle is
73094 +                        * closed. This can happen, for  example, when we  are
73095 +                        * under  VFS directory lock  and don't want to commit
73096 +                        * atom  right   now to  avoid  stalling other threads
73097 +                        * working in the same directory.
73098 +                        */
73099 +
73100 +                       /* Wake  the ktxnmgrd up if  the ktxnmgrd is needed to
73101 +                        * commit this  atom: no  atom  waiters  and only  one
73102 +                        * (our) open transaction handle. */
73103 +                       cd->wake_ktxnmgrd_up =
73104 +                           cd->atom->txnh_count == 1 &&
73105 +                           cd->atom->nr_waiters == 0;
73106 +                       reiser4_atom_send_event(cd->atom);
73107 +                       result = 0;
73108 +               } else if (!atom_can_be_committed(cd->atom)) {
73109 +                       if (should_wait_commit(cd->txnh)) {
73110 +                               /* sync(): wait for commit */
73111 +                               cd->atom->nr_waiters++;
73112 +                               cd->wait = 1;
73113 +                               reiser4_atom_wait_event(cd->atom);
73114 +                               result = RETERR(-E_REPEAT);
73115 +                       } else {
73116 +                               result = 0;
73117 +                       }
73118 +               } else if (cd->preflush > 0 && !is_current_ktxnmgrd()) {
73119 +                       /*
73120 +                        * optimization: flush  atom without switching it into
73121 +                        * ASTAGE_CAPTURE_WAIT.
73122 +                        *
73123 +                        * But don't  do this for  ktxnmgrd, because  ktxnmgrd
73124 +                        * should never block on atom fusion.
73125 +                        */
73126 +                       result = flush_current_atom(JNODE_FLUSH_WRITE_BLOCKS,
73127 +                                                   LONG_MAX, &cd->nr_written,
73128 +                                                   &cd->atom, NULL);
73129 +                       if (result == 0) {
73130 +                               spin_unlock_atom(cd->atom);
73131 +                               cd->preflush = 0;
73132 +                               result = RETERR(-E_REPEAT);
73133 +                       } else  /* Atoms wasn't flushed
73134 +                                * completely. Rinse. Repeat. */
73135 +                               --cd->preflush;
73136 +               } else {
73137 +                       /* We change   atom state  to   ASTAGE_CAPTURE_WAIT to
73138 +                          prevent atom fusion and count  ourself as an active
73139 +                          flusher */
73140 +                       reiser4_atom_set_stage(cd->atom, ASTAGE_CAPTURE_WAIT);
73141 +                       cd->atom->flags |= ATOM_FORCE_COMMIT;
73142 +
73143 +                       result =
73144 +                           commit_current_atom(&cd->nr_written, &cd->atom);
73145 +                       if (result != 0 && result != -E_REPEAT)
73146 +                               cd->failed = 1;
73147 +               }
73148 +       } else
73149 +               result = 0;
73150 +
73151 +#if REISER4_DEBUG
73152 +       if (result == 0)
73153 +               assert_spin_locked(&(cd->atom->alock));
73154 +#endif
73155 +
73156 +       /* perfectly valid assertion, except that when atom/txnh is not locked
73157 +        * fusion can take place, and cd->atom points nowhere. */
73158 +       /*
73159 +          assert("jmacd-1028", ergo(result != 0, spin_atom_is_not_locked(cd->atom)));
73160 +        */
73161 +       return result;
73162 +}
73163 +
73164 +/* Called to commit a transaction handle.  This decrements the atom's number of open
73165 +   handles and if it is the last handle to commit and the atom should commit, initiates
73166 +   atom commit. if commit does not fail, return number of written blocks */
73167 +static int commit_txnh(txn_handle * txnh)
73168 +{
73169 +       commit_data cd;
73170 +       assert("umka-192", txnh != NULL);
73171 +
73172 +       memset(&cd, 0, sizeof cd);
73173 +       cd.txnh = txnh;
73174 +       cd.preflush = 10;
73175 +
73176 +       /* calls try_commit_txnh() until either atom commits, or error
73177 +        * happens */
73178 +       while (try_commit_txnh(&cd) != 0)
73179 +               reiser4_preempt_point();
73180 +
73181 +       spin_lock_txnh(txnh);
73182 +
73183 +       cd.atom->txnh_count -= 1;
73184 +       txnh->atom = NULL;
73185 +       /* remove transaction handle from atom's list of transaction handles */
73186 +       list_del_init(&txnh->txnh_link);
73187 +
73188 +       spin_unlock_txnh(txnh);
73189 +       atom_dec_and_unlock(cd.atom);
73190 +       /* if we don't want to do a commit (TXNH_DONT_COMMIT is set, probably
73191 +        * because it takes time) by current thread, we do that work
73192 +        * asynchronously by ktxnmgrd daemon. */
73193 +       if (cd.wake_ktxnmgrd_up)
73194 +               ktxnmgrd_kick(&get_current_super_private()->tmgr);
73195 +
73196 +       return 0;
73197 +}
73198 +
73199 +/* TRY_CAPTURE */
73200 +
73201 +/* This routine attempts a single block-capture request.  It may return -E_REPEAT if some
73202 +   condition indicates that the request should be retried, and it may block if the
73203 +   txn_capture mode does not include the TXN_CAPTURE_NONBLOCKING request flag.
73204 +
73205 +   This routine encodes the basic logic of block capturing described by:
73206 +
73207 +     http://namesys.com/v4/v4.html
73208 +
73209 +   Our goal here is to ensure that any two blocks that contain dependent modifications
73210 +   should commit at the same time.  This function enforces this discipline by initiating
73211 +   fusion whenever a transaction handle belonging to one atom requests to read or write a
73212 +   block belonging to another atom (TXN_CAPTURE_WRITE or TXN_CAPTURE_READ_ATOMIC).
73213 +
73214 +   In addition, this routine handles the initial assignment of atoms to blocks and
73215 +   transaction handles.  These are possible outcomes of this function:
73216 +
73217 +   1. The block and handle are already part of the same atom: return immediate success
73218 +
73219 +   2. The block is assigned but the handle is not: call capture_assign_txnh to assign
73220 +      the handle to the block's atom.
73221 +
73222 +   3. The handle is assigned but the block is not: call capture_assign_block to assign
73223 +      the block to the handle's atom.
73224 +
73225 +   4. Both handle and block are assigned, but to different atoms: call capture_init_fusion
73226 +      to fuse atoms.
73227 +
73228 +   5. Neither block nor handle are assigned: create a new atom and assign them both.
73229 +
73230 +   6. A read request for a non-captured block: return immediate success.
73231 +
73232 +   This function acquires and releases the handle's spinlock.  This function is called
73233 +   under the jnode lock and if the return value is 0, it returns with the jnode lock still
73234 +   held.  If the return is -E_REPEAT or some other error condition, the jnode lock is
73235 +   released.  The external interface (reiser4_try_capture) manages re-aquiring the jnode
73236 +   lock in the failure case.
73237 +*/
73238 +static int try_capture_block(
73239 +       txn_handle * txnh, jnode * node, txn_capture mode,
73240 +       txn_atom ** atom_alloc)
73241 +{
73242 +       txn_atom *block_atom;
73243 +       txn_atom *txnh_atom;
73244 +
73245 +       /* Should not call capture for READ_NONCOM requests, handled in reiser4_try_capture. */
73246 +       assert("jmacd-567", CAPTURE_TYPE(mode) != TXN_CAPTURE_READ_NONCOM);
73247 +
73248 +       /* FIXME-ZAM-HANS: FIXME_LATER_JMACD Should assert that atom->tree ==
73249 +        * node->tree somewhere. */
73250 +       assert("umka-194", txnh != NULL);
73251 +       assert("umka-195", node != NULL);
73252 +
73253 +       /* The jnode is already locked!  Being called from reiser4_try_capture(). */
73254 +       assert_spin_locked(&(node->guard));
73255 +       block_atom = node->atom;
73256 +
73257 +       /* Get txnh spinlock, this allows us to compare txn_atom pointers but it doesn't
73258 +          let us touch the atoms themselves. */
73259 +       spin_lock_txnh(txnh);
73260 +       txnh_atom = txnh->atom;
73261 +       /* Process of capturing continues into one of four branches depends on
73262 +          which atoms from (block atom (node->atom), current atom (txnh->atom))
73263 +          exist. */
73264 +       if (txnh_atom == NULL) {
73265 +               if (block_atom == NULL) {
73266 +                       spin_unlock_txnh(txnh);
73267 +                       spin_unlock_jnode(node);
73268 +                       /* assign empty atom to the txnh and repeat */
73269 +                       return atom_begin_and_assign_to_txnh(atom_alloc, txnh);
73270 +               } else {
73271 +                       atomic_inc(&block_atom->refcount);
73272 +                       /* node spin-lock isn't needed anymore */
73273 +                       spin_unlock_jnode(node);
73274 +                       if (!spin_trylock_atom(block_atom)) {
73275 +                               spin_unlock_txnh(txnh);
73276 +                               spin_lock_atom(block_atom);
73277 +                               spin_lock_txnh(txnh);
73278 +                       }
73279 +                       /* re-check state after getting txnh and the node
73280 +                        * atom spin-locked */
73281 +                       if (node->atom != block_atom || txnh->atom != NULL) {
73282 +                               spin_unlock_txnh(txnh);
73283 +                               atom_dec_and_unlock(block_atom);
73284 +                               return RETERR(-E_REPEAT);
73285 +                       }
73286 +                       atomic_dec(&block_atom->refcount);
73287 +                       if (block_atom->stage > ASTAGE_CAPTURE_WAIT ||
73288 +                           (block_atom->stage == ASTAGE_CAPTURE_WAIT &&
73289 +                            block_atom->txnh_count != 0))
73290 +                               return capture_fuse_wait(txnh, block_atom, NULL, mode);
73291 +                       capture_assign_txnh_nolock(block_atom, txnh);
73292 +                       spin_unlock_txnh(txnh);
73293 +                       spin_unlock_atom(block_atom);
73294 +                       return RETERR(-E_REPEAT);
73295 +               }
73296 +       } else {
73297 +               /* It is time to perform deadlock prevention check over the
73298 +                  node we want to capture.  It is possible this node was locked
73299 +                  for read without capturing it. The optimization which allows
73300 +                  to do it helps us in keeping atoms independent as long as
73301 +                  possible but it may cause lock/fuse deadlock problems.
73302 +
73303 +                  A number of similar deadlock situations with locked but not
73304 +                  captured nodes were found.  In each situation there are two
73305 +                  or more threads: one of them does flushing while another one
73306 +                  does routine balancing or tree lookup.  The flushing thread
73307 +                  (F) sleeps in long term locking request for node (N), another
73308 +                  thread (A) sleeps in trying to capture some node already
73309 +                  belonging the atom F, F has a state which prevents
73310 +                  immediately fusion .
73311 +
73312 +                  Deadlocks of this kind cannot happen if node N was properly
73313 +                  captured by thread A. The F thread fuse atoms before locking
73314 +                  therefore current atom of thread F and current atom of thread
73315 +                  A became the same atom and thread A may proceed.  This does
73316 +                  not work if node N was not captured because the fusion of
73317 +                  atom does not happens.
73318 +
73319 +                  The following scheme solves the deadlock: If
73320 +                  longterm_lock_znode locks and does not capture a znode, that
73321 +                  znode is marked as MISSED_IN_CAPTURE.  A node marked this way
73322 +                  is processed by the code below which restores the missed
73323 +                  capture and fuses current atoms of all the node lock owners
73324 +                  by calling the fuse_not_fused_lock_owners() function. */
73325 +               if (JF_ISSET(node, JNODE_MISSED_IN_CAPTURE)) {
73326 +                       JF_CLR(node, JNODE_MISSED_IN_CAPTURE);
73327 +                       if (jnode_is_znode(node) && znode_is_locked(JZNODE(node))) {
73328 +                               spin_unlock_txnh(txnh);
73329 +                               spin_unlock_jnode(node);
73330 +                               fuse_not_fused_lock_owners(txnh, JZNODE(node));
73331 +                               return RETERR(-E_REPEAT);
73332 +                       }
73333 +               }
73334 +               if (block_atom == NULL) {
73335 +                       atomic_inc(&txnh_atom->refcount);
73336 +                       spin_unlock_txnh(txnh);
73337 +                       if (!spin_trylock_atom(txnh_atom)) {
73338 +                               spin_unlock_jnode(node);
73339 +                               spin_lock_atom(txnh_atom);
73340 +                               spin_lock_jnode(node);
73341 +                       }
73342 +                       if (txnh->atom != txnh_atom || node->atom != NULL
73343 +                               || JF_ISSET(node, JNODE_IS_DYING)) {
73344 +                               spin_unlock_jnode(node);
73345 +                               atom_dec_and_unlock(txnh_atom);
73346 +                               return RETERR(-E_REPEAT);
73347 +                       }
73348 +                       atomic_dec(&txnh_atom->refcount);
73349 +                       capture_assign_block_nolock(txnh_atom, node);
73350 +                       spin_unlock_atom(txnh_atom);
73351 +               } else {
73352 +                       if (txnh_atom != block_atom) {
73353 +                               if (mode & TXN_CAPTURE_DONT_FUSE) {
73354 +                                       spin_unlock_txnh(txnh);
73355 +                                       spin_unlock_jnode(node);
73356 +                                       /* we are in a "no-fusion" mode and @node is
73357 +                                        * already part of transaction. */
73358 +                                       return RETERR(-E_NO_NEIGHBOR);
73359 +                               }
73360 +                               return capture_init_fusion(node, txnh, mode);
73361 +                       }
73362 +                       spin_unlock_txnh(txnh);
73363 +               }
73364 +       }
73365 +       return 0;
73366 +}
73367 +
73368 +static txn_capture
73369 +build_capture_mode(jnode * node, znode_lock_mode lock_mode, txn_capture flags)
73370 +{
73371 +       txn_capture cap_mode;
73372 +
73373 +       assert_spin_locked(&(node->guard));
73374 +
73375 +       /* FIXME_JMACD No way to set TXN_CAPTURE_READ_MODIFY yet. */
73376 +
73377 +       if (lock_mode == ZNODE_WRITE_LOCK) {
73378 +               cap_mode = TXN_CAPTURE_WRITE;
73379 +       } else if (node->atom != NULL) {
73380 +               cap_mode = TXN_CAPTURE_WRITE;
73381 +       } else if (0 &&         /* txnh->mode == TXN_READ_FUSING && */
73382 +                  jnode_get_level(node) == LEAF_LEVEL) {
73383 +               /* NOTE-NIKITA TXN_READ_FUSING is not currently used */
73384 +               /* We only need a READ_FUSING capture at the leaf level.  This
73385 +                  is because the internal levels of the tree (twigs included)
73386 +                  are redundant from the point of the user that asked for a
73387 +                  read-fusing transcrash.  The user only wants to read-fuse
73388 +                  atoms due to reading uncommitted data that another user has
73389 +                  written.  It is the file system that reads/writes the
73390 +                  internal tree levels, the user only reads/writes leaves. */
73391 +               cap_mode = TXN_CAPTURE_READ_ATOMIC;
73392 +       } else {
73393 +               /* In this case (read lock at a non-leaf) there's no reason to
73394 +                * capture. */
73395 +               /* cap_mode = TXN_CAPTURE_READ_NONCOM; */
73396 +               return 0;
73397 +       }
73398 +
73399 +       cap_mode |= (flags & (TXN_CAPTURE_NONBLOCKING | TXN_CAPTURE_DONT_FUSE));
73400 +       assert("nikita-3186", cap_mode != 0);
73401 +       return cap_mode;
73402 +}
73403 +
73404 +/* This is an external interface to try_capture_block(), it calls
73405 +   try_capture_block() repeatedly as long as -E_REPEAT is returned.
73406 +
73407 +   @node:         node to capture,
73408 +   @lock_mode:    read or write lock is used in capture mode calculation,
73409 +   @flags:        see txn_capture flags enumeration,
73410 +   @can_coc     : can copy-on-capture
73411 +
73412 +   @return: 0 - node was successfully captured, -E_REPEAT - capture request
73413 +            cannot be processed immediately as it was requested in flags,
73414 +           < 0 - other errors.
73415 +*/
73416 +int reiser4_try_capture(jnode *node, znode_lock_mode lock_mode,
73417 +                       txn_capture flags)
73418 +{
73419 +       txn_atom *atom_alloc = NULL;
73420 +       txn_capture cap_mode;
73421 +       txn_handle *txnh = get_current_context()->trans;
73422 +       int ret;
73423 +
73424 +       assert_spin_locked(&(node->guard));
73425 +
73426 +      repeat:
73427 +       if (JF_ISSET(node, JNODE_IS_DYING))
73428 +               return RETERR(-EINVAL);
73429 +       if (node->atom != NULL && txnh->atom == node->atom)
73430 +               return 0;
73431 +       cap_mode = build_capture_mode(node, lock_mode, flags);
73432 +       if (cap_mode == 0 ||
73433 +           (!(cap_mode & TXN_CAPTURE_WTYPES) && node->atom == NULL)) {
73434 +               /* Mark this node as "MISSED".  It helps in further deadlock
73435 +                * analysis */
73436 +               if (jnode_is_znode(node))
73437 +                       JF_SET(node, JNODE_MISSED_IN_CAPTURE);
73438 +               return 0;
73439 +       }
73440 +       /* Repeat try_capture as long as -E_REPEAT is returned. */
73441 +       ret = try_capture_block(txnh, node, cap_mode, &atom_alloc);
73442 +       /* Regardless of non_blocking:
73443 +
73444 +          If ret == 0 then jnode is still locked.
73445 +          If ret != 0 then jnode is unlocked.
73446 +        */
73447 +#if REISER4_DEBUG
73448 +       if (ret == 0)
73449 +               assert_spin_locked(&(node->guard));
73450 +       else
73451 +               assert_spin_not_locked(&(node->guard));
73452 +#endif
73453 +       assert_spin_not_locked(&(txnh->guard));
73454 +
73455 +       if (ret == -E_REPEAT) {
73456 +               /* E_REPEAT implies all locks were released, therefore we need
73457 +                  to take the jnode's lock again. */
73458 +               spin_lock_jnode(node);
73459 +
73460 +               /* Although this may appear to be a busy loop, it is not.
73461 +                  There are several conditions that cause E_REPEAT to be
73462 +                  returned by the call to try_capture_block, all cases
73463 +                  indicating some kind of state change that means you should
73464 +                  retry the request and will get a different result.  In some
73465 +                  cases this could be avoided with some extra code, but
73466 +                  generally it is done because the necessary locks were
73467 +                  released as a result of the operation and repeating is the
73468 +                  simplest thing to do (less bug potential).  The cases are:
73469 +                  atom fusion returns E_REPEAT after it completes (jnode and
73470 +                  txnh were unlocked); race conditions in assign_block,
73471 +                  assign_txnh, and init_fusion return E_REPEAT (trylock
73472 +                  failure); after going to sleep in capture_fuse_wait
73473 +                  (request was blocked but may now succeed).  I'm not quite
73474 +                  sure how capture_copy works yet, but it may also return
73475 +                  E_REPEAT.  When the request is legitimately blocked, the
73476 +                  requestor goes to sleep in fuse_wait, so this is not a busy
73477 +                  loop. */
73478 +               /* NOTE-NIKITA: still don't understand:
73479 +
73480 +                  try_capture_block->capture_assign_txnh->spin_trylock_atom->E_REPEAT
73481 +
73482 +                  looks like busy loop?
73483 +                */
73484 +               goto repeat;
73485 +       }
73486 +
73487 +       /* free extra atom object that was possibly allocated by
73488 +          try_capture_block().
73489 +
73490 +          Do this before acquiring jnode spin lock to
73491 +          minimize time spent under lock. --nikita */
73492 +       if (atom_alloc != NULL) {
73493 +               kmem_cache_free(_atom_slab, atom_alloc);
73494 +       }
73495 +
73496 +       if (ret != 0) {
73497 +               if (ret == -E_BLOCK) {
73498 +                       assert("nikita-3360",
73499 +                              cap_mode & TXN_CAPTURE_NONBLOCKING);
73500 +                       ret = -E_REPEAT;
73501 +               }
73502 +
73503 +               /* Failure means jnode is not locked.  FIXME_LATER_JMACD May
73504 +                  want to fix the above code to avoid releasing the lock and
73505 +                  re-acquiring it, but there are cases were failure occurs
73506 +                  when the lock is not held, and those cases would need to be
73507 +                  modified to re-take the lock. */
73508 +               spin_lock_jnode(node);
73509 +       }
73510 +
73511 +       /* Jnode is still locked. */
73512 +       assert_spin_locked(&(node->guard));
73513 +       return ret;
73514 +}
73515 +
73516 +static void release_two_atoms(txn_atom *one, txn_atom *two)
73517 +{
73518 +       spin_unlock_atom(one);
73519 +       atom_dec_and_unlock(two);
73520 +       spin_lock_atom(one);
73521 +       atom_dec_and_unlock(one);
73522 +}
73523 +
73524 +/* This function sets up a call to try_capture_block and repeats as long as -E_REPEAT is
73525 +   returned by that routine.  The txn_capture request mode is computed here depending on
73526 +   the transaction handle's type and the lock request.  This is called from the depths of
73527 +   the lock manager with the jnode lock held and it always returns with the jnode lock
73528 +   held.
73529 +*/
73530 +
73531 +/* fuse all 'active' atoms of lock owners of given node. */
73532 +static void fuse_not_fused_lock_owners(txn_handle * txnh, znode * node)
73533 +{
73534 +       lock_handle *lh;
73535 +       int repeat;
73536 +       txn_atom *atomh, *atomf;
73537 +       reiser4_context *me = get_current_context();
73538 +       reiser4_context *ctx = NULL;
73539 +
73540 +       assert_spin_not_locked(&(ZJNODE(node)->guard));
73541 +       assert_spin_not_locked(&(txnh->hlock));
73542 +
73543 + repeat:
73544 +       repeat = 0;
73545 +       atomh = txnh_get_atom(txnh);
73546 +       spin_unlock_txnh(txnh);
73547 +       assert("zam-692", atomh != NULL);
73548 +
73549 +       spin_lock_zlock(&node->lock);
73550 +       /* inspect list of lock owners */
73551 +       list_for_each_entry(lh, &node->lock.owners, owners_link) {
73552 +               ctx = get_context_by_lock_stack(lh->owner);
73553 +               if (ctx == me)
73554 +                       continue;
73555 +               /* below we use two assumptions to avoid addition spin-locks
73556 +                  for checking the condition :
73557 +
73558 +                  1) if the lock stack has lock, the transaction should be
73559 +                  opened, i.e. ctx->trans != NULL;
73560 +
73561 +                  2) reading of well-aligned ctx->trans->atom is atomic, if it
73562 +                  equals to the address of spin-locked atomh, we take that
73563 +                  the atoms are the same, nothing has to be captured. */
73564 +               if (atomh != ctx->trans->atom) {
73565 +                       reiser4_wake_up(lh->owner);
73566 +                       repeat = 1;
73567 +                       break;
73568 +               }
73569 +       }
73570 +       if (repeat) {
73571 +               if (!spin_trylock_txnh(ctx->trans)) {
73572 +                       spin_unlock_zlock(&node->lock);
73573 +                       spin_unlock_atom(atomh);
73574 +                       goto repeat;
73575 +               }
73576 +               atomf = ctx->trans->atom;
73577 +               if (atomf == NULL) {
73578 +                       capture_assign_txnh_nolock(atomh, ctx->trans);
73579 +                       /* release zlock lock _after_ assigning the atom to the
73580 +                        * transaction handle, otherwise the lock owner thread
73581 +                        * may unlock all znodes, exit kernel context and here
73582 +                        * we would access an invalid transaction handle. */
73583 +                       spin_unlock_zlock(&node->lock);
73584 +                       spin_unlock_atom(atomh);
73585 +                       spin_unlock_txnh(ctx->trans);
73586 +                       goto repeat;
73587 +               }
73588 +               assert("zam-1059", atomf != atomh);
73589 +               spin_unlock_zlock(&node->lock);
73590 +               atomic_inc(&atomh->refcount);
73591 +               atomic_inc(&atomf->refcount);
73592 +               spin_unlock_txnh(ctx->trans);
73593 +               if (atomf > atomh) {
73594 +                       spin_lock_atom_nested(atomf);
73595 +               } else {
73596 +                       spin_unlock_atom(atomh);
73597 +                       spin_lock_atom(atomf);
73598 +                       spin_lock_atom_nested(atomh);
73599 +               }
73600 +               if (atomh == atomf || !atom_isopen(atomh) || !atom_isopen(atomf)) {
73601 +                       release_two_atoms(atomf, atomh);
73602 +                       goto repeat;
73603 +               }
73604 +               atomic_dec(&atomh->refcount);
73605 +               atomic_dec(&atomf->refcount);
73606 +               capture_fuse_into(atomf, atomh);
73607 +               goto repeat;
73608 +       }
73609 +       spin_unlock_zlock(&node->lock);
73610 +       spin_unlock_atom(atomh);
73611 +}
73612 +
73613 +/* This is the interface to capture unformatted nodes via their struct page
73614 +   reference. Currently it is only used in reiser4_invalidatepage */
73615 +int try_capture_page_to_invalidate(struct page *pg)
73616 +{
73617 +       int ret;
73618 +       jnode *node;
73619 +
73620 +       assert("umka-292", pg != NULL);
73621 +       assert("nikita-2597", PageLocked(pg));
73622 +
73623 +       if (IS_ERR(node = jnode_of_page(pg))) {
73624 +               return PTR_ERR(node);
73625 +       }
73626 +
73627 +       spin_lock_jnode(node);
73628 +       unlock_page(pg);
73629 +
73630 +       ret = reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0);
73631 +       spin_unlock_jnode(node);
73632 +       jput(node);
73633 +       lock_page(pg);
73634 +       return ret;
73635 +}
73636 +
73637 +/* This informs the transaction manager when a node is deleted.  Add the block to the
73638 +   atom's delete set and uncapture the block.
73639 +
73640 +VS-FIXME-HANS: this E_REPEAT paradigm clutters the code and creates a need for
73641 +explanations.  find all the functions that use it, and unless there is some very
73642 +good reason to use it (I have not noticed one so far and I doubt it exists, but maybe somewhere somehow....),
73643 +move the loop to inside the function.
73644 +
73645 +VS-FIXME-HANS: can this code be at all streamlined?  In particular, can you lock and unlock the jnode fewer times?
73646 +  */
73647 +void reiser4_uncapture_page(struct page *pg)
73648 +{
73649 +       jnode *node;
73650 +       txn_atom *atom;
73651 +
73652 +       assert("umka-199", pg != NULL);
73653 +       assert("nikita-3155", PageLocked(pg));
73654 +
73655 +       clear_page_dirty_for_io(pg);
73656 +
73657 +       reiser4_wait_page_writeback(pg);
73658 +
73659 +       node = jprivate(pg);
73660 +       BUG_ON(node == NULL);
73661 +
73662 +       spin_lock_jnode(node);
73663 +
73664 +       atom = jnode_get_atom(node);
73665 +       if (atom == NULL) {
73666 +               assert("jmacd-7111", !JF_ISSET(node, JNODE_DIRTY));
73667 +               spin_unlock_jnode(node);
73668 +               return;
73669 +       }
73670 +
73671 +       /* We can remove jnode from transaction even if it is on flush queue
73672 +        * prepped list, we only need to be sure that flush queue is not being
73673 +        * written by reiser4_write_fq().  reiser4_write_fq() does not use atom
73674 +        * spin lock for protection of the prepped nodes list, instead
73675 +        * write_fq() increments atom's nr_running_queues counters for the time
73676 +        * when prepped list is not protected by spin lock.  Here we check this
73677 +        * counter if we want to remove jnode from flush queue and, if the
73678 +        * counter is not zero, wait all reiser4_write_fq() for this atom to
73679 +        * complete. This is not significant overhead. */
73680 +       while (JF_ISSET(node, JNODE_FLUSH_QUEUED) && atom->nr_running_queues) {
73681 +               spin_unlock_jnode(node);
73682 +               /*
73683 +                * at this moment we want to wait for "atom event", viz. wait
73684 +                * until @node can be removed from flush queue. But
73685 +                * reiser4_atom_wait_event() cannot be called with page locked,
73686 +                * because it deadlocks with jnode_extent_write(). Unlock page,
73687 +                * after making sure (through page_cache_get()) that it cannot
73688 +                * be released from memory.
73689 +                */
73690 +               page_cache_get(pg);
73691 +               unlock_page(pg);
73692 +               reiser4_atom_wait_event(atom);
73693 +               lock_page(pg);
73694 +               /*
73695 +                * page may has been detached by ->writepage()->releasepage().
73696 +                */
73697 +               reiser4_wait_page_writeback(pg);
73698 +               spin_lock_jnode(node);
73699 +               page_cache_release(pg);
73700 +               atom = jnode_get_atom(node);
73701 +/* VS-FIXME-HANS: improve the commenting in this function */
73702 +               if (atom == NULL) {
73703 +                       spin_unlock_jnode(node);
73704 +                       return;
73705 +               }
73706 +       }
73707 +       reiser4_uncapture_block(node);
73708 +       spin_unlock_atom(atom);
73709 +       jput(node);
73710 +}
73711 +
73712 +/* this is used in extent's kill hook to uncapture and unhash jnodes attached to
73713 + * inode's tree of jnodes */
73714 +void reiser4_uncapture_jnode(jnode * node)
73715 +{
73716 +       txn_atom *atom;
73717 +
73718 +       assert_spin_locked(&(node->guard));
73719 +       assert("", node->pg == 0);
73720 +
73721 +       atom = jnode_get_atom(node);
73722 +       if (atom == NULL) {
73723 +               assert("jmacd-7111", !JF_ISSET(node, JNODE_DIRTY));
73724 +               spin_unlock_jnode(node);
73725 +               return;
73726 +       }
73727 +
73728 +       reiser4_uncapture_block(node);
73729 +       spin_unlock_atom(atom);
73730 +       jput(node);
73731 +}
73732 +
73733 +/* No-locking version of assign_txnh.  Sets the transaction handle's atom pointer,
73734 +   increases atom refcount and txnh_count, adds to txnh_list. */
73735 +static void capture_assign_txnh_nolock(txn_atom *atom, txn_handle *txnh)
73736 +{
73737 +       assert("umka-200", atom != NULL);
73738 +       assert("umka-201", txnh != NULL);
73739 +
73740 +       assert_spin_locked(&(txnh->hlock));
73741 +       assert_spin_locked(&(atom->alock));
73742 +       assert("jmacd-824", txnh->atom == NULL);
73743 +       assert("nikita-3540", atom_isopen(atom));
73744 +       BUG_ON(txnh->atom != NULL);
73745 +
73746 +       atomic_inc(&atom->refcount);
73747 +       txnh->atom = atom;
73748 +       reiser4_ctx_gfp_mask_set();
73749 +       list_add_tail(&txnh->txnh_link, &atom->txnh_list);
73750 +       atom->txnh_count += 1;
73751 +}
73752 +
73753 +/* No-locking version of assign_block.  Sets the block's atom pointer, references the
73754 +   block, adds it to the clean or dirty capture_jnode list, increments capture_count. */
73755 +static void capture_assign_block_nolock(txn_atom *atom, jnode *node)
73756 +{
73757 +       assert("umka-202", atom != NULL);
73758 +       assert("umka-203", node != NULL);
73759 +       assert_spin_locked(&(node->guard));
73760 +       assert_spin_locked(&(atom->alock));
73761 +       assert("jmacd-323", node->atom == NULL);
73762 +       BUG_ON(!list_empty_careful(&node->capture_link));
73763 +       assert("nikita-3470", !JF_ISSET(node, JNODE_DIRTY));
73764 +
73765 +       /* Pointer from jnode to atom is not counted in atom->refcount. */
73766 +       node->atom = atom;
73767 +
73768 +       list_add_tail(&node->capture_link, ATOM_CLEAN_LIST(atom));
73769 +       atom->capture_count += 1;
73770 +       /* reference to jnode is acquired by atom. */
73771 +       jref(node);
73772 +
73773 +       ON_DEBUG(count_jnode(atom, node, NOT_CAPTURED, CLEAN_LIST, 1));
73774 +
73775 +       LOCK_CNT_INC(t_refs);
73776 +}
73777 +
73778 +/* common code for dirtying both unformatted jnodes and formatted znodes. */
73779 +static void do_jnode_make_dirty(jnode * node, txn_atom * atom)
73780 +{
73781 +       assert_spin_locked(&(node->guard));
73782 +       assert_spin_locked(&(atom->alock));
73783 +       assert("jmacd-3981", !JF_ISSET(node, JNODE_DIRTY));
73784 +
73785 +       JF_SET(node, JNODE_DIRTY);
73786 +
73787 +       get_current_context()->nr_marked_dirty++;
73788 +
73789 +       /* We grab2flush_reserve one additional block only if node was
73790 +          not CREATED and jnode_flush did not sort it into neither
73791 +          relocate set nor overwrite one. If node is in overwrite or
73792 +          relocate set we assume that atom's flush reserved counter was
73793 +          already adjusted. */
73794 +       if (!JF_ISSET(node, JNODE_CREATED) && !JF_ISSET(node, JNODE_RELOC)
73795 +           && !JF_ISSET(node, JNODE_OVRWR) && jnode_is_leaf(node)
73796 +           && !jnode_is_cluster_page(node)) {
73797 +               assert("vs-1093", !reiser4_blocknr_is_fake(&node->blocknr));
73798 +               assert("vs-1506", *jnode_get_block(node) != 0);
73799 +               grabbed2flush_reserved_nolock(atom, (__u64) 1);
73800 +               JF_SET(node, JNODE_FLUSH_RESERVED);
73801 +       }
73802 +
73803 +       if (!JF_ISSET(node, JNODE_FLUSH_QUEUED)) {
73804 +               /* If the atom is not set yet, it will be added to the appropriate list in
73805 +                  capture_assign_block_nolock. */
73806 +               /* Sometimes a node is set dirty before being captured -- the case for new
73807 +                  jnodes.  In that case the jnode will be added to the appropriate list
73808 +                  in capture_assign_block_nolock. Another reason not to re-link jnode is
73809 +                  that jnode is on a flush queue (see flush.c for details) */
73810 +
73811 +               int level = jnode_get_level(node);
73812 +
73813 +               assert("nikita-3152", !JF_ISSET(node, JNODE_OVRWR));
73814 +               assert("zam-654", atom->stage < ASTAGE_PRE_COMMIT);
73815 +               assert("nikita-2607", 0 <= level);
73816 +               assert("nikita-2606", level <= REAL_MAX_ZTREE_HEIGHT);
73817 +
73818 +               /* move node to atom's dirty list */
73819 +               list_move_tail(&node->capture_link, ATOM_DIRTY_LIST(atom, level));
73820 +               ON_DEBUG(count_jnode
73821 +                        (atom, node, NODE_LIST(node), DIRTY_LIST, 1));
73822 +       }
73823 +}
73824 +
73825 +/* Set the dirty status for this (spin locked) jnode. */
73826 +void jnode_make_dirty_locked(jnode * node)
73827 +{
73828 +       assert("umka-204", node != NULL);
73829 +       assert_spin_locked(&(node->guard));
73830 +
73831 +       if (REISER4_DEBUG && rofs_jnode(node)) {
73832 +               warning("nikita-3365", "Dirtying jnode on rofs");
73833 +               dump_stack();
73834 +       }
73835 +
73836 +       /* Fast check for already dirty node */
73837 +       if (!JF_ISSET(node, JNODE_DIRTY)) {
73838 +               txn_atom *atom;
73839 +
73840 +               atom = jnode_get_atom(node);
73841 +               assert("vs-1094", atom);
73842 +               /* Check jnode dirty status again because node spin lock might
73843 +                * be released inside jnode_get_atom(). */
73844 +               if (likely(!JF_ISSET(node, JNODE_DIRTY)))
73845 +                       do_jnode_make_dirty(node, atom);
73846 +               spin_unlock_atom(atom);
73847 +       }
73848 +}
73849 +
73850 +/* Set the dirty status for this znode. */
73851 +void znode_make_dirty(znode * z)
73852 +{
73853 +       jnode *node;
73854 +       struct page *page;
73855 +
73856 +       assert("umka-204", z != NULL);
73857 +       assert("nikita-3290", znode_above_root(z) || znode_is_loaded(z));
73858 +       assert("nikita-3560", znode_is_write_locked(z));
73859 +
73860 +       node = ZJNODE(z);
73861 +       /* znode is longterm locked, we can check dirty bit without spinlock */
73862 +       if (JF_ISSET(node, JNODE_DIRTY)) {
73863 +               /* znode is dirty already. All we have to do is to change znode version */
73864 +               z->version = znode_build_version(jnode_get_tree(node));
73865 +               return;
73866 +       }
73867 +
73868 +       spin_lock_jnode(node);
73869 +       jnode_make_dirty_locked(node);
73870 +       page = jnode_page(node);
73871 +       if (page != NULL) {
73872 +               /* this is useful assertion (allows one to check that no
73873 +                * modifications are lost due to update of in-flight page),
73874 +                * but it requires locking on page to check PG_writeback
73875 +                * bit. */
73876 +               /* assert("nikita-3292",
73877 +                  !PageWriteback(page) || ZF_ISSET(z, JNODE_WRITEBACK)); */
73878 +               page_cache_get(page);
73879 +
73880 +               /* jnode lock is not needed for the rest of
73881 +                * znode_set_dirty(). */
73882 +               spin_unlock_jnode(node);
73883 +               /* reiser4 file write code calls set_page_dirty for
73884 +                * unformatted nodes, for formatted nodes we do it here. */
73885 +               reiser4_set_page_dirty_internal(page);
73886 +               page_cache_release(page);
73887 +               /* bump version counter in znode */
73888 +               z->version = znode_build_version(jnode_get_tree(node));
73889 +       } else {
73890 +               assert("zam-596", znode_above_root(JZNODE(node)));
73891 +               spin_unlock_jnode(node);
73892 +       }
73893 +
73894 +       assert("nikita-1900", znode_is_write_locked(z));
73895 +       assert("jmacd-9777", node->atom != NULL);
73896 +}
73897 +
73898 +int reiser4_sync_atom(txn_atom * atom)
73899 +{
73900 +       int result;
73901 +       txn_handle *txnh;
73902 +
73903 +       txnh = get_current_context()->trans;
73904 +
73905 +       result = 0;
73906 +       if (atom != NULL) {
73907 +               if (atom->stage < ASTAGE_PRE_COMMIT) {
73908 +                       spin_lock_txnh(txnh);
73909 +                       capture_assign_txnh_nolock(atom, txnh);
73910 +                       result = force_commit_atom(txnh);
73911 +               } else if (atom->stage < ASTAGE_POST_COMMIT) {
73912 +                       /* wait atom commit */
73913 +                       reiser4_atom_wait_event(atom);
73914 +                       /* try once more */
73915 +                       result = RETERR(-E_REPEAT);
73916 +               } else
73917 +                       spin_unlock_atom(atom);
73918 +       }
73919 +       return result;
73920 +}
73921 +
73922 +#if REISER4_DEBUG
73923 +
73924 +/* move jnode form one list to another
73925 +   call this after atom->capture_count is updated */
73926 +void
73927 +count_jnode(txn_atom * atom, jnode * node, atom_list old_list,
73928 +           atom_list new_list, int check_lists)
73929 +{
73930 +       struct list_head *pos;
73931 +
73932 +       assert("zam-1018", atom_is_protected(atom));
73933 +       assert_spin_locked(&(node->guard));
73934 +       assert("", NODE_LIST(node) == old_list);
73935 +
73936 +       switch (NODE_LIST(node)) {
73937 +       case NOT_CAPTURED:
73938 +               break;
73939 +       case DIRTY_LIST:
73940 +               assert("", atom->dirty > 0);
73941 +               atom->dirty--;
73942 +               break;
73943 +       case CLEAN_LIST:
73944 +               assert("", atom->clean > 0);
73945 +               atom->clean--;
73946 +               break;
73947 +       case FQ_LIST:
73948 +               assert("", atom->fq > 0);
73949 +               atom->fq--;
73950 +               break;
73951 +       case WB_LIST:
73952 +               assert("", atom->wb > 0);
73953 +               atom->wb--;
73954 +               break;
73955 +       case OVRWR_LIST:
73956 +               assert("", atom->ovrwr > 0);
73957 +               atom->ovrwr--;
73958 +               break;
73959 +       default:
73960 +               impossible("", "");
73961 +       }
73962 +
73963 +       switch (new_list) {
73964 +       case NOT_CAPTURED:
73965 +               break;
73966 +       case DIRTY_LIST:
73967 +               atom->dirty++;
73968 +               break;
73969 +       case CLEAN_LIST:
73970 +               atom->clean++;
73971 +               break;
73972 +       case FQ_LIST:
73973 +               atom->fq++;
73974 +               break;
73975 +       case WB_LIST:
73976 +               atom->wb++;
73977 +               break;
73978 +       case OVRWR_LIST:
73979 +               atom->ovrwr++;
73980 +               break;
73981 +       default:
73982 +               impossible("", "");
73983 +       }
73984 +       ASSIGN_NODE_LIST(node, new_list);
73985 +       if (0 && check_lists) {
73986 +               int count;
73987 +               tree_level level;
73988 +
73989 +               count = 0;
73990 +
73991 +               /* flush queue list */
73992 +               /* reiser4_check_fq(atom); */
73993 +
73994 +               /* dirty list */
73995 +               count = 0;
73996 +               for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1) {
73997 +                       list_for_each(pos, ATOM_DIRTY_LIST(atom, level))
73998 +                               count++;
73999 +               }
74000 +               if (count != atom->dirty)
74001 +                       warning("", "dirty counter %d, real %d\n", atom->dirty,
74002 +                               count);
74003 +
74004 +               /* clean list */
74005 +               count = 0;
74006 +               list_for_each(pos, ATOM_CLEAN_LIST(atom))
74007 +                       count++;
74008 +               if (count != atom->clean)
74009 +                       warning("", "clean counter %d, real %d\n", atom->clean,
74010 +                               count);
74011 +
74012 +               /* wb list */
74013 +               count = 0;
74014 +               list_for_each(pos, ATOM_WB_LIST(atom))
74015 +                       count++;
74016 +               if (count != atom->wb)
74017 +                       warning("", "wb counter %d, real %d\n", atom->wb,
74018 +                               count);
74019 +
74020 +               /* overwrite list */
74021 +               count = 0;
74022 +               list_for_each(pos, ATOM_OVRWR_LIST(atom))
74023 +                       count++;
74024 +
74025 +               if (count != atom->ovrwr)
74026 +                       warning("", "ovrwr counter %d, real %d\n", atom->ovrwr,
74027 +                               count);
74028 +       }
74029 +       assert("vs-1624", atom->num_queued == atom->fq);
74030 +       if (atom->capture_count !=
74031 +           atom->dirty + atom->clean + atom->ovrwr + atom->wb + atom->fq) {
74032 +               printk
74033 +                   ("count %d, dirty %d clean %d ovrwr %d wb %d fq %d\n",
74034 +                    atom->capture_count, atom->dirty, atom->clean, atom->ovrwr,
74035 +                    atom->wb, atom->fq);
74036 +               assert("vs-1622",
74037 +                      atom->capture_count ==
74038 +                      atom->dirty + atom->clean + atom->ovrwr + atom->wb +
74039 +                      atom->fq);
74040 +       }
74041 +}
74042 +
74043 +#endif
74044 +
74045 +/* Make node OVRWR and put it on atom->overwrite_nodes list, atom lock and jnode
74046 + * lock should be taken before calling this function. */
74047 +void jnode_make_wander_nolock(jnode * node)
74048 +{
74049 +       txn_atom *atom;
74050 +
74051 +       assert("nikita-2431", node != NULL);
74052 +       assert("nikita-2432", !JF_ISSET(node, JNODE_RELOC));
74053 +       assert("nikita-3153", JF_ISSET(node, JNODE_DIRTY));
74054 +       assert("zam-897", !JF_ISSET(node, JNODE_FLUSH_QUEUED));
74055 +       assert("nikita-3367", !reiser4_blocknr_is_fake(jnode_get_block(node)));
74056 +
74057 +       atom = node->atom;
74058 +
74059 +       assert("zam-895", atom != NULL);
74060 +       assert("zam-894", atom_is_protected(atom));
74061 +
74062 +       JF_SET(node, JNODE_OVRWR);
74063 +       /* move node to atom's overwrite list */
74064 +       list_move_tail(&node->capture_link, ATOM_OVRWR_LIST(atom));
74065 +       ON_DEBUG(count_jnode(atom, node, DIRTY_LIST, OVRWR_LIST, 1));
74066 +}
74067 +
74068 +/* Same as jnode_make_wander_nolock, but all necessary locks are taken inside
74069 + * this function. */
74070 +void jnode_make_wander(jnode * node)
74071 +{
74072 +       txn_atom *atom;
74073 +
74074 +       spin_lock_jnode(node);
74075 +       atom = jnode_get_atom(node);
74076 +       assert("zam-913", atom != NULL);
74077 +       assert("zam-914", !JF_ISSET(node, JNODE_RELOC));
74078 +
74079 +       jnode_make_wander_nolock(node);
74080 +       spin_unlock_atom(atom);
74081 +       spin_unlock_jnode(node);
74082 +}
74083 +
74084 +/* this just sets RELOC bit  */
74085 +static void jnode_make_reloc_nolock(flush_queue_t * fq, jnode * node)
74086 +{
74087 +       assert_spin_locked(&(node->guard));
74088 +       assert("zam-916", JF_ISSET(node, JNODE_DIRTY));
74089 +       assert("zam-917", !JF_ISSET(node, JNODE_RELOC));
74090 +       assert("zam-918", !JF_ISSET(node, JNODE_OVRWR));
74091 +       assert("zam-920", !JF_ISSET(node, JNODE_FLUSH_QUEUED));
74092 +       assert("nikita-3367", !reiser4_blocknr_is_fake(jnode_get_block(node)));
74093 +       jnode_set_reloc(node);
74094 +}
74095 +
74096 +/* Make znode RELOC and put it on flush queue */
74097 +void znode_make_reloc(znode * z, flush_queue_t * fq)
74098 +{
74099 +       jnode *node;
74100 +       txn_atom *atom;
74101 +
74102 +       node = ZJNODE(z);
74103 +       spin_lock_jnode(node);
74104 +
74105 +       atom = jnode_get_atom(node);
74106 +       assert("zam-919", atom != NULL);
74107 +
74108 +       jnode_make_reloc_nolock(fq, node);
74109 +       queue_jnode(fq, node);
74110 +
74111 +       spin_unlock_atom(atom);
74112 +       spin_unlock_jnode(node);
74113 +
74114 +}
74115 +
74116 +/* Make unformatted node RELOC and put it on flush queue */
74117 +void unformatted_make_reloc(jnode *node, flush_queue_t *fq)
74118 +{
74119 +       assert("vs-1479", jnode_is_unformatted(node));
74120 +
74121 +       jnode_make_reloc_nolock(fq, node);
74122 +       queue_jnode(fq, node);
74123 +}
74124 +
74125 +int reiser4_capture_super_block(struct super_block *s)
74126 +{
74127 +       int result;
74128 +       znode *uber;
74129 +       lock_handle lh;
74130 +
74131 +       init_lh(&lh);
74132 +       result = get_uber_znode(reiser4_get_tree(s),
74133 +                               ZNODE_WRITE_LOCK, ZNODE_LOCK_LOPRI, &lh);
74134 +       if (result)
74135 +               return result;
74136 +
74137 +       uber = lh.node;
74138 +       /* Grabbing one block for superblock */
74139 +       result = reiser4_grab_space_force((__u64) 1, BA_RESERVED);
74140 +       if (result != 0)
74141 +               return result;
74142 +
74143 +       znode_make_dirty(uber);
74144 +
74145 +       done_lh(&lh);
74146 +       return 0;
74147 +}
74148 +
74149 +/* Wakeup every handle on the atom's WAITFOR list */
74150 +static void wakeup_atom_waitfor_list(txn_atom * atom)
74151 +{
74152 +       txn_wait_links *wlinks;
74153 +
74154 +       assert("umka-210", atom != NULL);
74155 +
74156 +       /* atom is locked */
74157 +       list_for_each_entry(wlinks, &atom->fwaitfor_list, _fwaitfor_link) {
74158 +               if (wlinks->waitfor_cb == NULL ||
74159 +                   wlinks->waitfor_cb(atom, wlinks))
74160 +                       /* Wake up. */
74161 +                       reiser4_wake_up(wlinks->_lock_stack);
74162 +       }
74163 +}
74164 +
74165 +/* Wakeup every handle on the atom's WAITING list */
74166 +static void wakeup_atom_waiting_list(txn_atom * atom)
74167 +{
74168 +       txn_wait_links *wlinks;
74169 +
74170 +       assert("umka-211", atom != NULL);
74171 +
74172 +       /* atom is locked */
74173 +       list_for_each_entry(wlinks, &atom->fwaiting_list, _fwaiting_link) {
74174 +               if (wlinks->waiting_cb == NULL ||
74175 +                   wlinks->waiting_cb(atom, wlinks))
74176 +                       /* Wake up. */
74177 +                       reiser4_wake_up(wlinks->_lock_stack);
74178 +       }
74179 +}
74180 +
74181 +/* helper function used by capture_fuse_wait() to avoid "spurious wake-ups" */
74182 +static int wait_for_fusion(txn_atom * atom, txn_wait_links * wlinks)
74183 +{
74184 +       assert("nikita-3330", atom != NULL);
74185 +       assert_spin_locked(&(atom->alock));
74186 +
74187 +       /* atom->txnh_count == 1 is for waking waiters up if we are releasing
74188 +        * last transaction handle. */
74189 +       return atom->stage != ASTAGE_CAPTURE_WAIT || atom->txnh_count == 1;
74190 +}
74191 +
74192 +/* The general purpose of this function is to wait on the first of two possible events.
74193 +   The situation is that a handle (and its atom atomh) is blocked trying to capture a
74194 +   block (i.e., node) but the node's atom (atomf) is in the CAPTURE_WAIT state.  The
74195 +   handle's atom (atomh) is not in the CAPTURE_WAIT state.  However, atomh could fuse with
74196 +   another atom or, due to age, enter the CAPTURE_WAIT state itself, at which point it
74197 +   needs to unblock the handle to avoid deadlock.  When the txnh is unblocked it will
74198 +   proceed and fuse the two atoms in the CAPTURE_WAIT state.
74199 +
74200 +   In other words, if either atomh or atomf change state, the handle will be awakened,
74201 +   thus there are two lists per atom: WAITING and WAITFOR.
74202 +
74203 +   This is also called by capture_assign_txnh with (atomh == NULL) to wait for atomf to
74204 +   close but it is not assigned to an atom of its own.
74205 +
74206 +   Lock ordering in this method: all four locks are held: JNODE_LOCK, TXNH_LOCK,
74207 +   BOTH_ATOM_LOCKS.  Result: all four locks are released.
74208 +*/
74209 +static int capture_fuse_wait(txn_handle * txnh, txn_atom * atomf,
74210 +                   txn_atom * atomh, txn_capture mode)
74211 +{
74212 +       int ret;
74213 +       txn_wait_links wlinks;
74214 +
74215 +       assert("umka-213", txnh != NULL);
74216 +       assert("umka-214", atomf != NULL);
74217 +
74218 +       if ((mode & TXN_CAPTURE_NONBLOCKING) != 0) {
74219 +               spin_unlock_txnh(txnh);
74220 +               spin_unlock_atom(atomf);
74221 +
74222 +               if (atomh) {
74223 +                       spin_unlock_atom(atomh);
74224 +               }
74225 +
74226 +               return RETERR(-E_BLOCK);
74227 +       }
74228 +
74229 +       /* Initialize the waiting list links. */
74230 +       init_wlinks(&wlinks);
74231 +
74232 +       /* Add txnh to atomf's waitfor list, unlock atomf. */
74233 +       list_add_tail(&wlinks._fwaitfor_link, &atomf->fwaitfor_list);
74234 +       wlinks.waitfor_cb = wait_for_fusion;
74235 +       atomic_inc(&atomf->refcount);
74236 +       spin_unlock_atom(atomf);
74237 +
74238 +       if (atomh) {
74239 +               /* Add txnh to atomh's waiting list, unlock atomh. */
74240 +               list_add_tail(&wlinks._fwaiting_link, &atomh->fwaiting_list);
74241 +               atomic_inc(&atomh->refcount);
74242 +               spin_unlock_atom(atomh);
74243 +       }
74244 +
74245 +       /* Go to sleep. */
74246 +       spin_unlock_txnh(txnh);
74247 +
74248 +       ret = reiser4_prepare_to_sleep(wlinks._lock_stack);
74249 +       if (ret == 0) {
74250 +               reiser4_go_to_sleep(wlinks._lock_stack);
74251 +               ret = RETERR(-E_REPEAT);
74252 +       }
74253 +
74254 +       /* Remove from the waitfor list. */
74255 +       spin_lock_atom(atomf);
74256 +
74257 +       list_del(&wlinks._fwaitfor_link);
74258 +       atom_dec_and_unlock(atomf);
74259 +
74260 +       if (atomh) {
74261 +               /* Remove from the waiting list. */
74262 +               spin_lock_atom(atomh);
74263 +               list_del(&wlinks._fwaiting_link);
74264 +               atom_dec_and_unlock(atomh);
74265 +       }
74266 +       return ret;
74267 +}
74268 +
74269 +static void lock_two_atoms(txn_atom * one, txn_atom * two)
74270 +{
74271 +       assert("zam-1067", one != two);
74272 +
74273 +       /* lock the atom with lesser address first */
74274 +       if (one < two) {
74275 +               spin_lock_atom(one);
74276 +               spin_lock_atom_nested(two);
74277 +       } else {
74278 +               spin_lock_atom(two);
74279 +               spin_lock_atom_nested(one);
74280 +       }
74281 +}
74282 +
74283 +/* Perform the necessary work to prepare for fusing two atoms, which involves
74284 + * acquiring two atom locks in the proper order.  If one of the node's atom is
74285 + * blocking fusion (i.e., it is in the CAPTURE_WAIT stage) and the handle's
74286 + * atom is not then the handle's request is put to sleep.  If the node's atom
74287 + * is committing, then the node can be copy-on-captured.  Otherwise, pick the
74288 + * atom with fewer pointers to be fused into the atom with more pointer and
74289 + * call capture_fuse_into.
74290 + */
74291 +static int capture_init_fusion(jnode *node, txn_handle *txnh, txn_capture mode)
74292 +{
74293 +       txn_atom * txnh_atom = txnh->atom;
74294 +       txn_atom * block_atom = node->atom;
74295 +
74296 +       atomic_inc(&txnh_atom->refcount);
74297 +       atomic_inc(&block_atom->refcount);
74298 +
74299 +       spin_unlock_txnh(txnh);
74300 +       spin_unlock_jnode(node);
74301 +
74302 +       lock_two_atoms(txnh_atom, block_atom);
74303 +
74304 +       if (txnh->atom != txnh_atom || node->atom != block_atom ) {
74305 +               release_two_atoms(txnh_atom, block_atom);
74306 +               return RETERR(-E_REPEAT);
74307 +       }
74308 +
74309 +       atomic_dec(&txnh_atom->refcount);
74310 +       atomic_dec(&block_atom->refcount);
74311 +
74312 +       assert ("zam-1066", atom_isopen(txnh_atom));
74313 +
74314 +       if (txnh_atom->stage >= block_atom->stage ||
74315 +           (block_atom->stage == ASTAGE_CAPTURE_WAIT && block_atom->txnh_count == 0)) {
74316 +               capture_fuse_into(txnh_atom, block_atom);
74317 +               return RETERR(-E_REPEAT);
74318 +       }
74319 +       spin_lock_txnh(txnh);
74320 +       return capture_fuse_wait(txnh, block_atom, txnh_atom, mode);
74321 +}
74322 +
74323 +/* This function splices together two jnode lists (small and large) and sets all jnodes in
74324 +   the small list to point to the large atom.  Returns the length of the list. */
74325 +static int
74326 +capture_fuse_jnode_lists(txn_atom *large, struct list_head *large_head,
74327 +                        struct list_head *small_head)
74328 +{
74329 +       int count = 0;
74330 +       jnode *node;
74331 +
74332 +       assert("umka-218", large != NULL);
74333 +       assert("umka-219", large_head != NULL);
74334 +       assert("umka-220", small_head != NULL);
74335 +       /* small atom should be locked also. */
74336 +       assert_spin_locked(&(large->alock));
74337 +
74338 +       /* For every jnode on small's capture list... */
74339 +       list_for_each_entry(node, small_head, capture_link) {
74340 +               count += 1;
74341 +
74342 +               /* With the jnode lock held, update atom pointer. */
74343 +               spin_lock_jnode(node);
74344 +               node->atom = large;
74345 +               spin_unlock_jnode(node);
74346 +       }
74347 +
74348 +       /* Splice the lists. */
74349 +       list_splice_init(small_head, large_head->prev);
74350 +
74351 +       return count;
74352 +}
74353 +
74354 +/* This function splices together two txnh lists (small and large) and sets all txn handles in
74355 +   the small list to point to the large atom.  Returns the length of the list. */
74356 +static int
74357 +capture_fuse_txnh_lists(txn_atom *large, struct list_head *large_head,
74358 +                       struct list_head *small_head)
74359 +{
74360 +       int count = 0;
74361 +       txn_handle *txnh;
74362 +
74363 +       assert("umka-221", large != NULL);
74364 +       assert("umka-222", large_head != NULL);
74365 +       assert("umka-223", small_head != NULL);
74366 +
74367 +       /* Adjust every txnh to the new atom. */
74368 +       list_for_each_entry(txnh, small_head, txnh_link) {
74369 +               count += 1;
74370 +
74371 +               /* With the txnh lock held, update atom pointer. */
74372 +               spin_lock_txnh(txnh);
74373 +               txnh->atom = large;
74374 +               spin_unlock_txnh(txnh);
74375 +       }
74376 +
74377 +       /* Splice the txn_handle list. */
74378 +       list_splice_init(small_head, large_head->prev);
74379 +
74380 +       return count;
74381 +}
74382 +
74383 +/* This function fuses two atoms.  The captured nodes and handles belonging to SMALL are
74384 +   added to LARGE and their ->atom pointers are all updated.  The associated counts are
74385 +   updated as well, and any waiting handles belonging to either are awakened.  Finally the
74386 +   smaller atom's refcount is decremented.
74387 +*/
74388 +static void capture_fuse_into(txn_atom * small, txn_atom * large)
74389 +{
74390 +       int level;
74391 +       unsigned zcount = 0;
74392 +       unsigned tcount = 0;
74393 +
74394 +       assert("umka-224", small != NULL);
74395 +       assert("umka-225", small != NULL);
74396 +
74397 +       assert_spin_locked(&(large->alock));
74398 +       assert_spin_locked(&(small->alock));
74399 +
74400 +       assert("jmacd-201", atom_isopen(small));
74401 +       assert("jmacd-202", atom_isopen(large));
74402 +
74403 +       /* Splice and update the per-level dirty jnode lists */
74404 +       for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1) {
74405 +               zcount +=
74406 +                   capture_fuse_jnode_lists(large,
74407 +                                            ATOM_DIRTY_LIST(large, level),
74408 +                                            ATOM_DIRTY_LIST(small, level));
74409 +       }
74410 +
74411 +       /* Splice and update the [clean,dirty] jnode and txnh lists */
74412 +       zcount +=
74413 +           capture_fuse_jnode_lists(large, ATOM_CLEAN_LIST(large),
74414 +                                    ATOM_CLEAN_LIST(small));
74415 +       zcount +=
74416 +           capture_fuse_jnode_lists(large, ATOM_OVRWR_LIST(large),
74417 +                                    ATOM_OVRWR_LIST(small));
74418 +       zcount +=
74419 +           capture_fuse_jnode_lists(large, ATOM_WB_LIST(large),
74420 +                                    ATOM_WB_LIST(small));
74421 +       zcount +=
74422 +           capture_fuse_jnode_lists(large, &large->inodes, &small->inodes);
74423 +       tcount +=
74424 +           capture_fuse_txnh_lists(large, &large->txnh_list,
74425 +                                   &small->txnh_list);
74426 +
74427 +       /* Check our accounting. */
74428 +       assert("jmacd-1063",
74429 +              zcount + small->num_queued == small->capture_count);
74430 +       assert("jmacd-1065", tcount == small->txnh_count);
74431 +
74432 +       /* sum numbers of waiters threads */
74433 +       large->nr_waiters += small->nr_waiters;
74434 +       small->nr_waiters = 0;
74435 +
74436 +       /* splice flush queues */
74437 +       reiser4_fuse_fq(large, small);
74438 +
74439 +       /* update counter of jnode on every atom' list */
74440 +       ON_DEBUG(large->dirty += small->dirty;
74441 +                small->dirty = 0;
74442 +                large->clean += small->clean;
74443 +                small->clean = 0;
74444 +                large->ovrwr += small->ovrwr;
74445 +                small->ovrwr = 0;
74446 +                large->wb += small->wb;
74447 +                small->wb = 0;
74448 +                large->fq += small->fq;
74449 +                small->fq = 0;);
74450 +
74451 +       /* count flushers in result atom */
74452 +       large->nr_flushers += small->nr_flushers;
74453 +       small->nr_flushers = 0;
74454 +
74455 +       /* update counts of flushed nodes */
74456 +       large->flushed += small->flushed;
74457 +       small->flushed = 0;
74458 +
74459 +       /* Transfer list counts to large. */
74460 +       large->txnh_count += small->txnh_count;
74461 +       large->capture_count += small->capture_count;
74462 +
74463 +       /* Add all txnh references to large. */
74464 +       atomic_add(small->txnh_count, &large->refcount);
74465 +       atomic_sub(small->txnh_count, &small->refcount);
74466 +
74467 +       /* Reset small counts */
74468 +       small->txnh_count = 0;
74469 +       small->capture_count = 0;
74470 +
74471 +       /* Assign the oldest start_time, merge flags. */
74472 +       large->start_time = min(large->start_time, small->start_time);
74473 +       large->flags |= small->flags;
74474 +
74475 +       /* Merge blocknr sets. */
74476 +       blocknr_set_merge(&small->delete_set, &large->delete_set);
74477 +       blocknr_set_merge(&small->wandered_map, &large->wandered_map);
74478 +
74479 +       /* Merge allocated/deleted file counts */
74480 +       large->nr_objects_deleted += small->nr_objects_deleted;
74481 +       large->nr_objects_created += small->nr_objects_created;
74482 +
74483 +       small->nr_objects_deleted = 0;
74484 +       small->nr_objects_created = 0;
74485 +
74486 +       /* Merge allocated blocks counts */
74487 +       large->nr_blocks_allocated += small->nr_blocks_allocated;
74488 +
74489 +       large->nr_running_queues += small->nr_running_queues;
74490 +       small->nr_running_queues = 0;
74491 +
74492 +       /* Merge blocks reserved for overwrite set. */
74493 +       large->flush_reserved += small->flush_reserved;
74494 +       small->flush_reserved = 0;
74495 +
74496 +       if (large->stage < small->stage) {
74497 +               /* Large only needs to notify if it has changed state. */
74498 +               reiser4_atom_set_stage(large, small->stage);
74499 +               wakeup_atom_waiting_list(large);
74500 +       }
74501 +
74502 +       reiser4_atom_set_stage(small, ASTAGE_INVALID);
74503 +
74504 +       /* Notify any waiters--small needs to unload its wait lists.  Waiters
74505 +          actually remove themselves from the list before returning from the
74506 +          fuse_wait function. */
74507 +       wakeup_atom_waiting_list(small);
74508 +
74509 +       /* Unlock atoms */
74510 +       spin_unlock_atom(large);
74511 +       atom_dec_and_unlock(small);
74512 +}
74513 +
74514 +/* TXNMGR STUFF */
74515 +
74516 +/* Release a block from the atom, reversing the effects of being captured,
74517 +   do not release atom's reference to jnode due to holding spin-locks.
74518 +   Currently this is only called when the atom commits.
74519 +
74520 +   NOTE: this function does not release a (journal) reference to jnode
74521 +   due to locking optimizations, you should call jput() somewhere after
74522 +   calling reiser4_uncapture_block(). */
74523 +void reiser4_uncapture_block(jnode * node)
74524 +{
74525 +       txn_atom *atom;
74526 +
74527 +       assert("umka-226", node != NULL);
74528 +       atom = node->atom;
74529 +       assert("umka-228", atom != NULL);
74530 +
74531 +       assert("jmacd-1021", node->atom == atom);
74532 +       assert_spin_locked(&(node->guard));
74533 +       assert("jmacd-1023", atom_is_protected(atom));
74534 +
74535 +       JF_CLR(node, JNODE_DIRTY);
74536 +       JF_CLR(node, JNODE_RELOC);
74537 +       JF_CLR(node, JNODE_OVRWR);
74538 +       JF_CLR(node, JNODE_CREATED);
74539 +       JF_CLR(node, JNODE_WRITEBACK);
74540 +       JF_CLR(node, JNODE_REPACK);
74541 +
74542 +       list_del_init(&node->capture_link);
74543 +       if (JF_ISSET(node, JNODE_FLUSH_QUEUED)) {
74544 +               assert("zam-925", atom_isopen(atom));
74545 +               assert("vs-1623", NODE_LIST(node) == FQ_LIST);
74546 +               ON_DEBUG(atom->num_queued--);
74547 +               JF_CLR(node, JNODE_FLUSH_QUEUED);
74548 +       }
74549 +       atom->capture_count -= 1;
74550 +       ON_DEBUG(count_jnode(atom, node, NODE_LIST(node), NOT_CAPTURED, 1));
74551 +       node->atom = NULL;
74552 +
74553 +       spin_unlock_jnode(node);
74554 +       LOCK_CNT_DEC(t_refs);
74555 +}
74556 +
74557 +/* Unconditional insert of jnode into atom's overwrite list. Currently used in
74558 +   bitmap-based allocator code for adding modified bitmap blocks the
74559 +   transaction. @atom and @node are spin locked */
74560 +void insert_into_atom_ovrwr_list(txn_atom * atom, jnode * node)
74561 +{
74562 +       assert("zam-538", atom_is_protected(atom));
74563 +       assert_spin_locked(&(node->guard));
74564 +       assert("zam-899", JF_ISSET(node, JNODE_OVRWR));
74565 +       assert("zam-543", node->atom == NULL);
74566 +       assert("vs-1433", !jnode_is_unformatted(node) && !jnode_is_znode(node));
74567 +
74568 +       list_add(&node->capture_link, ATOM_OVRWR_LIST(atom));
74569 +       jref(node);
74570 +       node->atom = atom;
74571 +       atom->capture_count++;
74572 +       ON_DEBUG(count_jnode(atom, node, NODE_LIST(node), OVRWR_LIST, 1));
74573 +}
74574 +
74575 +static int count_deleted_blocks_actor(txn_atom * atom,
74576 +                                     const reiser4_block_nr * a,
74577 +                                     const reiser4_block_nr * b, void *data)
74578 +{
74579 +       reiser4_block_nr *counter = data;
74580 +
74581 +       assert("zam-995", data != NULL);
74582 +       assert("zam-996", a != NULL);
74583 +       if (b == NULL)
74584 +               *counter += 1;
74585 +       else
74586 +               *counter += *b;
74587 +       return 0;
74588 +}
74589 +
74590 +reiser4_block_nr txnmgr_count_deleted_blocks(void)
74591 +{
74592 +       reiser4_block_nr result;
74593 +       txn_mgr *tmgr = &get_super_private(reiser4_get_current_sb())->tmgr;
74594 +       txn_atom *atom;
74595 +
74596 +       result = 0;
74597 +
74598 +       spin_lock_txnmgr(tmgr);
74599 +       list_for_each_entry(atom, &tmgr->atoms_list, atom_link) {
74600 +               spin_lock_atom(atom);
74601 +               if (atom_isopen(atom))
74602 +                       blocknr_set_iterator(
74603 +                               atom, &atom->delete_set,
74604 +                               count_deleted_blocks_actor, &result, 0);
74605 +               spin_unlock_atom(atom);
74606 +       }
74607 +       spin_unlock_txnmgr(tmgr);
74608 +
74609 +       return result;
74610 +}
74611 +
74612 +/*
74613 + * Local variables:
74614 + * c-indentation-style: "K&R"
74615 + * mode-name: "LC"
74616 + * c-basic-offset: 8
74617 + * tab-width: 8
74618 + * fill-column: 79
74619 + * End:
74620 + */
74621 diff -puN /dev/null fs/reiser4/txnmgr.h
74622 --- /dev/null
74623 +++ a/fs/reiser4/txnmgr.h
74624 @@ -0,0 +1,701 @@
74625 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
74626 + * reiser4/README */
74627 +
74628 +/* data-types and function declarations for transaction manager. See txnmgr.c
74629 + * for details. */
74630 +
74631 +#ifndef __REISER4_TXNMGR_H__
74632 +#define __REISER4_TXNMGR_H__
74633 +
74634 +#include "forward.h"
74635 +#include "dformat.h"
74636 +
74637 +#include <linux/fs.h>
74638 +#include <linux/mm.h>
74639 +#include <linux/types.h>
74640 +#include <linux/spinlock.h>
74641 +#include <asm/atomic.h>
74642 +#include <linux/wait.h>
74643 +
74644 +/* TYPE DECLARATIONS */
74645 +
74646 +/* This enumeration describes the possible types of a capture request (reiser4_try_capture).
74647 +   A capture request dynamically assigns a block to the calling thread's transaction
74648 +   handle. */
74649 +typedef enum {
74650 +       /* A READ_ATOMIC request indicates that a block will be read and that the caller's
74651 +          atom should fuse in order to ensure that the block commits atomically with the
74652 +          caller. */
74653 +       TXN_CAPTURE_READ_ATOMIC = (1 << 0),
74654 +
74655 +       /* A READ_NONCOM request indicates that a block will be read and that the caller is
74656 +          willing to read a non-committed block without causing atoms to fuse. */
74657 +       TXN_CAPTURE_READ_NONCOM = (1 << 1),
74658 +
74659 +       /* A READ_MODIFY request indicates that a block will be read but that the caller
74660 +          wishes for the block to be captured as it will be written.  This capture request
74661 +          mode is not currently used, but eventually it will be useful for preventing
74662 +          deadlock in read-modify-write cycles. */
74663 +       TXN_CAPTURE_READ_MODIFY = (1 << 2),
74664 +
74665 +       /* A WRITE capture request indicates that a block will be modified and that atoms
74666 +          should fuse to make the commit atomic. */
74667 +       TXN_CAPTURE_WRITE = (1 << 3),
74668 +
74669 +       /* CAPTURE_TYPES is a mask of the four above capture types, used to separate the
74670 +          exclusive type designation from extra bits that may be supplied -- see
74671 +          below. */
74672 +       TXN_CAPTURE_TYPES = (TXN_CAPTURE_READ_ATOMIC |
74673 +                            TXN_CAPTURE_READ_NONCOM | TXN_CAPTURE_READ_MODIFY |
74674 +                            TXN_CAPTURE_WRITE),
74675 +
74676 +       /* A subset of CAPTURE_TYPES, CAPTURE_WTYPES is a mask of request types that
74677 +          indicate modification will occur. */
74678 +       TXN_CAPTURE_WTYPES = (TXN_CAPTURE_READ_MODIFY | TXN_CAPTURE_WRITE),
74679 +
74680 +       /* An option to reiser4_try_capture, NONBLOCKING indicates that the caller would
74681 +          prefer not to sleep waiting for an aging atom to commit. */
74682 +       TXN_CAPTURE_NONBLOCKING = (1 << 4),
74683 +
74684 +       /* An option to reiser4_try_capture to prevent atom fusion, just simple
74685 +          capturing is allowed */
74686 +       TXN_CAPTURE_DONT_FUSE = (1 << 5)
74687 +
74688 +       /* This macro selects only the exclusive capture request types, stripping out any
74689 +          options that were supplied (i.e., NONBLOCKING). */
74690 +#define CAPTURE_TYPE(x) ((x) & TXN_CAPTURE_TYPES)
74691 +} txn_capture;
74692 +
74693 +/* There are two kinds of transaction handle: WRITE_FUSING and READ_FUSING, the only
74694 +   difference is in the handling of read requests.  A WRITE_FUSING transaction handle
74695 +   defaults read capture requests to TXN_CAPTURE_READ_NONCOM whereas a READ_FUSIONG
74696 +   transaction handle defaults to TXN_CAPTURE_READ_ATOMIC. */
74697 +typedef enum {
74698 +       TXN_WRITE_FUSING = (1 << 0),
74699 +       TXN_READ_FUSING = (1 << 1) | TXN_WRITE_FUSING,  /* READ implies WRITE */
74700 +} txn_mode;
74701 +
74702 +/* Every atom has a stage, which is one of these exclusive values: */
74703 +typedef enum {
74704 +       /* Initially an atom is free. */
74705 +       ASTAGE_FREE = 0,
74706 +
74707 +       /* An atom begins by entering the CAPTURE_FUSE stage, where it proceeds to capture
74708 +          blocks and fuse with other atoms. */
74709 +       ASTAGE_CAPTURE_FUSE = 1,
74710 +
74711 +       /* We need to have a ASTAGE_CAPTURE_SLOW in which an atom fuses with one node for every X nodes it flushes to disk where X > 1. */
74712 +
74713 +       /* When an atom reaches a certain age it must do all it can to commit.  An atom in
74714 +          the CAPTURE_WAIT stage refuses new transaction handles and prevents fusion from
74715 +          atoms in the CAPTURE_FUSE stage. */
74716 +       ASTAGE_CAPTURE_WAIT = 2,
74717 +
74718 +       /* Waiting for I/O before commit.  Copy-on-capture (see
74719 +          http://namesys.com/v4/v4.html). */
74720 +       ASTAGE_PRE_COMMIT = 3,
74721 +
74722 +       /* Post-commit overwrite I/O.  Steal-on-capture. */
74723 +       ASTAGE_POST_COMMIT = 4,
74724 +
74725 +       /* Atom which waits for the removal of the last reference to (it? ) to
74726 +        * be deleted from memory  */
74727 +       ASTAGE_DONE = 5,
74728 +
74729 +       /* invalid atom. */
74730 +       ASTAGE_INVALID = 6,
74731 +
74732 +} txn_stage;
74733 +
74734 +/* Certain flags may be set in the txn_atom->flags field. */
74735 +typedef enum {
74736 +       /* Indicates that the atom should commit as soon as possible. */
74737 +       ATOM_FORCE_COMMIT = (1 << 0),
74738 +       /* to avoid endless loop, mark the atom (which was considered as too
74739 +        * small) after failed attempt to fuse it. */
74740 +       ATOM_CANCEL_FUSION = (1 << 1)
74741 +} txn_flags;
74742 +
74743 +/* Flags for controlling commit_txnh */
74744 +typedef enum {
74745 +       /* Wait commit atom completion in commit_txnh */
74746 +       TXNH_WAIT_COMMIT = 0x2,
74747 +       /* Don't commit atom when this handle is closed */
74748 +       TXNH_DONT_COMMIT = 0x4
74749 +} txn_handle_flags_t;
74750 +
74751 +/* TYPE DEFINITIONS */
74752 +
74753 +/* A note on lock ordering: the handle & jnode spinlock protects reading of their ->atom
74754 +   fields, so typically an operation on the atom through either of these objects must (1)
74755 +   lock the object, (2) read the atom pointer, (3) lock the atom.
74756 +
74757 +   During atom fusion, the process holds locks on both atoms at once.  Then, it iterates
74758 +   through the list of handles and pages held by the smaller of the two atoms.  For each
74759 +   handle and page referencing the smaller atom, the fusing process must: (1) lock the
74760 +   object, and (2) update the atom pointer.
74761 +
74762 +   You can see that there is a conflict of lock ordering here, so the more-complex
74763 +   procedure should have priority, i.e., the fusing process has priority so that it is
74764 +   guaranteed to make progress and to avoid restarts.
74765 +
74766 +   This decision, however, means additional complexity for aquiring the atom lock in the
74767 +   first place.
74768 +
74769 +   The general original procedure followed in the code was:
74770 +
74771 +       TXN_OBJECT *obj = ...;
74772 +       TXN_ATOM   *atom;
74773 +
74774 +       spin_lock (& obj->_lock);
74775 +
74776 +       atom = obj->_atom;
74777 +
74778 +       if (! spin_trylock_atom (atom))
74779 +         {
74780 +           spin_unlock (& obj->_lock);
74781 +           RESTART OPERATION, THERE WAS A RACE;
74782 +         }
74783 +
74784 +       ELSE YOU HAVE BOTH ATOM AND OBJ LOCKED
74785 +
74786 +   It has however been found that this wastes CPU a lot in a manner that is
74787 +   hard to profile. So, proper refcounting was added to atoms, and new
74788 +   standard locking sequence is like following:
74789 +
74790 +       TXN_OBJECT *obj = ...;
74791 +       TXN_ATOM   *atom;
74792 +
74793 +       spin_lock (& obj->_lock);
74794 +
74795 +       atom = obj->_atom;
74796 +
74797 +       if (! spin_trylock_atom (atom))
74798 +         {
74799 +           atomic_inc (& atom->refcount);
74800 +           spin_unlock (& obj->_lock);
74801 +           spin_lock (&atom->_lock);
74802 +           atomic_dec (& atom->refcount);
74803 +           // HERE atom is locked
74804 +           spin_unlock (&atom->_lock);
74805 +           RESTART OPERATION, THERE WAS A RACE;
74806 +         }
74807 +
74808 +       ELSE YOU HAVE BOTH ATOM AND OBJ LOCKED
74809 +
74810 +   (core of this is implemented in trylock_throttle() function)
74811 +
74812 +   See the jnode_get_atom() function for a common case.
74813 +
74814 +   As an additional (and important) optimization allowing to avoid restarts,
74815 +   it is possible to re-check required pre-conditions at the HERE point in
74816 +   code above and proceed without restarting if they are still satisfied.
74817 +*/
74818 +
74819 +/* An atomic transaction: this is the underlying system representation
74820 +   of a transaction, not the one seen by clients.
74821 +
74822 +   Invariants involving this data-type:
74823 +
74824 +      [sb-fake-allocated]
74825 +*/
74826 +struct txn_atom {
74827 +       /* The spinlock protecting the atom, held during fusion and various other state
74828 +          changes. */
74829 +       spinlock_t alock;
74830 +
74831 +       /* The atom's reference counter, increasing (in case of a duplication
74832 +          of an existing reference or when we are sure that some other
74833 +          reference exists) may be done without taking spinlock, decrementing
74834 +          of the ref. counter requires a spinlock to be held.
74835 +
74836 +          Each transaction handle counts in ->refcount. All jnodes count as
74837 +          one reference acquired in atom_begin_andlock(), released in
74838 +          commit_current_atom().
74839 +        */
74840 +       atomic_t refcount;
74841 +
74842 +       /* The atom_id identifies the atom in persistent records such as the log. */
74843 +       __u32 atom_id;
74844 +
74845 +       /* Flags holding any of the txn_flags enumerated values (e.g.,
74846 +          ATOM_FORCE_COMMIT). */
74847 +       __u32 flags;
74848 +
74849 +       /* Number of open handles. */
74850 +       __u32 txnh_count;
74851 +
74852 +       /* The number of znodes captured by this atom.  Equal to the sum of lengths of the
74853 +          dirty_nodes[level] and clean_nodes lists. */
74854 +       __u32 capture_count;
74855 +
74856 +#if REISER4_DEBUG
74857 +       int clean;
74858 +       int dirty;
74859 +       int ovrwr;
74860 +       int wb;
74861 +       int fq;
74862 +#endif
74863 +
74864 +       __u32 flushed;
74865 +
74866 +       /* Current transaction stage. */
74867 +       txn_stage stage;
74868 +
74869 +       /* Start time. */
74870 +       unsigned long start_time;
74871 +
74872 +       /* The atom's delete set. It collects block numbers of the nodes
74873 +          which were deleted during the transaction. */
74874 +       struct list_head delete_set;
74875 +
74876 +       /* The atom's wandered_block mapping. */
74877 +       struct list_head wandered_map;
74878 +
74879 +       /* The transaction's list of dirty captured nodes--per level.  Index
74880 +          by (level). dirty_nodes[0] is for znode-above-root */
74881 +       struct list_head dirty_nodes[REAL_MAX_ZTREE_HEIGHT + 1];
74882 +
74883 +       /* The transaction's list of clean captured nodes. */
74884 +       struct list_head clean_nodes;
74885 +
74886 +       /* The atom's overwrite set */
74887 +       struct list_head ovrwr_nodes;
74888 +
74889 +       /* nodes which are being written to disk */
74890 +       struct list_head writeback_nodes;
74891 +
74892 +       /* list of inodes */
74893 +       struct list_head inodes;
74894 +
74895 +       /* List of handles associated with this atom. */
74896 +       struct list_head txnh_list;
74897 +
74898 +       /* Transaction list link: list of atoms in the transaction manager. */
74899 +       struct list_head atom_link;
74900 +
74901 +       /* List of handles waiting FOR this atom: see 'capture_fuse_wait' comment. */
74902 +       struct list_head fwaitfor_list;
74903 +
74904 +       /* List of this atom's handles that are waiting: see 'capture_fuse_wait' comment. */
74905 +       struct list_head fwaiting_list;
74906 +
74907 +       /* Numbers of objects which were deleted/created in this transaction
74908 +          thereby numbers of objects IDs which were released/deallocated. */
74909 +       int nr_objects_deleted;
74910 +       int nr_objects_created;
74911 +       /* number of blocks allocated during the transaction */
74912 +       __u64 nr_blocks_allocated;
74913 +       /* All atom's flush queue objects are on this list  */
74914 +       struct list_head flush_queues;
74915 +#if REISER4_DEBUG
74916 +       /* number of flush queues for this atom. */
74917 +       int nr_flush_queues;
74918 +       /* Number of jnodes which were removed from atom's lists and put
74919 +          on flush_queue */
74920 +       int num_queued;
74921 +#endif
74922 +       /* number of threads who wait for this atom to complete commit */
74923 +       int nr_waiters;
74924 +       /* number of threads which do jnode_flush() over this atom */
74925 +       int nr_flushers;
74926 +       /* number of flush queues which are IN_USE and jnodes from fq->prepped
74927 +          are submitted to disk by the reiser4_write_fq() routine. */
74928 +       int nr_running_queues;
74929 +       /* A counter of grabbed unformatted nodes, see a description of the
74930 +        * reiser4 space reservation scheme at block_alloc.c */
74931 +       reiser4_block_nr flush_reserved;
74932 +#if REISER4_DEBUG
74933 +       void *committer;
74934 +#endif
74935 +       struct super_block *super;
74936 +};
74937 +
74938 +#define ATOM_DIRTY_LIST(atom, level) (&(atom)->dirty_nodes[level])
74939 +#define ATOM_CLEAN_LIST(atom) (&(atom)->clean_nodes)
74940 +#define ATOM_OVRWR_LIST(atom) (&(atom)->ovrwr_nodes)
74941 +#define ATOM_WB_LIST(atom) (&(atom)->writeback_nodes)
74942 +#define ATOM_FQ_LIST(fq) (&(fq)->prepped)
74943 +
74944 +#define NODE_LIST(node) (node)->list
74945 +#define ASSIGN_NODE_LIST(node, list) ON_DEBUG(NODE_LIST(node) = list)
74946 +ON_DEBUG(void
74947 +        count_jnode(txn_atom *, jnode *, atom_list old_list,
74948 +                    atom_list new_list, int check_lists));
74949 +
74950 +/* A transaction handle: the client obtains and commits this handle which is assigned by
74951 +   the system to a txn_atom. */
74952 +struct txn_handle {
74953 +       /* Spinlock protecting ->atom pointer */
74954 +       spinlock_t hlock;
74955 +
74956 +       /* Flags for controlling commit_txnh() behavior */
74957 +       /* from txn_handle_flags_t */
74958 +       txn_handle_flags_t flags;
74959 +
74960 +       /* Whether it is READ_FUSING or WRITE_FUSING. */
74961 +       txn_mode mode;
74962 +
74963 +       /* If assigned, the atom it is part of. */
74964 +       txn_atom *atom;
74965 +
74966 +       /* Transaction list link. Head is in txn_atom. */
74967 +       struct list_head txnh_link;
74968 +};
74969 +
74970 +/* The transaction manager: one is contained in the reiser4_super_info_data */
74971 +struct txn_mgr {
74972 +       /* A spinlock protecting the atom list, id_count, flush_control */
74973 +       spinlock_t tmgr_lock;
74974 +
74975 +       /* List of atoms. */
74976 +       struct list_head atoms_list;
74977 +
74978 +       /* Number of atoms. */
74979 +       int atom_count;
74980 +
74981 +       /* A counter used to assign atom->atom_id values. */
74982 +       __u32 id_count;
74983 +
74984 +       /* a mutex object for commit serialization */
74985 +       struct mutex commit_mutex;
74986 +
74987 +       /* a list of all txnmrgs served by particular daemon. */
74988 +       struct list_head linkage;
74989 +
74990 +       /* description of daemon for this txnmgr */
74991 +       ktxnmgrd_context *daemon;
74992 +
74993 +       /* parameters. Adjustable through mount options. */
74994 +       unsigned int atom_max_size;
74995 +       unsigned int atom_max_age;
74996 +       unsigned int atom_min_size;
74997 +       /* max number of concurrent flushers for one atom, 0 - unlimited.  */
74998 +       unsigned int atom_max_flushers;
74999 +       struct dentry *debugfs_atom_count;
75000 +       struct dentry *debugfs_id_count;
75001 +};
75002 +
75003 +/* FUNCTION DECLARATIONS */
75004 +
75005 +/* These are the externally (within Reiser4) visible transaction functions, therefore they
75006 +   are prefixed with "txn_".  For comments, see txnmgr.c. */
75007 +
75008 +extern int init_txnmgr_static(void);
75009 +extern void done_txnmgr_static(void);
75010 +
75011 +extern void reiser4_init_txnmgr(txn_mgr *);
75012 +extern void reiser4_done_txnmgr(txn_mgr *);
75013 +
75014 +extern int reiser4_txn_reserve(int reserved);
75015 +
75016 +extern void reiser4_txn_begin(reiser4_context * context);
75017 +extern int reiser4_txn_end(reiser4_context * context);
75018 +
75019 +extern void reiser4_txn_restart(reiser4_context * context);
75020 +extern void reiser4_txn_restart_current(void);
75021 +
75022 +extern int txnmgr_force_commit_all(struct super_block *, int);
75023 +extern int current_atom_should_commit(void);
75024 +
75025 +extern jnode *find_first_dirty_jnode(txn_atom *, int);
75026 +
75027 +extern int commit_some_atoms(txn_mgr *);
75028 +extern int force_commit_atom(txn_handle *);
75029 +extern int flush_current_atom(int, long, long *, txn_atom **, jnode *);
75030 +
75031 +extern int flush_some_atom(jnode *, long *, const struct writeback_control *, int);
75032 +
75033 +extern void reiser4_atom_set_stage(txn_atom * atom, txn_stage stage);
75034 +
75035 +extern int same_slum_check(jnode * base, jnode * check, int alloc_check,
75036 +                          int alloc_value);
75037 +extern void atom_dec_and_unlock(txn_atom * atom);
75038 +
75039 +extern int reiser4_try_capture(jnode * node, znode_lock_mode mode, txn_capture flags);
75040 +extern int try_capture_page_to_invalidate(struct page *pg);
75041 +
75042 +extern void reiser4_uncapture_page(struct page *pg);
75043 +extern void reiser4_uncapture_block(jnode *);
75044 +extern void reiser4_uncapture_jnode(jnode *);
75045 +
75046 +extern int reiser4_capture_inode(struct inode *);
75047 +extern int reiser4_uncapture_inode(struct inode *);
75048 +
75049 +extern txn_atom *get_current_atom_locked_nocheck(void);
75050 +
75051 +#if REISER4_DEBUG
75052 +
75053 +/**
75054 + * atom_is_protected - make sure that nobody but us can do anything with atom
75055 + * @atom: atom to be checked
75056 + *
75057 + * This is used to assert that atom either entered commit stages or is spin
75058 + * locked.
75059 + */
75060 +static inline int atom_is_protected(txn_atom *atom)
75061 +{
75062 +       if (atom->stage >= ASTAGE_PRE_COMMIT)
75063 +               return 1;
75064 +       assert_spin_locked(&(atom->alock));
75065 +       return 1;
75066 +}
75067 +
75068 +#endif
75069 +
75070 +/* Get the current atom and spinlock it if current atom present. May not return NULL */
75071 +static inline txn_atom *get_current_atom_locked(void)
75072 +{
75073 +       txn_atom *atom;
75074 +
75075 +       atom = get_current_atom_locked_nocheck();
75076 +       assert("zam-761", atom != NULL);
75077 +
75078 +       return atom;
75079 +}
75080 +
75081 +extern txn_atom *jnode_get_atom(jnode *);
75082 +
75083 +extern void reiser4_atom_wait_event(txn_atom *);
75084 +extern void reiser4_atom_send_event(txn_atom *);
75085 +
75086 +extern void insert_into_atom_ovrwr_list(txn_atom * atom, jnode * node);
75087 +extern int reiser4_capture_super_block(struct super_block *s);
75088 +int capture_bulk(jnode **, int count);
75089 +
75090 +/* See the comment on the function blocknrset.c:blocknr_set_add for the
75091 +   calling convention of these three routines. */
75092 +extern void blocknr_set_init(struct list_head * bset);
75093 +extern void blocknr_set_destroy(struct list_head * bset);
75094 +extern void blocknr_set_merge(struct list_head * from, struct list_head * into);
75095 +extern int blocknr_set_add_extent(txn_atom * atom,
75096 +                                 struct list_head * bset,
75097 +                                 blocknr_set_entry ** new_bsep,
75098 +                                 const reiser4_block_nr * start,
75099 +                                 const reiser4_block_nr * len);
75100 +extern int blocknr_set_add_pair(txn_atom * atom, struct list_head * bset,
75101 +                               blocknr_set_entry ** new_bsep,
75102 +                               const reiser4_block_nr * a,
75103 +                               const reiser4_block_nr * b);
75104 +
75105 +typedef int (*blocknr_set_actor_f) (txn_atom *, const reiser4_block_nr *,
75106 +                                   const reiser4_block_nr *, void *);
75107 +
75108 +extern int blocknr_set_iterator(txn_atom * atom, struct list_head * bset,
75109 +                               blocknr_set_actor_f actor, void *data,
75110 +                               int delete);
75111 +
75112 +/* flush code takes care about how to fuse flush queues */
75113 +extern void flush_init_atom(txn_atom * atom);
75114 +extern void flush_fuse_queues(txn_atom * large, txn_atom * small);
75115 +
75116 +static inline void spin_lock_atom(txn_atom *atom)
75117 +{
75118 +       /* check that spinlocks of lower priorities are not held */
75119 +       assert("", (LOCK_CNT_NIL(spin_locked_txnh) &&
75120 +                   LOCK_CNT_NIL(spin_locked_atom) &&
75121 +                   LOCK_CNT_NIL(spin_locked_jnode) &&
75122 +                   LOCK_CNT_NIL(spin_locked_zlock) &&
75123 +                   LOCK_CNT_NIL(rw_locked_dk) &&
75124 +                   LOCK_CNT_NIL(rw_locked_tree)));
75125 +
75126 +       spin_lock(&(atom->alock));
75127 +
75128 +       LOCK_CNT_INC(spin_locked_atom);
75129 +       LOCK_CNT_INC(spin_locked);
75130 +}
75131 +
75132 +static inline void spin_lock_atom_nested(txn_atom *atom)
75133 +{
75134 +       assert("", (LOCK_CNT_NIL(spin_locked_txnh) &&
75135 +                   LOCK_CNT_NIL(spin_locked_jnode) &&
75136 +                   LOCK_CNT_NIL(spin_locked_zlock) &&
75137 +                   LOCK_CNT_NIL(rw_locked_dk) &&
75138 +                   LOCK_CNT_NIL(rw_locked_tree)));
75139 +
75140 +       spin_lock_nested(&(atom->alock), SINGLE_DEPTH_NESTING);
75141 +
75142 +       LOCK_CNT_INC(spin_locked_atom);
75143 +       LOCK_CNT_INC(spin_locked);
75144 +}
75145 +
75146 +static inline int spin_trylock_atom(txn_atom *atom)
75147 +{
75148 +       if (spin_trylock(&(atom->alock))) {
75149 +               LOCK_CNT_INC(spin_locked_atom);
75150 +               LOCK_CNT_INC(spin_locked);
75151 +               return 1;
75152 +       }
75153 +       return 0;
75154 +}
75155 +
75156 +static inline void spin_unlock_atom(txn_atom *atom)
75157 +{
75158 +       assert_spin_locked(&(atom->alock));
75159 +       assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_atom));
75160 +       assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
75161 +
75162 +       LOCK_CNT_DEC(spin_locked_atom);
75163 +       LOCK_CNT_DEC(spin_locked);
75164 +
75165 +       spin_unlock(&(atom->alock));
75166 +}
75167 +
75168 +static inline void spin_lock_txnh(txn_handle *txnh)
75169 +{
75170 +       /* check that spinlocks of lower priorities are not held */
75171 +       assert("", (LOCK_CNT_NIL(rw_locked_dk) &&
75172 +                   LOCK_CNT_NIL(spin_locked_zlock) &&
75173 +                   LOCK_CNT_NIL(rw_locked_tree)));
75174 +
75175 +       spin_lock(&(txnh->hlock));
75176 +
75177 +       LOCK_CNT_INC(spin_locked_txnh);
75178 +       LOCK_CNT_INC(spin_locked);
75179 +}
75180 +
75181 +static inline int spin_trylock_txnh(txn_handle *txnh)
75182 +{
75183 +       if (spin_trylock(&(txnh->hlock))) {
75184 +               LOCK_CNT_INC(spin_locked_txnh);
75185 +               LOCK_CNT_INC(spin_locked);
75186 +               return 1;
75187 +       }
75188 +       return 0;
75189 +}
75190 +
75191 +static inline void spin_unlock_txnh(txn_handle *txnh)
75192 +{
75193 +       assert_spin_locked(&(txnh->hlock));
75194 +       assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_txnh));
75195 +       assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
75196 +
75197 +       LOCK_CNT_DEC(spin_locked_txnh);
75198 +       LOCK_CNT_DEC(spin_locked);
75199 +
75200 +       spin_unlock(&(txnh->hlock));
75201 +}
75202 +
75203 +#define spin_ordering_pred_txnmgr(tmgr)                \
75204 +       ( LOCK_CNT_NIL(spin_locked_atom) &&     \
75205 +         LOCK_CNT_NIL(spin_locked_txnh) &&     \
75206 +         LOCK_CNT_NIL(spin_locked_jnode) &&    \
75207 +         LOCK_CNT_NIL(rw_locked_zlock) &&      \
75208 +         LOCK_CNT_NIL(rw_locked_dk) &&         \
75209 +         LOCK_CNT_NIL(rw_locked_tree) )
75210 +
75211 +static inline void spin_lock_txnmgr(txn_mgr *mgr)
75212 +{
75213 +       /* check that spinlocks of lower priorities are not held */
75214 +       assert("", (LOCK_CNT_NIL(spin_locked_atom) &&
75215 +                   LOCK_CNT_NIL(spin_locked_txnh) &&
75216 +                   LOCK_CNT_NIL(spin_locked_jnode) &&
75217 +                   LOCK_CNT_NIL(spin_locked_zlock) &&
75218 +                   LOCK_CNT_NIL(rw_locked_dk) &&
75219 +                   LOCK_CNT_NIL(rw_locked_tree)));
75220 +
75221 +       spin_lock(&(mgr->tmgr_lock));
75222 +
75223 +       LOCK_CNT_INC(spin_locked_txnmgr);
75224 +       LOCK_CNT_INC(spin_locked);
75225 +}
75226 +
75227 +static inline int spin_trylock_txnmgr(txn_mgr *mgr)
75228 +{
75229 +       if (spin_trylock(&(mgr->tmgr_lock))) {
75230 +               LOCK_CNT_INC(spin_locked_txnmgr);
75231 +               LOCK_CNT_INC(spin_locked);
75232 +               return 1;
75233 +       }
75234 +       return 0;
75235 +}
75236 +
75237 +static inline void spin_unlock_txnmgr(txn_mgr *mgr)
75238 +{
75239 +       assert_spin_locked(&(mgr->tmgr_lock));
75240 +       assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_txnmgr));
75241 +       assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
75242 +
75243 +       LOCK_CNT_DEC(spin_locked_txnmgr);
75244 +       LOCK_CNT_DEC(spin_locked);
75245 +
75246 +       spin_unlock(&(mgr->tmgr_lock));
75247 +}
75248 +
75249 +typedef enum {
75250 +       FQ_IN_USE = 0x1
75251 +} flush_queue_state_t;
75252 +
75253 +typedef struct flush_queue flush_queue_t;
75254 +
75255 +/* This is an accumulator for jnodes prepared for writing to disk. A flush queue
75256 +   is filled by the jnode_flush() routine, and written to disk under memory
75257 +   pressure or at atom commit time. */
75258 +/* LOCKING: fq state and fq->atom are protected by guard spinlock, fq->nr_queued
75259 +   field and fq->prepped list can be modified if atom is spin-locked and fq
75260 +   object is "in-use" state.  For read-only traversal of the fq->prepped list
75261 +   and reading of the fq->nr_queued field it is enough to keep fq "in-use" or
75262 +   only have atom spin-locked. */
75263 +struct flush_queue {
75264 +       /* linkage element is the first in this structure to make debugging
75265 +          easier.  See field in atom struct for description of list. */
75266 +       struct list_head alink;
75267 +       /* A spinlock to protect changes of fq state and fq->atom pointer */
75268 +       spinlock_t guard;
75269 +       /* flush_queue state: [in_use | ready] */
75270 +       flush_queue_state_t state;
75271 +       /* A list which contains queued nodes, queued nodes are removed from any
75272 +        * atom's list and put on this ->prepped one. */
75273 +       struct list_head prepped;
75274 +       /* number of submitted i/o requests */
75275 +       atomic_t nr_submitted;
75276 +       /* number of i/o errors */
75277 +       atomic_t nr_errors;
75278 +       /* An atom this flush queue is attached to */
75279 +       txn_atom *atom;
75280 +       /* A wait queue head to wait on i/o completion */
75281 +       wait_queue_head_t wait;
75282 +#if REISER4_DEBUG
75283 +       /* A thread which took this fq in exclusive use, NULL if fq is free,
75284 +        * used for debugging. */
75285 +       struct task_struct *owner;
75286 +#endif
75287 +};
75288 +
75289 +extern int reiser4_fq_by_atom(txn_atom *, flush_queue_t **);
75290 +extern void reiser4_fq_put_nolock(flush_queue_t *);
75291 +extern void reiser4_fq_put(flush_queue_t *);
75292 +extern void reiser4_fuse_fq(txn_atom * to, txn_atom * from);
75293 +extern void queue_jnode(flush_queue_t *, jnode *);
75294 +
75295 +extern int reiser4_write_fq(flush_queue_t *, long *, int);
75296 +extern int current_atom_finish_all_fq(void);
75297 +extern void init_atom_fq_parts(txn_atom *);
75298 +
75299 +extern reiser4_block_nr txnmgr_count_deleted_blocks(void);
75300 +
75301 +extern void znode_make_dirty(znode * node);
75302 +extern void jnode_make_dirty_locked(jnode * node);
75303 +
75304 +extern int reiser4_sync_atom(txn_atom * atom);
75305 +
75306 +#if REISER4_DEBUG
75307 +extern int atom_fq_parts_are_clean(txn_atom *);
75308 +#endif
75309 +
75310 +extern void add_fq_to_bio(flush_queue_t *, struct bio *);
75311 +extern flush_queue_t *get_fq_for_current_atom(void);
75312 +
75313 +void reiser4_invalidate_list(struct list_head * head);
75314 +
75315 +# endif                                /* __REISER4_TXNMGR_H__ */
75316 +
75317 +/* Make Linus happy.
75318 +   Local variables:
75319 +   c-indentation-style: "K&R"
75320 +   mode-name: "LC"
75321 +   c-basic-offset: 8
75322 +   tab-width: 8
75323 +   fill-column: 120
75324 +   End:
75325 +*/
75326 diff -puN /dev/null fs/reiser4/type_safe_hash.h
75327 --- /dev/null
75328 +++ a/fs/reiser4/type_safe_hash.h
75329 @@ -0,0 +1,320 @@
75330 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
75331 + * reiser4/README */
75332 +
75333 +/* A hash table class that uses hash chains (singly-linked) and is
75334 +   parametrized to provide type safety.  */
75335 +
75336 +#ifndef __REISER4_TYPE_SAFE_HASH_H__
75337 +#define __REISER4_TYPE_SAFE_HASH_H__
75338 +
75339 +#include "debug.h"
75340 +
75341 +#include <asm/errno.h>
75342 +/* Step 1: Use TYPE_SAFE_HASH_DECLARE() to define the TABLE and LINK objects
75343 +   based on the object type.  You need to declare the item type before
75344 +   this definition, define it after this definition. */
75345 +#define TYPE_SAFE_HASH_DECLARE(PREFIX,ITEM_TYPE)                                                     \
75346 +                                                                                              \
75347 +typedef struct PREFIX##_hash_table_  PREFIX##_hash_table;                                     \
75348 +typedef struct PREFIX##_hash_link_   PREFIX##_hash_link;                                      \
75349 +                                                                                              \
75350 +struct PREFIX##_hash_table_                                                                   \
75351 +{                                                                                             \
75352 +  ITEM_TYPE  **_table;                                                                        \
75353 +  __u32        _buckets;                                                                      \
75354 +};                                                                                            \
75355 +                                                                                              \
75356 +struct PREFIX##_hash_link_                                                                    \
75357 +{                                                                                             \
75358 +  ITEM_TYPE *_next;                                                                           \
75359 +}
75360 +
75361 +/* Step 2: Define the object type of the hash: give it field of type
75362 +   PREFIX_hash_link. */
75363 +
75364 +/* Step 3: Use TYPE_SAFE_HASH_DEFINE to define the hash table interface using
75365 +   the type and field name used in step 3.  The arguments are:
75366 +
75367 +   ITEM_TYPE    The item type being hashed
75368 +   KEY_TYPE     The type of key being hashed
75369 +   KEY_NAME     The name of the key field within the item
75370 +   LINK_NAME    The name of the link field within the item, which you must make type PREFIX_hash_link)
75371 +   HASH_FUNC    The name of the hash function (or macro, takes const pointer to key)
75372 +   EQ_FUNC      The name of the equality function (or macro, takes const pointer to two keys)
75373 +
75374 +   It implements these functions:
75375 +
75376 +   prefix_hash_init           Initialize the table given its size.
75377 +   prefix_hash_insert         Insert an item
75378 +   prefix_hash_insert_index   Insert an item w/ precomputed hash_index
75379 +   prefix_hash_find           Find an item by key
75380 +   prefix_hash_find_index     Find an item w/ precomputed hash_index
75381 +   prefix_hash_remove         Remove an item, returns 1 if found, 0 if not found
75382 +   prefix_hash_remove_index   Remove an item w/ precomputed hash_index
75383 +
75384 +   If you'd like something to be done differently, feel free to ask me
75385 +   for modifications.  Additional features that could be added but
75386 +   have not been:
75387 +
75388 +   prefix_hash_remove_key           Find and remove an item by key
75389 +   prefix_hash_remove_key_index     Find and remove an item by key w/ precomputed hash_index
75390 +
75391 +   The hash_function currently receives only the key as an argument,
75392 +   meaning it must somehow know the number of buckets.  If this is a
75393 +   problem let me know.
75394 +
75395 +   This hash table uses a single-linked hash chain.  This means
75396 +   insertion is fast but deletion requires searching the chain.
75397 +
75398 +   There is also the doubly-linked hash chain approach, under which
75399 +   deletion requires no search but the code is longer and it takes two
75400 +   pointers per item.
75401 +
75402 +   The circularly-linked approach has the shortest code but requires
75403 +   two pointers per bucket, doubling the size of the bucket array (in
75404 +   addition to two pointers per item).
75405 +*/
75406 +#define TYPE_SAFE_HASH_DEFINE(PREFIX,ITEM_TYPE,KEY_TYPE,KEY_NAME,LINK_NAME,HASH_FUNC,EQ_FUNC)  \
75407 +                                                                                       \
75408 +static __inline__ void                                                                 \
75409 +PREFIX##_check_hash (PREFIX##_hash_table *table UNUSED_ARG,                            \
75410 +                    __u32                hash UNUSED_ARG)                              \
75411 +{                                                                                      \
75412 +       assert("nikita-2780", hash < table->_buckets);                                  \
75413 +}                                                                                      \
75414 +                                                                                       \
75415 +static __inline__ int                                                                  \
75416 +PREFIX##_hash_init (PREFIX##_hash_table *hash,                                         \
75417 +                   __u32                buckets)                                       \
75418 +{                                                                                      \
75419 +  hash->_table   = (ITEM_TYPE**) KMALLOC (sizeof (ITEM_TYPE*) * buckets);              \
75420 +  hash->_buckets = buckets;                                                            \
75421 +  if (hash->_table == NULL)                                                            \
75422 +    {                                                                                  \
75423 +      return RETERR(-ENOMEM);                                                          \
75424 +    }                                                                                  \
75425 +  memset (hash->_table, 0, sizeof (ITEM_TYPE*) * buckets);                             \
75426 +  ON_DEBUG(printk(#PREFIX "_hash_table: %i buckets\n", buckets));                      \
75427 +  return 0;                                                                            \
75428 +}                                                                                      \
75429 +                                                                                       \
75430 +static __inline__ void                                                                 \
75431 +PREFIX##_hash_done (PREFIX##_hash_table *hash)                                         \
75432 +{                                                                                      \
75433 +  if (REISER4_DEBUG && hash->_table != NULL) {                                          \
75434 +           __u32 i;                                                                    \
75435 +           for (i = 0 ; i < hash->_buckets ; ++ i)                                     \
75436 +                   assert("nikita-2905", hash->_table[i] == NULL);                     \
75437 +  }                                                                                     \
75438 +  if (hash->_table != NULL)                                                            \
75439 +    KFREE (hash->_table, sizeof (ITEM_TYPE*) * hash->_buckets);                                \
75440 +  hash->_table = NULL;                                                                 \
75441 +}                                                                                      \
75442 +                                                                                       \
75443 +static __inline__ void                                                                 \
75444 +PREFIX##_hash_prefetch_next (ITEM_TYPE *item)                                          \
75445 +{                                                                                      \
75446 +       prefetch(item->LINK_NAME._next);                                                \
75447 +}                                                                                      \
75448 +                                                                                       \
75449 +static __inline__ void                                                                 \
75450 +PREFIX##_hash_prefetch_bucket (PREFIX##_hash_table *hash,                              \
75451 +                              __u32                index)                              \
75452 +{                                                                                      \
75453 +       prefetch(hash->_table[index]);                                                  \
75454 +}                                                                                      \
75455 +                                                                                       \
75456 +static __inline__ ITEM_TYPE*                                                           \
75457 +PREFIX##_hash_find_index (PREFIX##_hash_table *hash,                                   \
75458 +                         __u32                hash_index,                              \
75459 +                         KEY_TYPE const      *find_key)                                \
75460 +{                                                                                      \
75461 +  ITEM_TYPE *item;                                                                     \
75462 +                                                                                       \
75463 +  PREFIX##_check_hash(hash, hash_index);                                               \
75464 +                                                                                       \
75465 +  for (item  = hash->_table[hash_index];                                               \
75466 +       item != NULL;                                                                   \
75467 +       item  = item->LINK_NAME._next)                                                  \
75468 +    {                                                                                  \
75469 +      prefetch(item->LINK_NAME._next);                                                 \
75470 +      prefetch(item->LINK_NAME._next + offsetof(ITEM_TYPE, KEY_NAME));                 \
75471 +      if (EQ_FUNC (& item->KEY_NAME, find_key))                                                \
75472 +        {                                                                              \
75473 +          return item;                                                                 \
75474 +        }                                                                              \
75475 +    }                                                                                  \
75476 +                                                                                       \
75477 +  return NULL;                                                                         \
75478 +}                                                                                      \
75479 +                                                                                       \
75480 +static __inline__ ITEM_TYPE*                                                           \
75481 +PREFIX##_hash_find_index_lru (PREFIX##_hash_table *hash,                               \
75482 +                             __u32                hash_index,                          \
75483 +                             KEY_TYPE const      *find_key)                            \
75484 +{                                                                                      \
75485 +  ITEM_TYPE ** item = &hash->_table[hash_index];                                        \
75486 +                                                                                       \
75487 +  PREFIX##_check_hash(hash, hash_index);                                               \
75488 +                                                                                        \
75489 +  while (*item != NULL) {                                                               \
75490 +    prefetch(&(*item)->LINK_NAME._next);                                               \
75491 +    if (EQ_FUNC (&(*item)->KEY_NAME, find_key)) {                                       \
75492 +      ITEM_TYPE *found;                                                                \
75493 +                                                                                       \
75494 +      found = *item;                                                                   \
75495 +      *item = found->LINK_NAME._next;                                                   \
75496 +      found->LINK_NAME._next = hash->_table[hash_index];                               \
75497 +      hash->_table[hash_index] = found;                                                        \
75498 +      return found;                                                                     \
75499 +    }                                                                                   \
75500 +    item = &(*item)->LINK_NAME._next;                                                   \
75501 +  }                                                                                    \
75502 +  return NULL;                                                                         \
75503 +}                                                                                      \
75504 +                                                                                       \
75505 +static __inline__ int                                                                  \
75506 +PREFIX##_hash_remove_index (PREFIX##_hash_table *hash,                                 \
75507 +                           __u32                hash_index,                            \
75508 +                           ITEM_TYPE           *del_item)                              \
75509 +{                                                                                      \
75510 +  ITEM_TYPE ** hash_item_p = &hash->_table[hash_index];                                 \
75511 +                                                                                       \
75512 +  PREFIX##_check_hash(hash, hash_index);                                               \
75513 +                                                                                        \
75514 +  while (*hash_item_p != NULL) {                                                        \
75515 +    prefetch(&(*hash_item_p)->LINK_NAME._next);                                                \
75516 +    if (*hash_item_p == del_item) {                                                     \
75517 +      *hash_item_p = (*hash_item_p)->LINK_NAME._next;                                   \
75518 +      return 1;                                                                         \
75519 +    }                                                                                   \
75520 +    hash_item_p = &(*hash_item_p)->LINK_NAME._next;                                     \
75521 +  }                                                                                    \
75522 +  return 0;                                                                            \
75523 +}                                                                                      \
75524 +                                                                                       \
75525 +static __inline__ void                                                                 \
75526 +PREFIX##_hash_insert_index (PREFIX##_hash_table *hash,                                 \
75527 +                           __u32                hash_index,                            \
75528 +                           ITEM_TYPE           *ins_item)                              \
75529 +{                                                                                      \
75530 +  PREFIX##_check_hash(hash, hash_index);                                               \
75531 +                                                                                       \
75532 +  ins_item->LINK_NAME._next = hash->_table[hash_index];                                        \
75533 +  hash->_table[hash_index]  = ins_item;                                                        \
75534 +}                                                                                      \
75535 +                                                                                       \
75536 +static __inline__ void                                                                 \
75537 +PREFIX##_hash_insert_index_rcu (PREFIX##_hash_table *hash,                             \
75538 +                               __u32                hash_index,                        \
75539 +                               ITEM_TYPE           *ins_item)                          \
75540 +{                                                                                      \
75541 +  PREFIX##_check_hash(hash, hash_index);                                               \
75542 +                                                                                       \
75543 +  ins_item->LINK_NAME._next = hash->_table[hash_index];                                        \
75544 +  smp_wmb();                                                                           \
75545 +  hash->_table[hash_index]  = ins_item;                                                        \
75546 +}                                                                                      \
75547 +                                                                                       \
75548 +static __inline__ ITEM_TYPE*                                                           \
75549 +PREFIX##_hash_find (PREFIX##_hash_table *hash,                                         \
75550 +                   KEY_TYPE const      *find_key)                                      \
75551 +{                                                                                      \
75552 +  return PREFIX##_hash_find_index (hash, HASH_FUNC(hash, find_key), find_key);         \
75553 +}                                                                                      \
75554 +                                                                                       \
75555 +static __inline__ ITEM_TYPE*                                                           \
75556 +PREFIX##_hash_find_lru (PREFIX##_hash_table *hash,                                     \
75557 +                       KEY_TYPE const      *find_key)                                  \
75558 +{                                                                                      \
75559 +  return PREFIX##_hash_find_index_lru (hash, HASH_FUNC(hash, find_key), find_key);     \
75560 +}                                                                                      \
75561 +                                                                                       \
75562 +static __inline__ int                                                                  \
75563 +PREFIX##_hash_remove (PREFIX##_hash_table *hash,                                       \
75564 +                     ITEM_TYPE           *del_item)                                    \
75565 +{                                                                                      \
75566 +  return PREFIX##_hash_remove_index (hash,                                             \
75567 +                                     HASH_FUNC(hash, &del_item->KEY_NAME), del_item);  \
75568 +}                                                                                      \
75569 +                                                                                       \
75570 +static __inline__ int                                                                  \
75571 +PREFIX##_hash_remove_rcu (PREFIX##_hash_table *hash,                                   \
75572 +                     ITEM_TYPE           *del_item)                                    \
75573 +{                                                                                      \
75574 +  return PREFIX##_hash_remove (hash, del_item);                                                \
75575 +}                                                                                      \
75576 +                                                                                       \
75577 +static __inline__ void                                                                 \
75578 +PREFIX##_hash_insert (PREFIX##_hash_table *hash,                                       \
75579 +                     ITEM_TYPE           *ins_item)                                    \
75580 +{                                                                                      \
75581 +  return PREFIX##_hash_insert_index (hash,                                             \
75582 +                                     HASH_FUNC(hash, &ins_item->KEY_NAME), ins_item);  \
75583 +}                                                                                      \
75584 +                                                                                       \
75585 +static __inline__ void                                                                 \
75586 +PREFIX##_hash_insert_rcu (PREFIX##_hash_table *hash,                                   \
75587 +                         ITEM_TYPE           *ins_item)                                \
75588 +{                                                                                      \
75589 +  return PREFIX##_hash_insert_index_rcu (hash, HASH_FUNC(hash, &ins_item->KEY_NAME),           \
75590 +                                         ins_item);                                    \
75591 +}                                                                                      \
75592 +                                                                                       \
75593 +static __inline__ ITEM_TYPE *                                                          \
75594 +PREFIX##_hash_first (PREFIX##_hash_table *hash, __u32 ind)                             \
75595 +{                                                                                      \
75596 +  ITEM_TYPE *first;                                                                    \
75597 +                                                                                       \
75598 +  for (first = NULL; ind < hash->_buckets; ++ ind) {                                   \
75599 +    first = hash->_table[ind];                                                         \
75600 +    if (first != NULL)                                                                 \
75601 +      break;                                                                           \
75602 +  }                                                                                    \
75603 +  return first;                                                                                \
75604 +}                                                                                      \
75605 +                                                                                       \
75606 +static __inline__ ITEM_TYPE *                                                          \
75607 +PREFIX##_hash_next (PREFIX##_hash_table *hash,                                         \
75608 +                   ITEM_TYPE           *item)                                          \
75609 +{                                                                                      \
75610 +  ITEM_TYPE  *next;                                                                    \
75611 +                                                                                       \
75612 +  if (item == NULL)                                                                    \
75613 +    return NULL;                                                                       \
75614 +  next = item->LINK_NAME._next;                                                                \
75615 +  if (next == NULL)                                                                    \
75616 +    next = PREFIX##_hash_first (hash, HASH_FUNC(hash, &item->KEY_NAME) + 1);           \
75617 +  return next;                                                                         \
75618 +}                                                                                      \
75619 +                                                                                       \
75620 +typedef struct {} PREFIX##_hash_dummy
75621 +
75622 +#define for_all_ht_buckets(table, head)                                        \
75623 +for ((head) = &(table) -> _table[ 0 ] ;                                        \
75624 +     (head) != &(table) -> _table[ (table) -> _buckets ] ; ++ (head))
75625 +
75626 +#define for_all_in_bucket(bucket, item, next, field)                           \
75627 +for ((item) = *(bucket), (next) = (item) ? (item) -> field._next : NULL ;      \
75628 +     (item) != NULL ;                                                          \
75629 +     (item) = (next), (next) = (item) ? (item) -> field._next : NULL )
75630 +
75631 +#define for_all_in_htable(table, prefix, item, next)   \
75632 +for ((item) = prefix ## _hash_first ((table), 0),      \
75633 +     (next) = prefix ## _hash_next ((table), (item)) ; \
75634 +     (item) != NULL ;                                  \
75635 +     (item) = (next),                                  \
75636 +     (next) = prefix ## _hash_next ((table), (item)))
75637 +
75638 +/* __REISER4_TYPE_SAFE_HASH_H__ */
75639 +#endif
75640 +
75641 +/* Make Linus happy.
75642 +   Local variables:
75643 +   c-indentation-style: "K&R"
75644 +   mode-name: "LC"
75645 +   c-basic-offset: 8
75646 +   tab-width: 8
75647 +   fill-column: 120
75648 +   End:
75649 +*/
75650 diff -puN /dev/null fs/reiser4/vfs_ops.c
75651 --- /dev/null
75652 +++ a/fs/reiser4/vfs_ops.c
75653 @@ -0,0 +1,259 @@
75654 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
75655 + * reiser4/README */
75656 +
75657 +/* Interface to VFS. Reiser4 {super|export|dentry}_operations are defined
75658 +   here. */
75659 +
75660 +#include "forward.h"
75661 +#include "debug.h"
75662 +#include "dformat.h"
75663 +#include "coord.h"
75664 +#include "plugin/item/item.h"
75665 +#include "plugin/file/file.h"
75666 +#include "plugin/security/perm.h"
75667 +#include "plugin/disk_format/disk_format.h"
75668 +#include "plugin/plugin.h"
75669 +#include "plugin/plugin_set.h"
75670 +#include "plugin/object.h"
75671 +#include "txnmgr.h"
75672 +#include "jnode.h"
75673 +#include "znode.h"
75674 +#include "block_alloc.h"
75675 +#include "tree.h"
75676 +#include "vfs_ops.h"
75677 +#include "inode.h"
75678 +#include "page_cache.h"
75679 +#include "ktxnmgrd.h"
75680 +#include "super.h"
75681 +#include "reiser4.h"
75682 +#include "entd.h"
75683 +#include "status_flags.h"
75684 +#include "flush.h"
75685 +#include "dscale.h"
75686 +
75687 +#include <linux/profile.h>
75688 +#include <linux/types.h>
75689 +#include <linux/mount.h>
75690 +#include <linux/vfs.h>
75691 +#include <linux/mm.h>
75692 +#include <linux/buffer_head.h>
75693 +#include <linux/dcache.h>
75694 +#include <linux/list.h>
75695 +#include <linux/pagemap.h>
75696 +#include <linux/slab.h>
75697 +#include <linux/seq_file.h>
75698 +#include <linux/init.h>
75699 +#include <linux/module.h>
75700 +#include <linux/writeback.h>
75701 +#include <linux/blkdev.h>
75702 +#include <linux/quotaops.h>
75703 +#include <linux/security.h>
75704 +#include <linux/reboot.h>
75705 +#include <linux/rcupdate.h>
75706 +
75707 +/* update inode stat-data by calling plugin */
75708 +int reiser4_update_sd(struct inode *object)
75709 +{
75710 +       file_plugin *fplug;
75711 +
75712 +       assert("nikita-2338", object != NULL);
75713 +       /* check for read-only file system. */
75714 +       if (IS_RDONLY(object))
75715 +               return 0;
75716 +
75717 +       fplug = inode_file_plugin(object);
75718 +       assert("nikita-2339", fplug != NULL);
75719 +       return fplug->write_sd_by_inode(object);
75720 +}
75721 +
75722 +/* helper function: increase inode nlink count and call plugin method to save
75723 +   updated stat-data.
75724 +
75725 +   Used by link/create and during creation of dot and dotdot in mkdir
75726 +*/
75727 +int reiser4_add_nlink(struct inode *object /* object to which link is added */ ,
75728 +                     struct inode *parent /* parent where new entry will be */
75729 +                     ,
75730 +                     int write_sd_p    /* true if stat-data has to be
75731 +                                        * updated */ )
75732 +{
75733 +       file_plugin *fplug;
75734 +       int result;
75735 +
75736 +       assert("nikita-1351", object != NULL);
75737 +
75738 +       fplug = inode_file_plugin(object);
75739 +       assert("nikita-1445", fplug != NULL);
75740 +
75741 +       /* ask plugin whether it can add yet another link to this
75742 +          object */
75743 +       if (!fplug->can_add_link(object))
75744 +               return RETERR(-EMLINK);
75745 +
75746 +       assert("nikita-2211", fplug->add_link != NULL);
75747 +       /* call plugin to do actual addition of link */
75748 +       result = fplug->add_link(object, parent);
75749 +
75750 +       /* optionally update stat data */
75751 +       if (result == 0 && write_sd_p)
75752 +               result = fplug->write_sd_by_inode(object);
75753 +       return result;
75754 +}
75755 +
75756 +/* helper function: decrease inode nlink count and call plugin method to save
75757 +   updated stat-data.
75758 +
75759 +   Used by unlink/create
75760 +*/
75761 +int reiser4_del_nlink(struct inode *object     /* object from which link is
75762 +                                                * removed */ ,
75763 +                     struct inode *parent /* parent where entry was */ ,
75764 +                     int write_sd_p    /* true is stat-data has to be
75765 +                                        * updated */ )
75766 +{
75767 +       file_plugin *fplug;
75768 +       int result;
75769 +
75770 +       assert("nikita-1349", object != NULL);
75771 +
75772 +       fplug = inode_file_plugin(object);
75773 +       assert("nikita-1350", fplug != NULL);
75774 +       assert("nikita-1446", object->i_nlink > 0);
75775 +       assert("nikita-2210", fplug->rem_link != NULL);
75776 +
75777 +       /* call plugin to do actual deletion of link */
75778 +       result = fplug->rem_link(object, parent);
75779 +
75780 +       /* optionally update stat data */
75781 +       if (result == 0 && write_sd_p)
75782 +               result = fplug->write_sd_by_inode(object);
75783 +       return result;
75784 +}
75785 +
75786 +/* Release reiser4 dentry. This is d_op->d_release() method. */
75787 +static void reiser4_d_release(struct dentry *dentry /* dentry released */ )
75788 +{
75789 +       reiser4_free_dentry_fsdata(dentry);
75790 +}
75791 +
75792 +/*
75793 + * Called by reiser4_sync_inodes(), during speculative write-back (through
75794 + * pdflush, or balance_dirty_pages()).
75795 + */
75796 +void reiser4_writeout(struct super_block *sb, struct writeback_control *wbc)
75797 +{
75798 +       long written = 0;
75799 +       int repeats = 0;
75800 +       int result;
75801 +       struct address_space *mapping;
75802 +
75803 +       /*
75804 +        * Performs early flushing, trying to free some memory. If there is
75805 +        * nothing to flush, commits some atoms.
75806 +        */
75807 +
75808 +       /* Commit all atoms if reiser4_writepages() is called from sys_sync() or
75809 +          sys_fsync(). */
75810 +       if (wbc->sync_mode != WB_SYNC_NONE) {
75811 +               txnmgr_force_commit_all(sb, 0);
75812 +               return;
75813 +       }
75814 +
75815 +       BUG_ON(reiser4_get_super_fake(sb) == NULL);
75816 +       mapping = reiser4_get_super_fake(sb)->i_mapping;
75817 +       do {
75818 +               long nr_submitted = 0;
75819 +               jnode *node = NULL;
75820 +
75821 +               /* do not put more requests to overload write queue */
75822 +               if (wbc->nonblocking &&
75823 +                   bdi_write_congested(mapping->backing_dev_info)) {
75824 +                       blk_run_address_space(mapping);
75825 +                       wbc->encountered_congestion = 1;
75826 +                       break;
75827 +               }
75828 +               repeats++;
75829 +               BUG_ON(wbc->nr_to_write <= 0);
75830 +
75831 +               if (get_current_context()->entd) {
75832 +                       entd_context *ent = get_entd_context(sb);
75833 +
75834 +                       if (ent->cur_request->node)
75835 +                               /*
75836 +                                * this is ent thread and it managed to capture
75837 +                                * requested page itself - start flush from
75838 +                                * that page
75839 +                                */
75840 +                               node = jref(ent->cur_request->node);
75841 +               }
75842 +
75843 +               result = flush_some_atom(node, &nr_submitted, wbc,
75844 +                                        JNODE_FLUSH_WRITE_BLOCKS);
75845 +               if (result != 0)
75846 +                       warning("nikita-31001", "Flush failed: %i", result);
75847 +               if (node)
75848 +                       jput(node);
75849 +               if (!nr_submitted)
75850 +                       break;
75851 +
75852 +               wbc->nr_to_write -= nr_submitted;
75853 +               written += nr_submitted;
75854 +       } while (wbc->nr_to_write > 0);
75855 +}
75856 +
75857 +void reiser4_throttle_write(struct inode *inode)
75858 +{
75859 +       reiser4_txn_restart_current();
75860 +       balance_dirty_pages_ratelimited(inode->i_mapping);
75861 +}
75862 +
75863 +const char *REISER4_SUPER_MAGIC_STRING = "ReIsEr4";
75864 +const int REISER4_MAGIC_OFFSET = 16 * 4096;    /* offset to magic string from the
75865 +                                                * beginning of device */
75866 +
75867 +/*
75868 + * Reiser4 initialization/shutdown.
75869 + *
75870 + * Code below performs global reiser4 initialization that is done either as
75871 + * part of kernel initialization (when reiser4 is statically built-in), or
75872 + * during reiser4 module load (when compiled as module).
75873 + */
75874 +
75875 +void reiser4_handle_error(void)
75876 +{
75877 +       struct super_block *sb = reiser4_get_current_sb();
75878 +
75879 +       if (!sb)
75880 +               return;
75881 +       reiser4_status_write(REISER4_STATUS_DAMAGED, 0,
75882 +                            "Filesystem error occured");
75883 +       switch (get_super_private(sb)->onerror) {
75884 +       case 0:
75885 +               reiser4_panic("foobar-42", "Filesystem error occured\n");
75886 +       case 1:
75887 +       default:
75888 +               if (sb->s_flags & MS_RDONLY)
75889 +                       return;
75890 +               sb->s_flags |= MS_RDONLY;
75891 +               break;
75892 +       }
75893 +}
75894 +
75895 +struct dentry_operations reiser4_dentry_operations = {
75896 +       .d_revalidate = NULL,
75897 +       .d_hash = NULL,
75898 +       .d_compare = NULL,
75899 +       .d_delete = NULL,
75900 +       .d_release = reiser4_d_release,
75901 +       .d_iput = NULL,
75902 +};
75903 +
75904 +/* Make Linus happy.
75905 +   Local variables:
75906 +   c-indentation-style: "K&R"
75907 +   mode-name: "LC"
75908 +   c-basic-offset: 8
75909 +   tab-width: 8
75910 +   fill-column: 120
75911 +   End:
75912 +*/
75913 diff -puN /dev/null fs/reiser4/vfs_ops.h
75914 --- /dev/null
75915 +++ a/fs/reiser4/vfs_ops.h
75916 @@ -0,0 +1,53 @@
75917 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
75918 + * reiser4/README */
75919 +
75920 +/* vfs_ops.c's exported symbols */
75921 +
75922 +#if !defined( __FS_REISER4_VFS_OPS_H__ )
75923 +#define __FS_REISER4_VFS_OPS_H__
75924 +
75925 +#include "forward.h"
75926 +#include "coord.h"
75927 +#include "seal.h"
75928 +#include "plugin/file/file.h"
75929 +#include "super.h"
75930 +#include "readahead.h"
75931 +
75932 +#include <linux/types.h>       /* for loff_t */
75933 +#include <linux/fs.h>          /* for struct address_space */
75934 +#include <linux/dcache.h>      /* for struct dentry */
75935 +#include <linux/mm.h>
75936 +#include <linux/backing-dev.h>
75937 +
75938 +/* address space operations */
75939 +int reiser4_writepage(struct page *, struct writeback_control *);
75940 +int reiser4_set_page_dirty(struct page *);
75941 +void reiser4_invalidatepage(struct page *, unsigned long offset);
75942 +int reiser4_releasepage(struct page *, gfp_t);
75943 +
75944 +extern int reiser4_update_sd(struct inode *);
75945 +extern int reiser4_add_nlink(struct inode *, struct inode *, int);
75946 +extern int reiser4_del_nlink(struct inode *, struct inode *, int);
75947 +
75948 +extern int reiser4_start_up_io(struct page *page);
75949 +extern void reiser4_throttle_write(struct inode *);
75950 +extern int jnode_is_releasable(jnode *);
75951 +
75952 +#define CAPTURE_APAGE_BURST (1024l)
75953 +void reiser4_writeout(struct super_block *, struct writeback_control *);
75954 +
75955 +extern void reiser4_handle_error(void);
75956 +
75957 +/* __FS_REISER4_VFS_OPS_H__ */
75958 +#endif
75959 +
75960 +/* Make Linus happy.
75961 +   Local variables:
75962 +   c-indentation-style: "K&R"
75963 +   mode-name: "LC"
75964 +   c-basic-offset: 8
75965 +   tab-width: 8
75966 +   fill-column: 120
75967 +   scroll-step: 1
75968 +   End:
75969 +*/
75970 diff -puN /dev/null fs/reiser4/wander.c
75971 --- /dev/null
75972 +++ a/fs/reiser4/wander.c
75973 @@ -0,0 +1,1797 @@
75974 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
75975 + * reiser4/README */
75976 +
75977 +/* Reiser4 Wandering Log */
75978 +
75979 +/* You should read http://www.namesys.com/txn-doc.html
75980 +
75981 +   That describes how filesystem operations are performed as atomic
75982 +   transactions, and how we try to arrange it so that we can write most of the
75983 +   data only once while performing the operation atomically.
75984 +
75985 +   For the purposes of this code, it is enough for it to understand that it
75986 +   has been told a given block should be written either once, or twice (if
75987 +   twice then once to the wandered location and once to the real location).
75988 +
75989 +   This code guarantees that those blocks that are defined to be part of an
75990 +   atom either all take effect or none of them take effect.
75991 +
75992 +   Relocate set nodes are submitted to write by the jnode_flush() routine, and
75993 +   the overwrite set is submitted by reiser4_write_log().  This is because with
75994 +   the overwrite set we seek to optimize writes, and with the relocate set we
75995 +   seek to cause disk order to correlate with the parent first pre-order.
75996 +
75997 +   reiser4_write_log() allocates and writes wandered blocks and maintains
75998 +   additional on-disk structures of the atom as wander records (each wander
75999 +   record occupies one block) for storing of the "wandered map" (a table which
76000 +   contains a relation between wandered and real block numbers) and other
76001 +   information which might be needed at transaction recovery time.
76002 +
76003 +   The wander records are unidirectionally linked into a circle: each wander
76004 +   record contains a block number of the next wander record, the last wander
76005 +   record points to the first one.
76006 +
76007 +   One wander record (named "tx head" in this file) has a format which is
76008 +   different from the other wander records. The "tx head" has a reference to the
76009 +   "tx head" block of the previously committed atom.  Also, "tx head" contains
76010 +   fs information (the free blocks counter, and the oid allocator state) which
76011 +   is logged in a special way .
76012 +
76013 +   There are two journal control blocks, named journal header and journal
76014 +   footer which have fixed on-disk locations.  The journal header has a
76015 +   reference to the "tx head" block of the last committed atom.  The journal
76016 +   footer points to the "tx head" of the last flushed atom.  The atom is
76017 +   "played" when all blocks from its overwrite set are written to disk the
76018 +   second time (i.e. written to their real locations).
76019 +
76020 +   NOTE: People who know reiserfs internals and its journal structure might be
76021 +   confused with these terms journal footer and journal header. There is a table
76022 +   with terms of similar semantics in reiserfs (reiser3) and reiser4:
76023 +
76024 +   REISER3 TERM        |  REISER4 TERM         | DESCRIPTION
76025 +   --------------------+-----------------------+----------------------------
76026 +   commit record       |  journal header       | atomic write of this record
76027 +                       |                       | ends transaction commit
76028 +   --------------------+-----------------------+----------------------------
76029 +   journal header      |  journal footer       | atomic write of this record
76030 +                       |                       | ends post-commit writes.
76031 +                       |                       | After successful
76032 +                       |                       | writing of this journal
76033 +                       |                       | blocks (in reiser3) or
76034 +                       |                       | wandered blocks/records are
76035 +                       |                       | free for re-use.
76036 +   --------------------+-----------------------+----------------------------
76037 +
76038 +   The atom commit process is the following:
76039 +
76040 +   1. The overwrite set is taken from atom's clean list, and its size is
76041 +      counted.
76042 +
76043 +   2. The number of necessary wander records (including tx head) is calculated,
76044 +      and the wander record blocks are allocated.
76045 +
76046 +   3. Allocate wandered blocks and populate wander records by wandered map.
76047 +
76048 +   4. submit write requests for wander records and wandered blocks.
76049 +
76050 +   5. wait until submitted write requests complete.
76051 +
76052 +   6. update journal header: change the pointer to the block number of just
76053 +   written tx head, submit an i/o for modified journal header block and wait
76054 +   for i/o completion.
76055 +
76056 +   NOTE: The special logging for bitmap blocks and some reiser4 super block
76057 +   fields makes processes of atom commit, flush and recovering a bit more
76058 +   complex (see comments in the source code for details).
76059 +
76060 +   The atom playing process is the following:
76061 +
76062 +   1. Write atom's overwrite set in-place.
76063 +
76064 +   2. Wait on i/o.
76065 +
76066 +   3. Update journal footer: change the pointer to block number of tx head
76067 +   block of the atom we currently flushing, submit an i/o, wait on i/o
76068 +   completion.
76069 +
76070 +   4. Free disk space which was used for wandered blocks and wander records.
76071 +
76072 +   After the freeing of wandered blocks and wander records we have that journal
76073 +   footer points to the on-disk structure which might be overwritten soon.
76074 +   Neither the log writer nor the journal recovery procedure use that pointer
76075 +   for accessing the data.  When the journal recovery procedure finds the oldest
76076 +   transaction it compares the journal footer pointer value with the "prev_tx"
76077 +   pointer value in tx head, if values are equal the oldest not flushed
76078 +   transaction is found.
76079 +
76080 +   NOTE on disk space leakage: the information about of what blocks and how many
76081 +   blocks are allocated for wandered blocks, wandered records is not written to
76082 +   the disk because of special logging for bitmaps and some super blocks
76083 +   counters.  After a system crash we the reiser4 does not remember those
76084 +   objects allocation, thus we have no such a kind of disk space leakage.
76085 +*/
76086 +
76087 +/* Special logging of reiser4 super block fields. */
76088 +
76089 +/* There are some reiser4 super block fields (free block count and OID allocator
76090 +   state (number of files and next free OID) which are logged separately from
76091 +   super block to avoid unnecessary atom fusion.
76092 +
76093 +   So, the reiser4 super block can be not captured by a transaction with
76094 +   allocates/deallocates disk blocks or create/delete file objects.  Moreover,
76095 +   the reiser4 on-disk super block is not touched when such a transaction is
76096 +   committed and flushed.  Those "counters logged specially" are logged in "tx
76097 +   head" blocks and in the journal footer block.
76098 +
76099 +   A step-by-step description of special logging:
76100 +
76101 +   0. The per-atom information about deleted or created files and allocated or
76102 +   freed blocks is collected during the transaction.  The atom's
76103 +   ->nr_objects_created and ->nr_objects_deleted are for object
76104 +   deletion/creation tracking, the numbers of allocated and freed blocks are
76105 +   calculated using atom's delete set and atom's capture list -- all new and
76106 +   relocated nodes should be on atom's clean list and should have JNODE_RELOC
76107 +   bit set.
76108 +
76109 +   1. The "logged specially" reiser4 super block fields have their "committed"
76110 +   versions in the reiser4 in-memory super block.  They get modified only at
76111 +   atom commit time.  The atom's commit thread has an exclusive access to those
76112 +   "committed" fields because the log writer implementation supports only one
76113 +   atom commit a time (there is a per-fs "commit" mutex).  At
76114 +   that time "committed" counters are modified using per-atom information
76115 +   collected during the transaction. These counters are stored on disk as a
76116 +   part of tx head block when atom is committed.
76117 +
76118 +   2. When the atom is flushed the value of the free block counter and the OID
76119 +   allocator state get written to the journal footer block.  A special journal
76120 +   procedure (journal_recover_sb_data()) takes those values from the journal
76121 +   footer and updates the reiser4 in-memory super block.
76122 +
76123 +   NOTE: That means free block count and OID allocator state are logged
76124 +   separately from the reiser4 super block regardless of the fact that the
76125 +   reiser4 super block has fields to store both the free block counter and the
76126 +   OID allocator.
76127 +
76128 +   Writing the whole super block at commit time requires knowing true values of
76129 +   all its fields without changes made by not yet committed transactions. It is
76130 +   possible by having their "committed" version of the super block like the
76131 +   reiser4 bitmap blocks have "committed" and "working" versions.  However,
76132 +   another scheme was implemented which stores special logged values in the
76133 +   unused free space inside transaction head block.  In my opinion it has an
76134 +   advantage of not writing whole super block when only part of it was
76135 +   modified. */
76136 +
76137 +#include "debug.h"
76138 +#include "dformat.h"
76139 +#include "txnmgr.h"
76140 +#include "jnode.h"
76141 +#include "znode.h"
76142 +#include "block_alloc.h"
76143 +#include "page_cache.h"
76144 +#include "wander.h"
76145 +#include "reiser4.h"
76146 +#include "super.h"
76147 +#include "vfs_ops.h"
76148 +#include "writeout.h"
76149 +#include "inode.h"
76150 +#include "entd.h"
76151 +
76152 +#include <linux/types.h>
76153 +#include <linux/fs.h>          /* for struct super_block  */
76154 +#include <linux/mm.h>          /* for struct page */
76155 +#include <linux/pagemap.h>
76156 +#include <linux/bio.h>         /* for struct bio */
76157 +#include <linux/blkdev.h>
76158 +
76159 +static int write_jnodes_to_disk_extent(
76160 +       jnode *, int, const reiser4_block_nr *, flush_queue_t *, int);
76161 +
76162 +/* The commit_handle is a container for objects needed at atom commit time  */
76163 +struct commit_handle {
76164 +       /* A pointer to atom's list of OVRWR nodes */
76165 +       struct list_head *overwrite_set;
76166 +       /* atom's overwrite set size */
76167 +       int overwrite_set_size;
76168 +       /* jnodes for wander record blocks */
76169 +       struct list_head tx_list;
76170 +       /* number of wander records */
76171 +       __u32 tx_size;
76172 +       /* 'committed' sb counters are saved here until atom is completely
76173 +          flushed  */
76174 +       __u64 free_blocks;
76175 +       __u64 nr_files;
76176 +       __u64 next_oid;
76177 +       /* A pointer to the atom which is being committed */
76178 +       txn_atom *atom;
76179 +       /* A pointer to current super block */
76180 +       struct super_block *super;
76181 +       /* The counter of modified bitmaps */
76182 +       reiser4_block_nr nr_bitmap;
76183 +};
76184 +
76185 +static void init_commit_handle(struct commit_handle *ch, txn_atom *atom)
76186 +{
76187 +       memset(ch, 0, sizeof(struct commit_handle));
76188 +       INIT_LIST_HEAD(&ch->tx_list);
76189 +
76190 +       ch->atom = atom;
76191 +       ch->super = reiser4_get_current_sb();
76192 +}
76193 +
76194 +static void done_commit_handle(struct commit_handle *ch)
76195 +{
76196 +       assert("zam-690", list_empty(&ch->tx_list));
76197 +}
76198 +
76199 +static inline int reiser4_use_write_barrier(struct super_block * s)
76200 +{
76201 +       return !reiser4_is_set(s, REISER4_NO_WRITE_BARRIER);
76202 +}
76203 +
76204 +static void disable_write_barrier(struct super_block * s)
76205 +{
76206 +       notice("zam-1055", "%s does not support write barriers,"
76207 +              " using synchronous write instead.", s->s_id);
76208 +       set_bit((int)REISER4_NO_WRITE_BARRIER, &get_super_private(s)->fs_flags);
76209 +}
76210 +
76211 +/* fill journal header block data  */
76212 +static void format_journal_header(struct commit_handle *ch)
76213 +{
76214 +       struct reiser4_super_info_data *sbinfo;
76215 +       struct journal_header *header;
76216 +       jnode *txhead;
76217 +
76218 +       sbinfo = get_super_private(ch->super);
76219 +       assert("zam-479", sbinfo != NULL);
76220 +       assert("zam-480", sbinfo->journal_header != NULL);
76221 +
76222 +       txhead = list_entry(ch->tx_list.next, jnode, capture_link);
76223 +
76224 +       jload(sbinfo->journal_header);
76225 +
76226 +       header = (struct journal_header *)jdata(sbinfo->journal_header);
76227 +       assert("zam-484", header != NULL);
76228 +
76229 +       put_unaligned(cpu_to_le64(*jnode_get_block(txhead)),
76230 +                     &header->last_committed_tx);
76231 +
76232 +       jrelse(sbinfo->journal_header);
76233 +}
76234 +
76235 +/* fill journal footer block data */
76236 +static void format_journal_footer(struct commit_handle *ch)
76237 +{
76238 +       struct reiser4_super_info_data *sbinfo;
76239 +       struct journal_footer *footer;
76240 +       jnode *tx_head;
76241 +
76242 +       sbinfo = get_super_private(ch->super);
76243 +
76244 +       tx_head = list_entry(ch->tx_list.next, jnode, capture_link);
76245 +
76246 +       assert("zam-493", sbinfo != NULL);
76247 +       assert("zam-494", sbinfo->journal_header != NULL);
76248 +
76249 +       check_me("zam-691", jload(sbinfo->journal_footer) == 0);
76250 +
76251 +       footer = (struct journal_footer *)jdata(sbinfo->journal_footer);
76252 +       assert("zam-495", footer != NULL);
76253 +
76254 +       put_unaligned(cpu_to_le64(*jnode_get_block(tx_head)),
76255 +                     &footer->last_flushed_tx);
76256 +       put_unaligned(cpu_to_le64(ch->free_blocks), &footer->free_blocks);
76257 +
76258 +       put_unaligned(cpu_to_le64(ch->nr_files), &footer->nr_files);
76259 +       put_unaligned(cpu_to_le64(ch->next_oid), &footer->next_oid);
76260 +
76261 +       jrelse(sbinfo->journal_footer);
76262 +}
76263 +
76264 +/* wander record capacity depends on current block size */
76265 +static int wander_record_capacity(const struct super_block *super)
76266 +{
76267 +       return (super->s_blocksize -
76268 +               sizeof(struct wander_record_header)) /
76269 +           sizeof(struct wander_entry);
76270 +}
76271 +
76272 +/* Fill first wander record (tx head) in accordance with supplied given data */
76273 +static void format_tx_head(struct commit_handle *ch)
76274 +{
76275 +       jnode *tx_head;
76276 +       jnode *next;
76277 +       struct tx_header *header;
76278 +
76279 +       tx_head = list_entry(ch->tx_list.next, jnode, capture_link);
76280 +       assert("zam-692", &ch->tx_list != &tx_head->capture_link);
76281 +
76282 +       next = list_entry(tx_head->capture_link.next, jnode, capture_link);
76283 +       if (&ch->tx_list == &next->capture_link)
76284 +               next = tx_head;
76285 +
76286 +       header = (struct tx_header *)jdata(tx_head);
76287 +
76288 +       assert("zam-460", header != NULL);
76289 +       assert("zam-462", ch->super->s_blocksize >= sizeof(struct tx_header));
76290 +
76291 +       memset(jdata(tx_head), 0, (size_t) ch->super->s_blocksize);
76292 +       memcpy(jdata(tx_head), TX_HEADER_MAGIC, TX_HEADER_MAGIC_SIZE);
76293 +
76294 +       put_unaligned(cpu_to_le32(ch->tx_size), &header->total);
76295 +       put_unaligned(cpu_to_le64(get_super_private(ch->super)->last_committed_tx),
76296 +                     &header->prev_tx);
76297 +       put_unaligned(cpu_to_le64(*jnode_get_block(next)), &header->next_block);
76298 +       put_unaligned(cpu_to_le64(ch->free_blocks), &header->free_blocks);
76299 +       put_unaligned(cpu_to_le64(ch->nr_files), &header->nr_files);
76300 +       put_unaligned(cpu_to_le64(ch->next_oid), &header->next_oid);
76301 +}
76302 +
76303 +/* prepare ordinary wander record block (fill all service fields) */
76304 +static void
76305 +format_wander_record(struct commit_handle *ch, jnode *node, __u32 serial)
76306 +{
76307 +       struct wander_record_header *LRH;
76308 +       jnode *next;
76309 +
76310 +       assert("zam-464", node != NULL);
76311 +
76312 +       LRH = (struct wander_record_header *)jdata(node);
76313 +       next = list_entry(node->capture_link.next, jnode, capture_link);
76314 +
76315 +       if (&ch->tx_list == &next->capture_link)
76316 +               next = list_entry(ch->tx_list.next, jnode, capture_link);
76317 +
76318 +       assert("zam-465", LRH != NULL);
76319 +       assert("zam-463",
76320 +              ch->super->s_blocksize > sizeof(struct wander_record_header));
76321 +
76322 +       memset(jdata(node), 0, (size_t) ch->super->s_blocksize);
76323 +       memcpy(jdata(node), WANDER_RECORD_MAGIC, WANDER_RECORD_MAGIC_SIZE);
76324 +
76325 +       put_unaligned(cpu_to_le32(ch->tx_size), &LRH->total);
76326 +       put_unaligned(cpu_to_le32(serial), &LRH->serial);
76327 +       put_unaligned(cpu_to_le64(*jnode_get_block(next)), &LRH->next_block);
76328 +}
76329 +
76330 +/* add one wandered map entry to formatted wander record */
76331 +static void
76332 +store_entry(jnode * node, int index, const reiser4_block_nr * a,
76333 +           const reiser4_block_nr * b)
76334 +{
76335 +       char *data;
76336 +       struct wander_entry *pairs;
76337 +
76338 +       data = jdata(node);
76339 +       assert("zam-451", data != NULL);
76340 +
76341 +       pairs =
76342 +           (struct wander_entry *)(data + sizeof(struct wander_record_header));
76343 +
76344 +       put_unaligned(cpu_to_le64(*a), &pairs[index].original);
76345 +       put_unaligned(cpu_to_le64(*b), &pairs[index].wandered);
76346 +}
76347 +
76348 +/* currently, wander records contains contain only wandered map, which depend on
76349 +   overwrite set size */
76350 +static void get_tx_size(struct commit_handle *ch)
76351 +{
76352 +       assert("zam-440", ch->overwrite_set_size != 0);
76353 +       assert("zam-695", ch->tx_size == 0);
76354 +
76355 +       /* count all ordinary wander records
76356 +          (<overwrite_set_size> - 1) / <wander_record_capacity> + 1 and add one
76357 +          for tx head block */
76358 +       ch->tx_size =
76359 +           (ch->overwrite_set_size - 1) / wander_record_capacity(ch->super) +
76360 +           2;
76361 +}
76362 +
76363 +/* A special structure for using in store_wmap_actor() for saving its state
76364 +   between calls */
76365 +struct store_wmap_params {
76366 +       jnode *cur;             /* jnode of current wander record to fill */
76367 +       int idx;                /* free element index in wander record  */
76368 +       int capacity;           /* capacity  */
76369 +
76370 +#if REISER4_DEBUG
76371 +       struct list_head *tx_list;
76372 +#endif
76373 +};
76374 +
76375 +/* an actor for use in blocknr_set_iterator routine which populates the list
76376 +   of pre-formatted wander records by wandered map info */
76377 +static int
76378 +store_wmap_actor(txn_atom * atom UNUSED_ARG, const reiser4_block_nr * a,
76379 +                const reiser4_block_nr * b, void *data)
76380 +{
76381 +       struct store_wmap_params *params = data;
76382 +
76383 +       if (params->idx >= params->capacity) {
76384 +               /* a new wander record should be taken from the tx_list */
76385 +               params->cur = list_entry(params->cur->capture_link.next, jnode, capture_link);
76386 +               assert("zam-454",
76387 +                      params->tx_list != &params->cur->capture_link);
76388 +
76389 +               params->idx = 0;
76390 +       }
76391 +
76392 +       store_entry(params->cur, params->idx, a, b);
76393 +       params->idx++;
76394 +
76395 +       return 0;
76396 +}
76397 +
76398 +/* This function is called after Relocate set gets written to disk, Overwrite
76399 +   set is written to wandered locations and all wander records are written
76400 +   also. Updated journal header blocks contains a pointer (block number) to
76401 +   first wander record of the just written transaction */
76402 +static int update_journal_header(struct commit_handle *ch, int use_barrier)
76403 +{
76404 +       struct reiser4_super_info_data *sbinfo = get_super_private(ch->super);
76405 +       jnode *jh = sbinfo->journal_header;
76406 +       jnode *head = list_entry(ch->tx_list.next, jnode, capture_link);
76407 +       int ret;
76408 +
76409 +       format_journal_header(ch);
76410 +
76411 +       ret = write_jnodes_to_disk_extent(jh, 1, jnode_get_block(jh), NULL,
76412 +                                         use_barrier ? WRITEOUT_BARRIER : 0);
76413 +       if (ret)
76414 +               return ret;
76415 +
76416 +       /* blk_run_address_space(sbinfo->fake->i_mapping);
76417 +        * blk_run_queues(); */
76418 +
76419 +       ret = jwait_io(jh, WRITE);
76420 +
76421 +       if (ret)
76422 +               return ret;
76423 +
76424 +       sbinfo->last_committed_tx = *jnode_get_block(head);
76425 +
76426 +       return 0;
76427 +}
76428 +
76429 +/* This function is called after write-back is finished. We update journal
76430 +   footer block and free blocks which were occupied by wandered blocks and
76431 +   transaction wander records */
76432 +static int update_journal_footer(struct commit_handle *ch, int use_barrier)
76433 +{
76434 +       reiser4_super_info_data *sbinfo = get_super_private(ch->super);
76435 +
76436 +       jnode *jf = sbinfo->journal_footer;
76437 +
76438 +       int ret;
76439 +
76440 +       format_journal_footer(ch);
76441 +
76442 +       ret = write_jnodes_to_disk_extent(jf, 1, jnode_get_block(jf), NULL,
76443 +                                         use_barrier ? WRITEOUT_BARRIER : 0);
76444 +       if (ret)
76445 +               return ret;
76446 +
76447 +       /* blk_run_address_space(sbinfo->fake->i_mapping);
76448 +        * blk_run_queue(); */
76449 +
76450 +       ret = jwait_io(jf, WRITE);
76451 +       if (ret)
76452 +               return ret;
76453 +
76454 +       return 0;
76455 +}
76456 +
76457 +/* free block numbers of wander records of already written in place transaction */
76458 +static void dealloc_tx_list(struct commit_handle *ch)
76459 +{
76460 +       while (!list_empty(&ch->tx_list)) {
76461 +               jnode *cur = list_entry(ch->tx_list.next, jnode, capture_link);
76462 +               list_del(&cur->capture_link);
76463 +               ON_DEBUG(INIT_LIST_HEAD(&cur->capture_link));
76464 +               reiser4_dealloc_block(jnode_get_block(cur), BLOCK_NOT_COUNTED,
76465 +                                     BA_FORMATTED);
76466 +
76467 +               unpin_jnode_data(cur);
76468 +               reiser4_drop_io_head(cur);
76469 +       }
76470 +}
76471 +
76472 +/* An actor for use in block_nr_iterator() routine which frees wandered blocks
76473 +   from atom's overwrite set. */
76474 +static int
76475 +dealloc_wmap_actor(txn_atom * atom UNUSED_ARG,
76476 +                  const reiser4_block_nr * a UNUSED_ARG,
76477 +                  const reiser4_block_nr * b, void *data UNUSED_ARG)
76478 +{
76479 +
76480 +       assert("zam-499", b != NULL);
76481 +       assert("zam-500", *b != 0);
76482 +       assert("zam-501", !reiser4_blocknr_is_fake(b));
76483 +
76484 +       reiser4_dealloc_block(b, BLOCK_NOT_COUNTED, BA_FORMATTED);
76485 +       return 0;
76486 +}
76487 +
76488 +/* free wandered block locations of already written in place transaction */
76489 +static void dealloc_wmap(struct commit_handle *ch)
76490 +{
76491 +       assert("zam-696", ch->atom != NULL);
76492 +
76493 +       blocknr_set_iterator(ch->atom, &ch->atom->wandered_map,
76494 +                            dealloc_wmap_actor, NULL, 1);
76495 +}
76496 +
76497 +/* helper function for alloc wandered blocks, which refill set of block
76498 +   numbers needed for wandered blocks  */
76499 +static int
76500 +get_more_wandered_blocks(int count, reiser4_block_nr * start, int *len)
76501 +{
76502 +       reiser4_blocknr_hint hint;
76503 +       int ret;
76504 +
76505 +       reiser4_block_nr wide_len = count;
76506 +
76507 +       /* FIXME-ZAM: A special policy needed for allocation of wandered blocks
76508 +          ZAM-FIXME-HANS: yes, what happened to our discussion of using a fixed
76509 +          reserved allocation area so as to get the best qualities of fixed
76510 +          journals? */
76511 +       reiser4_blocknr_hint_init(&hint);
76512 +       hint.block_stage = BLOCK_GRABBED;
76513 +
76514 +       ret = reiser4_alloc_blocks(&hint, start, &wide_len,
76515 +                                  BA_FORMATTED | BA_USE_DEFAULT_SEARCH_START);
76516 +       *len = (int)wide_len;
76517 +
76518 +       return ret;
76519 +}
76520 +
76521 +/*
76522 + * roll back changes made before issuing BIO in the case of IO error.
76523 + */
76524 +static void undo_bio(struct bio *bio)
76525 +{
76526 +       int i;
76527 +
76528 +       for (i = 0; i < bio->bi_vcnt; ++i) {
76529 +               struct page *pg;
76530 +               jnode *node;
76531 +
76532 +               pg = bio->bi_io_vec[i].bv_page;
76533 +               end_page_writeback(pg);
76534 +               node = jprivate(pg);
76535 +               spin_lock_jnode(node);
76536 +               JF_CLR(node, JNODE_WRITEBACK);
76537 +               JF_SET(node, JNODE_DIRTY);
76538 +               spin_unlock_jnode(node);
76539 +       }
76540 +       bio_put(bio);
76541 +}
76542 +
76543 +/* put overwrite set back to atom's clean list */
76544 +static void put_overwrite_set(struct commit_handle *ch)
76545 +{
76546 +       jnode *cur;
76547 +
76548 +       list_for_each_entry(cur, ch->overwrite_set, capture_link)
76549 +               jrelse_tail(cur);
76550 +}
76551 +
76552 +/* Count overwrite set size, grab disk space for wandered blocks allocation.
76553 +   Since we have a separate list for atom's overwrite set we just scan the list,
76554 +   count bitmap and other not leaf nodes which wandered blocks allocation we
76555 +   have to grab space for. */
76556 +static int get_overwrite_set(struct commit_handle *ch)
76557 +{
76558 +       int ret;
76559 +       jnode *cur;
76560 +       __u64 nr_not_leaves = 0;
76561 +#if REISER4_DEBUG
76562 +       __u64 nr_formatted_leaves = 0;
76563 +       __u64 nr_unformatted_leaves = 0;
76564 +#endif
76565 +
76566 +       assert("zam-697", ch->overwrite_set_size == 0);
76567 +
76568 +       ch->overwrite_set = ATOM_OVRWR_LIST(ch->atom);
76569 +       cur = list_entry(ch->overwrite_set->next, jnode, capture_link);
76570 +
76571 +       while (ch->overwrite_set != &cur->capture_link) {
76572 +               jnode *next = list_entry(cur->capture_link.next, jnode, capture_link);
76573 +
76574 +               /* Count bitmap locks for getting correct statistics what number
76575 +                * of blocks were cleared by the transaction commit. */
76576 +               if (jnode_get_type(cur) == JNODE_BITMAP)
76577 +                       ch->nr_bitmap++;
76578 +
76579 +               assert("zam-939", JF_ISSET(cur, JNODE_OVRWR)
76580 +                      || jnode_get_type(cur) == JNODE_BITMAP);
76581 +
76582 +               if (jnode_is_znode(cur) && znode_above_root(JZNODE(cur))) {
76583 +                       /* we replace fake znode by another (real)
76584 +                          znode which is suggested by disk_layout
76585 +                          plugin */
76586 +
76587 +                       /* FIXME: it looks like fake znode should be
76588 +                          replaced by jnode supplied by
76589 +                          disk_layout. */
76590 +
76591 +                       struct super_block *s = reiser4_get_current_sb();
76592 +                       reiser4_super_info_data *sbinfo =
76593 +                           get_current_super_private();
76594 +
76595 +                       if (sbinfo->df_plug->log_super) {
76596 +                               jnode *sj = sbinfo->df_plug->log_super(s);
76597 +
76598 +                               assert("zam-593", sj != NULL);
76599 +
76600 +                               if (IS_ERR(sj))
76601 +                                       return PTR_ERR(sj);
76602 +
76603 +                               spin_lock_jnode(sj);
76604 +                               JF_SET(sj, JNODE_OVRWR);
76605 +                               insert_into_atom_ovrwr_list(ch->atom, sj);
76606 +                               spin_unlock_jnode(sj);
76607 +
76608 +                               /* jload it as the rest of overwrite set */
76609 +                               jload_gfp(sj, reiser4_ctx_gfp_mask_get(), 0);
76610 +
76611 +                               ch->overwrite_set_size++;
76612 +                       }
76613 +                       spin_lock_jnode(cur);
76614 +                       reiser4_uncapture_block(cur);
76615 +                       jput(cur);
76616 +
76617 +               } else {
76618 +                       int ret;
76619 +                       ch->overwrite_set_size++;
76620 +                       ret = jload_gfp(cur, reiser4_ctx_gfp_mask_get(), 0);
76621 +                       if (ret)
76622 +                               reiser4_panic("zam-783",
76623 +                                             "cannot load e-flushed jnode back (ret = %d)\n",
76624 +                                             ret);
76625 +               }
76626 +
76627 +               /* Count not leaves here because we have to grab disk space
76628 +                * for wandered blocks. They were not counted as "flush
76629 +                * reserved". Counting should be done _after_ nodes are pinned
76630 +                * into memory by jload(). */
76631 +               if (!jnode_is_leaf(cur))
76632 +                       nr_not_leaves++;
76633 +               else {
76634 +#if REISER4_DEBUG
76635 +                       /* at this point @cur either has JNODE_FLUSH_RESERVED
76636 +                        * or is eflushed. Locking is not strong enough to
76637 +                        * write an assertion checking for this. */
76638 +                       if (jnode_is_znode(cur))
76639 +                               nr_formatted_leaves++;
76640 +                       else
76641 +                               nr_unformatted_leaves++;
76642 +#endif
76643 +                       JF_CLR(cur, JNODE_FLUSH_RESERVED);
76644 +               }
76645 +
76646 +               cur = next;
76647 +       }
76648 +
76649 +       /* Grab space for writing (wandered blocks) of not leaves found in
76650 +        * overwrite set. */
76651 +       ret = reiser4_grab_space_force(nr_not_leaves, BA_RESERVED);
76652 +       if (ret)
76653 +               return ret;
76654 +
76655 +       /* Disk space for allocation of wandered blocks of leaf nodes already
76656 +        * reserved as "flush reserved", move it to grabbed space counter. */
76657 +       spin_lock_atom(ch->atom);
76658 +       assert("zam-940",
76659 +              nr_formatted_leaves + nr_unformatted_leaves <=
76660 +              ch->atom->flush_reserved);
76661 +       flush_reserved2grabbed(ch->atom, ch->atom->flush_reserved);
76662 +       spin_unlock_atom(ch->atom);
76663 +
76664 +       return ch->overwrite_set_size;
76665 +}
76666 +
76667 +/**
76668 + * write_jnodes_to_disk_extent - submit write request
76669 + * @head:
76670 + * @first: first jnode of the list
76671 + * @nr: number of jnodes on the list
76672 + * @block_p:
76673 + * @fq:
76674 + * @flags: used to decide whether page is to get PG_reclaim flag
76675 + *
76676 + * Submits a write request for @nr jnodes beginning from the @first, other
76677 + * jnodes are after the @first on the double-linked "capture" list.  All jnodes
76678 + * will be written to the disk region of @nr blocks starting with @block_p block
76679 + * number.  If @fq is not NULL it means that waiting for i/o completion will be
76680 + * done more efficiently by using flush_queue_t objects.
76681 + * This function is the one which writes list of jnodes in batch mode. It does
76682 + * all low-level things as bio construction and page states manipulation.
76683 + *
76684 + * ZAM-FIXME-HANS: brief me on why this function exists, and why bios are
76685 + * aggregated in this function instead of being left to the layers below
76686 + *
76687 + * FIXME: ZAM->HANS: What layer are you talking about? Can you point me to that?
76688 + * Why that layer needed? Why BIOs cannot be constructed here?
76689 + */
76690 +static int write_jnodes_to_disk_extent(
76691 +       jnode *first, int nr, const reiser4_block_nr *block_p,
76692 +       flush_queue_t *fq, int flags)
76693 +{
76694 +       struct super_block *super = reiser4_get_current_sb();
76695 +       int write_op = ( flags & WRITEOUT_BARRIER ) ? WRITE_BARRIER : WRITE;
76696 +       int max_blocks;
76697 +       jnode *cur = first;
76698 +       reiser4_block_nr block;
76699 +
76700 +       assert("zam-571", first != NULL);
76701 +       assert("zam-572", block_p != NULL);
76702 +       assert("zam-570", nr > 0);
76703 +
76704 +       block = *block_p;
76705 +       max_blocks = min(bio_get_nr_vecs(super->s_bdev), BIO_MAX_PAGES);
76706 +
76707 +       while (nr > 0) {
76708 +               struct bio *bio;
76709 +               int nr_blocks = min(nr, max_blocks);
76710 +               int i;
76711 +               int nr_used;
76712 +
76713 +               bio = bio_alloc(GFP_NOIO, nr_blocks);
76714 +               if (!bio)
76715 +                       return RETERR(-ENOMEM);
76716 +
76717 +               bio->bi_bdev = super->s_bdev;
76718 +               bio->bi_sector = block * (super->s_blocksize >> 9);
76719 +               for (nr_used = 0, i = 0; i < nr_blocks; i++) {
76720 +                       struct page *pg;
76721 +
76722 +                       pg = jnode_page(cur);
76723 +                       assert("zam-573", pg != NULL);
76724 +
76725 +                       page_cache_get(pg);
76726 +
76727 +                       lock_and_wait_page_writeback(pg);
76728 +
76729 +                       if (!bio_add_page(bio, pg, super->s_blocksize, 0)) {
76730 +                               /*
76731 +                                * underlying device is satiated. Stop adding
76732 +                                * pages to the bio.
76733 +                                */
76734 +                               unlock_page(pg);
76735 +                               page_cache_release(pg);
76736 +                               break;
76737 +                       }
76738 +
76739 +                       spin_lock_jnode(cur);
76740 +                       assert("nikita-3166",
76741 +                              pg->mapping == jnode_get_mapping(cur));
76742 +                       assert("zam-912", !JF_ISSET(cur, JNODE_WRITEBACK));
76743 +#if REISER4_DEBUG
76744 +                       spin_lock(&cur->load);
76745 +                       assert("nikita-3165", !jnode_is_releasable(cur));
76746 +                       spin_unlock(&cur->load);
76747 +#endif
76748 +                       JF_SET(cur, JNODE_WRITEBACK);
76749 +                       JF_CLR(cur, JNODE_DIRTY);
76750 +                       ON_DEBUG(cur->written++);
76751 +                       spin_unlock_jnode(cur);
76752 +
76753 +                       ClearPageError(pg);
76754 +                       set_page_writeback(pg);
76755 +
76756 +                       if (get_current_context()->entd) {
76757 +                               /* this is ent thread */
76758 +                               entd_context *ent = get_entd_context(super);
76759 +                               struct wbq *rq, *next;
76760 +
76761 +                               spin_lock(&ent->guard);
76762 +
76763 +                               if (pg == ent->cur_request->page) {
76764 +                                       /*
76765 +                                        * entd is called for this page. This
76766 +                                        * request is not in th etodo list
76767 +                                        */
76768 +                                       ent->cur_request->written = 1;
76769 +                               } else {
76770 +                                       /*
76771 +                                        * if we have written a page for which writepage
76772 +                                        * is called for - move request to another list.
76773 +                                        */
76774 +                                       list_for_each_entry_safe(rq, next, &ent->todo_list, link) {
76775 +                                               assert("", rq->magic == WBQ_MAGIC);
76776 +                                               if (pg == rq->page) {
76777 +                                                       /*
76778 +                                                        * remove request from
76779 +                                                        * entd's queue, but do
76780 +                                                        * not wake up a thread
76781 +                                                        * which put this
76782 +                                                        * request
76783 +                                                        */
76784 +                                                       list_del_init(&rq->link);
76785 +                                                       ent->nr_todo_reqs --;
76786 +                                                       list_add_tail(&rq->link, &ent->done_list);
76787 +                                                       ent->nr_done_reqs ++;
76788 +                                                       rq->written = 1;
76789 +                                                       break;
76790 +                                               }
76791 +                                       }
76792 +                               }
76793 +                               spin_unlock(&ent->guard);
76794 +                       }
76795 +
76796 +                       clear_page_dirty_for_io(pg);
76797 +
76798 +                       unlock_page(pg);
76799 +
76800 +                       cur = list_entry(cur->capture_link.next, jnode, capture_link);
76801 +                       nr_used++;
76802 +               }
76803 +               if (nr_used > 0) {
76804 +                       assert("nikita-3453",
76805 +                              bio->bi_size == super->s_blocksize * nr_used);
76806 +                       assert("nikita-3454", bio->bi_vcnt == nr_used);
76807 +
76808 +                       /* Check if we are allowed to write at all */
76809 +                       if (super->s_flags & MS_RDONLY)
76810 +                               undo_bio(bio);
76811 +                       else {
76812 +                               int not_supported;
76813 +
76814 +                               add_fq_to_bio(fq, bio);
76815 +                               bio_get(bio);
76816 +                               reiser4_submit_bio(write_op, bio);
76817 +                               not_supported = bio_flagged(bio, BIO_EOPNOTSUPP);
76818 +                               bio_put(bio);
76819 +                               if (not_supported)
76820 +                                       return -EOPNOTSUPP;
76821 +                       }
76822 +
76823 +                       block += nr_used - 1;
76824 +                       update_blocknr_hint_default(super, &block);
76825 +                       block += 1;
76826 +               } else {
76827 +                       bio_put(bio);
76828 +               }
76829 +               nr -= nr_used;
76830 +       }
76831 +
76832 +       return 0;
76833 +}
76834 +
76835 +/* This is a procedure which recovers a contiguous sequences of disk block
76836 +   numbers in the given list of j-nodes and submits write requests on this
76837 +   per-sequence basis */
76838 +int
76839 +write_jnode_list(struct list_head *head, flush_queue_t *fq,
76840 +                long *nr_submitted, int flags)
76841 +{
76842 +       int ret;
76843 +       jnode *beg = list_entry(head->next, jnode, capture_link);
76844 +
76845 +       while (head != &beg->capture_link) {
76846 +               int nr = 1;
76847 +               jnode *cur = list_entry(beg->capture_link.next, jnode, capture_link);
76848 +
76849 +               while (head != &cur->capture_link) {
76850 +                       if (*jnode_get_block(cur) != *jnode_get_block(beg) + nr)
76851 +                               break;
76852 +                       ++nr;
76853 +                       cur = list_entry(cur->capture_link.next, jnode, capture_link);
76854 +               }
76855 +
76856 +               ret = write_jnodes_to_disk_extent(
76857 +                       beg, nr, jnode_get_block(beg), fq, flags);
76858 +               if (ret)
76859 +                       return ret;
76860 +
76861 +               if (nr_submitted)
76862 +                       *nr_submitted += nr;
76863 +
76864 +               beg = cur;
76865 +       }
76866 +
76867 +       return 0;
76868 +}
76869 +
76870 +/* add given wandered mapping to atom's wandered map */
76871 +static int
76872 +add_region_to_wmap(jnode * cur, int len, const reiser4_block_nr * block_p)
76873 +{
76874 +       int ret;
76875 +       blocknr_set_entry *new_bsep = NULL;
76876 +       reiser4_block_nr block;
76877 +
76878 +       txn_atom *atom;
76879 +
76880 +       assert("zam-568", block_p != NULL);
76881 +       block = *block_p;
76882 +       assert("zam-569", len > 0);
76883 +
76884 +       while ((len--) > 0) {
76885 +               do {
76886 +                       atom = get_current_atom_locked();
76887 +                       assert("zam-536",
76888 +                              !reiser4_blocknr_is_fake(jnode_get_block(cur)));
76889 +                       ret =
76890 +                           blocknr_set_add_pair(atom, &atom->wandered_map,
76891 +                                                &new_bsep,
76892 +                                                jnode_get_block(cur), &block);
76893 +               } while (ret == -E_REPEAT);
76894 +
76895 +               if (ret) {
76896 +                       /* deallocate blocks which were not added to wandered
76897 +                          map */
76898 +                       reiser4_block_nr wide_len = len;
76899 +
76900 +                       reiser4_dealloc_blocks(&block, &wide_len,
76901 +                                              BLOCK_NOT_COUNTED,
76902 +                                              BA_FORMATTED
76903 +                                              /* formatted, without defer */ );
76904 +
76905 +                       return ret;
76906 +               }
76907 +
76908 +               spin_unlock_atom(atom);
76909 +
76910 +               cur = list_entry(cur->capture_link.next, jnode, capture_link);
76911 +               ++block;
76912 +       }
76913 +
76914 +       return 0;
76915 +}
76916 +
76917 +/* Allocate wandered blocks for current atom's OVERWRITE SET and immediately
76918 +   submit IO for allocated blocks.  We assume that current atom is in a stage
76919 +   when any atom fusion is impossible and atom is unlocked and it is safe. */
76920 +static int alloc_wandered_blocks(struct commit_handle *ch, flush_queue_t *fq)
76921 +{
76922 +       reiser4_block_nr block;
76923 +
76924 +       int rest;
76925 +       int len;
76926 +       int ret;
76927 +
76928 +       jnode *cur;
76929 +
76930 +       assert("zam-534", ch->overwrite_set_size > 0);
76931 +
76932 +       rest = ch->overwrite_set_size;
76933 +
76934 +       cur = list_entry(ch->overwrite_set->next, jnode, capture_link);
76935 +       while (ch->overwrite_set != &cur->capture_link) {
76936 +               assert("zam-567", JF_ISSET(cur, JNODE_OVRWR));
76937 +
76938 +               ret = get_more_wandered_blocks(rest, &block, &len);
76939 +               if (ret)
76940 +                       return ret;
76941 +
76942 +               rest -= len;
76943 +
76944 +               ret = add_region_to_wmap(cur, len, &block);
76945 +               if (ret)
76946 +                       return ret;
76947 +
76948 +               ret = write_jnodes_to_disk_extent(cur, len, &block, fq, 0);
76949 +               if (ret)
76950 +                       return ret;
76951 +
76952 +               while ((len--) > 0) {
76953 +                       assert("zam-604",
76954 +                              ch->overwrite_set != &cur->capture_link);
76955 +                       cur = list_entry(cur->capture_link.next, jnode, capture_link);
76956 +               }
76957 +       }
76958 +
76959 +       return 0;
76960 +}
76961 +
76962 +/* allocate given number of nodes over the journal area and link them into a
76963 +   list, return pointer to the first jnode in the list */
76964 +static int alloc_tx(struct commit_handle *ch, flush_queue_t * fq)
76965 +{
76966 +       reiser4_blocknr_hint hint;
76967 +       reiser4_block_nr allocated = 0;
76968 +       reiser4_block_nr first, len;
76969 +       jnode *cur;
76970 +       jnode *txhead;
76971 +       int ret;
76972 +       reiser4_context *ctx;
76973 +       reiser4_super_info_data *sbinfo;
76974 +
76975 +       assert("zam-698", ch->tx_size > 0);
76976 +       assert("zam-699", list_empty_careful(&ch->tx_list));
76977 +
76978 +       ctx = get_current_context();
76979 +       sbinfo = get_super_private(ctx->super);
76980 +
76981 +       while (allocated < (unsigned)ch->tx_size) {
76982 +               len = (ch->tx_size - allocated);
76983 +
76984 +               reiser4_blocknr_hint_init(&hint);
76985 +
76986 +               hint.block_stage = BLOCK_GRABBED;
76987 +
76988 +               /* FIXME: there should be some block allocation policy for
76989 +                  nodes which contain wander records */
76990 +
76991 +               /* We assume that disk space for wandered record blocks can be
76992 +                * taken from reserved area. */
76993 +               ret = reiser4_alloc_blocks(&hint, &first, &len,
76994 +                                          BA_FORMATTED | BA_RESERVED |
76995 +                                          BA_USE_DEFAULT_SEARCH_START);
76996 +               reiser4_blocknr_hint_done(&hint);
76997 +
76998 +               if (ret)
76999 +                       return ret;
77000 +
77001 +               allocated += len;
77002 +
77003 +               /* create jnodes for all wander records */
77004 +               while (len--) {
77005 +                       cur = reiser4_alloc_io_head(&first);
77006 +
77007 +                       if (cur == NULL) {
77008 +                               ret = RETERR(-ENOMEM);
77009 +                               goto free_not_assigned;
77010 +                       }
77011 +
77012 +                       ret = jinit_new(cur, reiser4_ctx_gfp_mask_get());
77013 +
77014 +                       if (ret != 0) {
77015 +                               jfree(cur);
77016 +                               goto free_not_assigned;
77017 +                       }
77018 +
77019 +                       pin_jnode_data(cur);
77020 +
77021 +                       list_add_tail(&cur->capture_link, &ch->tx_list);
77022 +
77023 +                       first++;
77024 +               }
77025 +       }
77026 +
77027 +       { /* format a on-disk linked list of wander records */
77028 +               int serial = 1;
77029 +
77030 +               txhead = list_entry(ch->tx_list.next, jnode, capture_link);
77031 +               format_tx_head(ch);
77032 +
77033 +               cur = list_entry(txhead->capture_link.next, jnode, capture_link);
77034 +               while (&ch->tx_list != &cur->capture_link) {
77035 +                       format_wander_record(ch, cur, serial++);
77036 +                       cur = list_entry(cur->capture_link.next, jnode, capture_link);
77037 +               }
77038 +       }
77039 +
77040 +       { /* Fill wander records with Wandered Set */
77041 +               struct store_wmap_params params;
77042 +               txn_atom *atom;
77043 +
77044 +               params.cur = list_entry(txhead->capture_link.next, jnode, capture_link);
77045 +
77046 +               params.idx = 0;
77047 +               params.capacity =
77048 +                   wander_record_capacity(reiser4_get_current_sb());
77049 +
77050 +               atom = get_current_atom_locked();
77051 +               blocknr_set_iterator(atom, &atom->wandered_map,
77052 +                                    &store_wmap_actor, &params, 0);
77053 +               spin_unlock_atom(atom);
77054 +       }
77055 +
77056 +       { /* relse all jnodes from tx_list */
77057 +               cur = list_entry(ch->tx_list.next, jnode, capture_link);
77058 +               while (&ch->tx_list != &cur->capture_link) {
77059 +                       jrelse(cur);
77060 +                       cur = list_entry(cur->capture_link.next, jnode, capture_link);
77061 +               }
77062 +       }
77063 +
77064 +       ret = write_jnode_list(&ch->tx_list, fq, NULL, 0);
77065 +
77066 +       return ret;
77067 +
77068 +      free_not_assigned:
77069 +       /* We deallocate blocks not yet assigned to jnodes on tx_list. The
77070 +          caller takes care about invalidating of tx list  */
77071 +       reiser4_dealloc_blocks(&first, &len, BLOCK_NOT_COUNTED, BA_FORMATTED);
77072 +
77073 +       return ret;
77074 +}
77075 +
77076 +static int commit_tx(struct commit_handle *ch)
77077 +{
77078 +       flush_queue_t *fq;
77079 +       int barrier;
77080 +       int ret;
77081 +
77082 +       /* Grab more space for wandered records. */
77083 +       ret = reiser4_grab_space_force((__u64) (ch->tx_size), BA_RESERVED);
77084 +       if (ret)
77085 +               return ret;
77086 +
77087 +       fq = get_fq_for_current_atom();
77088 +       if (IS_ERR(fq))
77089 +               return PTR_ERR(fq);
77090 +
77091 +       spin_unlock_atom(fq->atom);
77092 +       do {
77093 +               ret = alloc_wandered_blocks(ch, fq);
77094 +               if (ret)
77095 +                       break;
77096 +               ret = alloc_tx(ch, fq);
77097 +               if (ret)
77098 +                       break;
77099 +       } while (0);
77100 +
77101 +       reiser4_fq_put(fq);
77102 +       if (ret)
77103 +               return ret;
77104 + repeat_wo_barrier:
77105 +       barrier = reiser4_use_write_barrier(ch->super);
77106 +       if (!barrier) {
77107 +               ret = current_atom_finish_all_fq();
77108 +               if (ret)
77109 +                       return ret;
77110 +       }
77111 +       ret = update_journal_header(ch, barrier);
77112 +       if (barrier) {
77113 +               if (ret) {
77114 +                       if (ret == -EOPNOTSUPP) {
77115 +                               disable_write_barrier(ch->super);
77116 +                               goto repeat_wo_barrier;
77117 +                       }
77118 +                       return ret;
77119 +               }
77120 +               ret = current_atom_finish_all_fq();
77121 +       }
77122 +       return ret;
77123 +}
77124 +
77125 +static int write_tx_back(struct commit_handle * ch)
77126 +{
77127 +       flush_queue_t *fq;
77128 +       int ret;
77129 +       int barrier;
77130 +
77131 +       reiser4_post_commit_hook();
77132 +       fq = get_fq_for_current_atom();
77133 +       if (IS_ERR(fq))
77134 +               return  PTR_ERR(fq);
77135 +       spin_unlock_atom(fq->atom);
77136 +       ret = write_jnode_list(
77137 +               ch->overwrite_set, fq, NULL, WRITEOUT_FOR_PAGE_RECLAIM);
77138 +       reiser4_fq_put(fq);
77139 +       if (ret)
77140 +               return ret;
77141 + repeat_wo_barrier:
77142 +       barrier = reiser4_use_write_barrier(ch->super);
77143 +       if (!barrier) {
77144 +               ret = current_atom_finish_all_fq();
77145 +               if (ret)
77146 +                       return ret;
77147 +       }
77148 +       ret = update_journal_footer(ch, barrier);
77149 +       if (barrier) {
77150 +               if (ret) {
77151 +                       if (ret == -EOPNOTSUPP) {
77152 +                               disable_write_barrier(ch->super);
77153 +                               goto repeat_wo_barrier;
77154 +                       }
77155 +                       return ret;
77156 +               }
77157 +               ret = current_atom_finish_all_fq();
77158 +       }
77159 +       if (ret)
77160 +               return ret;
77161 +       reiser4_post_write_back_hook();
77162 +       return 0;
77163 +}
77164 +
77165 +/* We assume that at this moment all captured blocks are marked as RELOC or
77166 +   WANDER (belong to Relocate o Overwrite set), all nodes from Relocate set
77167 +   are submitted to write.
77168 +*/
77169 +
77170 +int reiser4_write_logs(long *nr_submitted)
77171 +{
77172 +       txn_atom *atom;
77173 +       struct super_block *super = reiser4_get_current_sb();
77174 +       reiser4_super_info_data *sbinfo = get_super_private(super);
77175 +       struct commit_handle ch;
77176 +       int ret;
77177 +
77178 +       writeout_mode_enable();
77179 +
77180 +       /* block allocator may add j-nodes to the clean_list */
77181 +       ret = reiser4_pre_commit_hook();
77182 +       if (ret)
77183 +               return ret;
77184 +
77185 +       /* No locks are required if we take atom which stage >=
77186 +        * ASTAGE_PRE_COMMIT */
77187 +       atom = get_current_context()->trans->atom;
77188 +       assert("zam-965", atom != NULL);
77189 +
77190 +       /* relocate set is on the atom->clean_nodes list after
77191 +        * current_atom_complete_writes() finishes. It can be safely
77192 +        * uncaptured after commit_mutex is locked, because any atom that
77193 +        * captures these nodes is guaranteed to commit after current one.
77194 +        *
77195 +        * This can only be done after reiser4_pre_commit_hook(), because it is where
77196 +        * early flushed jnodes with CREATED bit are transferred to the
77197 +        * overwrite list. */
77198 +       reiser4_invalidate_list(ATOM_CLEAN_LIST(atom));
77199 +       spin_lock_atom(atom);
77200 +       /* There might be waiters for the relocate nodes which we have
77201 +        * released, wake them up. */
77202 +       reiser4_atom_send_event(atom);
77203 +       spin_unlock_atom(atom);
77204 +
77205 +       if (REISER4_DEBUG) {
77206 +               int level;
77207 +
77208 +               for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; ++level)
77209 +                       assert("nikita-3352",
77210 +                              list_empty_careful(ATOM_DIRTY_LIST(atom, level)));
77211 +       }
77212 +
77213 +       sbinfo->nr_files_committed += (unsigned)atom->nr_objects_created;
77214 +       sbinfo->nr_files_committed -= (unsigned)atom->nr_objects_deleted;
77215 +
77216 +       init_commit_handle(&ch, atom);
77217 +
77218 +       ch.free_blocks = sbinfo->blocks_free_committed;
77219 +       ch.nr_files = sbinfo->nr_files_committed;
77220 +       /* ZAM-FIXME-HANS: email me what the contention level is for the super
77221 +        * lock. */
77222 +       ch.next_oid = oid_next(super);
77223 +
77224 +       /* count overwrite set and place it in a separate list */
77225 +       ret = get_overwrite_set(&ch);
77226 +
77227 +       if (ret <= 0) {
77228 +               /* It is possible that overwrite set is empty here, it means
77229 +                  all captured nodes are clean */
77230 +               goto up_and_ret;
77231 +       }
77232 +
77233 +       /* Inform the caller about what number of dirty pages will be
77234 +        * submitted to disk. */
77235 +       *nr_submitted += ch.overwrite_set_size - ch.nr_bitmap;
77236 +
77237 +       /* count all records needed for storing of the wandered set */
77238 +       get_tx_size(&ch);
77239 +
77240 +       ret = commit_tx(&ch);
77241 +       if (ret)
77242 +               goto up_and_ret;
77243 +
77244 +       spin_lock_atom(atom);
77245 +       reiser4_atom_set_stage(atom, ASTAGE_POST_COMMIT);
77246 +       spin_unlock_atom(atom);
77247 +
77248 +       ret = write_tx_back(&ch);
77249 +       reiser4_post_write_back_hook();
77250 +
77251 +      up_and_ret:
77252 +       if (ret) {
77253 +               /* there could be fq attached to current atom; the only way to
77254 +                  remove them is: */
77255 +               current_atom_finish_all_fq();
77256 +       }
77257 +
77258 +       /* free blocks of flushed transaction */
77259 +       dealloc_tx_list(&ch);
77260 +       dealloc_wmap(&ch);
77261 +
77262 +       put_overwrite_set(&ch);
77263 +
77264 +       done_commit_handle(&ch);
77265 +
77266 +       writeout_mode_disable();
77267 +
77268 +       return ret;
77269 +}
77270 +
77271 +/* consistency checks for journal data/control blocks: header, footer, log
77272 +   records, transactions head blocks. All functions return zero on success. */
77273 +
77274 +static int check_journal_header(const jnode * node UNUSED_ARG)
77275 +{
77276 +       /* FIXME: journal header has no magic field yet. */
77277 +       return 0;
77278 +}
77279 +
77280 +/* wait for write completion for all jnodes from given list */
77281 +static int wait_on_jnode_list(struct list_head *head)
77282 +{
77283 +       jnode *scan;
77284 +       int ret = 0;
77285 +
77286 +       list_for_each_entry(scan, head, capture_link) {
77287 +               struct page *pg = jnode_page(scan);
77288 +
77289 +               if (pg) {
77290 +                       if (PageWriteback(pg))
77291 +                               wait_on_page_writeback(pg);
77292 +
77293 +                       if (PageError(pg))
77294 +                               ret++;
77295 +               }
77296 +       }
77297 +
77298 +       return ret;
77299 +}
77300 +
77301 +static int check_journal_footer(const jnode * node UNUSED_ARG)
77302 +{
77303 +       /* FIXME: journal footer has no magic field yet. */
77304 +       return 0;
77305 +}
77306 +
77307 +static int check_tx_head(const jnode * node)
77308 +{
77309 +       struct tx_header *header = (struct tx_header *)jdata(node);
77310 +
77311 +       if (memcmp(&header->magic, TX_HEADER_MAGIC, TX_HEADER_MAGIC_SIZE) != 0) {
77312 +               warning("zam-627", "tx head at block %s corrupted\n",
77313 +                       sprint_address(jnode_get_block(node)));
77314 +               return RETERR(-EIO);
77315 +       }
77316 +
77317 +       return 0;
77318 +}
77319 +
77320 +static int check_wander_record(const jnode * node)
77321 +{
77322 +       struct wander_record_header *RH =
77323 +           (struct wander_record_header *)jdata(node);
77324 +
77325 +       if (memcmp(&RH->magic, WANDER_RECORD_MAGIC, WANDER_RECORD_MAGIC_SIZE) !=
77326 +           0) {
77327 +               warning("zam-628", "wander record at block %s corrupted\n",
77328 +                       sprint_address(jnode_get_block(node)));
77329 +               return RETERR(-EIO);
77330 +       }
77331 +
77332 +       return 0;
77333 +}
77334 +
77335 +/* fill commit_handler structure by everything what is needed for update_journal_footer */
77336 +static int restore_commit_handle(struct commit_handle *ch, jnode *tx_head)
77337 +{
77338 +       struct tx_header *TXH;
77339 +       int ret;
77340 +
77341 +       ret = jload(tx_head);
77342 +       if (ret)
77343 +               return ret;
77344 +
77345 +       TXH = (struct tx_header *)jdata(tx_head);
77346 +
77347 +       ch->free_blocks = le64_to_cpu(get_unaligned(&TXH->free_blocks));
77348 +       ch->nr_files = le64_to_cpu(get_unaligned(&TXH->nr_files));
77349 +       ch->next_oid = le64_to_cpu(get_unaligned(&TXH->next_oid));
77350 +
77351 +       jrelse(tx_head);
77352 +
77353 +       list_add(&tx_head->capture_link, &ch->tx_list);
77354 +
77355 +       return 0;
77356 +}
77357 +
77358 +/* replay one transaction: restore and write overwrite set in place */
77359 +static int replay_transaction(const struct super_block *s,
77360 +                             jnode * tx_head,
77361 +                             const reiser4_block_nr * log_rec_block_p,
77362 +                             const reiser4_block_nr * end_block,
77363 +                             unsigned int nr_wander_records)
77364 +{
77365 +       reiser4_block_nr log_rec_block = *log_rec_block_p;
77366 +       struct commit_handle ch;
77367 +       LIST_HEAD(overwrite_set);
77368 +       jnode *log;
77369 +       int ret;
77370 +
77371 +       init_commit_handle(&ch, NULL);
77372 +       ch.overwrite_set = &overwrite_set;
77373 +
77374 +       restore_commit_handle(&ch, tx_head);
77375 +
77376 +       while (log_rec_block != *end_block) {
77377 +               struct wander_record_header *header;
77378 +               struct wander_entry *entry;
77379 +
77380 +               int i;
77381 +
77382 +               if (nr_wander_records == 0) {
77383 +                       warning("zam-631",
77384 +                               "number of wander records in the linked list"
77385 +                               " greater than number stored in tx head.\n");
77386 +                       ret = RETERR(-EIO);
77387 +                       goto free_ow_set;
77388 +               }
77389 +
77390 +               log = reiser4_alloc_io_head(&log_rec_block);
77391 +               if (log == NULL)
77392 +                       return RETERR(-ENOMEM);
77393 +
77394 +               ret = jload(log);
77395 +               if (ret < 0) {
77396 +                       reiser4_drop_io_head(log);
77397 +                       return ret;
77398 +               }
77399 +
77400 +               ret = check_wander_record(log);
77401 +               if (ret) {
77402 +                       jrelse(log);
77403 +                       reiser4_drop_io_head(log);
77404 +                       return ret;
77405 +               }
77406 +
77407 +               header = (struct wander_record_header *)jdata(log);
77408 +               log_rec_block = le64_to_cpu(get_unaligned(&header->next_block));
77409 +
77410 +               entry = (struct wander_entry *)(header + 1);
77411 +
77412 +               /* restore overwrite set from wander record content */
77413 +               for (i = 0; i < wander_record_capacity(s); i++) {
77414 +                       reiser4_block_nr block;
77415 +                       jnode *node;
77416 +
77417 +                       block = le64_to_cpu(get_unaligned(&entry->wandered));
77418 +                       if (block == 0)
77419 +                               break;
77420 +
77421 +                       node = reiser4_alloc_io_head(&block);
77422 +                       if (node == NULL) {
77423 +                               ret = RETERR(-ENOMEM);
77424 +                               /*
77425 +                                * FIXME-VS:???
77426 +                                */
77427 +                               jrelse(log);
77428 +                               reiser4_drop_io_head(log);
77429 +                               goto free_ow_set;
77430 +                       }
77431 +
77432 +                       ret = jload(node);
77433 +
77434 +                       if (ret < 0) {
77435 +                               reiser4_drop_io_head(node);
77436 +                               /*
77437 +                                * FIXME-VS:???
77438 +                                */
77439 +                               jrelse(log);
77440 +                               reiser4_drop_io_head(log);
77441 +                               goto free_ow_set;
77442 +                       }
77443 +
77444 +                       block = le64_to_cpu(get_unaligned(&entry->original));
77445 +
77446 +                       assert("zam-603", block != 0);
77447 +
77448 +                       jnode_set_block(node, &block);
77449 +
77450 +                       list_add_tail(&node->capture_link, ch.overwrite_set);
77451 +
77452 +                       ++entry;
77453 +               }
77454 +
77455 +               jrelse(log);
77456 +               reiser4_drop_io_head(log);
77457 +
77458 +               --nr_wander_records;
77459 +       }
77460 +
77461 +       if (nr_wander_records != 0) {
77462 +               warning("zam-632", "number of wander records in the linked list"
77463 +                       " less than number stored in tx head.\n");
77464 +               ret = RETERR(-EIO);
77465 +               goto free_ow_set;
77466 +       }
77467 +
77468 +       {                       /* write wandered set in place */
77469 +               write_jnode_list(ch.overwrite_set, NULL, NULL, 0);
77470 +               ret = wait_on_jnode_list(ch.overwrite_set);
77471 +
77472 +               if (ret) {
77473 +                       ret = RETERR(-EIO);
77474 +                       goto free_ow_set;
77475 +               }
77476 +       }
77477 +
77478 +       ret = update_journal_footer(&ch, 0);
77479 +
77480 +      free_ow_set:
77481 +
77482 +       while (!list_empty(ch.overwrite_set)) {
77483 +               jnode *cur = list_entry(ch.overwrite_set->next, jnode, capture_link);
77484 +               list_del_init(&cur->capture_link);
77485 +               jrelse(cur);
77486 +               reiser4_drop_io_head(cur);
77487 +       }
77488 +
77489 +       list_del_init(&tx_head->capture_link);
77490 +
77491 +       done_commit_handle(&ch);
77492 +
77493 +       return ret;
77494 +}
77495 +
77496 +/* find oldest committed and not played transaction and play it. The transaction
77497 + * was committed and journal header block was updated but the blocks from the
77498 + * process of writing the atom's overwrite set in-place and updating of journal
77499 + * footer block were not completed. This function completes the process by
77500 + * recovering the atom's overwrite set from their wandered locations and writes
77501 + * them in-place and updating the journal footer. */
77502 +static int replay_oldest_transaction(struct super_block *s)
77503 +{
77504 +       reiser4_super_info_data *sbinfo = get_super_private(s);
77505 +       jnode *jf = sbinfo->journal_footer;
77506 +       unsigned int total;
77507 +       struct journal_footer *F;
77508 +       struct tx_header *T;
77509 +
77510 +       reiser4_block_nr prev_tx;
77511 +       reiser4_block_nr last_flushed_tx;
77512 +       reiser4_block_nr log_rec_block = 0;
77513 +
77514 +       jnode *tx_head;
77515 +
77516 +       int ret;
77517 +
77518 +       if ((ret = jload(jf)) < 0)
77519 +               return ret;
77520 +
77521 +       F = (struct journal_footer *)jdata(jf);
77522 +
77523 +       last_flushed_tx = le64_to_cpu(get_unaligned(&F->last_flushed_tx));
77524 +
77525 +       jrelse(jf);
77526 +
77527 +       if (sbinfo->last_committed_tx == last_flushed_tx) {
77528 +               /* all transactions are replayed */
77529 +               return 0;
77530 +       }
77531 +
77532 +       prev_tx = sbinfo->last_committed_tx;
77533 +
77534 +       /* searching for oldest not flushed transaction */
77535 +       while (1) {
77536 +               tx_head = reiser4_alloc_io_head(&prev_tx);
77537 +               if (!tx_head)
77538 +                       return RETERR(-ENOMEM);
77539 +
77540 +               ret = jload(tx_head);
77541 +               if (ret < 0) {
77542 +                       reiser4_drop_io_head(tx_head);
77543 +                       return ret;
77544 +               }
77545 +
77546 +               ret = check_tx_head(tx_head);
77547 +               if (ret) {
77548 +                       jrelse(tx_head);
77549 +                       reiser4_drop_io_head(tx_head);
77550 +                       return ret;
77551 +               }
77552 +
77553 +               T = (struct tx_header *)jdata(tx_head);
77554 +
77555 +               prev_tx = le64_to_cpu(get_unaligned(&T->prev_tx));
77556 +
77557 +               if (prev_tx == last_flushed_tx)
77558 +                       break;
77559 +
77560 +               jrelse(tx_head);
77561 +               reiser4_drop_io_head(tx_head);
77562 +       }
77563 +
77564 +       total = le32_to_cpu(get_unaligned(&T->total));
77565 +       log_rec_block = le64_to_cpu(get_unaligned(&T->next_block));
77566 +
77567 +       pin_jnode_data(tx_head);
77568 +       jrelse(tx_head);
77569 +
77570 +       ret =
77571 +           replay_transaction(s, tx_head, &log_rec_block,
77572 +                              jnode_get_block(tx_head), total - 1);
77573 +
77574 +       unpin_jnode_data(tx_head);
77575 +       reiser4_drop_io_head(tx_head);
77576 +
77577 +       if (ret)
77578 +               return ret;
77579 +       return -E_REPEAT;
77580 +}
77581 +
77582 +/* The reiser4 journal current implementation was optimized to not to capture
77583 +   super block if certain super blocks fields are modified. Currently, the set
77584 +   is (<free block count>, <OID allocator>). These fields are logged by
77585 +   special way which includes storing them in each transaction head block at
77586 +   atom commit time and writing that information to journal footer block at
77587 +   atom flush time.  For getting info from journal footer block to the
77588 +   in-memory super block there is a special function
77589 +   reiser4_journal_recover_sb_data() which should be called after disk format
77590 +   plugin re-reads super block after journal replaying.
77591 +*/
77592 +
77593 +/* get the information from journal footer in-memory super block */
77594 +int reiser4_journal_recover_sb_data(struct super_block *s)
77595 +{
77596 +       reiser4_super_info_data *sbinfo = get_super_private(s);
77597 +       struct journal_footer *jf;
77598 +       int ret;
77599 +
77600 +       assert("zam-673", sbinfo->journal_footer != NULL);
77601 +
77602 +       ret = jload(sbinfo->journal_footer);
77603 +       if (ret != 0)
77604 +               return ret;
77605 +
77606 +       ret = check_journal_footer(sbinfo->journal_footer);
77607 +       if (ret != 0)
77608 +               goto out;
77609 +
77610 +       jf = (struct journal_footer *)jdata(sbinfo->journal_footer);
77611 +
77612 +       /* was there at least one flushed transaction?  */
77613 +       if (jf->last_flushed_tx) {
77614 +
77615 +               /* restore free block counter logged in this transaction */
77616 +               reiser4_set_free_blocks(s, le64_to_cpu(get_unaligned(&jf->free_blocks)));
77617 +
77618 +               /* restore oid allocator state */
77619 +               oid_init_allocator(s,
77620 +                                  le64_to_cpu(get_unaligned(&jf->nr_files)),
77621 +                                  le64_to_cpu(get_unaligned(&jf->next_oid)));
77622 +       }
77623 +      out:
77624 +       jrelse(sbinfo->journal_footer);
77625 +       return ret;
77626 +}
77627 +
77628 +/* reiser4 replay journal procedure */
77629 +int reiser4_journal_replay(struct super_block *s)
77630 +{
77631 +       reiser4_super_info_data *sbinfo = get_super_private(s);
77632 +       jnode *jh, *jf;
77633 +       struct journal_header *header;
77634 +       int nr_tx_replayed = 0;
77635 +       int ret;
77636 +
77637 +       assert("zam-582", sbinfo != NULL);
77638 +
77639 +       jh = sbinfo->journal_header;
77640 +       jf = sbinfo->journal_footer;
77641 +
77642 +       if (!jh || !jf) {
77643 +               /* it is possible that disk layout does not support journal
77644 +                  structures, we just warn about this */
77645 +               warning("zam-583",
77646 +                       "journal control blocks were not loaded by disk layout plugin.  "
77647 +                       "journal replaying is not possible.\n");
77648 +               return 0;
77649 +       }
77650 +
77651 +       /* Take free block count from journal footer block. The free block
77652 +          counter value corresponds the last flushed transaction state */
77653 +       ret = jload(jf);
77654 +       if (ret < 0)
77655 +               return ret;
77656 +
77657 +       ret = check_journal_footer(jf);
77658 +       if (ret) {
77659 +               jrelse(jf);
77660 +               return ret;
77661 +       }
77662 +
77663 +       jrelse(jf);
77664 +
77665 +       /* store last committed transaction info in reiser4 in-memory super
77666 +          block */
77667 +       ret = jload(jh);
77668 +       if (ret < 0)
77669 +               return ret;
77670 +
77671 +       ret = check_journal_header(jh);
77672 +       if (ret) {
77673 +               jrelse(jh);
77674 +               return ret;
77675 +       }
77676 +
77677 +       header = (struct journal_header *)jdata(jh);
77678 +       sbinfo->last_committed_tx = le64_to_cpu(get_unaligned(&header->last_committed_tx));
77679 +
77680 +       jrelse(jh);
77681 +
77682 +       /* replay committed transactions */
77683 +       while ((ret = replay_oldest_transaction(s)) == -E_REPEAT)
77684 +               nr_tx_replayed++;
77685 +
77686 +       return ret;
77687 +}
77688 +
77689 +/* load journal control block (either journal header or journal footer block) */
77690 +static int
77691 +load_journal_control_block(jnode ** node, const reiser4_block_nr * block)
77692 +{
77693 +       int ret;
77694 +
77695 +       *node = reiser4_alloc_io_head(block);
77696 +       if (!(*node))
77697 +               return RETERR(-ENOMEM);
77698 +
77699 +       ret = jload(*node);
77700 +
77701 +       if (ret) {
77702 +               reiser4_drop_io_head(*node);
77703 +               *node = NULL;
77704 +               return ret;
77705 +       }
77706 +
77707 +       pin_jnode_data(*node);
77708 +       jrelse(*node);
77709 +
77710 +       return 0;
77711 +}
77712 +
77713 +/* unload journal header or footer and free jnode */
77714 +static void unload_journal_control_block(jnode ** node)
77715 +{
77716 +       if (*node) {
77717 +               unpin_jnode_data(*node);
77718 +               reiser4_drop_io_head(*node);
77719 +               *node = NULL;
77720 +       }
77721 +}
77722 +
77723 +/* release journal control blocks */
77724 +void reiser4_done_journal_info(struct super_block *s)
77725 +{
77726 +       reiser4_super_info_data *sbinfo = get_super_private(s);
77727 +
77728 +       assert("zam-476", sbinfo != NULL);
77729 +
77730 +       unload_journal_control_block(&sbinfo->journal_header);
77731 +       unload_journal_control_block(&sbinfo->journal_footer);
77732 +       rcu_barrier();
77733 +}
77734 +
77735 +/* load journal control blocks */
77736 +int reiser4_init_journal_info(struct super_block *s)
77737 +{
77738 +       reiser4_super_info_data *sbinfo = get_super_private(s);
77739 +       journal_location *loc;
77740 +       int ret;
77741 +
77742 +       loc = &sbinfo->jloc;
77743 +
77744 +       assert("zam-651", loc != NULL);
77745 +       assert("zam-652", loc->header != 0);
77746 +       assert("zam-653", loc->footer != 0);
77747 +
77748 +       ret = load_journal_control_block(&sbinfo->journal_header, &loc->header);
77749 +
77750 +       if (ret)
77751 +               return ret;
77752 +
77753 +       ret = load_journal_control_block(&sbinfo->journal_footer, &loc->footer);
77754 +
77755 +       if (ret) {
77756 +               unload_journal_control_block(&sbinfo->journal_header);
77757 +       }
77758 +
77759 +       return ret;
77760 +}
77761 +
77762 +/* Make Linus happy.
77763 +   Local variables:
77764 +   c-indentation-style: "K&R"
77765 +   mode-name: "LC"
77766 +   c-basic-offset: 8
77767 +   tab-width: 8
77768 +   fill-column: 80
77769 +   End:
77770 +*/
77771 diff -puN /dev/null fs/reiser4/wander.h
77772 --- /dev/null
77773 +++ a/fs/reiser4/wander.h
77774 @@ -0,0 +1,135 @@
77775 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
77776 +
77777 +#if !defined (__FS_REISER4_WANDER_H__)
77778 +#define __FS_REISER4_WANDER_H__
77779 +
77780 +#include "dformat.h"
77781 +
77782 +#include <linux/fs.h>          /* for struct super_block  */
77783 +
77784 +/* REISER4 JOURNAL ON-DISK DATA STRUCTURES   */
77785 +
77786 +#define TX_HEADER_MAGIC  "TxMagic4"
77787 +#define WANDER_RECORD_MAGIC "LogMagc4"
77788 +
77789 +#define TX_HEADER_MAGIC_SIZE  (8)
77790 +#define WANDER_RECORD_MAGIC_SIZE (8)
77791 +
77792 +/* journal header block format */
77793 +struct journal_header {
77794 +       /* last written transaction head location */
77795 +       d64 last_committed_tx;
77796 +};
77797 +
77798 +typedef struct journal_location {
77799 +       reiser4_block_nr footer;
77800 +       reiser4_block_nr header;
77801 +} journal_location;
77802 +
77803 +/* The wander.c head comment describes usage and semantic of all these structures */
77804 +/* journal footer block format */
77805 +struct journal_footer {
77806 +       /* last flushed transaction location. */
77807 +       /* This block number is no more valid after the transaction it points
77808 +          to gets flushed, this number is used only at journal replaying time
77809 +          for detection of the end of on-disk list of committed transactions
77810 +          which were not flushed completely */
77811 +       d64 last_flushed_tx;
77812 +
77813 +       /* free block counter is written in journal footer at transaction
77814 +          flushing , not in super block because free blocks counter is logged
77815 +          by another way than super block fields (root pointer, for
77816 +          example). */
77817 +       d64 free_blocks;
77818 +
77819 +       /* number of used OIDs and maximal used OID are logged separately from
77820 +          super block */
77821 +       d64 nr_files;
77822 +       d64 next_oid;
77823 +};
77824 +
77825 +/* Each wander record (except the first one) has unified format with wander
77826 +   record header followed by an array of log entries */
77827 +struct wander_record_header {
77828 +       /* when there is no predefined location for wander records, this magic
77829 +          string should help reiser4fsck. */
77830 +       char magic[WANDER_RECORD_MAGIC_SIZE];
77831 +
77832 +       /* transaction id */
77833 +       d64 id;
77834 +
77835 +       /* total number of wander records in current transaction  */
77836 +       d32 total;
77837 +
77838 +       /* this block number in transaction */
77839 +       d32 serial;
77840 +
77841 +       /* number of previous block in commit */
77842 +       d64 next_block;
77843 +};
77844 +
77845 +/* The first wander record (transaction head) of written transaction has the
77846 +   special format */
77847 +struct tx_header {
77848 +       /* magic string makes first block in transaction different from other
77849 +          logged blocks, it should help fsck. */
77850 +       char magic[TX_HEADER_MAGIC_SIZE];
77851 +
77852 +       /* transaction id */
77853 +       d64 id;
77854 +
77855 +       /* total number of records (including this first tx head) in the
77856 +          transaction */
77857 +       d32 total;
77858 +
77859 +       /* align next field to 8-byte boundary; this field always is zero */
77860 +       d32 padding;
77861 +
77862 +       /* block number of previous transaction head */
77863 +       d64 prev_tx;
77864 +
77865 +       /* next wander record location */
77866 +       d64 next_block;
77867 +
77868 +       /* committed versions of free blocks counter */
77869 +       d64 free_blocks;
77870 +
77871 +       /* number of used OIDs (nr_files) and maximal used OID are logged
77872 +          separately from super block */
77873 +       d64 nr_files;
77874 +       d64 next_oid;
77875 +};
77876 +
77877 +/* A transaction gets written to disk as a set of wander records (each wander
77878 +   record size is fs block) */
77879 +
77880 +/* As it was told above a wander The rest of wander record is filled by these log entries, unused space filled
77881 +   by zeroes */
77882 +struct wander_entry {
77883 +       d64 original;           /* block original location */
77884 +       d64 wandered;           /* block wandered location */
77885 +};
77886 +
77887 +/* REISER4 JOURNAL WRITER FUNCTIONS   */
77888 +
77889 +extern int reiser4_write_logs(long *);
77890 +extern int reiser4_journal_replay(struct super_block *);
77891 +extern int reiser4_journal_recover_sb_data(struct super_block *);
77892 +
77893 +extern int reiser4_init_journal_info(struct super_block *);
77894 +extern void reiser4_done_journal_info(struct super_block *);
77895 +
77896 +extern int write_jnode_list(struct list_head *, flush_queue_t *, long *, int);
77897 +
77898 +#endif                         /* __FS_REISER4_WANDER_H__ */
77899 +
77900 +/* Make Linus happy.
77901 +   Local variables:
77902 +   c-indentation-style: "K&R"
77903 +   mode-name: "LC"
77904 +   c-basic-offset: 8
77905 +   tab-width: 8
77906 +   fill-column: 80
77907 +   scroll-step: 1
77908 +   End:
77909 +*/
77910 diff -puN /dev/null fs/reiser4/writeout.h
77911 --- /dev/null
77912 +++ a/fs/reiser4/writeout.h
77913 @@ -0,0 +1,21 @@
77914 +/* Copyright 2002, 2003, 2004 by Hans Reiser, licensing governed by reiser4/README  */
77915 +
77916 +#if !defined (__FS_REISER4_WRITEOUT_H__)
77917 +
77918 +#define WRITEOUT_SINGLE_STREAM (0x1)
77919 +#define WRITEOUT_FOR_PAGE_RECLAIM  (0x2)
77920 +#define WRITEOUT_BARRIER (0x4)
77921 +
77922 +extern int reiser4_get_writeout_flags(void);
77923 +
77924 +#endif                         /* __FS_REISER4_WRITEOUT_H__ */
77925 +
77926 +/* Make Linus happy.
77927 +   Local variables:
77928 +   c-indentation-style: "K&R"
77929 +   mode-name: "LC"
77930 +   c-basic-offset: 8
77931 +   tab-width: 8
77932 +   fill-column: 80
77933 +   End:
77934 +*/
77935 diff -puN /dev/null fs/reiser4/znode.c
77936 --- /dev/null
77937 +++ a/fs/reiser4/znode.c
77938 @@ -0,0 +1,1029 @@
77939 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
77940 + * reiser4/README */
77941 +/* Znode manipulation functions. */
77942 +/* Znode is the in-memory header for a tree node. It is stored
77943 +   separately from the node itself so that it does not get written to
77944 +   disk.  In this respect znode is like buffer head or page head. We
77945 +   also use znodes for additional reiser4 specific purposes:
77946 +
77947 +    . they are organized into tree structure which is a part of whole
77948 +      reiser4 tree.
77949 +    . they are used to implement node grained locking
77950 +    . they are used to keep additional state associated with a
77951 +      node
77952 +    . they contain links to lists used by the transaction manager
77953 +
77954 +   Znode is attached to some variable "block number" which is instance of
77955 +   fs/reiser4/tree.h:reiser4_block_nr type. Znode can exist without
77956 +   appropriate node being actually loaded in memory. Existence of znode itself
77957 +   is regulated by reference count (->x_count) in it. Each time thread
77958 +   acquires reference to znode through call to zget(), ->x_count is
77959 +   incremented and decremented on call to zput().  Data (content of node) are
77960 +   brought in memory through call to zload(), which also increments ->d_count
77961 +   reference counter.  zload can block waiting on IO.  Call to zrelse()
77962 +   decreases this counter. Also, ->c_count keeps track of number of child
77963 +   znodes and prevents parent znode from being recycled until all of its
77964 +   children are. ->c_count is decremented whenever child goes out of existence
77965 +   (being actually recycled in zdestroy()) which can be some time after last
77966 +   reference to this child dies if we support some form of LRU cache for
77967 +   znodes.
77968 +
77969 +*/
77970 +/* EVERY ZNODE'S STORY
77971 +
77972 +   1. His infancy.
77973 +
77974 +   Once upon a time, the znode was born deep inside of zget() by call to
77975 +   zalloc(). At the return from zget() znode had:
77976 +
77977 +    . reference counter (x_count) of 1
77978 +    . assigned block number, marked as used in bitmap
77979 +    . pointer to parent znode. Root znode parent pointer points
77980 +      to its father: "fake" znode. This, in turn, has NULL parent pointer.
77981 +    . hash table linkage
77982 +    . no data loaded from disk
77983 +    . no node plugin
77984 +    . no sibling linkage
77985 +
77986 +   2. His childhood
77987 +
77988 +   Each node is either brought into memory as a result of tree traversal, or
77989 +   created afresh, creation of the root being a special case of the latter. In
77990 +   either case it's inserted into sibling list. This will typically require
77991 +   some ancillary tree traversing, but ultimately both sibling pointers will
77992 +   exist and JNODE_LEFT_CONNECTED and JNODE_RIGHT_CONNECTED will be true in
77993 +   zjnode.state.
77994 +
77995 +   3. His youth.
77996 +
77997 +   If znode is bound to already existing node in a tree, its content is read
77998 +   from the disk by call to zload(). At that moment, JNODE_LOADED bit is set
77999 +   in zjnode.state and zdata() function starts to return non null for this
78000 +   znode. zload() further calls zparse() that determines which node layout
78001 +   this node is rendered in, and sets ->nplug on success.
78002 +
78003 +   If znode is for new node just created, memory for it is allocated and
78004 +   zinit_new() function is called to initialise data, according to selected
78005 +   node layout.
78006 +
78007 +   4. His maturity.
78008 +
78009 +   After this point, znode lingers in memory for some time. Threads can
78010 +   acquire references to znode either by blocknr through call to zget(), or by
78011 +   following a pointer to unallocated znode from internal item. Each time
78012 +   reference to znode is obtained, x_count is increased. Thread can read/write
78013 +   lock znode. Znode data can be loaded through calls to zload(), d_count will
78014 +   be increased appropriately. If all references to znode are released
78015 +   (x_count drops to 0), znode is not recycled immediately. Rather, it is
78016 +   still cached in the hash table in the hope that it will be accessed
78017 +   shortly.
78018 +
78019 +   There are two ways in which znode existence can be terminated:
78020 +
78021 +    . sudden death: node bound to this znode is removed from the tree
78022 +    . overpopulation: znode is purged out of memory due to memory pressure
78023 +
78024 +   5. His death.
78025 +
78026 +   Death is complex process.
78027 +
78028 +   When we irrevocably commit ourselves to decision to remove node from the
78029 +   tree, JNODE_HEARD_BANSHEE bit is set in zjnode.state of corresponding
78030 +   znode. This is done either in ->kill_hook() of internal item or in
78031 +   reiser4_kill_root() function when tree root is removed.
78032 +
78033 +   At this moment znode still has:
78034 +
78035 +    . locks held on it, necessary write ones
78036 +    . references to it
78037 +    . disk block assigned to it
78038 +    . data loaded from the disk
78039 +    . pending requests for lock
78040 +
78041 +   But once JNODE_HEARD_BANSHEE bit set, last call to unlock_znode() does node
78042 +   deletion. Node deletion includes two phases. First all ways to get
78043 +   references to that znode (sibling and parent links and hash lookup using
78044 +   block number stored in parent node) should be deleted -- it is done through
78045 +   sibling_list_remove(), also we assume that nobody uses down link from
78046 +   parent node due to its nonexistence or proper parent node locking and
78047 +   nobody uses parent pointers from children due to absence of them. Second we
78048 +   invalidate all pending lock requests which still are on znode's lock
78049 +   request queue, this is done by reiser4_invalidate_lock(). Another
78050 +   JNODE_IS_DYING znode status bit is used to invalidate pending lock requests.
78051 +   Once it set all requesters are forced to return -EINVAL from
78052 +   longterm_lock_znode(). Future locking attempts are not possible because all
78053 +   ways to get references to that znode are removed already. Last, node is
78054 +   uncaptured from transaction.
78055 +
78056 +   When last reference to the dying znode is just about to be released,
78057 +   block number for this lock is released and znode is removed from the
78058 +   hash table.
78059 +
78060 +   Now znode can be recycled.
78061 +
78062 +   [it's possible to free bitmap block and remove znode from the hash
78063 +   table when last lock is released. This will result in having
78064 +   referenced but completely orphaned znode]
78065 +
78066 +   6. Limbo
78067 +
78068 +   As have been mentioned above znodes with reference counter 0 are
78069 +   still cached in a hash table. Once memory pressure increases they are
78070 +   purged out of there [this requires something like LRU list for
78071 +   efficient implementation. LRU list would also greatly simplify
78072 +   implementation of coord cache that would in this case morph to just
78073 +   scanning some initial segment of LRU list]. Data loaded into
78074 +   unreferenced znode are flushed back to the durable storage if
78075 +   necessary and memory is freed. Znodes themselves can be recycled at
78076 +   this point too.
78077 +
78078 +*/
78079 +
78080 +#include "debug.h"
78081 +#include "dformat.h"
78082 +#include "key.h"
78083 +#include "coord.h"
78084 +#include "plugin/plugin_header.h"
78085 +#include "plugin/node/node.h"
78086 +#include "plugin/plugin.h"
78087 +#include "txnmgr.h"
78088 +#include "jnode.h"
78089 +#include "znode.h"
78090 +#include "block_alloc.h"
78091 +#include "tree.h"
78092 +#include "tree_walk.h"
78093 +#include "super.h"
78094 +#include "reiser4.h"
78095 +
78096 +#include <linux/pagemap.h>
78097 +#include <linux/spinlock.h>
78098 +#include <linux/slab.h>
78099 +#include <linux/err.h>
78100 +
78101 +static z_hash_table *get_htable(reiser4_tree *,
78102 +                               const reiser4_block_nr * const blocknr);
78103 +static z_hash_table *znode_get_htable(const znode *);
78104 +static void zdrop(znode *);
78105 +
78106 +/* hash table support */
78107 +
78108 +/* compare two block numbers for equality. Used by hash-table macros */
78109 +static inline int
78110 +blknreq(const reiser4_block_nr * b1, const reiser4_block_nr * b2)
78111 +{
78112 +       assert("nikita-534", b1 != NULL);
78113 +       assert("nikita-535", b2 != NULL);
78114 +
78115 +       return *b1 == *b2;
78116 +}
78117 +
78118 +/* Hash znode by block number. Used by hash-table macros */
78119 +/* Audited by: umka (2002.06.11) */
78120 +static inline __u32
78121 +blknrhashfn(z_hash_table * table, const reiser4_block_nr * b)
78122 +{
78123 +       assert("nikita-536", b != NULL);
78124 +
78125 +       return *b & (REISER4_ZNODE_HASH_TABLE_SIZE - 1);
78126 +}
78127 +
78128 +/* The hash table definition */
78129 +#define KMALLOC(size) kmalloc((size), reiser4_ctx_gfp_mask_get())
78130 +#define KFREE(ptr, size) kfree(ptr)
78131 +TYPE_SAFE_HASH_DEFINE(z, znode, reiser4_block_nr, zjnode.key.z, zjnode.link.z,
78132 +                     blknrhashfn, blknreq);
78133 +#undef KFREE
78134 +#undef KMALLOC
78135 +
78136 +/* slab for znodes */
78137 +static struct kmem_cache *znode_cache;
78138 +
78139 +int znode_shift_order;
78140 +
78141 +/**
78142 + * init_znodes - create znode cache
78143 + *
78144 + * Initializes slab cache of znodes. It is part of reiser4 module initialization.
78145 + */
78146 +int init_znodes(void)
78147 +{
78148 +       znode_cache = kmem_cache_create("znode", sizeof(znode), 0,
78149 +                                       SLAB_HWCACHE_ALIGN |
78150 +                                       SLAB_RECLAIM_ACCOUNT, NULL);
78151 +       if (znode_cache == NULL)
78152 +               return RETERR(-ENOMEM);
78153 +
78154 +       for (znode_shift_order = 0; (1 << znode_shift_order) < sizeof(znode);
78155 +            ++znode_shift_order);
78156 +       --znode_shift_order;
78157 +       return 0;
78158 +}
78159 +
78160 +/**
78161 + * done_znodes - delete znode cache
78162 + *
78163 + * This is called on reiser4 module unloading or system shutdown.
78164 + */
78165 +void done_znodes(void)
78166 +{
78167 +       destroy_reiser4_cache(&znode_cache);
78168 +}
78169 +
78170 +/* call this to initialise tree of znodes */
78171 +int znodes_tree_init(reiser4_tree * tree /* tree to initialise znodes for */ )
78172 +{
78173 +       int result;
78174 +       assert("umka-050", tree != NULL);
78175 +
78176 +       rwlock_init(&tree->dk_lock);
78177 +
78178 +       result = z_hash_init(&tree->zhash_table, REISER4_ZNODE_HASH_TABLE_SIZE);
78179 +       if (result != 0)
78180 +               return result;
78181 +       result = z_hash_init(&tree->zfake_table, REISER4_ZNODE_HASH_TABLE_SIZE);
78182 +       return result;
78183 +}
78184 +
78185 +/* free this znode */
78186 +void zfree(znode * node /* znode to free */ )
78187 +{
78188 +       assert("nikita-465", node != NULL);
78189 +       assert("nikita-2120", znode_page(node) == NULL);
78190 +       assert("nikita-2301", list_empty_careful(&node->lock.owners));
78191 +       assert("nikita-2302", list_empty_careful(&node->lock.requestors));
78192 +       assert("nikita-2663", (list_empty_careful(&ZJNODE(node)->capture_link) &&
78193 +                              NODE_LIST(ZJNODE(node)) == NOT_CAPTURED));
78194 +       assert("nikita-3220", list_empty(&ZJNODE(node)->jnodes));
78195 +       assert("nikita-3293", !znode_is_right_connected(node));
78196 +       assert("nikita-3294", !znode_is_left_connected(node));
78197 +       assert("nikita-3295", node->left == NULL);
78198 +       assert("nikita-3296", node->right == NULL);
78199 +
78200 +       /* not yet phash_jnode_destroy(ZJNODE(node)); */
78201 +
78202 +       kmem_cache_free(znode_cache, node);
78203 +}
78204 +
78205 +/* call this to free tree of znodes */
78206 +void znodes_tree_done(reiser4_tree * tree /* tree to finish with znodes of */ )
78207 +{
78208 +       znode *node;
78209 +       znode *next;
78210 +       z_hash_table *ztable;
78211 +
78212 +       /* scan znode hash-tables and kill all znodes, then free hash tables
78213 +        * themselves. */
78214 +
78215 +       assert("nikita-795", tree != NULL);
78216 +
78217 +       ztable = &tree->zhash_table;
78218 +
78219 +       if (ztable->_table != NULL) {
78220 +               for_all_in_htable(ztable, z, node, next) {
78221 +                       node->c_count = 0;
78222 +                       node->in_parent.node = NULL;
78223 +                       assert("nikita-2179", atomic_read(&ZJNODE(node)->x_count) == 0);
78224 +                       zdrop(node);
78225 +               }
78226 +
78227 +               z_hash_done(&tree->zhash_table);
78228 +       }
78229 +
78230 +       ztable = &tree->zfake_table;
78231 +
78232 +       if (ztable->_table != NULL) {
78233 +               for_all_in_htable(ztable, z, node, next) {
78234 +                       node->c_count = 0;
78235 +                       node->in_parent.node = NULL;
78236 +                       assert("nikita-2179", atomic_read(&ZJNODE(node)->x_count) == 0);
78237 +                       zdrop(node);
78238 +               }
78239 +
78240 +               z_hash_done(&tree->zfake_table);
78241 +       }
78242 +}
78243 +
78244 +/* ZNODE STRUCTURES */
78245 +
78246 +/* allocate fresh znode */
78247 +znode *zalloc(gfp_t gfp_flag /* allocation flag */ )
78248 +{
78249 +       znode *node;
78250 +
78251 +       node = kmem_cache_alloc(znode_cache, gfp_flag);
78252 +       return node;
78253 +}
78254 +
78255 +/* Initialize fields of znode
78256 +   @node:    znode to initialize;
78257 +   @parent:  parent znode;
78258 +   @tree:    tree we are in. */
78259 +void zinit(znode * node, const znode * parent, reiser4_tree * tree)
78260 +{
78261 +       assert("nikita-466", node != NULL);
78262 +       assert("umka-268", current_tree != NULL);
78263 +
78264 +       memset(node, 0, sizeof *node);
78265 +
78266 +       assert("umka-051", tree != NULL);
78267 +
78268 +       jnode_init(&node->zjnode, tree, JNODE_FORMATTED_BLOCK);
78269 +       reiser4_init_lock(&node->lock);
78270 +       init_parent_coord(&node->in_parent, parent);
78271 +}
78272 +
78273 +/*
78274 + * remove znode from indices. This is called jput() when last reference on
78275 + * znode is released.
78276 + */
78277 +void znode_remove(znode * node /* znode to remove */ , reiser4_tree * tree)
78278 +{
78279 +       assert("nikita-2108", node != NULL);
78280 +       assert("nikita-470", node->c_count == 0);
78281 +       assert_rw_write_locked(&(tree->tree_lock));
78282 +
78283 +       /* remove reference to this znode from cbk cache */
78284 +       cbk_cache_invalidate(node, tree);
78285 +
78286 +       /* update c_count of parent */
78287 +       if (znode_parent(node) != NULL) {
78288 +               assert("nikita-472", znode_parent(node)->c_count > 0);
78289 +               /* father, onto your hands I forward my spirit... */
78290 +               znode_parent(node)->c_count--;
78291 +               node->in_parent.node = NULL;
78292 +       } else {
78293 +               /* orphaned znode?! Root? */
78294 +       }
78295 +
78296 +       /* remove znode from hash-table */
78297 +       z_hash_remove_rcu(znode_get_htable(node), node);
78298 +}
78299 +
78300 +/* zdrop() -- Remove znode from the tree.
78301 +
78302 +   This is called when znode is removed from the memory. */
78303 +static void zdrop(znode * node /* znode to finish with */ )
78304 +{
78305 +       jdrop(ZJNODE(node));
78306 +}
78307 +
78308 +/*
78309 + * put znode into right place in the hash table. This is called by relocate
78310 + * code.
78311 + */
78312 +int znode_rehash(znode * node /* node to rehash */ ,
78313 +                const reiser4_block_nr * new_block_nr /* new block number */ )
78314 +{
78315 +       z_hash_table *oldtable;
78316 +       z_hash_table *newtable;
78317 +       reiser4_tree *tree;
78318 +
78319 +       assert("nikita-2018", node != NULL);
78320 +
78321 +       tree = znode_get_tree(node);
78322 +       oldtable = znode_get_htable(node);
78323 +       newtable = get_htable(tree, new_block_nr);
78324 +
78325 +       write_lock_tree(tree);
78326 +       /* remove znode from hash-table */
78327 +       z_hash_remove_rcu(oldtable, node);
78328 +
78329 +       /* assertion no longer valid due to RCU */
78330 +       /* assert("nikita-2019", z_hash_find(newtable, new_block_nr) == NULL); */
78331 +
78332 +       /* update blocknr */
78333 +       znode_set_block(node, new_block_nr);
78334 +       node->zjnode.key.z = *new_block_nr;
78335 +
78336 +       /* insert it into hash */
78337 +       z_hash_insert_rcu(newtable, node);
78338 +       write_unlock_tree(tree);
78339 +       return 0;
78340 +}
78341 +
78342 +/* ZNODE LOOKUP, GET, PUT */
78343 +
78344 +/* zlook() - get znode with given block_nr in a hash table or return NULL
78345 +
78346 +   If result is non-NULL then the znode's x_count is incremented.  Internal version
78347 +   accepts pre-computed hash index.  The hash table is accessed under caller's
78348 +   tree->hash_lock.
78349 +*/
78350 +znode *zlook(reiser4_tree * tree, const reiser4_block_nr * const blocknr)
78351 +{
78352 +       znode *result;
78353 +       __u32 hash;
78354 +       z_hash_table *htable;
78355 +
78356 +       assert("jmacd-506", tree != NULL);
78357 +       assert("jmacd-507", blocknr != NULL);
78358 +
78359 +       htable = get_htable(tree, blocknr);
78360 +       hash = blknrhashfn(htable, blocknr);
78361 +
78362 +       rcu_read_lock();
78363 +       result = z_hash_find_index(htable, hash, blocknr);
78364 +
78365 +       if (result != NULL) {
78366 +               add_x_ref(ZJNODE(result));
78367 +               result = znode_rip_check(tree, result);
78368 +       }
78369 +       rcu_read_unlock();
78370 +
78371 +       return result;
78372 +}
78373 +
78374 +/* return hash table where znode with block @blocknr is (or should be)
78375 + * stored */
78376 +static z_hash_table *get_htable(reiser4_tree * tree,
78377 +                               const reiser4_block_nr * const blocknr)
78378 +{
78379 +       z_hash_table *table;
78380 +       if (is_disk_addr_unallocated(blocknr))
78381 +               table = &tree->zfake_table;
78382 +       else
78383 +               table = &tree->zhash_table;
78384 +       return table;
78385 +}
78386 +
78387 +/* return hash table where znode @node is (or should be) stored */
78388 +static z_hash_table *znode_get_htable(const znode * node)
78389 +{
78390 +       return get_htable(znode_get_tree(node), znode_get_block(node));
78391 +}
78392 +
78393 +/* zget() - get znode from hash table, allocating it if necessary.
78394 +
78395 +   First a call to zlook, locating a x-referenced znode if one
78396 +   exists.  If znode is not found, allocate new one and return.  Result
78397 +   is returned with x_count reference increased.
78398 +
78399 +   LOCKS TAKEN:   TREE_LOCK, ZNODE_LOCK
78400 +   LOCK ORDERING: NONE
78401 +*/
78402 +znode *zget(reiser4_tree * tree,
78403 +           const reiser4_block_nr * const blocknr,
78404 +           znode * parent, tree_level level, gfp_t gfp_flag)
78405 +{
78406 +       znode *result;
78407 +       __u32 hashi;
78408 +
78409 +       z_hash_table *zth;
78410 +
78411 +       assert("jmacd-512", tree != NULL);
78412 +       assert("jmacd-513", blocknr != NULL);
78413 +       assert("jmacd-514", level < REISER4_MAX_ZTREE_HEIGHT);
78414 +
78415 +       zth = get_htable(tree, blocknr);
78416 +       hashi = blknrhashfn(zth, blocknr);
78417 +
78418 +       /* NOTE-NIKITA address-as-unallocated-blocknr still is not
78419 +          implemented. */
78420 +
78421 +       z_hash_prefetch_bucket(zth, hashi);
78422 +
78423 +       rcu_read_lock();
78424 +       /* Find a matching BLOCKNR in the hash table.  If the znode is found,
78425 +          we obtain an reference (x_count) but the znode remains unlocked.
78426 +          Have to worry about race conditions later. */
78427 +       result = z_hash_find_index(zth, hashi, blocknr);
78428 +       /* According to the current design, the hash table lock protects new
78429 +          znode references. */
78430 +       if (result != NULL) {
78431 +               add_x_ref(ZJNODE(result));
78432 +               /* NOTE-NIKITA it should be so, but special case during
78433 +                  creation of new root makes such assertion highly
78434 +                  complicated.  */
78435 +               assert("nikita-2131", 1 || znode_parent(result) == parent ||
78436 +                      (ZF_ISSET(result, JNODE_ORPHAN)
78437 +                       && (znode_parent(result) == NULL)));
78438 +               result = znode_rip_check(tree, result);
78439 +       }
78440 +
78441 +       rcu_read_unlock();
78442 +
78443 +       if (!result) {
78444 +               znode *shadow;
78445 +
78446 +               result = zalloc(gfp_flag);
78447 +               if (!result) {
78448 +                       return ERR_PTR(RETERR(-ENOMEM));
78449 +               }
78450 +
78451 +               zinit(result, parent, tree);
78452 +               ZJNODE(result)->blocknr = *blocknr;
78453 +               ZJNODE(result)->key.z = *blocknr;
78454 +               result->level = level;
78455 +
78456 +               write_lock_tree(tree);
78457 +
78458 +               shadow = z_hash_find_index(zth, hashi, blocknr);
78459 +               if (unlikely(shadow != NULL && !ZF_ISSET(shadow, JNODE_RIP))) {
78460 +                       jnode_list_remove(ZJNODE(result));
78461 +                       zfree(result);
78462 +                       result = shadow;
78463 +               } else {
78464 +                       result->version = znode_build_version(tree);
78465 +                       z_hash_insert_index_rcu(zth, hashi, result);
78466 +
78467 +                       if (parent != NULL)
78468 +                               ++parent->c_count;
78469 +               }
78470 +
78471 +               add_x_ref(ZJNODE(result));
78472 +
78473 +               write_unlock_tree(tree);
78474 +       }
78475 +#if REISER4_DEBUG
78476 +       if (!reiser4_blocknr_is_fake(blocknr) && *blocknr != 0)
78477 +               reiser4_check_block(blocknr, 1);
78478 +#endif
78479 +       /* Check for invalid tree level, return -EIO */
78480 +       if (unlikely(znode_get_level(result) != level)) {
78481 +               warning("jmacd-504",
78482 +                       "Wrong level for cached block %llu: %i expecting %i",
78483 +                       (unsigned long long)(*blocknr), znode_get_level(result),
78484 +                       level);
78485 +               zput(result);
78486 +               return ERR_PTR(RETERR(-EIO));
78487 +       }
78488 +
78489 +       assert("nikita-1227", znode_invariant(result));
78490 +
78491 +       return result;
78492 +}
78493 +
78494 +/* ZNODE PLUGINS/DATA */
78495 +
78496 +/* "guess" plugin for node loaded from the disk. Plugin id of node plugin is
78497 +   stored at the fixed offset from the beginning of the node. */
78498 +static node_plugin *znode_guess_plugin(const znode * node      /* znode to guess
78499 +                                                                * plugin of */ )
78500 +{
78501 +       reiser4_tree *tree;
78502 +
78503 +       assert("nikita-1053", node != NULL);
78504 +       assert("nikita-1055", zdata(node) != NULL);
78505 +
78506 +       tree = znode_get_tree(node);
78507 +       assert("umka-053", tree != NULL);
78508 +
78509 +       if (reiser4_is_set(tree->super, REISER4_ONE_NODE_PLUGIN)) {
78510 +               return tree->nplug;
78511 +       } else {
78512 +               return node_plugin_by_disk_id
78513 +                   (tree, &((common_node_header *) zdata(node))->plugin_id);
78514 +#ifdef GUESS_EXISTS
78515 +               reiser4_plugin *plugin;
78516 +
78517 +               /* NOTE-NIKITA add locking here when dynamic plugins will be
78518 +                * implemented */
78519 +               for_all_plugins(REISER4_NODE_PLUGIN_TYPE, plugin) {
78520 +                       if ((plugin->u.node.guess != NULL)
78521 +                           && plugin->u.node.guess(node))
78522 +                               return plugin;
78523 +               }
78524 +               warning("nikita-1057", "Cannot guess node plugin");
78525 +               print_znode("node", node);
78526 +               return NULL;
78527 +#endif
78528 +       }
78529 +}
78530 +
78531 +/* parse node header and install ->node_plugin */
78532 +int zparse(znode * node /* znode to parse */ )
78533 +{
78534 +       int result;
78535 +
78536 +       assert("nikita-1233", node != NULL);
78537 +       assert("nikita-2370", zdata(node) != NULL);
78538 +
78539 +       if (node->nplug == NULL) {
78540 +               node_plugin *nplug;
78541 +
78542 +               nplug = znode_guess_plugin(node);
78543 +               if (likely(nplug != NULL)) {
78544 +                       result = nplug->parse(node);
78545 +                       if (likely(result == 0))
78546 +                               node->nplug = nplug;
78547 +               } else {
78548 +                       result = RETERR(-EIO);
78549 +               }
78550 +       } else
78551 +               result = 0;
78552 +       return result;
78553 +}
78554 +
78555 +/* zload with readahead */
78556 +int zload_ra(znode * node /* znode to load */ , ra_info_t * info)
78557 +{
78558 +       int result;
78559 +
78560 +       assert("nikita-484", node != NULL);
78561 +       assert("nikita-1377", znode_invariant(node));
78562 +       assert("jmacd-7771", !znode_above_root(node));
78563 +       assert("nikita-2125", atomic_read(&ZJNODE(node)->x_count) > 0);
78564 +       assert("nikita-3016", reiser4_schedulable());
78565 +
78566 +       if (info)
78567 +               formatted_readahead(node, info);
78568 +
78569 +       result = jload(ZJNODE(node));
78570 +       assert("nikita-1378", znode_invariant(node));
78571 +       return result;
78572 +}
78573 +
78574 +/* load content of node into memory */
78575 +int zload(znode * node)
78576 +{
78577 +       return zload_ra(node, NULL);
78578 +}
78579 +
78580 +/* call node plugin to initialise newly allocated node. */
78581 +int zinit_new(znode * node /* znode to initialise */ , gfp_t gfp_flags)
78582 +{
78583 +       return jinit_new(ZJNODE(node), gfp_flags);
78584 +}
78585 +
78586 +/* drop reference to node data. When last reference is dropped, data are
78587 +   unloaded. */
78588 +void zrelse(znode * node /* znode to release references to */ )
78589 +{
78590 +       assert("nikita-1381", znode_invariant(node));
78591 +
78592 +       jrelse(ZJNODE(node));
78593 +}
78594 +
78595 +/* returns free space in node */
78596 +unsigned znode_free_space(znode * node /* znode to query */ )
78597 +{
78598 +       assert("nikita-852", node != NULL);
78599 +       return node_plugin_by_node(node)->free_space(node);
78600 +}
78601 +
78602 +/* left delimiting key of znode */
78603 +reiser4_key *znode_get_rd_key(znode * node /* znode to query */ )
78604 +{
78605 +       assert("nikita-958", node != NULL);
78606 +       assert_rw_locked(&(znode_get_tree(node)->dk_lock));
78607 +       assert("nikita-3067", LOCK_CNT_GTZ(rw_locked_dk));
78608 +       assert("nikita-30671", node->rd_key_version != 0);
78609 +       return &node->rd_key;
78610 +}
78611 +
78612 +/* right delimiting key of znode */
78613 +reiser4_key *znode_get_ld_key(znode * node /* znode to query */ )
78614 +{
78615 +       assert("nikita-974", node != NULL);
78616 +       assert_rw_locked(&(znode_get_tree(node)->dk_lock));
78617 +       assert("nikita-3068", LOCK_CNT_GTZ(rw_locked_dk));
78618 +       assert("nikita-30681", node->ld_key_version != 0);
78619 +       return &node->ld_key;
78620 +}
78621 +
78622 +ON_DEBUG(atomic_t delim_key_version = ATOMIC_INIT(0);
78623 +    )
78624 +
78625 +/* update right-delimiting key of @node */
78626 +reiser4_key *znode_set_rd_key(znode * node, const reiser4_key * key)
78627 +{
78628 +       assert("nikita-2937", node != NULL);
78629 +       assert("nikita-2939", key != NULL);
78630 +       assert_rw_write_locked(&(znode_get_tree(node)->dk_lock));
78631 +       assert("nikita-3069", LOCK_CNT_GTZ(write_locked_dk));
78632 +       assert("nikita-2944",
78633 +              znode_is_any_locked(node) ||
78634 +              znode_get_level(node) != LEAF_LEVEL ||
78635 +              keyge(key, &node->rd_key) ||
78636 +              keyeq(&node->rd_key, reiser4_min_key()) ||
78637 +              ZF_ISSET(node, JNODE_HEARD_BANSHEE));
78638 +
78639 +       node->rd_key = *key;
78640 +       ON_DEBUG(node->rd_key_version = atomic_inc_return(&delim_key_version));
78641 +       return &node->rd_key;
78642 +}
78643 +
78644 +/* update left-delimiting key of @node */
78645 +reiser4_key *znode_set_ld_key(znode * node, const reiser4_key * key)
78646 +{
78647 +       assert("nikita-2940", node != NULL);
78648 +       assert("nikita-2941", key != NULL);
78649 +       assert_rw_write_locked(&(znode_get_tree(node)->dk_lock));
78650 +       assert("nikita-3070", LOCK_CNT_GTZ(write_locked_dk));
78651 +       assert("nikita-2943",
78652 +              znode_is_any_locked(node) || keyeq(&node->ld_key,
78653 +                                                 reiser4_min_key()));
78654 +
78655 +       node->ld_key = *key;
78656 +       ON_DEBUG(node->ld_key_version = atomic_inc_return(&delim_key_version));
78657 +       return &node->ld_key;
78658 +}
78659 +
78660 +/* true if @key is inside key range for @node */
78661 +int znode_contains_key(znode * node /* znode to look in */ ,
78662 +                      const reiser4_key * key /* key to look for */ )
78663 +{
78664 +       assert("nikita-1237", node != NULL);
78665 +       assert("nikita-1238", key != NULL);
78666 +
78667 +       /* left_delimiting_key <= key <= right_delimiting_key */
78668 +       return keyle(znode_get_ld_key(node), key)
78669 +           && keyle(key, znode_get_rd_key(node));
78670 +}
78671 +
78672 +/* same as znode_contains_key(), but lock dk lock */
78673 +int znode_contains_key_lock(znode * node /* znode to look in */ ,
78674 +                           const reiser4_key * key /* key to look for */ )
78675 +{
78676 +       int result;
78677 +
78678 +       assert("umka-056", node != NULL);
78679 +       assert("umka-057", key != NULL);
78680 +
78681 +       read_lock_dk(znode_get_tree(node));
78682 +       result = znode_contains_key(node, key);
78683 +       read_unlock_dk(znode_get_tree(node));
78684 +       return result;
78685 +}
78686 +
78687 +/* get parent pointer, assuming tree is not locked */
78688 +znode *znode_parent_nolock(const znode * node /* child znode */ )
78689 +{
78690 +       assert("nikita-1444", node != NULL);
78691 +       return node->in_parent.node;
78692 +}
78693 +
78694 +/* get parent pointer of znode */
78695 +znode *znode_parent(const znode * node /* child znode */ )
78696 +{
78697 +       assert("nikita-1226", node != NULL);
78698 +       assert("nikita-1406", LOCK_CNT_GTZ(rw_locked_tree));
78699 +       return znode_parent_nolock(node);
78700 +}
78701 +
78702 +/* detect uber znode used to protect in-superblock tree root pointer */
78703 +int znode_above_root(const znode * node /* znode to query */ )
78704 +{
78705 +       assert("umka-059", node != NULL);
78706 +
78707 +       return disk_addr_eq(&ZJNODE(node)->blocknr, &UBER_TREE_ADDR);
78708 +}
78709 +
78710 +/* check that @node is root---that its block number is recorder in the tree as
78711 +   that of root node */
78712 +#if REISER4_DEBUG
78713 +static int znode_is_true_root(const znode * node /* znode to query */ )
78714 +{
78715 +       assert("umka-060", node != NULL);
78716 +       assert("umka-061", current_tree != NULL);
78717 +
78718 +       return disk_addr_eq(znode_get_block(node),
78719 +                           &znode_get_tree(node)->root_block);
78720 +}
78721 +#endif
78722 +
78723 +/* check that @node is root */
78724 +int znode_is_root(const znode * node /* znode to query */ )
78725 +{
78726 +       assert("nikita-1206", node != NULL);
78727 +
78728 +       return znode_get_level(node) == znode_get_tree(node)->height;
78729 +}
78730 +
78731 +/* Returns true is @node was just created by zget() and wasn't ever loaded
78732 +   into memory. */
78733 +/* NIKITA-HANS: yes */
78734 +int znode_just_created(const znode * node)
78735 +{
78736 +       assert("nikita-2188", node != NULL);
78737 +       return (znode_page(node) == NULL);
78738 +}
78739 +
78740 +/* obtain updated ->znode_epoch. See seal.c for description. */
78741 +__u64 znode_build_version(reiser4_tree * tree)
78742 +{
78743 +       __u64 result;
78744 +
78745 +       spin_lock(&tree->epoch_lock);
78746 +       result = ++tree->znode_epoch;
78747 +       spin_unlock(&tree->epoch_lock);
78748 +       return result;
78749 +}
78750 +
78751 +void init_load_count(load_count * dh)
78752 +{
78753 +       assert("nikita-2105", dh != NULL);
78754 +       memset(dh, 0, sizeof *dh);
78755 +}
78756 +
78757 +void done_load_count(load_count * dh)
78758 +{
78759 +       assert("nikita-2106", dh != NULL);
78760 +       if (dh->node != NULL) {
78761 +               for (; dh->d_ref > 0; --dh->d_ref)
78762 +                       zrelse(dh->node);
78763 +               dh->node = NULL;
78764 +       }
78765 +}
78766 +
78767 +static int incr_load_count(load_count * dh)
78768 +{
78769 +       int result;
78770 +
78771 +       assert("nikita-2110", dh != NULL);
78772 +       assert("nikita-2111", dh->node != NULL);
78773 +
78774 +       result = zload(dh->node);
78775 +       if (result == 0)
78776 +               ++dh->d_ref;
78777 +       return result;
78778 +}
78779 +
78780 +int incr_load_count_znode(load_count * dh, znode * node)
78781 +{
78782 +       assert("nikita-2107", dh != NULL);
78783 +       assert("nikita-2158", node != NULL);
78784 +       assert("nikita-2109",
78785 +              ergo(dh->node != NULL, (dh->node == node) || (dh->d_ref == 0)));
78786 +
78787 +       dh->node = node;
78788 +       return incr_load_count(dh);
78789 +}
78790 +
78791 +int incr_load_count_jnode(load_count * dh, jnode * node)
78792 +{
78793 +       if (jnode_is_znode(node)) {
78794 +               return incr_load_count_znode(dh, JZNODE(node));
78795 +       }
78796 +       return 0;
78797 +}
78798 +
78799 +void copy_load_count(load_count * new, load_count * old)
78800 +{
78801 +       int ret = 0;
78802 +       done_load_count(new);
78803 +       new->node = old->node;
78804 +       new->d_ref = 0;
78805 +
78806 +       while ((new->d_ref < old->d_ref) && (ret = incr_load_count(new)) == 0) {
78807 +       }
78808 +
78809 +       assert("jmacd-87589", ret == 0);
78810 +}
78811 +
78812 +void move_load_count(load_count * new, load_count * old)
78813 +{
78814 +       done_load_count(new);
78815 +       new->node = old->node;
78816 +       new->d_ref = old->d_ref;
78817 +       old->node = NULL;
78818 +       old->d_ref = 0;
78819 +}
78820 +
78821 +/* convert parent pointer into coord */
78822 +void parent_coord_to_coord(const parent_coord_t * pcoord, coord_t * coord)
78823 +{
78824 +       assert("nikita-3204", pcoord != NULL);
78825 +       assert("nikita-3205", coord != NULL);
78826 +
78827 +       coord_init_first_unit_nocheck(coord, pcoord->node);
78828 +       coord_set_item_pos(coord, pcoord->item_pos);
78829 +       coord->between = AT_UNIT;
78830 +}
78831 +
78832 +/* pack coord into parent_coord_t */
78833 +void coord_to_parent_coord(const coord_t * coord, parent_coord_t * pcoord)
78834 +{
78835 +       assert("nikita-3206", pcoord != NULL);
78836 +       assert("nikita-3207", coord != NULL);
78837 +
78838 +       pcoord->node = coord->node;
78839 +       pcoord->item_pos = coord->item_pos;
78840 +}
78841 +
78842 +/* Initialize a parent hint pointer. (parent hint pointer is a field in znode,
78843 +   look for comments there) */
78844 +void init_parent_coord(parent_coord_t * pcoord, const znode * node)
78845 +{
78846 +       pcoord->node = (znode *) node;
78847 +       pcoord->item_pos = (unsigned short)~0;
78848 +}
78849 +
78850 +#if REISER4_DEBUG
78851 +
78852 +/* debugging aid: znode invariant */
78853 +static int znode_invariant_f(const znode * node /* znode to check */ ,
78854 +                            char const **msg   /* where to store error
78855 +                                                * message, if any */ )
78856 +{
78857 +#define _ergo(ant, con)                                                \
78858 +       ((*msg) = "{" #ant "} ergo {" #con "}", ergo((ant), (con)))
78859 +
78860 +#define _equi(e1, e2)                                          \
78861 +       ((*msg) = "{" #e1 "} <=> {" #e2 "}", equi((e1), (e2)))
78862 +
78863 +#define _check(exp) ((*msg) = #exp, (exp))
78864 +
78865 +       return jnode_invariant_f(ZJNODE(node), msg) &&
78866 +           /* [znode-fake] invariant */
78867 +           /* fake znode doesn't have a parent, and */
78868 +           _ergo(znode_get_level(node) == 0, znode_parent(node) == NULL) &&
78869 +           /* there is another way to express this very check, and */
78870 +           _ergo(znode_above_root(node), znode_parent(node) == NULL) &&
78871 +           /* it has special block number, and */
78872 +           _ergo(znode_get_level(node) == 0,
78873 +                 disk_addr_eq(znode_get_block(node), &UBER_TREE_ADDR)) &&
78874 +           /* it is the only znode with such block number, and */
78875 +           _ergo(!znode_above_root(node) && znode_is_loaded(node),
78876 +                 !disk_addr_eq(znode_get_block(node), &UBER_TREE_ADDR)) &&
78877 +           /* it is parent of the tree root node */
78878 +           _ergo(znode_is_true_root(node),
78879 +                 znode_above_root(znode_parent(node))) &&
78880 +           /* [znode-level] invariant */
78881 +           /* level of parent znode is one larger than that of child,
78882 +              except for the fake znode, and */
78883 +           _ergo(znode_parent(node) && !znode_above_root(znode_parent(node)),
78884 +                 znode_get_level(znode_parent(node)) ==
78885 +                 znode_get_level(node) + 1) &&
78886 +           /* left neighbor is at the same level, and */
78887 +           _ergo(znode_is_left_connected(node) && node->left != NULL,
78888 +                 znode_get_level(node) == znode_get_level(node->left)) &&
78889 +           /* right neighbor is at the same level */
78890 +           _ergo(znode_is_right_connected(node) && node->right != NULL,
78891 +                 znode_get_level(node) == znode_get_level(node->right)) &&
78892 +           /* [znode-connected] invariant */
78893 +           _ergo(node->left != NULL, znode_is_left_connected(node)) &&
78894 +           _ergo(node->right != NULL, znode_is_right_connected(node)) &&
78895 +           _ergo(!znode_is_root(node) && node->left != NULL,
78896 +                 znode_is_right_connected(node->left) &&
78897 +                 node->left->right == node) &&
78898 +           _ergo(!znode_is_root(node) && node->right != NULL,
78899 +                 znode_is_left_connected(node->right) &&
78900 +                 node->right->left == node) &&
78901 +           /* [znode-c_count] invariant */
78902 +           /* for any znode, c_count of its parent is greater than 0 */
78903 +           _ergo(znode_parent(node) != NULL &&
78904 +                 !znode_above_root(znode_parent(node)),
78905 +                 znode_parent(node)->c_count > 0) &&
78906 +           /* leaves don't have children */
78907 +           _ergo(znode_get_level(node) == LEAF_LEVEL,
78908 +                 node->c_count == 0) &&
78909 +           _check(node->zjnode.jnodes.prev != NULL) &&
78910 +           _check(node->zjnode.jnodes.next != NULL) &&
78911 +           /* orphan doesn't have a parent */
78912 +           _ergo(ZF_ISSET(node, JNODE_ORPHAN), znode_parent(node) == 0) &&
78913 +           /* [znode-modify] invariant */
78914 +           /* if znode is not write-locked, its checksum remains
78915 +            * invariant */
78916 +           /* unfortunately, zlock is unordered w.r.t. jnode_lock, so we
78917 +            * cannot check this. */
78918 +           /* [znode-refs] invariant */
78919 +           /* only referenced znode can be long-term locked */
78920 +           _ergo(znode_is_locked(node),
78921 +                 atomic_read(&ZJNODE(node)->x_count) != 0);
78922 +}
78923 +
78924 +/* debugging aid: check znode invariant and panic if it doesn't hold */
78925 +int znode_invariant(znode * node /* znode to check */ )
78926 +{
78927 +       char const *failed_msg;
78928 +       int result;
78929 +
78930 +       assert("umka-063", node != NULL);
78931 +       assert("umka-064", current_tree != NULL);
78932 +
78933 +       spin_lock_znode(node);
78934 +       read_lock_tree(znode_get_tree(node));
78935 +       result = znode_invariant_f(node, &failed_msg);
78936 +       if (!result) {
78937 +               /* print_znode("corrupted node", node); */
78938 +               warning("jmacd-555", "Condition %s failed", failed_msg);
78939 +       }
78940 +       read_unlock_tree(znode_get_tree(node));
78941 +       spin_unlock_znode(node);
78942 +       return result;
78943 +}
78944 +
78945 +/* return non-0 iff data are loaded into znode */
78946 +int znode_is_loaded(const znode * node /* znode to query */ )
78947 +{
78948 +       assert("nikita-497", node != NULL);
78949 +       return jnode_is_loaded(ZJNODE(node));
78950 +}
78951 +
78952 +unsigned long znode_times_locked(const znode * z)
78953 +{
78954 +       return z->times_locked;
78955 +}
78956 +
78957 +#endif                         /* REISER4_DEBUG */
78958 +
78959 +/* Make Linus happy.
78960 +   Local variables:
78961 +   c-indentation-style: "K&R"
78962 +   mode-name: "LC"
78963 +   c-basic-offset: 8
78964 +   tab-width: 8
78965 +   fill-column: 120
78966 +   End:
78967 +*/
78968 diff -puN /dev/null fs/reiser4/znode.h
78969 --- /dev/null
78970 +++ a/fs/reiser4/znode.h
78971 @@ -0,0 +1,434 @@
78972 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
78973 + * reiser4/README */
78974 +
78975 +/* Declaration of znode (Zam's node). See znode.c for more details. */
78976 +
78977 +#ifndef __ZNODE_H__
78978 +#define __ZNODE_H__
78979 +
78980 +#include "forward.h"
78981 +#include "debug.h"
78982 +#include "dformat.h"
78983 +#include "key.h"
78984 +#include "coord.h"
78985 +#include "plugin/node/node.h"
78986 +#include "jnode.h"
78987 +#include "lock.h"
78988 +#include "readahead.h"
78989 +
78990 +#include <linux/types.h>
78991 +#include <linux/spinlock.h>
78992 +#include <linux/semaphore.h>
78993 +#include <linux/pagemap.h>     /* for PAGE_CACHE_SIZE */
78994 +#include <asm/atomic.h>
78995 +
78996 +/* znode tracks its position within parent (internal item in a parent node,
78997 + * that contains znode's block number). */
78998 +typedef struct parent_coord {
78999 +       znode *node;
79000 +       pos_in_node_t item_pos;
79001 +} parent_coord_t;
79002 +
79003 +/* &znode - node in a reiser4 tree.
79004 +
79005 +   NOTE-NIKITA fields in this struct have to be rearranged (later) to reduce
79006 +   cacheline pressure.
79007 +
79008 +   Locking:
79009 +
79010 +   Long term: data in a disk node attached to this znode are protected
79011 +   by long term, deadlock aware lock ->lock;
79012 +
79013 +   Spin lock: the following fields are protected by the spin lock:
79014 +
79015 +    ->lock
79016 +
79017 +   Following fields are protected by the global tree lock:
79018 +
79019 +    ->left
79020 +    ->right
79021 +    ->in_parent
79022 +    ->c_count
79023 +
79024 +   Following fields are protected by the global delimiting key lock (dk_lock):
79025 +
79026 +    ->ld_key (to update ->ld_key long-term lock on the node is also required)
79027 +    ->rd_key
79028 +
79029 +   Following fields are protected by the long term lock:
79030 +
79031 +    ->nr_items
79032 +
79033 +   ->node_plugin is never changed once set. This means that after code made
79034 +   itself sure that field is valid it can be accessed without any additional
79035 +   locking.
79036 +
79037 +   ->level is immutable.
79038 +
79039 +   Invariants involving this data-type:
79040 +
79041 +      [znode-fake]
79042 +      [znode-level]
79043 +      [znode-connected]
79044 +      [znode-c_count]
79045 +      [znode-refs]
79046 +      [jnode-refs]
79047 +      [jnode-queued]
79048 +      [znode-modify]
79049 +
79050 +    For this to be made into a clustering or NUMA filesystem, we would want to eliminate all of the global locks.
79051 +    Suggestions for how to do that are desired.*/
79052 +struct znode {
79053 +       /* Embedded jnode. */
79054 +       jnode zjnode;
79055 +
79056 +       /* contains three subfields, node, pos_in_node, and pos_in_unit.
79057 +
79058 +          pos_in_node and pos_in_unit are only hints that are cached to
79059 +          speed up lookups during balancing. They are not required to be up to
79060 +          date. Synched in find_child_ptr().
79061 +
79062 +          This value allows us to avoid expensive binary searches.
79063 +
79064 +          in_parent->node points to the parent of this node, and is NOT a
79065 +          hint.
79066 +        */
79067 +       parent_coord_t in_parent;
79068 +
79069 +       /*
79070 +        * sibling list pointers
79071 +        */
79072 +
79073 +       /* left-neighbor */
79074 +       znode *left;
79075 +       /* right-neighbor */
79076 +       znode *right;
79077 +
79078 +       /* long term lock on node content. This lock supports deadlock
79079 +          detection. See lock.c
79080 +        */
79081 +       zlock lock;
79082 +
79083 +       /* You cannot remove from memory a node that has children in
79084 +          memory. This is because we rely on the fact that parent of given
79085 +          node can always be reached without blocking for io. When reading a
79086 +          node into memory you must increase the c_count of its parent, when
79087 +          removing it from memory you must decrease the c_count.  This makes
79088 +          the code simpler, and the cases where it is suboptimal are truly
79089 +          obscure.
79090 +        */
79091 +       int c_count;
79092 +
79093 +       /* plugin of node attached to this znode. NULL if znode is not
79094 +          loaded. */
79095 +       node_plugin *nplug;
79096 +
79097 +       /* version of znode data. This is increased on each modification. This
79098 +        * is necessary to implement seals (see seal.[ch]) efficiently. */
79099 +       __u64 version;
79100 +
79101 +       /* left delimiting key. Necessary to efficiently perform
79102 +          balancing with node-level locking. Kept in memory only. */
79103 +       reiser4_key ld_key;
79104 +       /* right delimiting key. */
79105 +       reiser4_key rd_key;
79106 +
79107 +       /* znode's tree level */
79108 +       __u16 level;
79109 +       /* number of items in this node. This field is modified by node
79110 +        * plugin. */
79111 +       __u16 nr_items;
79112 +
79113 +#if REISER4_DEBUG
79114 +       void *creator;
79115 +       reiser4_key first_key;
79116 +       unsigned long times_locked;
79117 +       int left_version;       /* when node->left was updated */
79118 +       int right_version;      /* when node->right was updated */
79119 +       int ld_key_version;     /* when node->ld_key was updated */
79120 +       int rd_key_version;     /* when node->rd_key was updated */
79121 +#endif
79122 +
79123 +} __attribute__ ((aligned(16)));
79124 +
79125 +ON_DEBUG(extern atomic_t delim_key_version;
79126 +    )
79127 +
79128 +/* In general I think these macros should not be exposed. */
79129 +#define znode_is_locked(node)          (lock_is_locked(&node->lock))
79130 +#define znode_is_rlocked(node)         (lock_is_rlocked(&node->lock))
79131 +#define znode_is_wlocked(node)         (lock_is_wlocked(&node->lock))
79132 +#define znode_is_wlocked_once(node)    (lock_is_wlocked_once(&node->lock))
79133 +#define znode_can_be_rlocked(node)     (lock_can_be_rlocked(&node->lock))
79134 +#define is_lock_compatible(node, mode) (lock_mode_compatible(&node->lock, mode))
79135 +/* Macros for accessing the znode state. */
79136 +#define        ZF_CLR(p,f)             JF_CLR  (ZJNODE(p), (f))
79137 +#define        ZF_ISSET(p,f)           JF_ISSET(ZJNODE(p), (f))
79138 +#define        ZF_SET(p,f)             JF_SET  (ZJNODE(p), (f))
79139 +extern znode *zget(reiser4_tree * tree, const reiser4_block_nr * const block,
79140 +                  znode * parent, tree_level level, gfp_t gfp_flag);
79141 +extern znode *zlook(reiser4_tree * tree, const reiser4_block_nr * const block);
79142 +extern int zload(znode * node);
79143 +extern int zload_ra(znode * node, ra_info_t * info);
79144 +extern int zinit_new(znode * node, gfp_t gfp_flags);
79145 +extern void zrelse(znode * node);
79146 +extern void znode_change_parent(znode * new_parent, reiser4_block_nr * block);
79147 +
79148 +/* size of data in znode */
79149 +static inline unsigned
79150 +znode_size(const znode * node UNUSED_ARG /* znode to query */ )
79151 +{
79152 +       assert("nikita-1416", node != NULL);
79153 +       return PAGE_CACHE_SIZE;
79154 +}
79155 +
79156 +extern void parent_coord_to_coord(const parent_coord_t * pcoord,
79157 +                                 coord_t * coord);
79158 +extern void coord_to_parent_coord(const coord_t * coord,
79159 +                                 parent_coord_t * pcoord);
79160 +extern void init_parent_coord(parent_coord_t * pcoord, const znode * node);
79161 +
79162 +extern unsigned znode_free_space(znode * node);
79163 +
79164 +extern reiser4_key *znode_get_rd_key(znode * node);
79165 +extern reiser4_key *znode_get_ld_key(znode * node);
79166 +
79167 +extern reiser4_key *znode_set_rd_key(znode * node, const reiser4_key * key);
79168 +extern reiser4_key *znode_set_ld_key(znode * node, const reiser4_key * key);
79169 +
79170 +/* `connected' state checks */
79171 +static inline int znode_is_right_connected(const znode * node)
79172 +{
79173 +       return ZF_ISSET(node, JNODE_RIGHT_CONNECTED);
79174 +}
79175 +
79176 +static inline int znode_is_left_connected(const znode * node)
79177 +{
79178 +       return ZF_ISSET(node, JNODE_LEFT_CONNECTED);
79179 +}
79180 +
79181 +static inline int znode_is_connected(const znode * node)
79182 +{
79183 +       return znode_is_right_connected(node) && znode_is_left_connected(node);
79184 +}
79185 +
79186 +extern int znode_shift_order;
79187 +extern int znode_rehash(znode * node, const reiser4_block_nr * new_block_nr);
79188 +extern void znode_remove(znode *, reiser4_tree *);
79189 +extern znode *znode_parent(const znode * node);
79190 +extern znode *znode_parent_nolock(const znode * node);
79191 +extern int znode_above_root(const znode * node);
79192 +extern int init_znodes(void);
79193 +extern void done_znodes(void);
79194 +extern int znodes_tree_init(reiser4_tree * ztree);
79195 +extern void znodes_tree_done(reiser4_tree * ztree);
79196 +extern int znode_contains_key(znode * node, const reiser4_key * key);
79197 +extern int znode_contains_key_lock(znode * node, const reiser4_key * key);
79198 +extern unsigned znode_save_free_space(znode * node);
79199 +extern unsigned znode_recover_free_space(znode * node);
79200 +extern znode *zalloc(gfp_t gfp_flag);
79201 +extern void zinit(znode *, const znode * parent, reiser4_tree *);
79202 +extern int zparse(znode * node);
79203 +
79204 +extern int znode_just_created(const znode * node);
79205 +
79206 +extern void zfree(znode * node);
79207 +
79208 +#if REISER4_DEBUG
79209 +extern void print_znode(const char *prefix, const znode * node);
79210 +#else
79211 +#define print_znode( p, n ) noop
79212 +#endif
79213 +
79214 +/* Make it look like various znode functions exist instead of treating znodes as
79215 +   jnodes in znode-specific code. */
79216 +#define znode_page(x)               jnode_page ( ZJNODE(x) )
79217 +#define zdata(x)                    jdata ( ZJNODE(x) )
79218 +#define znode_get_block(x)          jnode_get_block ( ZJNODE(x) )
79219 +#define znode_created(x)            jnode_created ( ZJNODE(x) )
79220 +#define znode_set_created(x)        jnode_set_created ( ZJNODE(x) )
79221 +#define znode_convertible(x)        jnode_convertible (ZJNODE(x))
79222 +#define znode_set_convertible(x)    jnode_set_convertible (ZJNODE(x))
79223 +
79224 +#define znode_is_dirty(x)           jnode_is_dirty    ( ZJNODE(x) )
79225 +#define znode_check_dirty(x)        jnode_check_dirty ( ZJNODE(x) )
79226 +#define znode_make_clean(x)         jnode_make_clean   ( ZJNODE(x) )
79227 +#define znode_set_block(x, b)       jnode_set_block ( ZJNODE(x), (b) )
79228 +
79229 +#define spin_lock_znode(x)          spin_lock_jnode ( ZJNODE(x) )
79230 +#define spin_unlock_znode(x)        spin_unlock_jnode ( ZJNODE(x) )
79231 +#define spin_trylock_znode(x)       spin_trylock_jnode ( ZJNODE(x) )
79232 +#define spin_znode_is_locked(x)     spin_jnode_is_locked ( ZJNODE(x) )
79233 +#define spin_znode_is_not_locked(x) spin_jnode_is_not_locked ( ZJNODE(x) )
79234 +
79235 +#if REISER4_DEBUG
79236 +extern int znode_x_count_is_protected(const znode * node);
79237 +extern int znode_invariant(znode * node);
79238 +#endif
79239 +
79240 +/* acquire reference to @node */
79241 +static inline znode *zref(znode * node)
79242 +{
79243 +       /* change of x_count from 0 to 1 is protected by tree spin-lock */
79244 +       return JZNODE(jref(ZJNODE(node)));
79245 +}
79246 +
79247 +/* release reference to @node */
79248 +static inline void zput(znode * node)
79249 +{
79250 +       assert("nikita-3564", znode_invariant(node));
79251 +       jput(ZJNODE(node));
79252 +}
79253 +
79254 +/* get the level field for a znode */
79255 +static inline tree_level znode_get_level(const znode * node)
79256 +{
79257 +       return node->level;
79258 +}
79259 +
79260 +/* get the level field for a jnode */
79261 +static inline tree_level jnode_get_level(const jnode * node)
79262 +{
79263 +       if (jnode_is_znode(node))
79264 +               return znode_get_level(JZNODE(node));
79265 +       else
79266 +               /* unformatted nodes are all at the LEAF_LEVEL and for
79267 +                  "semi-formatted" nodes like bitmaps, level doesn't matter. */
79268 +               return LEAF_LEVEL;
79269 +}
79270 +
79271 +/* true if jnode is on leaf level */
79272 +static inline int jnode_is_leaf(const jnode * node)
79273 +{
79274 +       if (jnode_is_znode(node))
79275 +               return (znode_get_level(JZNODE(node)) == LEAF_LEVEL);
79276 +       if (jnode_get_type(node) == JNODE_UNFORMATTED_BLOCK)
79277 +               return 1;
79278 +       return 0;
79279 +}
79280 +
79281 +/* return znode's tree */
79282 +static inline reiser4_tree *znode_get_tree(const znode * node)
79283 +{
79284 +       assert("nikita-2692", node != NULL);
79285 +       return jnode_get_tree(ZJNODE(node));
79286 +}
79287 +
79288 +/* resolve race with zput */
79289 +static inline znode *znode_rip_check(reiser4_tree * tree, znode * node)
79290 +{
79291 +       jnode *j;
79292 +
79293 +       j = jnode_rip_sync(tree, ZJNODE(node));
79294 +       if (likely(j != NULL))
79295 +               node = JZNODE(j);
79296 +       else
79297 +               node = NULL;
79298 +       return node;
79299 +}
79300 +
79301 +#if defined(REISER4_DEBUG)
79302 +int znode_is_loaded(const znode * node /* znode to query */ );
79303 +#endif
79304 +
79305 +extern __u64 znode_build_version(reiser4_tree * tree);
79306 +
79307 +/* Data-handles.  A data handle object manages pairing calls to zload() and zrelse().  We
79308 +   must load the data for a node in many places.  We could do this by simply calling
79309 +   zload() everywhere, the difficulty arises when we must release the loaded data by
79310 +   calling zrelse.  In a function with many possible error/return paths, it requires extra
79311 +   work to figure out which exit paths must call zrelse and those which do not.  The data
79312 +   handle automatically calls zrelse for every zload that it is responsible for.  In that
79313 +   sense, it acts much like a lock_handle.
79314 +*/
79315 +typedef struct load_count {
79316 +       znode *node;
79317 +       int d_ref;
79318 +} load_count;
79319 +
79320 +extern void init_load_count(load_count * lc);  /* Initialize a load_count set the current node to NULL. */
79321 +extern void done_load_count(load_count * dh);  /* Finalize a load_count: call zrelse() if necessary */
79322 +extern int incr_load_count_znode(load_count * dh, znode * node);       /* Set the argument znode to the current node, call zload(). */
79323 +extern int incr_load_count_jnode(load_count * dh, jnode * node);       /* If the argument jnode is formatted, do the same as
79324 +                                                                        * incr_load_count_znode, otherwise do nothing (unformatted nodes
79325 +                                                                        * don't require zload/zrelse treatment). */
79326 +extern void move_load_count(load_count * new, load_count * old);       /* Move the contents of a load_count.  Old handle is released. */
79327 +extern void copy_load_count(load_count * new, load_count * old);       /* Copy the contents of a load_count.  Old handle remains held. */
79328 +
79329 +/* Variable initializers for load_count. */
79330 +#define INIT_LOAD_COUNT ( load_count * ){ .node = NULL, .d_ref = 0 }
79331 +#define INIT_LOAD_COUNT_NODE( n ) ( load_count ){ .node = ( n ), .d_ref = 0 }
79332 +/* A convenience macro for use in assertions or debug-only code, where loaded
79333 +   data is only required to perform the debugging check.  This macro
79334 +   encapsulates an expression inside a pair of calls to zload()/zrelse(). */
79335 +#define WITH_DATA( node, exp )                         \
79336 +({                                                     \
79337 +       long __with_dh_result;                          \
79338 +       znode *__with_dh_node;                          \
79339 +                                                       \
79340 +       __with_dh_node = ( node );                      \
79341 +       __with_dh_result = zload( __with_dh_node );     \
79342 +       if( __with_dh_result == 0 ) {                   \
79343 +               __with_dh_result = ( long )( exp );     \
79344 +               zrelse( __with_dh_node );               \
79345 +       }                                               \
79346 +       __with_dh_result;                               \
79347 +})
79348 +
79349 +/* Same as above, but accepts a return value in case zload fails. */
79350 +#define WITH_DATA_RET( node, ret, exp )                        \
79351 +({                                                     \
79352 +       int __with_dh_result;                           \
79353 +       znode *__with_dh_node;                          \
79354 +                                                       \
79355 +       __with_dh_node = ( node );                      \
79356 +       __with_dh_result = zload( __with_dh_node );     \
79357 +       if( __with_dh_result == 0 ) {                   \
79358 +               __with_dh_result = ( int )( exp );      \
79359 +               zrelse( __with_dh_node );               \
79360 +       } else                                          \
79361 +               __with_dh_result = ( ret );             \
79362 +       __with_dh_result;                               \
79363 +})
79364 +
79365 +#define WITH_COORD(coord, exp)                 \
79366 +({                                             \
79367 +       coord_t *__coord;                       \
79368 +                                               \
79369 +       __coord = (coord);                      \
79370 +       coord_clear_iplug(__coord);             \
79371 +       WITH_DATA(__coord->node, exp);          \
79372 +})
79373 +
79374 +#if REISER4_DEBUG
79375 +#define STORE_COUNTERS                                         \
79376 +       reiser4_lock_cnt_info __entry_counters =                \
79377 +               *reiser4_lock_counters()
79378 +#define CHECK_COUNTERS                                                 \
79379 +ON_DEBUG_CONTEXT(                                                      \
79380 +({                                                                     \
79381 +       __entry_counters.x_refs = reiser4_lock_counters() -> x_refs;    \
79382 +       __entry_counters.t_refs = reiser4_lock_counters() -> t_refs;    \
79383 +       __entry_counters.d_refs = reiser4_lock_counters() -> d_refs;    \
79384 +       assert("nikita-2159",                                           \
79385 +              !memcmp(&__entry_counters, reiser4_lock_counters(),      \
79386 +                      sizeof __entry_counters));                       \
79387 +}) )
79388 +
79389 +#else
79390 +#define STORE_COUNTERS
79391 +#define CHECK_COUNTERS noop
79392 +#endif
79393 +
79394 +/* __ZNODE_H__ */
79395 +#endif
79396 +
79397 +/* Make Linus happy.
79398 +   Local variables:
79399 +   c-indentation-style: "K&R"
79400 +   mode-name: "LC"
79401 +   c-basic-offset: 8
79402 +   tab-width: 8
79403 +   fill-column: 120
79404 +   End:
79405 +*/
79406 _
79407 From: Edward Shishkin <edward.shishkin@gmail.com>
79408
79409 . adjust reiser4 to the new aops (->write_begin, ->write_end)
79410 . add support of loop devices over cryptcompress files.
79411
79412 Signed-off-by: Edward Shishkin <edward.shishkin@gmail.com>
79413 Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
79414 ---
79415
79416  fs/reiser4/as_ops.c                      |   18 ----
79417  fs/reiser4/page_cache.c                  |    4 -
79418  fs/reiser4/plugin/file/cryptcompress.c   |   56 +++++++++++---
79419  fs/reiser4/plugin/file/file.c            |   61 +++------------
79420  fs/reiser4/plugin/file/file.h            |   31 ++++---
79421  fs/reiser4/plugin/file/file_conversion.c |   83 +++++++++++++++++++++
79422  fs/reiser4/plugin/file_ops.c             |   47 -----------
79423  fs/reiser4/plugin/object.c               |   16 ++--
79424  fs/reiser4/plugin/object.h               |    3
79425  fs/reiser4/plugin/plugin.h               |    8 +-
79426  10 files changed, 175 insertions(+), 152 deletions(-)
79427
79428 diff -puN fs/reiser4/as_ops.c~reiser4-adjust-to-the-new-aops fs/reiser4/as_ops.c
79429 --- a/fs/reiser4/as_ops.c~reiser4-adjust-to-the-new-aops
79430 +++ a/fs/reiser4/as_ops.c
79431 @@ -347,24 +347,6 @@ int reiser4_writepages(struct address_sp
79432         return inode_file_plugin(mapping->host)->writepages(mapping, wbc);
79433  }
79434
79435 -int reiser4_prepare_write(struct file *file, struct page *page,
79436 -                         unsigned from, unsigned to)
79437 -{
79438 -       return inode_file_plugin(file->f_dentry->d_inode)->prepare_write(file,
79439 -                                                                        page,
79440 -                                                                        from,
79441 -                                                                        to);
79442 -}
79443 -
79444 -int reiser4_commit_write(struct file *file, struct page *page,
79445 -                        unsigned from, unsigned to)
79446 -{
79447 -       return inode_file_plugin(file->f_dentry->d_inode)->commit_write(file,
79448 -                                                                       page,
79449 -                                                                       from,
79450 -                                                                       to);
79451 -}
79452 -
79453  /* Make Linus happy.
79454     Local variables:
79455     c-indentation-style: "K&R"
79456 diff -puN fs/reiser4/page_cache.c~reiser4-adjust-to-the-new-aops fs/reiser4/page_cache.c
79457 --- a/fs/reiser4/page_cache.c~reiser4-adjust-to-the-new-aops
79458 +++ a/fs/reiser4/page_cache.c
79459 @@ -560,8 +560,8 @@ static struct address_space_operations f
79460         .set_page_dirty = formatted_set_page_dirty,
79461         /* used for read-ahead. Not applicable */
79462         .readpages = NULL,
79463 -       .prepare_write = NULL,
79464 -       .commit_write = NULL,
79465 +       .write_begin = NULL,
79466 +       .write_end = NULL,
79467         .bmap = NULL,
79468         /* called just before page is being detached from inode mapping and
79469            removed from memory. Called on truncate, cut/squeeze, and
79470 diff -puN fs/reiser4/plugin/file/cryptcompress.c~reiser4-adjust-to-the-new-aops fs/reiser4/plugin/file/cryptcompress.c
79471 --- a/fs/reiser4/plugin/file/cryptcompress.c~reiser4-adjust-to-the-new-aops
79472 +++ a/fs/reiser4/plugin/file/cryptcompress.c
79473 @@ -3405,11 +3405,12 @@ static int cryptcompress_truncate(struct
79474         return result;
79475  }
79476
79477 -/* Capture an anonymous pager cluster. (Page cluser is
79478 - * anonymous if it contains at least one anonymous page
79479 +/**
79480 + * Capture a pager cluster.
79481 + * @clust must be set up by a caller.
79482   */
79483 -static int capture_anon_page_cluster(struct cluster_handle * clust,
79484 -                                    struct inode * inode)
79485 +static int capture_page_cluster(struct cluster_handle * clust,
79486 +                               struct inode * inode)
79487  {
79488         int result;
79489
79490 @@ -3420,6 +3421,7 @@ static int capture_anon_page_cluster(str
79491         result = prepare_logical_cluster(inode, 0, 0, clust, LC_APPOV);
79492         if (result)
79493                 return result;
79494 +
79495         set_cluster_pages_dirty(clust, inode);
79496         result = checkin_logical_cluster(clust, inode);
79497         put_hint_cluster(clust, inode, ZNODE_WRITE_LOCK);
79498 @@ -3502,7 +3504,7 @@ static int capture_anon_pages(struct add
79499                         break;
79500                 }
79501                 move_cluster_forward(&clust, inode, pages[0]->index);
79502 -               result = capture_anon_page_cluster(&clust, inode);
79503 +               result = capture_page_cluster(&clust, inode);
79504
79505                 put_found_pages(pages, found); /* find_anon_page_cluster */
79506                 if (result)
79507 @@ -3743,18 +3745,48 @@ int release_cryptcompress(struct inode *
79508  }
79509
79510  /* plugin->prepare_write */
79511 -int prepare_write_cryptcompress(struct file *file, struct page *page,
79512 -                               unsigned from, unsigned to)
79513 +int write_begin_cryptcompress(struct file *file, struct page *page,
79514 +                         unsigned from, unsigned to)
79515  {
79516 -       return -EINVAL;
79517 +       return do_prepare_write(file, page, from, to);
79518  }
79519
79520  /* plugin->commit_write */
79521 -int commit_write_cryptcompress(struct file *file, struct page *page,
79522 -                              unsigned from, unsigned to)
79523 +int write_end_cryptcompress(struct file *file, struct page *page,
79524 +                         unsigned from, unsigned to)
79525  {
79526 -       BUG();
79527 -       return 0;
79528 +       int ret;
79529 +       hint_t *hint;
79530 +       lock_handle *lh;
79531 +       struct inode * inode;
79532 +       struct cluster_handle clust;
79533 +
79534 +       unlock_page(page);
79535 +
79536 +       inode = page->mapping->host;
79537 +       hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
79538 +       if (hint == NULL)
79539 +               return RETERR(-ENOMEM);
79540 +       hint_init_zero(hint);
79541 +       lh = &hint->lh;
79542 +
79543 +       cluster_init_read(&clust, NULL);
79544 +       clust.hint = hint;
79545 +
79546 +       ret = alloc_cluster_pgset(&clust, cluster_nrpages(inode));
79547 +       if (ret)
79548 +               goto out;
79549 +       clust.index = pg_to_clust(page->index, inode);
79550 +       ret = capture_page_cluster(&clust, inode);
79551 +       if (ret)
79552 +               warning("edward-1557",
79553 +                       "Capture failed (inode %llu, result=%i)",
79554 +                       (unsigned long long)get_inode_oid(inode), ret);
79555 + out:
79556 +       done_lh(lh);
79557 +       kfree(hint);
79558 +       put_cluster_handle(&clust);
79559 +       return ret;
79560  }
79561
79562  /* plugin->bmap */
79563 diff -puN fs/reiser4/plugin/file/file.c~reiser4-adjust-to-the-new-aops fs/reiser4/plugin/file/file.c
79564 --- a/fs/reiser4/plugin/file/file.c~reiser4-adjust-to-the-new-aops
79565 +++ a/fs/reiser4/plugin/file/file.c
79566 @@ -889,36 +889,12 @@ static int capture_page_and_create_exten
79567         return result;
79568  }
79569
79570 -/* this is implementation of method commit_write of struct
79571 -   address_space_operations for unix file plugin */
79572 -int
79573 -commit_write_unix_file(struct file *file, struct page *page,
79574 -                      unsigned from, unsigned to)
79575 +/* plugin->write_end() */
79576 +int write_end_unix_file(struct file *file, struct page *page,
79577 +                       unsigned from, unsigned to)
79578  {
79579 -       reiser4_context *ctx;
79580 -       struct inode *inode;
79581 -       int result;
79582 -
79583 -       assert("umka-3101", file != NULL);
79584 -       assert("umka-3102", page != NULL);
79585 -       assert("umka-3093", PageLocked(page));
79586 -
79587 -       SetPageUptodate(page);
79588 -
79589 -       inode = page->mapping->host;
79590 -       ctx = reiser4_init_context(page->mapping->host->i_sb);
79591 -       if (IS_ERR(ctx))
79592 -               return PTR_ERR(ctx);
79593 -       page_cache_get(page);
79594         unlock_page(page);
79595 -       result = capture_page_and_create_extent(page);
79596 -       lock_page(page);
79597 -       page_cache_release(page);
79598 -
79599 -       /* don't commit transaction under inode semaphore */
79600 -       context_set_commit_async(ctx);
79601 -       reiser4_exit_context(ctx);
79602 -       return result;
79603 +       return capture_page_and_create_extent(page);
79604  }
79605
79606  /*
79607 @@ -2687,32 +2663,23 @@ int delete_object_unix_file(struct inode
79608         return reiser4_delete_object_common(inode);
79609  }
79610
79611 -int
79612 -prepare_write_unix_file(struct file *file, struct page *page,
79613 -                       unsigned from, unsigned to)
79614 +/* plugin->write_begin() */
79615 +int write_begin_unix_file(struct file *file, struct page *page,
79616 +                         unsigned from, unsigned to)
79617  {
79618 -       reiser4_context *ctx;
79619 -       struct unix_file_info *uf_info;
79620         int ret;
79621 +       struct unix_file_info *info;
79622
79623 -       ctx = reiser4_init_context(file->f_dentry->d_inode->i_sb);
79624 -       if (IS_ERR(ctx))
79625 -               return PTR_ERR(ctx);
79626 -
79627 -       uf_info = unix_file_inode_data(file->f_dentry->d_inode);
79628 -       get_exclusive_access(uf_info);
79629 -       ret = find_file_state(file->f_dentry->d_inode, uf_info);
79630 -       if (ret == 0) {
79631 -               if (uf_info->container == UF_CONTAINER_TAILS)
79632 +       info = unix_file_inode_data(file->f_dentry->d_inode);
79633 +       get_exclusive_access(info);
79634 +       ret = find_file_state(file->f_dentry->d_inode, info);
79635 +       if (likely(ret == 0)) {
79636 +               if (info->container == UF_CONTAINER_TAILS)
79637                         ret = -EINVAL;
79638                 else
79639                         ret = do_prepare_write(file, page, from, to);
79640         }
79641 -       drop_exclusive_access(uf_info);
79642 -
79643 -       /* don't commit transaction under inode semaphore */
79644 -       context_set_commit_async(ctx);
79645 -       reiser4_exit_context(ctx);
79646 +       drop_exclusive_access(info);
79647         return ret;
79648  }
79649
79650 diff -puN fs/reiser4/plugin/file/file.h~reiser4-adjust-to-the-new-aops fs/reiser4/plugin/file/file.h
79651 --- a/fs/reiser4/plugin/file/file.h~reiser4-adjust-to-the-new-aops
79652 +++ a/fs/reiser4/plugin/file/file.h
79653 @@ -59,10 +59,14 @@ int reiser4_readpage(struct file *, stru
79654  int reiser4_readpages(struct file*, struct address_space*, struct list_head*,
79655                       unsigned);
79656  int reiser4_writepages(struct address_space *, struct writeback_control *);
79657 -int reiser4_prepare_write(struct file *, struct page *, unsigned from,
79658 -                         unsigned to);
79659 -int reiser4_commit_write(struct file *, struct page *, unsigned from,
79660 -                        unsigned to);
79661 +int reiser4_write_begin_careful(struct file *file,
79662 +                               struct address_space *mapping,
79663 +                               loff_t pos, unsigned len, unsigned flags,
79664 +                               struct page **pagep, void **fsdata);
79665 +int reiser4_write_end_careful(struct file *file,
79666 +                             struct address_space *mapping,
79667 +                             loff_t pos, unsigned len, unsigned copied,
79668 +                             struct page *page, void *fsdata);
79669  sector_t reiser4_bmap_careful(struct address_space *, sector_t lblock);
79670
79671  /*
79672 @@ -87,12 +91,13 @@ int release_unix_file(struct inode *, st
79673
79674  /* private address space operations */
79675  int readpage_unix_file(struct file *, struct page *);
79676 -int readpages_unix_file(struct file*, struct address_space*, struct list_head*, unsigned);
79677 +int readpages_unix_file(struct file*, struct address_space*, struct list_head*,
79678 +                       unsigned);
79679  int writepages_unix_file(struct address_space *, struct writeback_control *);
79680 -int prepare_write_unix_file(struct file *, struct page *, unsigned from,
79681 -                           unsigned to);
79682 -int commit_write_unix_file(struct file *, struct page *, unsigned from,
79683 -                          unsigned to);
79684 +int write_begin_unix_file(struct file *file, struct page *page,
79685 +                         unsigned from, unsigned to);
79686 +int write_end_unix_file(struct file *file, struct page *page,
79687 +                       unsigned from, unsigned to);
79688  sector_t bmap_unix_file(struct address_space *, sector_t lblock);
79689
79690  /* other private methods */
79691 @@ -129,10 +134,10 @@ int readpages_cryptcompress(struct file*
79692                             struct list_head*, unsigned);
79693  int writepages_cryptcompress(struct address_space *,
79694                              struct writeback_control *);
79695 -int prepare_write_cryptcompress(struct file *, struct page *, unsigned from,
79696 -                               unsigned to);
79697 -int commit_write_cryptcompress(struct file *, struct page *, unsigned from,
79698 -                              unsigned to);
79699 +int write_begin_cryptcompress(struct file *file, struct page *page,
79700 +                             unsigned from, unsigned to);
79701 +int write_end_cryptcompress(struct file *file, struct page *page,
79702 +                           unsigned from, unsigned to);
79703  sector_t bmap_cryptcompress(struct address_space *, sector_t lblock);
79704
79705  /* other private methods */
79706 diff -puN fs/reiser4/plugin/file/file_conversion.c~reiser4-adjust-to-the-new-aops fs/reiser4/plugin/file/file_conversion.c
79707 --- a/fs/reiser4/plugin/file/file_conversion.c~reiser4-adjust-to-the-new-aops
79708 +++ a/fs/reiser4/plugin/file/file_conversion.c
79709 @@ -667,6 +667,89 @@ sector_t reiser4_bmap_careful(struct add
79710         return PROT_PASSIVE(sector_t, bmap, (mapping, lblock));
79711  }
79712
79713 +int reiser4_write_begin_careful(struct file *file,
79714 +                               struct address_space *mapping,
79715 +                               loff_t pos,
79716 +                               unsigned len,
79717 +                               unsigned flags,
79718 +                               struct page **pagep,
79719 +                               void **fsdata)
79720 +{
79721 +       int ret = 0;
79722 +       unsigned start, end;
79723 +       struct page *page;
79724 +       pgoff_t index;
79725 +       reiser4_context *ctx;
79726 +       struct inode * inode = file->f_dentry->d_inode;
79727 +
79728 +       index = pos >> PAGE_CACHE_SHIFT;
79729 +       start = pos & (PAGE_CACHE_SIZE - 1);
79730 +       end = start + len;
79731 +
79732 +       page = __grab_cache_page(mapping, index);
79733 +       *pagep = page;
79734 +       if (!page)
79735 +               return -ENOMEM;
79736 +
79737 +       ctx = reiser4_init_context(file->f_dentry->d_inode->i_sb);
79738 +       if (IS_ERR(ctx)) {
79739 +               ret = PTR_ERR(ctx);
79740 +               goto out;
79741 +       }
79742 +       ret = PROT_PASSIVE(int, write_begin, (file, page, start, end));
79743 +
79744 +       /* don't commit transaction under inode semaphore */
79745 +       context_set_commit_async(ctx);
79746 +       reiser4_exit_context(ctx);
79747 + out:
79748 +       if (unlikely(ret)) {
79749 +               unlock_page(page);
79750 +               page_cache_release(page);
79751 +       }
79752 +       return ret;
79753 +}
79754 +
79755 +int reiser4_write_end_careful(struct file *file,
79756 +                             struct address_space *mapping,
79757 +                             loff_t pos,
79758 +                             unsigned len,
79759 +                             unsigned copied,
79760 +                             struct page *page,
79761 +                             void *fsdata)
79762 +{
79763 +       int ret;
79764 +       reiser4_context *ctx;
79765 +       unsigned start, end;
79766 +       struct inode *inode = page->mapping->host;
79767 +
79768 +       assert("umka-3101", file != NULL);
79769 +       assert("umka-3102", page != NULL);
79770 +       assert("umka-3093", PageLocked(page));
79771 +
79772 +       start = pos & (PAGE_CACHE_SIZE - 1);
79773 +       end = start + len;
79774 +
79775 +       flush_dcache_page(page);
79776 +       SetPageUptodate(page);
79777 +
79778 +       ctx = reiser4_init_context(page->mapping->host->i_sb);
79779 +       if (IS_ERR(ctx)){
79780 +               unlock_page(page);
79781 +               ret = PTR_ERR(ctx);
79782 +               goto out;
79783 +       }
79784 +       ret = PROT_PASSIVE(int, write_end, (file, page, start, end));
79785 +
79786 +       /* don't commit transaction under inode semaphore */
79787 +       context_set_commit_async(ctx);
79788 +       reiser4_exit_context(ctx);
79789 + out:
79790 +       page_cache_release(page);
79791 +       if (!ret)
79792 +               ret = copied;
79793 +       return ret;
79794 +}
79795 +
79796  /*
79797   * Wrappers without protection for:
79798   *
79799 diff -puN fs/reiser4/plugin/file_ops.c~reiser4-adjust-to-the-new-aops fs/reiser4/plugin/file_ops.c
79800 --- a/fs/reiser4/plugin/file_ops.c~reiser4-adjust-to-the-new-aops
79801 +++ a/fs/reiser4/plugin/file_ops.c
79802 @@ -95,55 +95,12 @@ int reiser4_sync_file_common(struct file
79803         return 0;
79804  }
79805
79806 -/* this is common implementation of vfs's sendfile method of struct
79807 -   file_operations
79808 -
79809 -   Reads @count bytes from @file and calls @actor for every page read. This is
79810 -   needed for loop back devices support.
79811 -*/
79812 -#if 0
79813 -ssize_t
79814 -sendfile_common(struct file *file, loff_t *ppos, size_t count,
79815 -               read_actor_t actor, void *target)
79816 -{
79817 -       reiser4_context *ctx;
79818 -       ssize_t result;
79819 -
79820 -       ctx = reiser4_init_context(file->f_dentry->d_inode->i_sb);
79821 -       if (IS_ERR(ctx))
79822 -               return PTR_ERR(ctx);
79823 -       result = generic_file_sendfile(file, ppos, count, actor, target);
79824 -       reiser4_exit_context(ctx);
79825 -       return result;
79826 -}
79827 -#endif  /*  0  */
79828
79829  /* address space operations */
79830
79831 -/* this is common implementation of vfs's prepare_write method of struct
79832 -   address_space_operations
79833 -*/
79834 -int
79835 -prepare_write_common(struct file *file, struct page *page, unsigned from,
79836 -                    unsigned to)
79837 -{
79838 -       reiser4_context *ctx;
79839 -       int result;
79840
79841 -       ctx = reiser4_init_context(page->mapping->host->i_sb);
79842 -       result = do_prepare_write(file, page, from, to);
79843 -
79844 -       /* don't commit transaction under inode semaphore */
79845 -       context_set_commit_async(ctx);
79846 -       reiser4_exit_context(ctx);
79847 -
79848 -       return result;
79849 -}
79850 -
79851 -/* this is helper for prepare_write_common and prepare_write_unix_file
79852 - */
79853 -int
79854 -do_prepare_write(struct file *file, struct page *page, unsigned from,
79855 +/* this is helper for plugin->write_begin() */
79856 +int do_prepare_write(struct file *file, struct page *page, unsigned from,
79857                  unsigned to)
79858  {
79859         int result;
79860 diff -puN fs/reiser4/plugin/object.c~reiser4-adjust-to-the-new-aops fs/reiser4/plugin/object.c
79861 --- a/fs/reiser4/plugin/object.c~reiser4-adjust-to-the-new-aops
79862 +++ a/fs/reiser4/plugin/object.c
79863 @@ -114,8 +114,8 @@ static struct address_space_operations r
79864         .writepages = reiser4_writepages,
79865         .set_page_dirty = reiser4_set_page_dirty,
79866         .readpages = reiser4_readpages,
79867 -       .prepare_write = reiser4_prepare_write,
79868 -       .commit_write = reiser4_commit_write,
79869 +       .write_begin = reiser4_write_begin_careful,
79870 +       .write_end = reiser4_write_end_careful,
79871         .bmap = reiser4_bmap_careful,
79872         .invalidatepage = reiser4_invalidatepage,
79873         .releasepage = reiser4_releasepage
79874 @@ -165,8 +165,8 @@ static struct address_space_operations d
79875         .writepages = dummyop,
79876         .set_page_dirty = bugop,
79877         .readpages = bugop,
79878 -       .prepare_write = bugop,
79879 -       .commit_write = bugop,
79880 +       .write_begin = bugop,
79881 +       .write_end = bugop,
79882         .bmap = bugop,
79883         .invalidatepage = bugop,
79884         .releasepage = bugop
79885 @@ -209,8 +209,8 @@ file_plugin file_plugins[LAST_FILE_PLUGI
79886                 .readpage = readpage_unix_file,
79887                 .readpages = readpages_unix_file,
79888                 .writepages = writepages_unix_file,
79889 -               .prepare_write = prepare_write_unix_file,
79890 -               .commit_write = commit_write_unix_file,
79891 +               .write_begin = write_begin_unix_file,
79892 +               .write_end = write_end_unix_file,
79893                 /*
79894                  * private a_ops
79895                  */
79896 @@ -403,8 +403,8 @@ file_plugin file_plugins[LAST_FILE_PLUGI
79897                 .readpage = readpage_cryptcompress,
79898                 .readpages = readpages_cryptcompress,
79899                 .writepages = writepages_cryptcompress,
79900 -               .prepare_write = prepare_write_cryptcompress,
79901 -               .commit_write = commit_write_cryptcompress,
79902 +               .write_begin = write_begin_cryptcompress,
79903 +               .write_end = write_end_cryptcompress,
79904
79905                 .bmap = bmap_cryptcompress,
79906
79907 diff -puN fs/reiser4/plugin/object.h~reiser4-adjust-to-the-new-aops fs/reiser4/plugin/object.h
79908 --- a/fs/reiser4/plugin/object.h~reiser4-adjust-to-the-new-aops
79909 +++ a/fs/reiser4/plugin/object.h
79910 @@ -36,9 +36,6 @@ int reiser4_readdir_common(struct file *
79911  int reiser4_release_dir_common(struct inode *, struct file *);
79912  int reiser4_sync_common(struct file *, struct dentry *, int datasync);
79913
79914 -/* common implementations of address space operations */
79915 -int prepare_write_common(struct file *, struct page *, unsigned from,
79916 -                        unsigned to);
79917
79918  /* file plugin operations: common implementations */
79919  int write_sd_by_inode_common(struct inode *);
79920 diff -puN fs/reiser4/plugin/plugin.h~reiser4-adjust-to-the-new-aops fs/reiser4/plugin/plugin.h
79921 --- a/fs/reiser4/plugin/plugin.h~reiser4-adjust-to-the-new-aops
79922 +++ a/fs/reiser4/plugin/plugin.h
79923 @@ -248,10 +248,10 @@ typedef struct file_plugin {
79924                           struct list_head *pages, unsigned nr_pages);
79925         int (*writepages)(struct address_space *mapping,
79926                           struct writeback_control *wbc);
79927 -       int (*prepare_write)(struct file *file, struct page *page,
79928 -                            unsigned from, unsigned to);
79929 -       int (*commit_write)(struct file *file, struct page *page,
79930 -                           unsigned from, unsigned to);
79931 +       int (*write_begin)(struct file *file, struct page *page,
79932 +                         unsigned from, unsigned to);
79933 +       int (*write_end)(struct file *file, struct page *page,
79934 +                         unsigned from, unsigned to);
79935         sector_t (*bmap) (struct address_space * mapping, sector_t lblock);
79936         /* other private methods */
79937         /* save inode cached stat-data onto disk. It was called
79938 _
79939 From: Edward Shishkin <edward.shishkin@gmail.com>
79940
79941 Make sure that reiser4_write_begin() is not called for interruptible
79942 copies.
79943
79944 Signed-off-by: Edward Shishkin<edward.shishkin@gmail.com>
79945 Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
79946 ---
79947
79948  fs/reiser4/plugin/file/file_conversion.c |    6 ++++++
79949  1 file changed, 6 insertions(+)
79950
79951 diff -puN fs/reiser4/plugin/file/file_conversion.c~reiser4-adjust-to-the-new-aops-fixup fs/reiser4/plugin/file/file_conversion.c
79952 --- a/fs/reiser4/plugin/file/file_conversion.c~reiser4-adjust-to-the-new-aops-fixup
79953 +++ a/fs/reiser4/plugin/file/file_conversion.c
79954 @@ -682,6 +682,12 @@ int reiser4_write_begin_careful(struct f
79955         reiser4_context *ctx;
79956         struct inode * inode = file->f_dentry->d_inode;
79957
79958 +       /**
79959 +        * reiser4_write_end() can not cope with
79960 +        * short writes for now
79961 +        */
79962 +       BUG_ON(!(flags & AOP_FLAG_UNINTERRUPTIBLE));
79963 +
79964         index = pos >> PAGE_CACHE_SHIFT;
79965         start = pos & (PAGE_CACHE_SIZE - 1);
79966         end = start + len;
79967 _
79968 From: Laurent Riffard <laurent.riffard@free.fr>
79969
79970 Reiser4 can't be built as a module when EXPORT_UNUSED_SYMBOL is not set.
79971
79972 It's broken because reiser4_write_extent is calling
79973 simple_prepare_write() and commit
79974 4e02ed4b4a2fae34aae766a5bb93ae235f60adb8 scheduled it for unexporting.
79975
79976 Signed-off-by: Laurent Riffard <laurent.riffard@free.fr>
79977 Acked-by: Edward Shishkin <edward.shishkin@gmail.com>
79978 Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
79979 ---
79980
79981  fs/reiser4/plugin/item/extent_file_ops.c |    5 +++--
79982  1 file changed, 3 insertions(+), 2 deletions(-)
79983
79984 diff -puN fs/reiser4/plugin/item/extent_file_ops.c~reiser4-remove-simple_prepare_write-usage fs/reiser4/plugin/item/extent_file_ops.c
79985 --- a/fs/reiser4/plugin/item/extent_file_ops.c~reiser4-remove-simple_prepare_write-usage
79986 +++ a/fs/reiser4/plugin/item/extent_file_ops.c
79987 @@ -1059,8 +1059,9 @@ ssize_t reiser4_write_extent(struct file
79988
79989                 lock_page(page);
79990                 if (!PageUptodate(page) && to_page != PAGE_CACHE_SIZE)
79991 -                       simple_prepare_write(file, page, page_off,
79992 -                                            page_off + to_page);
79993 +                       zero_user_segments(page, 0, page_off,
79994 +                                          page_off + to_page,
79995 +                                          PAGE_CACHE_SIZE);
79996
79997                 written = filemap_copy_from_user(page, page_off, buf, to_page);
79998                 if (unlikely(written != to_page)) {
79999 _
80000 From: Andrew Morton <akpm@linux-foundation.org>
80001
80002 ERROR: code indent should use tabs where possible
80003 #27: FILE: fs/reiser4/plugin/item/extent_file_ops.c:1063:
80004 +^I^I^I                   page_off + to_page,$
80005
80006 ERROR: code indent should use tabs where possible
80007 #28: FILE: fs/reiser4/plugin/item/extent_file_ops.c:1064:
80008 +^I^I^I                   PAGE_CACHE_SIZE);$
80009
80010 total: 2 errors, 0 warnings, 11 lines checked
80011
80012 ./patches/reiser4-remove-simple_prepare_write-usage.patch has style problems, please review.  If any of these errors
80013 are false positives report them to the maintainer, see
80014 CHECKPATCH in MAINTAINERS.
80015
80016 Please run checkpatch prior to sending patches
80017
80018 Cc: Edward Shishkin <edward.shishkin@gmail.com>
80019 Cc: Laurent Riffard <laurent.riffard@free.fr>
80020 Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
80021 ---
80022
80023  fs/reiser4/plugin/item/extent_file_ops.c |    4 ++--
80024  1 file changed, 2 insertions(+), 2 deletions(-)
80025
80026 diff -puN fs/reiser4/plugin/item/extent_file_ops.c~reiser4-remove-simple_prepare_write-usage-checkpatch-fixes fs/reiser4/plugin/item/extent_file_ops.c
80027 --- a/fs/reiser4/plugin/item/extent_file_ops.c~reiser4-remove-simple_prepare_write-usage-checkpatch-fixes
80028 +++ a/fs/reiser4/plugin/item/extent_file_ops.c
80029 @@ -1060,8 +1060,8 @@ ssize_t reiser4_write_extent(struct file
80030                 lock_page(page);
80031                 if (!PageUptodate(page) && to_page != PAGE_CACHE_SIZE)
80032                         zero_user_segments(page, 0, page_off,
80033 -                                          page_off + to_page,
80034 -                                          PAGE_CACHE_SIZE);
80035 +                                          page_off + to_page,
80036 +                                          PAGE_CACHE_SIZE);
80037
80038                 written = filemap_copy_from_user(page, page_off, buf, to_page);
80039                 if (unlikely(written != to_page)) {
80040 _
80041 From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
80042
80043 build fix.
80044
80045 Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
80046 Cc: Nick Piggin <npiggin@suse.de>
80047 Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
80048 ---
80049
80050  fs/reiser4/plugin/file/file_conversion.c |    3 ++-
80051  1 file changed, 2 insertions(+), 1 deletion(-)
80052
80053 diff -puN fs/reiser4/plugin/file/file_conversion.c~fs-symlink-write_begin-allocation-context-fix-reiser4-fix fs/reiser4/plugin/file/file_conversion.c
80054 --- a/fs/reiser4/plugin/file/file_conversion.c~fs-symlink-write_begin-allocation-context-fix-reiser4-fix
80055 +++ a/fs/reiser4/plugin/file/file_conversion.c
80056 @@ -692,7 +692,8 @@ int reiser4_write_begin_careful(struct f
80057         start = pos & (PAGE_CACHE_SIZE - 1);
80058         end = start + len;
80059
80060 -       page = __grab_cache_page(mapping, index);
80061 +       page = grab_cache_page_write_begin(mapping, index,
80062 +                                          flags & AOP_FLAG_NOFS);
80063         *pagep = page;
80064         if (!page)
80065                 return -ENOMEM;
80066 _
80067 From: Edward Shishkin <edward.shishkin@gmail.com>
80068
80069 . Fix up incorrect handling errors reterned by d_obtain_alias.
80070 . Make quilt, checkpatch happy:
80071   remove comment with "joke not for everyone".
80072
80073 Signed-off-by: Edward Shishkin<edward.shishkin@gmail.com>
80074 Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
80075 ---
80076
80077  fs/reiser4/plugin/dir_plugin_common.c  |   10 +---------
80078  fs/reiser4/plugin/file_plugin_common.c |    5 +----
80079  2 files changed, 2 insertions(+), 13 deletions(-)
80080
80081 diff -puN fs/reiser4/plugin/dir_plugin_common.c~reiser4-handling-error-returned-by-d_obtain_alias-fixup fs/reiser4/plugin/dir_plugin_common.c
80082 --- a/fs/reiser4/plugin/dir_plugin_common.c~reiser4-handling-error-returned-by-d_obtain_alias-fixup
80083 +++ a/fs/reiser4/plugin/dir_plugin_common.c
80084 @@ -56,10 +56,7 @@ struct dentry *get_parent_common(struct
80085                 check_light_weight(parent, child);
80086                 reiser4_iget_complete(parent);
80087                 dentry = d_obtain_alias(parent);
80088 -               if (dentry == NULL) {
80089 -                       iput(parent);
80090 -                       dentry = ERR_PTR(RETERR(-ENOMEM));
80091 -               } else
80092 +               if (!IS_ERR(dentry))
80093                         dentry->d_op = &get_super_private(s)->ops.dentry;
80094         } else if (PTR_ERR(parent) == -ENOENT)
80095                 dentry = ERR_PTR(RETERR(-ESTALE));
80096 @@ -353,11 +350,6 @@ int reiser4_dir_done_common(struct inode
80097         result = reiser4_rem_entry_common(object, &goodby_dots, &entry);
80098         reiser4_free_dentry_fsdata(&goodby_dots);
80099         if (unlikely(result != 0 && result != -ENOMEM && result != -ENOENT))
80100 -               /* only worth a warning
80101 -
80102 -                  "values of \ eB\ f will give rise to dom!\n"
80103 -                  -- v6src/s2/mv.c:89
80104 -                */
80105                 warning("nikita-2252", "Cannot remove dot of %lli: %i",
80106                         (unsigned long long)get_inode_oid(object), result);
80107         return 0;
80108 diff -puN fs/reiser4/plugin/file_plugin_common.c~reiser4-handling-error-returned-by-d_obtain_alias-fixup fs/reiser4/plugin/file_plugin_common.c
80109 --- a/fs/reiser4/plugin/file_plugin_common.c~reiser4-handling-error-returned-by-d_obtain_alias-fixup
80110 +++ a/fs/reiser4/plugin/file_plugin_common.c
80111 @@ -476,10 +476,7 @@ struct dentry *wire_get_common(struct su
80112         if (!IS_ERR(inode)) {
80113                 reiser4_iget_complete(inode);
80114                 dentry = d_obtain_alias(inode);
80115 -               if (dentry == NULL) {
80116 -                       iput(inode);
80117 -                       dentry = ERR_PTR(-ENOMEM);
80118 -               } else
80119 +               if (!IS_ERR(dentry))
80120                         dentry->d_op = &get_super_private(sb)->ops.dentry;
80121         } else if (PTR_ERR(inode) == -ENOENT)
80122                 /*
80123 _
80124 From: Edward Shishkin <edward.shishkin@gmail.com>
80125
80126 Update names of quota methods
80127
80128 Signed-off-by: Edward Shishkin<edward.shishkin@gmail.com>
80129 Acked-by: Jan Kara <jack@suse.cz>
80130 Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
80131 ---
80132
80133  fs/reiser4/plugin/file_plugin_common.c   |    4 ++--
80134  fs/reiser4/plugin/inode_ops.c            |   20 ++++++++++----------
80135  fs/reiser4/plugin/item/cde.c             |    4 ++--
80136  fs/reiser4/plugin/item/extent_file_ops.c |   10 ++++++----
80137  fs/reiser4/plugin/item/extent_item_ops.c |    2 +-
80138  fs/reiser4/plugin/item/sde.c             |    4 ++--
80139  fs/reiser4/plugin/item/tail.c            |   16 ++++++++--------
80140  7 files changed, 31 insertions(+), 29 deletions(-)
80141
80142 diff -puN fs/reiser4/plugin/file_plugin_common.c~reiser4-update-names-of-quota-methods fs/reiser4/plugin/file_plugin_common.c
80143 --- a/fs/reiser4/plugin/file_plugin_common.c~reiser4-update-names-of-quota-methods
80144 +++ a/fs/reiser4/plugin/file_plugin_common.c
80145 @@ -944,8 +944,8 @@ common_object_delete_no_reserve(struct i
80146         if (!reiser4_inode_get_flag(inode, REISER4_NO_SD)) {
80147                 reiser4_key sd_key;
80148
80149 -               DQUOT_FREE_INODE(inode);
80150 -               DQUOT_DROP(inode);
80151 +               vfs_dq_free_inode(inode);
80152 +               vfs_dq_drop(inode);
80153
80154                 build_sd_key(inode, &sd_key);
80155                 result =
80156 diff -puN fs/reiser4/plugin/inode_ops.c~reiser4-update-names-of-quota-methods fs/reiser4/plugin/inode_ops.c
80157 --- a/fs/reiser4/plugin/inode_ops.c~reiser4-update-names-of-quota-methods
80158 +++ a/fs/reiser4/plugin/inode_ops.c
80159 @@ -453,7 +453,7 @@ int reiser4_setattr_common(struct dentry
80160                 if ((attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid)
80161                     || (attr->ia_valid & ATTR_GID
80162                         && attr->ia_gid != inode->i_gid)) {
80163 -                       result = DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0;
80164 +                       result = vfs_dq_transfer(inode, attr) ? -EDQUOT : 0;
80165                         if (result) {
80166                                 context_set_commit_async(ctx);
80167                                 reiser4_exit_context(ctx);
80168 @@ -593,8 +593,8 @@ static int do_create_vfs_child(reiser4_o
80169         /* So that on error iput will be called. */
80170         *retobj = object;
80171
80172 -       if (DQUOT_ALLOC_INODE(object)) {
80173 -               DQUOT_DROP(object);
80174 +       if (vfs_dq_alloc_inode(object)) {
80175 +               vfs_dq_drop(object);
80176                 object->i_flags |= S_NOQUOTA;
80177                 return RETERR(-EDQUOT);
80178         }
80179 @@ -608,7 +608,7 @@ static int do_create_vfs_child(reiser4_o
80180         if (result) {
80181                 warning("nikita-431", "Cannot install plugin %i on %llx",
80182                         data->id, (unsigned long long)get_inode_oid(object));
80183 -               DQUOT_FREE_INODE(object);
80184 +               vfs_dq_free_inode(object);
80185                 object->i_flags |= S_NOQUOTA;
80186                 return result;
80187         }
80188 @@ -617,7 +617,7 @@ static int do_create_vfs_child(reiser4_o
80189         obj_plug = inode_file_plugin(object);
80190
80191         if (obj_plug->create_object == NULL) {
80192 -               DQUOT_FREE_INODE(object);
80193 +               vfs_dq_free_inode(object);
80194                 object->i_flags |= S_NOQUOTA;
80195                 return RETERR(-EPERM);
80196         }
80197 @@ -636,7 +636,7 @@ static int do_create_vfs_child(reiser4_o
80198                 warning("nikita-432", "Cannot inherit from %llx to %llx",
80199                         (unsigned long long)get_inode_oid(parent),
80200                         (unsigned long long)get_inode_oid(object));
80201 -               DQUOT_FREE_INODE(object);
80202 +               vfs_dq_free_inode(object);
80203                 object->i_flags |= S_NOQUOTA;
80204                 return result;
80205         }
80206 @@ -652,7 +652,7 @@ static int do_create_vfs_child(reiser4_o
80207         /* obtain directory plugin (if any) for new object. */
80208         obj_dir = inode_dir_plugin(object);
80209         if (obj_dir != NULL && obj_dir->init == NULL) {
80210 -               DQUOT_FREE_INODE(object);
80211 +               vfs_dq_free_inode(object);
80212                 object->i_flags |= S_NOQUOTA;
80213                 return RETERR(-EPERM);
80214         }
80215 @@ -661,7 +661,7 @@ static int do_create_vfs_child(reiser4_o
80216
80217         reserve = estimate_create_vfs_object(parent, object);
80218         if (reiser4_grab_space(reserve, BA_CAN_COMMIT)) {
80219 -               DQUOT_FREE_INODE(object);
80220 +               vfs_dq_free_inode(object);
80221                 object->i_flags |= S_NOQUOTA;
80222                 return RETERR(-ENOSPC);
80223         }
80224 @@ -692,7 +692,7 @@ static int do_create_vfs_child(reiser4_o
80225                         warning("nikita-2219",
80226                                 "Failed to create sd for %llu",
80227                                 (unsigned long long)get_inode_oid(object));
80228 -               DQUOT_FREE_INODE(object);
80229 +               vfs_dq_free_inode(object);
80230                 object->i_flags |= S_NOQUOTA;
80231                 return result;
80232         }
80233 @@ -735,7 +735,7 @@ static int do_create_vfs_child(reiser4_o
80234          */
80235         reiser4_update_sd(object);
80236         if (result != 0) {
80237 -               DQUOT_FREE_INODE(object);
80238 +               vfs_dq_free_inode(object);
80239                 object->i_flags |= S_NOQUOTA;
80240                 /* if everything was ok (result == 0), parent stat-data is
80241                  * already updated above (update_parent_dir()) */
80242 diff -puN fs/reiser4/plugin/item/cde.c~reiser4-update-names-of-quota-methods fs/reiser4/plugin/item/cde.c
80243 --- a/fs/reiser4/plugin/item/cde.c~reiser4-update-names-of-quota-methods
80244 +++ a/fs/reiser4/plugin/item/cde.c
80245 @@ -932,7 +932,7 @@ int add_entry_cde(struct inode *dir /* d
80246         data.length = estimate_cde(result ? coord : NULL, &data);
80247
80248         /* NOTE-NIKITA quota plugin? */
80249 -       if (DQUOT_ALLOC_SPACE_NODIRTY(dir, cde_bytes(result, &data)))
80250 +       if (vfs_dq_alloc_space_nodirty(dir, cde_bytes(result, &data)))
80251                 return RETERR(-EDQUOT);
80252
80253         if (result)
80254 @@ -983,7 +983,7 @@ int rem_entry_cde(struct inode *dir /* d
80255             kill_node_content(coord, &shadow, NULL, NULL, NULL, NULL, NULL, 0);
80256         if (result == 0) {
80257                 /* NOTE-NIKITA quota plugin? */
80258 -               DQUOT_FREE_SPACE_NODIRTY(dir, length);
80259 +               vfs_dq_free_space_nodirty(dir, length);
80260         }
80261         return result;
80262  }
80263 diff -puN fs/reiser4/plugin/item/extent_file_ops.c~reiser4-update-names-of-quota-methods fs/reiser4/plugin/item/extent_file_ops.c
80264 --- a/fs/reiser4/plugin/item/extent_file_ops.c~reiser4-update-names-of-quota-methods
80265 +++ a/fs/reiser4/plugin/item/extent_file_ops.c
80266 @@ -260,8 +260,8 @@ static int append_last_extent(uf_coord_t
80267
80268         assert("", get_key_offset(key) == (loff_t)index_jnode(jnodes[0]) * PAGE_CACHE_SIZE);
80269
80270 -       result = DQUOT_ALLOC_BLOCK_NODIRTY(mapping_jnode(jnodes[0])->host,
80271 -                                          count);
80272 +       result = vfs_dq_alloc_block_nodirty(mapping_jnode(jnodes[0])->host,
80273 +                                           count);
80274         BUG_ON(result != 0);
80275
80276         switch (state_of_extent(ext)) {
80277 @@ -408,7 +408,8 @@ static int insert_first_extent(uf_coord_
80278         if (count == 0)
80279                 return 0;
80280
80281 -       result = DQUOT_ALLOC_BLOCK_NODIRTY(mapping_jnode(jnodes[0])->host, count);
80282 +       result = vfs_dq_alloc_block_nodirty(mapping_jnode(jnodes[0])->host,
80283 +                                           count);
80284         BUG_ON(result != 0);
80285
80286         /*
80287 @@ -622,7 +623,8 @@ static int overwrite_one_block(uf_coord_
80288                 break;
80289
80290         case HOLE_EXTENT:
80291 -               result = DQUOT_ALLOC_BLOCK_NODIRTY(mapping_jnode(node)->host, 1);
80292 +               result = vfs_dq_alloc_block_nodirty(mapping_jnode(node)->host,
80293 +                                                   1);
80294                 BUG_ON(result != 0);
80295                 result = plug_hole(uf_coord, key, &how);
80296                 if (result)
80297 diff -puN fs/reiser4/plugin/item/extent_item_ops.c~reiser4-update-names-of-quota-methods fs/reiser4/plugin/item/extent_item_ops.c
80298 --- a/fs/reiser4/plugin/item/extent_item_ops.c~reiser4-update-names-of-quota-methods
80299 +++ a/fs/reiser4/plugin/item/extent_item_ops.c
80300 @@ -468,7 +468,7 @@ kill_hook_extent(const coord_t * coord,
80301                         length = to_off - offset;
80302                 }
80303
80304 -               DQUOT_FREE_BLOCK_NODIRTY(inode, length);
80305 +               vfs_dq_free_block_nodirty(inode, length);
80306
80307                 if (state_of_extent(ext) == UNALLOCATED_EXTENT) {
80308                         /* some jnodes corresponding to this unallocated extent */
80309 diff -puN fs/reiser4/plugin/item/sde.c~reiser4-update-names-of-quota-methods fs/reiser4/plugin/item/sde.c
80310 --- a/fs/reiser4/plugin/item/sde.c~reiser4-update-names-of-quota-methods
80311 +++ a/fs/reiser4/plugin/item/sde.c
80312 @@ -120,7 +120,7 @@ int add_entry_de(struct inode *dir /* di
80313         data.iplug = item_plugin_by_id(SIMPLE_DIR_ENTRY_ID);
80314
80315         /* NOTE-NIKITA quota plugin */
80316 -       if (DQUOT_ALLOC_SPACE_NODIRTY(dir, data.length))
80317 +       if (vfs_dq_alloc_space_nodirty(dir, data.length))
80318                 return -EDQUOT;
80319
80320         result = insert_by_coord(coord, &data, &entry->key, lh, 0 /*flags */ );
80321 @@ -168,7 +168,7 @@ int rem_entry_de(struct inode *dir /* di
80322             kill_node_content(coord, &shadow, NULL, NULL, NULL, NULL, NULL, 0);
80323         if (result == 0) {
80324                 /* NOTE-NIKITA quota plugin */
80325 -               DQUOT_FREE_SPACE_NODIRTY(dir, length);
80326 +               vfs_dq_free_space_nodirty(dir, length);
80327         }
80328         return result;
80329  }
80330 diff -puN fs/reiser4/plugin/item/tail.c~reiser4-update-names-of-quota-methods fs/reiser4/plugin/item/tail.c
80331 --- a/fs/reiser4/plugin/item/tail.c~reiser4-update-names-of-quota-methods
80332 +++ a/fs/reiser4/plugin/item/tail.c
80333 @@ -494,11 +494,11 @@ static ssize_t insert_first_tail(struct
80334                  * were real data which are all zeros. Therefore we have to
80335                  * allocate quota here as well
80336                  */
80337 -               if (DQUOT_ALLOC_SPACE_NODIRTY(inode, flow->length))
80338 +               if (vfs_dq_alloc_space_nodirty(inode, flow->length))
80339                         return RETERR(-EDQUOT);
80340                 result = reiser4_insert_flow(coord, lh, flow);
80341                 if (flow->length)
80342 -                       DQUOT_FREE_SPACE_NODIRTY(inode, flow->length);
80343 +                       vfs_dq_free_space_nodirty(inode, flow->length);
80344
80345                 uf_info = unix_file_inode_data(inode);
80346
80347 @@ -518,13 +518,13 @@ static ssize_t insert_first_tail(struct
80348         }
80349
80350         /* check quota before appending data */
80351 -       if (DQUOT_ALLOC_SPACE_NODIRTY(inode, flow->length))
80352 +       if (vfs_dq_alloc_space_nodirty(inode, flow->length))
80353                 return RETERR(-EDQUOT);
80354
80355         to_write = flow->length;
80356         result = reiser4_insert_flow(coord, lh, flow);
80357         if (flow->length)
80358 -               DQUOT_FREE_SPACE_NODIRTY(inode, flow->length);
80359 +               vfs_dq_free_space_nodirty(inode, flow->length);
80360         return (to_write - flow->length) ? (to_write - flow->length) : result;
80361  }
80362
80363 @@ -553,22 +553,22 @@ static ssize_t append_tail(struct inode
80364                  * were real data which are all zeros. Therefore we have to
80365                  * allocate quota here as well
80366                  */
80367 -               if (DQUOT_ALLOC_SPACE_NODIRTY(inode, flow->length))
80368 +               if (vfs_dq_alloc_space_nodirty(inode, flow->length))
80369                         return RETERR(-EDQUOT);
80370                 result = reiser4_insert_flow(coord, lh, flow);
80371                 if (flow->length)
80372 -                       DQUOT_FREE_SPACE_NODIRTY(inode, flow->length);
80373 +                       vfs_dq_free_space_nodirty(inode, flow->length);
80374                 return result;
80375         }
80376
80377         /* check quota before appending data */
80378 -       if (DQUOT_ALLOC_SPACE_NODIRTY(inode, flow->length))
80379 +       if (vfs_dq_alloc_space_nodirty(inode, flow->length))
80380                 return RETERR(-EDQUOT);
80381
80382         to_write = flow->length;
80383         result = reiser4_insert_flow(coord, lh, flow);
80384         if (flow->length)
80385 -               DQUOT_FREE_SPACE_NODIRTY(inode, flow->length);
80386 +               vfs_dq_free_space_nodirty(inode, flow->length);
80387         return (to_write - flow->length) ? (to_write - flow->length) : result;
80388  }
80389
80390 _
80391 From: Jiri Slaby <jirislaby@gmail.com>
80392
80393 Change !X & Y to !(X & Y) to avoid compiler confusion and fix a bug.
80394
80395 Signed-off-by: Jiri Slaby <jirislaby@gmail.com>
80396 Cc: Edward Shishkin <edward.shishkin@gmail.com>
80397 Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
80398 ---
80399
80400  fs/reiser4/carry_ops.c |    4 ++--
80401  1 file changed, 2 insertions(+), 2 deletions(-)
80402
80403 diff -puN fs/reiser4/carry_ops.c~fs-reiser4-add-parenths-around-x-y fs/reiser4/carry_ops.c
80404 --- a/fs/reiser4/carry_ops.c~fs-reiser4-add-parenths-around-x-y
80405 +++ a/fs/reiser4/carry_ops.c
80406 @@ -79,7 +79,7 @@ static carry_node *find_left_neighbor(ca
80407         left->free = 1;
80408
80409         flags = GN_TRY_LOCK;
80410 -       if (!op->u.insert.flags & COPI_LOAD_LEFT)
80411 +       if (!(op->u.insert.flags & COPI_LOAD_LEFT))
80412                 flags |= GN_NO_ALLOC;
80413
80414         /* then, feeling lucky, peek left neighbor in the cache. */
80415 @@ -203,7 +203,7 @@ static carry_node *find_right_neighbor(c
80416         read_unlock_tree(tree);
80417
80418         flags = GN_CAN_USE_UPPER_LEVELS;
80419 -       if (!op->u.insert.flags & COPI_LOAD_RIGHT)
80420 +       if (!(op->u.insert.flags & COPI_LOAD_RIGHT))
80421                 flags = GN_NO_ALLOC;
80422
80423         /* then, try to lock right neighbor */
80424 _
80425 From: Andrew Morton <akpm@linux-foundation.org>
80426
80427 No longer in linux-next.
80428
80429 Cc: Edward Shishkin <edward.shishkin@gmail.com>
80430 Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
80431 ---
80432
80433  fs/reiser4/context.c |    3 +--
80434  fs/reiser4/txnmgr.c  |    2 +-
80435  2 files changed, 2 insertions(+), 3 deletions(-)
80436
80437 diff -puN fs/reiser4/context.c~fs-reiser4-contextc-current_is_pdflush-got-removed fs/reiser4/context.c
80438 --- a/fs/reiser4/context.c~fs-reiser4-contextc-current_is_pdflush-got-removed
80439 +++ a/fs/reiser4/context.c
80440 @@ -151,8 +151,7 @@ static void balance_dirty_pages_at(reise
80441          */
80442         if (sbinfo != NULL && sbinfo->fake != NULL &&
80443             context->nr_marked_dirty != 0 &&
80444 -           !(current->flags & PF_MEMALLOC) &&
80445 -           !current_is_pdflush())
80446 +           !(current->flags & PF_MEMALLOC))
80447                 balance_dirty_pages_ratelimited(sbinfo->fake->i_mapping);
80448  }
80449
80450 diff -puN fs/reiser4/txnmgr.c~fs-reiser4-contextc-current_is_pdflush-got-removed fs/reiser4/txnmgr.c
80451 --- a/fs/reiser4/txnmgr.c~fs-reiser4-contextc-current_is_pdflush-got-removed
80452 +++ a/fs/reiser4/txnmgr.c
80453 @@ -1410,7 +1410,7 @@ flush_some_atom(jnode * start, long *nr_
80454                  * Write throttling is case of no one atom can be
80455                  * flushed/committed.
80456                  */
80457 -               if (!current_is_pdflush() && !wbc->nonblocking) {
80458 +               if (!wbc->nonblocking) {
80459                         list_for_each_entry(atom, &tmgr->atoms_list, atom_link) {
80460                                 spin_lock_atom(atom);
80461                                 /* Repeat the check from the above. */
80462 _
80463 From: Andrew Morton <akpm@linux-foundation.org>
80464
80465 generic_sync_sb_indeos() changed
80466
80467 Cc: Edward Shishkin <edward.shishkin@gmail.com>
80468 Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
80469 ---
80470
80471  fs/reiser4/entd.c      |    2 +-
80472  fs/reiser4/super_ops.c |    2 +-
80473  2 files changed, 2 insertions(+), 2 deletions(-)
80474
80475 diff -puN fs/reiser4/entd.c~reiser4-fix fs/reiser4/entd.c
80476 --- a/fs/reiser4/entd.c~reiser4-fix
80477 +++ a/fs/reiser4/entd.c
80478 @@ -241,7 +241,7 @@ static void entd_flush(struct super_bloc
80479         if (rq->wbc->nr_to_write > 0) {
80480                 rq->wbc->range_start = 0;
80481                 rq->wbc->range_end = LLONG_MAX;
80482 -               generic_sync_sb_inodes(super, rq->wbc);
80483 +               generic_sync_sb_inodes(rq->wbc);
80484         }
80485         rq->wbc->nr_to_write = ENTD_CAPTURE_APAGE_BURST;
80486         reiser4_writeout(super, rq->wbc);
80487 diff -puN fs/reiser4/super_ops.c~reiser4-fix fs/reiser4/super_ops.c
80488 --- a/fs/reiser4/super_ops.c~reiser4-fix
80489 +++ a/fs/reiser4/super_ops.c
80490 @@ -412,7 +412,7 @@ static void reiser4_sync_inodes(struct s
80491          * call reiser4_writepages for each of dirty inodes to turn dirty pages
80492          * into transactions if they were not yet.
80493          */
80494 -       generic_sync_sb_inodes(super, wbc);
80495 +       generic_sync_sb_inodes(wbc);
80496
80497         /* flush goes here */
80498         wbc->nr_to_write = to_write;
80499 _
80500 From: Edward Shishkin <edward.shishkin@gmail.com>
80501
80502 . Rename confusing "psched" to "dispatch";
80503 . Cleanups.
80504
80505 Signed-off-by: Edward Shishkin <edward.shishkin@gmail.com>
80506 Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
80507 ---
80508
80509  fs/reiser4/plugin/file/cryptcompress.c   |   27 +---
80510  fs/reiser4/plugin/file/cryptcompress.h   |    8 -
80511  fs/reiser4/plugin/file/file.c            |    6
80512  fs/reiser4/plugin/file/file.h            |   20 +--
80513  fs/reiser4/plugin/file/file_conversion.c |  136 ++++++++-------------
80514  fs/reiser4/plugin/plugin.h               |    2
80515  6 files changed, 83 insertions(+), 116 deletions(-)
80516
80517 diff -puN fs/reiser4/plugin/file/cryptcompress.c~reiser4-rename-psched-to-dispatch fs/reiser4/plugin/file/cryptcompress.c
80518 --- a/fs/reiser4/plugin/file/cryptcompress.c~reiser4-rename-psched-to-dispatch
80519 +++ a/fs/reiser4/plugin/file/cryptcompress.c
80520 @@ -2642,7 +2642,7 @@ void reset_cluster_params(struct cluster
80521  /* the heart of write_cryptcompress */
80522  static loff_t do_write_cryptcompress(struct file *file, struct inode *inode,
80523                                      const char __user *buf, size_t to_write,
80524 -                                    loff_t pos, struct psched_context *cont)
80525 +                                    loff_t pos, struct dispatch_context *cont)
80526  {
80527         int i;
80528         hint_t *hint;
80529 @@ -2683,10 +2683,8 @@ static loff_t do_write_cryptcompress(str
80530         if (next_window_stat(&win) == HOLE_WINDOW) {
80531                 /* write hole in this iteration
80532                    separated from the loop below */
80533 -               result = write_pschedule_hook(file, inode,
80534 -                                             pos,
80535 -                                             &clust,
80536 -                                             cont);
80537 +               result = write_dispatch_hook(file, inode,
80538 +                                            pos, &clust, cont);
80539                 if (result)
80540                         goto out;
80541                 result = prepare_logical_cluster(inode, pos, count, &clust,
80542 @@ -2700,14 +2698,13 @@ static loff_t do_write_cryptcompress(str
80543
80544                 assert("edward-750", reiser4_schedulable());
80545
80546 -               result = write_pschedule_hook(file, inode,
80547 -                                             pos + to_write - count,
80548 -                                             &clust,
80549 -                                             cont);
80550 +               result = write_dispatch_hook(file, inode,
80551 +                                            pos + to_write - count,
80552 +                                            &clust, cont);
80553                 if (result)
80554                         goto out;
80555 -               if (cont->state == PSCHED_ASSIGNED_NEW)
80556 -                       /* done_lh was called in write_pschedule_hook */
80557 +               if (cont->state == DISPATCH_ASSIGNED_NEW)
80558 +                       /* done_lh was called in write_dispatch_hook */
80559                         goto out_no_longterm_lock;
80560
80561                 result = prepare_logical_cluster(inode, pos, count, &clust,
80562 @@ -2787,7 +2784,7 @@ static loff_t do_write_cryptcompress(str
80563         put_cluster_handle(&clust);
80564         assert("edward-195",
80565                ergo((to_write == count),
80566 -                   (result < 0 || cont->state == PSCHED_ASSIGNED_NEW)));
80567 +                   (result < 0 || cont->state == DISPATCH_ASSIGNED_NEW)));
80568         return (to_write - count) ? (to_write - count) : result;
80569  }
80570
80571 @@ -2800,7 +2797,7 @@ static loff_t do_write_cryptcompress(str
80572   */
80573  ssize_t write_cryptcompress(struct file *file, const char __user *buf,
80574                             size_t count, loff_t *off,
80575 -                           struct psched_context *cont)
80576 +                           struct dispatch_context *cont)
80577  {
80578         ssize_t result;
80579         struct inode *inode;
80580 @@ -2808,7 +2805,7 @@ ssize_t write_cryptcompress(struct file
80581         loff_t pos = *off;
80582         struct cryptcompress_info *info;
80583
80584 -       assert("edward-1449", cont->state == PSCHED_INVAL_STATE);
80585 +       assert("edward-1449", cont->state == DISPATCH_INVAL_STATE);
80586
80587         inode = file->f_dentry->d_inode;
80588         assert("edward-196", cryptcompress_inode_ok(inode));
80589 @@ -3701,7 +3698,7 @@ int setattr_cryptcompress(struct dentry
80590                         ctx = reiser4_init_context(dentry->d_inode->i_sb);
80591                         if (IS_ERR(ctx))
80592                                 return PTR_ERR(ctx);
80593 -                       result = setattr_pschedule_hook(inode);
80594 +                       result = setattr_dispatch_hook(inode);
80595                         if (result) {
80596                                 context_set_commit_async(ctx);
80597                                 reiser4_exit_context(ctx);
80598 diff -puN fs/reiser4/plugin/file/cryptcompress.h~reiser4-rename-psched-to-dispatch fs/reiser4/plugin/file/cryptcompress.h
80599 --- a/fs/reiser4/plugin/file/cryptcompress.h~reiser4-rename-psched-to-dispatch
80600 +++ a/fs/reiser4/plugin/file/cryptcompress.h
80601 @@ -562,10 +562,10 @@ int bind_cryptcompress(struct inode *chi
80602  void destroy_inode_cryptcompress(struct inode * inode);
80603  int grab_page_cluster(struct inode *inode, struct cluster_handle * clust,
80604                       rw_op rw);
80605 -int write_pschedule_hook(struct file *file, struct inode * inode,
80606 -                        loff_t pos, struct cluster_handle * clust,
80607 -                        struct psched_context * cont);
80608 -int setattr_pschedule_hook(struct inode * inode);
80609 +int write_dispatch_hook(struct file *file, struct inode * inode,
80610 +                       loff_t pos, struct cluster_handle * clust,
80611 +                       struct dispatch_context * cont);
80612 +int setattr_dispatch_hook(struct inode * inode);
80613  struct reiser4_crypto_info * inode_crypto_info(struct inode * inode);
80614  void inherit_crypto_info_common(struct inode * parent, struct inode * object,
80615                                 int (*can_inherit)(struct inode * child,
80616 diff -puN fs/reiser4/plugin/file/file.c~reiser4-rename-psched-to-dispatch fs/reiser4/plugin/file/file.c
80617 --- a/fs/reiser4/plugin/file/file.c~reiser4-rename-psched-to-dispatch
80618 +++ a/fs/reiser4/plugin/file/file.c
80619 @@ -2059,8 +2059,10 @@ static void drop_access(struct unix_file
80620   * @cont: unused argument, as we don't perform plugin conversion when being
80621   * managed by unix_file plugin.
80622   */
80623 -ssize_t write_unix_file(struct file *file, const char __user *buf,
80624 -                       size_t count, loff_t *pos, struct psched_context *cont)
80625 +ssize_t write_unix_file(struct file *file,
80626 +                       const char __user *buf,
80627 +                       size_t count, loff_t *pos,
80628 +                       struct dispatch_context *cont)
80629  {
80630         int result;
80631         reiser4_context *ctx;
80632 diff -puN fs/reiser4/plugin/file/file.h~reiser4-rename-psched-to-dispatch fs/reiser4/plugin/file/file.h
80633 --- a/fs/reiser4/plugin/file/file.h~reiser4-rename-psched-to-dispatch
80634 +++ a/fs/reiser4/plugin/file/file.h
80635 @@ -8,18 +8,18 @@
80636  #if !defined( __REISER4_FILE_H__ )
80637  #define __REISER4_FILE_H__
80638
80639 -/* possible states when scheduling a new file plugin */
80640 +/* possible states in dispatching process */
80641  typedef enum {
80642 -       PSCHED_INVAL_STATE,    /* invalid state */
80643 -       PSCHED_SCHED_POINT,    /* scheduling point has been achieved */
80644 -       PSCHED_REMAINS_OLD,    /* made a decision to be managed by old plugin */
80645 -       PSCHED_ASSIGNED_NEW    /* new plugin has been scheduled */
80646 -} psched_state;
80647 +       DISPATCH_INVAL_STATE,  /* invalid state */
80648 +       DISPATCH_POINT,        /* dispatching point has been achieved */
80649 +       DISPATCH_REMAINS_OLD,  /* made a decision to manage by old plugin */
80650 +       DISPATCH_ASSIGNED_NEW  /* a new plugin has been assigned */
80651 +} dispatch_state;
80652
80653 -struct psched_context {
80654 +struct dispatch_context {
80655         int nr_pages;
80656         struct page **pages;
80657 -       psched_state state;
80658 +       dispatch_state state;
80659  };
80660
80661  /**
80662 @@ -82,7 +82,7 @@ int setattr_unix_file(struct dentry *, s
80663  ssize_t read_unix_file(struct file *, char __user *buf, size_t read_amount,
80664                        loff_t *off);
80665  ssize_t write_unix_file(struct file *, const char __user *buf, size_t write_amount,
80666 -                       loff_t * off, struct psched_context * cont);
80667 +                       loff_t * off, struct dispatch_context * cont);
80668  int ioctl_unix_file(struct inode *, struct file *, unsigned int cmd,
80669                     unsigned long arg);
80670  int mmap_unix_file(struct file *, struct vm_area_struct *);
80671 @@ -121,7 +121,7 @@ ssize_t read_cryptcompress(struct file *
80672                            size_t count, loff_t *off);
80673  ssize_t write_cryptcompress(struct file *, const char __user *buf,
80674                             size_t count, loff_t * off,
80675 -                           struct psched_context *cont);
80676 +                           struct dispatch_context *cont);
80677  int ioctl_cryptcompress(struct inode *, struct file *, unsigned int cmd,
80678                         unsigned long arg);
80679  int mmap_cryptcompress(struct file *, struct vm_area_struct *);
80680 diff -puN fs/reiser4/plugin/file/file_conversion.c~reiser4-rename-psched-to-dispatch fs/reiser4/plugin/file/file_conversion.c
80681 --- a/fs/reiser4/plugin/file/file_conversion.c~reiser4-rename-psched-to-dispatch
80682 +++ a/fs/reiser4/plugin/file/file_conversion.c
80683 @@ -2,62 +2,30 @@
80684     licensing governed by reiser4/README */
80685
80686  /**
80687 - * This file contains plugin schedule hooks, and plugin conversion methods.
80688 + * This file contains dispatching hooks, and conversion methods, which
80689 + * implement transitions in the FILE interface.
80690   *
80691 - * Plugin schedule hook makes a decision (at plugin schedule point) about the
80692 - * most reasonable plugins for managing a regular file. Usually such decisions
80693 - * is made by some O(1)-heuristic.
80694 - *
80695 - * By default we assign a unix_file plugin id when writing incompressible file
80696 - * managed by cryptcompress plugin id. Currently used heuristic for estimating
80697 - * compressibility is very simple: if first complete logical cluster (64K by
80698 - * default) of a file is incompressible, then we make a decision, that the whole
80699 - * file is incompressible (*).
80700 - *
80701 - * To enable a conversion we install a special "magic" compression mode plugin
80702 - * (CONVX_COMPRESSION_MODE_ID, see plugin/compress/compress_mode.c for details)
80703 - * at file creation time (**).
80704 - *
80705 - * Note, that we don't perform back conversion (unix_file->cryptcompress)
80706 - * because of compatibility reasons (see http://dev.namesys.com/Version4.X.Y
80707 - * for details).
80708 - *
80709 - * The conversion is accompanied by rebuilding disk structures of a file, so it
80710 - * is important to protect them from being interacted with other plugins which
80711 - * don't expect them to be in such inconsistent state. For this to be protected
80712 - * we serialize readers and writers of a file's conversion set (FCS).
80713 - *
80714 - * We define FCS as a file plugin installed in inode's pset plus file's data
80715 - * and metadata that this file plugin manipulates with (items, etc).
80716 - * Note, that FCS is defined per file.
80717 - * FCS reader is defined as a set of instruction of the following type:
80718 - * {inode_file_plugin(inode)->method()} (I.e. retrieving a file plugin id
80719 - * conjoined with all method's instructions should be atomic).
80720 - * FCS writer is a set of instructions that perform file plugin conversion
80721 - * (convert items, update pset, etc).
80722 - * Example:
80723 - * reiser4_write_careful() supplied to VFS as a ->write() file operation is
80724 - * composed of the following (optional) instructions:
80725 - *             1              2                         3
80726 - * *********************** ####### -------------------------------------------->
80727 - *
80728 - * 1) "****" are instructions performed on behalf of cryptcompress file plugin;
80729 - * 2) "####" is a FCS writer (performing a conversion cryptcompress->unix_file);
80730 - * 3) "----" are instructions performed on behalf of unix_file plugin;
80731 - * Here (1) and (3) are FCS readers.
80732 - *
80733 - * In this example FCS readers and writers are already serialized (by design),
80734 - * however there can be readers and writers executing at the same time in
80735 - * different contexts, so we need a common mechanism of serialization.
80736 - *
80737 - * Currently serialization of FCS readers and writers is performed via acquiring
80738 - * a special per-inode rw-semaphore (conv_sem). And yes, {down, up}_read is for
80739 - * FCS readers, and  {down, up}_write is for FCS writers, see the macros below
80740 - * for passive/active protection.
80741 - *
80742 - * ---
80743 - * (*)  This heuristic can be changed to a better one (benchmarking is needed).
80744 - * (**) Such technique allows to keep enable/disable state on disk.
80745 + * Dispatching hook makes a decision (at dispatching point) about the
80746 + * most reasonable plugin. Such decision is made in accordance with some
80747 + * O(1)-heuristic.
80748 + *
80749 + * We implement a transition CRYPTCOMPRESS -> UNIX_FILE for files with
80750 + * incompressible data. Current heuristic to estimate compressibility is
80751 + * very simple: if first complete logical cluster (64K by default) of a
80752 + * file is incompressible, then we make a decision, that the whole file
80753 + * is incompressible.
80754 + *
80755 + * To enable dispatching we install a special "magic" compression mode
80756 + * plugin CONVX_COMPRESSION_MODE_ID at file creation time.
80757 + *
80758 + * Note, that we don't perform back conversion (UNIX_FILE->CRYPTCOMPRESS)
80759 + * because of compatibility reasons).
80760 + *
80761 + * In conversion time we protect CS, the conversion set (file's (meta)data
80762 + * and plugin table (pset)) via special per-inode rw-semaphore (conv_sem).
80763 + * The methods which implement conversion are CS writers. The methods of FS
80764 + * interface (file_operations, inode_operations, address_space_operations)
80765 + * are CS readers.
80766   */
80767
80768  #include "../../inode.h"
80769 @@ -212,11 +180,11 @@ static int disable_conversion(struct ino
80770  /**
80771   * Check if we really have achieved plugin scheduling point
80772   */
80773 -static int check_psched_point(struct inode * inode,
80774 -                             loff_t pos /* position in the
80775 -                                           file to write from */,
80776 -                             struct cluster_handle * clust,
80777 -                             struct psched_context * cont)
80778 +static int check_dispatch_point(struct inode * inode,
80779 +                               loff_t pos /* position in the
80780 +                                             file to write from */,
80781 +                               struct cluster_handle * clust,
80782 +                               struct dispatch_context * cont)
80783  {
80784         assert("edward-1505", conversion_enabled(inode));
80785         /*
80786 @@ -241,9 +209,9 @@ static int check_psched_point(struct ino
80787                pos == inode->i_size &&
80788                pos == inode_cluster_size(inode));
80789         assert("edward-1539", cont != NULL);
80790 -       assert("edward-1540", cont->state == PSCHED_INVAL_STATE);
80791 +       assert("edward-1540", cont->state == DISPATCH_INVAL_STATE);
80792
80793 -       cont->state = PSCHED_SCHED_POINT;
80794 +       cont->state = DISPATCH_POINT;
80795         return 0;
80796  }
80797
80798 @@ -301,14 +269,14 @@ static int prepped_dclust_ok(hint_t * hi
80799   */
80800  static int read_check_compressibility(struct inode * inode,
80801                                       struct cluster_handle * clust,
80802 -                                     struct psched_context * cont)
80803 +                                     struct dispatch_context * cont)
80804  {
80805         int i;
80806         int result;
80807         __u32 dst_len;
80808         hint_t tmp_hint;
80809         hint_t * cur_hint = clust->hint;
80810 -       assert("edward-1541", cont->state == PSCHED_SCHED_POINT);
80811 +       assert("edward-1541", cont->state == DISPATCH_POINT);
80812
80813         start_check_compressibility(inode, clust, &tmp_hint);
80814
80815 @@ -373,8 +341,8 @@ static int read_check_compressibility(st
80816         finish_check_compressibility(inode, clust, cur_hint);
80817         cont->state =
80818                 (data_is_compressible(dst_len, inode_cluster_size(inode)) ?
80819 -                PSCHED_REMAINS_OLD :
80820 -                PSCHED_ASSIGNED_NEW);
80821 +                DISPATCH_REMAINS_OLD :
80822 +                DISPATCH_ASSIGNED_NEW);
80823         return 0;
80824   error:
80825         put_page_cluster(clust, inode, READ_OP);
80826 @@ -433,8 +401,8 @@ static int reserve_cryptcompress2unixfil
80827  /**
80828   * Convert cryptcompress file plugin to unix_file plugin.
80829   */
80830 -static int cryptcompress2unixfile(struct file * file, struct inode * inode,
80831 -                                 struct psched_context * cont)
80832 +static int cryptcompress2unixfile(struct file *file, struct inode *inode,
80833 +                                 struct dispatch_context *cont)
80834  {
80835         int i;
80836         int result = 0;
80837 @@ -490,28 +458,28 @@ static int cryptcompress2unixfile(struct
80838   * Make a decision about the most reasonable file plugin id to manage
80839   * the file.
80840   */
80841 -int write_pschedule_hook(struct file * file, struct inode * inode,
80842 -                        loff_t pos, struct cluster_handle * clust,
80843 -                        struct psched_context * cont)
80844 +int write_dispatch_hook(struct file *file, struct inode *inode,
80845 +                       loff_t pos, struct cluster_handle *clust,
80846 +                       struct dispatch_context *cont)
80847  {
80848         int result;
80849         if (!conversion_enabled(inode))
80850                 return 0;
80851 -       result = check_psched_point(inode, pos, clust, cont);
80852 -       if (result || cont->state != PSCHED_SCHED_POINT)
80853 +       result = check_dispatch_point(inode, pos, clust, cont);
80854 +       if (result || cont->state != DISPATCH_POINT)
80855                 return result;
80856         result = read_check_compressibility(inode, clust, cont);
80857         if (result)
80858                 return result;
80859 -       if (cont->state == PSCHED_REMAINS_OLD) {
80860 +       if (cont->state == DISPATCH_REMAINS_OLD) {
80861                 put_page_cluster(clust, inode, READ_OP);
80862                 return disable_conversion(inode);
80863         }
80864 -       assert("edward-1543", cont->state == PSCHED_ASSIGNED_NEW);
80865 +       assert("edward-1543", cont->state == DISPATCH_ASSIGNED_NEW);
80866         /*
80867          * page cluster is grabbed and uptodate. It will be
80868          * released with a pgset after plugin conversion is
80869 -        * finished, see put_psched_context().
80870 +        * finished, see put_dispatch_context().
80871          */
80872         reiser4_unset_hint(clust->hint);
80873         move_cluster_pgset(clust, &cont->pages, &cont->nr_pages);
80874 @@ -521,20 +489,20 @@ int write_pschedule_hook(struct file * f
80875  /**
80876   * This is called by ->setattr() method of cryptcompress file plugin.
80877   */
80878 -int setattr_pschedule_hook(struct inode * inode)
80879 +int setattr_dispatch_hook(struct inode * inode)
80880  {
80881         if (conversion_enabled(inode))
80882                 return disable_conversion(inode);
80883         return 0;
80884  }
80885
80886 -static inline void init_psched_context(struct psched_context * cont)
80887 +static inline void init_dispatch_context(struct dispatch_context * cont)
80888  {
80889         memset(cont, 0, sizeof(*cont));
80890  }
80891
80892 -static inline void done_psched_context(struct psched_context * cont,
80893 -                                      struct inode * inode)
80894 +static inline void done_dispatch_context(struct dispatch_context * cont,
80895 +                                        struct inode * inode)
80896  {
80897         if (cont->pages) {
80898                 __put_page_cluster(0, cont->nr_pages, cont->pages, inode);
80899 @@ -564,13 +532,13 @@ ssize_t reiser4_write_careful(struct fil
80900         reiser4_context *ctx;
80901         ssize_t written_old = 0; /* bytes written with initial plugin */
80902         ssize_t written_new = 0; /* bytes written with new plugin */
80903 -       struct psched_context cont;
80904 +       struct dispatch_context cont;
80905         struct inode * inode = file->f_dentry->d_inode;
80906
80907         ctx = reiser4_init_context(inode->i_sb);
80908         if (IS_ERR(ctx))
80909                 return PTR_ERR(ctx);
80910 -       init_psched_context(&cont);
80911 +       init_dispatch_context(&cont);
80912         mutex_lock(&inode->i_mutex);
80913         /**
80914          * First step.
80915 @@ -582,7 +550,7 @@ ssize_t reiser4_write_careful(struct fil
80916                                                       count,
80917                                                       off,
80918                                                       &cont);
80919 -       if (cont.state != PSCHED_ASSIGNED_NEW || written_old < 0)
80920 +       if (cont.state != DISPATCH_ASSIGNED_NEW || written_old < 0)
80921                 goto exit;
80922         /**
80923          * Second step.
80924 @@ -616,7 +584,7 @@ ssize_t reiser4_write_careful(struct fil
80925                                                       NULL);
80926   exit:
80927         mutex_unlock(&inode->i_mutex);
80928 -       done_psched_context(&cont, inode);
80929 +       done_dispatch_context(&cont, inode);
80930         reiser4_exit_context(ctx);
80931
80932         return written_old + (written_new < 0 ? 0 : written_new);
80933 diff -puN fs/reiser4/plugin/plugin.h~reiser4-rename-psched-to-dispatch fs/reiser4/plugin/plugin.h
80934 --- a/fs/reiser4/plugin/plugin.h~reiser4-rename-psched-to-dispatch
80935 +++ a/fs/reiser4/plugin/plugin.h
80936 @@ -235,7 +235,7 @@ typedef struct file_plugin {
80937          * in @cont */
80938         ssize_t (*write) (struct file *, const char __user *buf,
80939                           size_t write_amount, loff_t * off,
80940 -                         struct psched_context * cont);
80941 +                         struct dispatch_context * cont);
80942         int (*ioctl) (struct inode *inode, struct file *filp,
80943                       unsigned int cmd, unsigned long arg);
80944         int (*mmap) (struct file *, struct vm_area_struct *);
80945 _
80946 From: Edward Shishkin <edward.shishkin@gmail.com>
80947
80948 1. Fix up the problem:
80949 Reiser4 steps to the journal code of other journalling file systems:
80950
80951 EXT3 complaints: "called recursively, non-PF_MEMALLOC"
80952 Call Trace:
80953  [<c0535873>] ext3_write_inode+0x1e/0x3a
80954  [<c04b16bc>] __writeback_single_inode+0x193/0x2ad
80955  [<c0508d86>] ? flush_some_atom+0x427/0x44d
80956  [<c04b1bb5>] ? generic_sync_sb_inodes+0x27c/0x338
80957  [<c04b1b68>] generic_sync_sb_inodes+0x22f/0x338
80958  [<c04b1c8e>] sync_sb_inodes+0x1d/0x20
80959  [<c04b1e1e>] writeback_inodes+0x79/0xb9
80960  [<c047c76a>] balance_dirty_pages_ratelimited_nr+0x119/0x21d
80961  [<c0504803>] reiser4_exit_context+0x5f/0xf6
80962  [<c051e5a3>] reiser4_write_careful+0x3ba/0x3cc
80963  [<c049bc3c>] ? do_sync_read+0xab/0xe9
80964  [<c0613dae>] ? selinux_file_permission+0x44/0x48
80965  [<c060fea6>] ? security_file_permission+0xf/0x11
80966  [<c051e1e9>] ? reiser4_write_careful+0x0/0x3cc
80967  [<c049c494>] vfs_write+0x84/0xdf
80968  [<c049c588>] sys_write+0x3b/0x60
80969  [<c0403178>] sysenter_do_call+0x12/0x2d
80970
80971 Solution:
80972 Drop current->journal_info before calling
80973 balance_dirty_pages_ratelimited().
80974
80975 2. Update comments in plugin_header.h
80976
80977 Signed-off-by: Edward Shishkin <edward.shishkin@gmail.com>
80978 Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
80979 ---
80980
80981  fs/reiser4/context.c                     |   13 +++---
80982  fs/reiser4/plugin/file/cryptcompress.c   |    3 -
80983  fs/reiser4/plugin/file/file.c            |    9 +---
80984  fs/reiser4/plugin/file/tail_conversion.c |   14 +++++--
80985  fs/reiser4/plugin/item/tail.h            |    2 -
80986  fs/reiser4/plugin/plugin_header.h        |   42 ++++++++-------------
80987  fs/reiser4/txnmgr.c                      |    3 +
80988  fs/reiser4/vfs_ops.c                     |   12 ++++--
80989  fs/reiser4/vfs_ops.h                     |    2 -
80990  9 files changed, 49 insertions(+), 51 deletions(-)
80991
80992 diff -puN fs/reiser4/context.c~reiser4-drop-journal-info fs/reiser4/context.c
80993 --- a/fs/reiser4/context.c~reiser4-drop-journal-info
80994 +++ a/fs/reiser4/context.c
80995 @@ -37,8 +37,8 @@
80996  #include "debug.h"
80997  #include "super.h"
80998  #include "context.h"
80999 +#include "vfs_ops.h"   /* for reiser4_throttle_write() */
81000
81001 -#include <linux/writeback.h>   /* balance_dirty_pages() */
81002  #include <linux/hardirq.h>
81003
81004  static void _reiser4_init_context(reiser4_context * context,
81005 @@ -139,7 +139,7 @@ int is_in_reiser4_context(void)
81006   * because some important lock (like ->i_mutex on the parent directory) is
81007   * held. To achieve this, ->nobalance flag can be set in the current context.
81008   */
81009 -static void balance_dirty_pages_at(reiser4_context *context)
81010 +static void reiser4_throttle_write_at(reiser4_context *context)
81011  {
81012         reiser4_super_info_data *sbinfo = get_super_private(context->super);
81013
81014 @@ -152,7 +152,8 @@ static void balance_dirty_pages_at(reise
81015         if (sbinfo != NULL && sbinfo->fake != NULL &&
81016             context->nr_marked_dirty != 0 &&
81017             !(current->flags & PF_MEMALLOC))
81018 -               balance_dirty_pages_ratelimited(sbinfo->fake->i_mapping);
81019 +               /* FIXME-EDWARD: throttle with nr_marked_dirty? */
81020 +               reiser4_throttle_write(sbinfo->fake, 1);
81021  }
81022
81023  /* release resources associated with context.
81024 @@ -225,10 +226,8 @@ void reiser4_exit_context(reiser4_contex
81025         assert("nikita-3021", reiser4_schedulable());
81026
81027         if (context->nr_children == 0) {
81028 -               if (!context->nobalance) {
81029 -                       reiser4_txn_restart(context);
81030 -                       balance_dirty_pages_at(context);
81031 -               }
81032 +               if (!context->nobalance)
81033 +                       reiser4_throttle_write_at(context);
81034
81035                 /* if filesystem is mounted with -o sync or -o dirsync - commit
81036                    transaction.  FIXME: TXNH_DONT_COMMIT is used to avoid
81037 diff -puN fs/reiser4/plugin/file/cryptcompress.c~reiser4-drop-journal-info fs/reiser4/plugin/file/cryptcompress.c
81038 --- a/fs/reiser4/plugin/file/cryptcompress.c~reiser4-drop-journal-info
81039 +++ a/fs/reiser4/plugin/file/cryptcompress.c
81040 @@ -1973,8 +1973,7 @@ static int balance_dirty_page_cluster(st
81041         info = cryptcompress_inode_data(inode);
81042
81043         mutex_unlock(&info->checkin_mutex);
81044 -       reiser4_txn_restart_current();
81045 -       balance_dirty_pages_ratelimited_nr(inode->i_mapping, nr_dirtied);
81046 +       reiser4_throttle_write(inode, nr_dirtied);
81047         mutex_lock(&info->checkin_mutex);
81048         return 0;
81049  }
81050 diff -puN fs/reiser4/plugin/file/file.c~reiser4-drop-journal-info fs/reiser4/plugin/file/file.c
81051 --- a/fs/reiser4/plugin/file/file.c~reiser4-drop-journal-info
81052 +++ a/fs/reiser4/plugin/file/file.c
81053 @@ -2227,16 +2227,13 @@ ssize_t write_unix_file(struct file *fil
81054                 }
81055                 drop_access(uf_info);
81056                 ea = NEITHER_OBTAINED;
81057 -               reiser4_txn_restart(ctx);
81058 -               current->journal_info = NULL;
81059 +
81060                 /*
81061                  * tell VM how many pages were dirtied. Maybe number of pages
81062                  * which were dirty already should not be counted
81063                  */
81064 -               balance_dirty_pages_ratelimited_nr(inode->i_mapping,
81065 -                                                  (written + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE);
81066 -               current->journal_info = ctx;
81067 -
81068 +               reiser4_throttle_write(inode,
81069 +                            (written + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE);
81070                 left -= written;
81071                 buf += written;
81072                 *pos += written;
81073 diff -puN fs/reiser4/plugin/file/tail_conversion.c~reiser4-drop-journal-info fs/reiser4/plugin/file/tail_conversion.c
81074 --- a/fs/reiser4/plugin/file/tail_conversion.c~reiser4-drop-journal-info
81075 +++ a/fs/reiser4/plugin/file/tail_conversion.c
81076 @@ -486,8 +486,10 @@ int tail2extent(struct unix_file_info *u
81077                          * on partially converted files.
81078                          */
81079                         drop_exclusive_access(uf_info);
81080 -                       /* throttle the conversion */
81081 -                       reiser4_throttle_write(inode);
81082 +                       /* throttle the conversion
81083 +                          FIXME-EDWARD: Pass the precise number of pages
81084 +                          that was dirtied */
81085 +                       reiser4_throttle_write(inode, 1);
81086                         get_exclusive_access(uf_info);
81087
81088                         /*
81089 @@ -685,8 +687,12 @@ int extent2tail(struct file * file, stru
81090                 page_cache_release(page);
81091
81092                 drop_exclusive_access(uf_info);
81093 -               /* throttle the conversion */
81094 -               reiser4_throttle_write(inode);
81095 +               /*
81096 +                * throttle the conversion.
81097 +                * FIXME-EDWARD: Calculate and pass the precise number
81098 +                * of pages that was dirtied
81099 +                */
81100 +               reiser4_throttle_write(inode, 1);
81101                 get_exclusive_access(uf_info);
81102                 /*
81103                  * nobody is allowed to complete conversion but a process which
81104 diff -puN fs/reiser4/plugin/item/tail.h~reiser4-drop-journal-info fs/reiser4/plugin/item/tail.h
81105 --- a/fs/reiser4/plugin/item/tail.h~reiser4-drop-journal-info
81106 +++ a/fs/reiser4/plugin/item/tail.h
81107 @@ -40,8 +40,6 @@ int readpage_tail(void *vp, struct page
81108  reiser4_key *append_key_tail(const coord_t *, reiser4_key *);
81109  void init_coord_extension_tail(uf_coord_t *, loff_t offset);
81110  int get_block_address_tail(const coord_t *, sector_t, sector_t *);
81111 -int item_balance_dirty_pages(struct address_space *, const flow_t *,
81112 -                            hint_t *, int back_to_dirty, int set_hint);
81113
81114  /* __REISER4_TAIL_H__ */
81115  #endif
81116 diff -puN fs/reiser4/plugin/plugin_header.h~reiser4-drop-journal-info fs/reiser4/plugin/plugin_header.h
81117 --- a/fs/reiser4/plugin/plugin_header.h~reiser4-drop-journal-info
81118 +++ a/fs/reiser4/plugin/plugin_header.h
81119 @@ -10,32 +10,24 @@
81120  #include "../debug.h"
81121  #include "../dformat.h"
81122
81123 -/* Every plugin type can be considered as a class of virtual objects
81124 -   {(type, i) | i = 0, 1, ...}, which has one the following categories
81125 -   of virtualization:
81126 -   A - no virtualization;
81127 -   F - per-file virtualization;
81128 -   S - per-superblock virtualization;
81129 -   FIXME-EDWARD: Define every such category */
81130 -
81131 -/* Supported plugin types: (id, (virtualization category), short description) */
81132 +/* The list of Reiser4 interfaces */
81133  typedef enum {
81134 -       REISER4_FILE_PLUGIN_TYPE,             /* (F) service VFS enry-points */
81135 -       REISER4_DIR_PLUGIN_TYPE,              /* (F) service VFS enry-points */
81136 -       REISER4_ITEM_PLUGIN_TYPE,             /* (F) manage items */
81137 -       REISER4_NODE_PLUGIN_TYPE,             /* (S) manage formatted nodes */
81138 -       REISER4_HASH_PLUGIN_TYPE,             /* (F) compute hash */
81139 -       REISER4_FIBRATION_PLUGIN_TYPE,        /* (F) directory fibrations */
81140 -       REISER4_FORMATTING_PLUGIN_TYPE,       /* (F) tail-packing policy */
81141 -       REISER4_PERM_PLUGIN_TYPE,             /*       stub (vacancy)     */
81142 -       REISER4_SD_EXT_PLUGIN_TYPE,           /* (A) stat-data extensions */
81143 -       REISER4_FORMAT_PLUGIN_TYPE,           /* (S) specify disk format */
81144 -       REISER4_JNODE_PLUGIN_TYPE,            /* (A) in-memory node headers */
81145 -       REISER4_CIPHER_PLUGIN_TYPE,           /* (F) cipher transform algs */
81146 -       REISER4_DIGEST_PLUGIN_TYPE,           /* (F) digest transform algs */
81147 -       REISER4_COMPRESSION_PLUGIN_TYPE,      /* (F) compression tfm algs */
81148 -       REISER4_COMPRESSION_MODE_PLUGIN_TYPE, /* (F) compression heuristic */
81149 -       REISER4_CLUSTER_PLUGIN_TYPE,          /* (F) size of logical cluster */
81150 +       REISER4_FILE_PLUGIN_TYPE,             /* manage VFS objects */
81151 +       REISER4_DIR_PLUGIN_TYPE,              /* manage directories */
81152 +       REISER4_ITEM_PLUGIN_TYPE,             /* manage items */
81153 +       REISER4_NODE_PLUGIN_TYPE,             /* manage formatted nodes */
81154 +       REISER4_HASH_PLUGIN_TYPE,             /* hash methods */
81155 +       REISER4_FIBRATION_PLUGIN_TYPE,        /* directory fibrations */
81156 +       REISER4_FORMATTING_PLUGIN_TYPE,       /* dispatching policy */
81157 +       REISER4_PERM_PLUGIN_TYPE,             /* stub (vacancy) */
81158 +       REISER4_SD_EXT_PLUGIN_TYPE,           /* manage stat-data extensions */
81159 +       REISER4_FORMAT_PLUGIN_TYPE,           /* disk format specifications */
81160 +       REISER4_JNODE_PLUGIN_TYPE,            /* manage in-memory headers */
81161 +       REISER4_CIPHER_PLUGIN_TYPE,           /* cipher transform methods */
81162 +       REISER4_DIGEST_PLUGIN_TYPE,           /* digest transform methods */
81163 +       REISER4_COMPRESSION_PLUGIN_TYPE,      /* compression methods */
81164 +       REISER4_COMPRESSION_MODE_PLUGIN_TYPE, /* dispatching policies */
81165 +       REISER4_CLUSTER_PLUGIN_TYPE,          /* manage logical clusters */
81166         REISER4_PLUGIN_TYPES
81167  } reiser4_plugin_type;
81168
81169 diff -puN fs/reiser4/txnmgr.c~reiser4-drop-journal-info fs/reiser4/txnmgr.c
81170 --- a/fs/reiser4/txnmgr.c~reiser4-drop-journal-info
81171 +++ a/fs/reiser4/txnmgr.c
81172 @@ -2328,7 +2328,8 @@ static void do_jnode_make_dirty(jnode *
81173
81174         JF_SET(node, JNODE_DIRTY);
81175
81176 -       get_current_context()->nr_marked_dirty++;
81177 +       if (!JF_ISSET(node, JNODE_CLUSTER_PAGE))
81178 +               get_current_context()->nr_marked_dirty++;
81179
81180         /* We grab2flush_reserve one additional block only if node was
81181            not CREATED and jnode_flush did not sort it into neither
81182 diff -puN fs/reiser4/vfs_ops.c~reiser4-drop-journal-info fs/reiser4/vfs_ops.c
81183 --- a/fs/reiser4/vfs_ops.c~reiser4-drop-journal-info
81184 +++ a/fs/reiser4/vfs_ops.c
81185 @@ -201,10 +201,16 @@ void reiser4_writeout(struct super_block
81186         } while (wbc->nr_to_write > 0);
81187  }
81188
81189 -void reiser4_throttle_write(struct inode *inode)
81190 +/* tell VM how many pages were dirtied */
81191 +void reiser4_throttle_write(struct inode *inode, int nrpages)
81192  {
81193 -       reiser4_txn_restart_current();
81194 -       balance_dirty_pages_ratelimited(inode->i_mapping);
81195 +       reiser4_context *ctx;
81196 +
81197 +       ctx = get_current_context();
81198 +       reiser4_txn_restart(ctx);
81199 +       current->journal_info = NULL;
81200 +       balance_dirty_pages_ratelimited_nr(inode->i_mapping, nrpages);
81201 +       current->journal_info = ctx;
81202  }
81203
81204  const char *REISER4_SUPER_MAGIC_STRING = "ReIsEr4";
81205 diff -puN fs/reiser4/vfs_ops.h~reiser4-drop-journal-info fs/reiser4/vfs_ops.h
81206 --- a/fs/reiser4/vfs_ops.h~reiser4-drop-journal-info
81207 +++ a/fs/reiser4/vfs_ops.h
81208 @@ -30,7 +30,7 @@ extern int reiser4_add_nlink(struct inod
81209  extern int reiser4_del_nlink(struct inode *, struct inode *, int);
81210
81211  extern int reiser4_start_up_io(struct page *page);
81212 -extern void reiser4_throttle_write(struct inode *);
81213 +extern void reiser4_throttle_write(struct inode *, int nrpages);
81214  extern int jnode_is_releasable(jnode *);
81215
81216  #define CAPTURE_APAGE_BURST (1024l)
81217 _
81218 From: Edward Shishkin <edward.shishkin@gmail.com>
81219
81220 This is a multi-part message in MIME format.
81221 --------------020703030905060904070706
81222 Content-Type: text/plain; charset=ISO-8859-1; format=flowed
81223 Content-Transfer-Encoding: 7bit
81224
81225 --------------020703030905060904070706
81226 Content-Type: text/plain;
81227  name="reiser4-fix-compile-warnings.patch"
81228 Content-Transfer-Encoding: 7bit
81229 Content-Disposition: inline;
81230  filename="reiser4-fix-compile-warnings.patch"
81231
81232 Fix compile warnings.
81233
81234 Prepared by: Brandon Berhent <cheater1034@gmail.com>
81235 Signed-off-by: Edward Shishkin <edward.shishkin@gmail.com>
81236 Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
81237 ---
81238
81239  fs/reiser4/plugin/compress/compress.c    |   16 ++++++++--------
81240  fs/reiser4/plugin/file/cryptcompress.c   |    4 ++--
81241  fs/reiser4/plugin/file/file_conversion.c |    2 +-
81242  fs/reiser4/plugin/item/item.h            |    4 ++--
81243  fs/reiser4/plugin/plugin.h               |    8 ++++----
81244  5 files changed, 17 insertions(+), 17 deletions(-)
81245
81246 diff -puN fs/reiser4/plugin/compress/compress.c~reiser4-fix-compile-warnings fs/reiser4/plugin/compress/compress.c
81247 --- a/fs/reiser4/plugin/compress/compress.c~reiser4-fix-compile-warnings
81248 +++ a/fs/reiser4/plugin/compress/compress.c
81249 @@ -122,8 +122,8 @@ static int gzip1_min_size_deflate(void)
81250  }
81251
81252  static void
81253 -gzip1_compress(coa_t coa, __u8 * src_first, unsigned src_len,
81254 -              __u8 * dst_first, unsigned *dst_len)
81255 +gzip1_compress(coa_t coa, __u8 * src_first, size_t src_len,
81256 +              __u8 * dst_first, size_t *dst_len)
81257  {
81258  #if REISER4_ZLIB
81259         int ret = 0;
81260 @@ -166,8 +166,8 @@ gzip1_compress(coa_t coa, __u8 * src_fir
81261  }
81262
81263  static void
81264 -gzip1_decompress(coa_t coa, __u8 * src_first, unsigned src_len,
81265 -                __u8 * dst_first, unsigned *dst_len)
81266 +gzip1_decompress(coa_t coa, __u8 * src_first, size_t src_len,
81267 +                __u8 * dst_first, size_t *dst_len)
81268  {
81269  #if REISER4_ZLIB
81270         int ret = 0;
81271 @@ -278,8 +278,8 @@ static int lzo1_min_size_deflate(void)
81272  }
81273
81274  static void
81275 -lzo1_compress(coa_t coa, __u8 * src_first, unsigned src_len,
81276 -             __u8 * dst_first, unsigned *dst_len)
81277 +lzo1_compress(coa_t coa, __u8 * src_first, size_t src_len,
81278 +             __u8 * dst_first, size_t *dst_len)
81279  {
81280         int result;
81281
81282 @@ -302,8 +302,8 @@ lzo1_compress(coa_t coa, __u8 * src_firs
81283  }
81284
81285  static void
81286 -lzo1_decompress(coa_t coa, __u8 * src_first, unsigned src_len,
81287 -               __u8 * dst_first, unsigned *dst_len)
81288 +lzo1_decompress(coa_t coa, __u8 * src_first, size_t src_len,
81289 +               __u8 * dst_first, size_t *dst_len)
81290  {
81291         int result;
81292
81293 diff -puN fs/reiser4/plugin/file/cryptcompress.c~reiser4-fix-compile-warnings fs/reiser4/plugin/file/cryptcompress.c
81294 --- a/fs/reiser4/plugin/file/cryptcompress.c~reiser4-fix-compile-warnings
81295 +++ a/fs/reiser4/plugin/file/cryptcompress.c
81296 @@ -1014,7 +1014,7 @@ int reiser4_deflate_cluster(struct clust
81297         coplug = inode_compression_plugin(inode);
81298         if (should_compress(tc, clust->index, inode)) {
81299                 /* try to compress, discard bad results */
81300 -               __u32 dst_len;
81301 +               size_t dst_len;
81302                 compression_mode_plugin * mplug =
81303                         inode_compression_mode_plugin(inode);
81304                 assert("edward-602", coplug != NULL);
81305 @@ -1164,7 +1164,7 @@ int reiser4_inflate_cluster(struct clust
81306                 transformed = 1;
81307         }
81308         if (need_inflate(clust, inode, 0)) {
81309 -               unsigned dst_len = inode_cluster_size(inode);
81310 +               size_t dst_len = inode_cluster_size(inode);
81311                 if(transformed)
81312                         alternate_streams(tc);
81313
81314 diff -puN fs/reiser4/plugin/file/file_conversion.c~reiser4-fix-compile-warnings fs/reiser4/plugin/file/file_conversion.c
81315 --- a/fs/reiser4/plugin/file/file_conversion.c~reiser4-fix-compile-warnings
81316 +++ a/fs/reiser4/plugin/file/file_conversion.c
81317 @@ -273,7 +273,7 @@ static int read_check_compressibility(st
81318  {
81319         int i;
81320         int result;
81321 -       __u32 dst_len;
81322 +       size_t dst_len;
81323         hint_t tmp_hint;
81324         hint_t * cur_hint = clust->hint;
81325         assert("edward-1541", cont->state == DISPATCH_POINT);
81326 diff -puN fs/reiser4/plugin/item/item.h~reiser4-fix-compile-warnings fs/reiser4/plugin/item/item.h
81327 --- a/fs/reiser4/plugin/item/item.h~reiser4-fix-compile-warnings
81328 +++ a/fs/reiser4/plugin/item/item.h
81329 @@ -233,8 +233,8 @@ struct dir_entry_iops {
81330
81331  /* operations specific to items regular (unix) file metadata are built of */
81332  struct file_iops{
81333 -       int (*write) (struct file *, struct inode *,
81334 -                     const char __user *, size_t, loff_t *pos);
81335 +       ssize_t (*write) (struct file *, struct inode *,
81336 +                         const char __user *, size_t, loff_t *pos);
81337         int (*read) (struct file *, flow_t *, hint_t *);
81338         int (*readpage) (void *, struct page *);
81339         int (*get_block) (const coord_t *, sector_t, sector_t *);
81340 diff -puN fs/reiser4/plugin/plugin.h~reiser4-fix-compile-warnings fs/reiser4/plugin/plugin.h
81341 --- a/fs/reiser4/plugin/plugin.h~reiser4-fix-compile-warnings
81342 +++ a/fs/reiser4/plugin/plugin.h
81343 @@ -560,10 +560,10 @@ typedef struct compression_plugin {
81344         int (*min_size_deflate) (void);
81345          __u32(*checksum) (char *data, __u32 length);
81346         /* main transform procedures */
81347 -       void (*compress) (coa_t coa, __u8 *src_first, unsigned src_len,
81348 -                         __u8 *dst_first, unsigned *dst_len);
81349 -       void (*decompress) (coa_t coa, __u8 *src_first, unsigned src_len,
81350 -                           __u8 *dst_first, unsigned *dst_len);
81351 +       void (*compress) (coa_t coa, __u8 *src_first, size_t src_len,
81352 +                         __u8 *dst_first, size_t *dst_len);
81353 +       void (*decompress) (coa_t coa, __u8 *src_first, size_t src_len,
81354 +                           __u8 *dst_first, size_t *dst_len);
81355  } compression_plugin;
81356
81357  typedef struct compression_mode_plugin {
81358 _
81359 From: Edward Shishkin <edward.shishkin@gmail.com>
81360
81361 Address a gcc warning for x86_64 about large frame size.
81362 Add a new function push_sb_field_opts().
81363
81364 Signed-off-by Edward Shsihkin <edward.shishkin@gmail.com>
81365
81366 Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
81367 ---
81368
81369  fs/reiser4/init_super.c |  126 +++++++++++++++++++-------------------
81370  1 file changed, 66 insertions(+), 60 deletions(-)
81371
81372 diff -puN fs/reiser4/init_super.c~reiser4-reduce-frame-size-of-reiser4_init_super_data fs/reiser4/init_super.c
81373 --- a/fs/reiser4/init_super.c~reiser4-reduce-frame-size-of-reiser4_init_super_data
81374 +++ a/fs/reiser4/init_super.c
81375 @@ -292,66 +292,6 @@ static int parse_options(char *opt_strin
81376
81377  #define MAX_NR_OPTIONS (30)
81378
81379 -/**
81380 - * reiser4_init_super_data - initialize reiser4 private super block
81381 - * @super: super block to initialize
81382 - * @opt_string: list of reiser4 mount options
81383 - *
81384 - * Sets various reiser4 parameters to default values. Parses mount options and
81385 - * overwrites default settings.
81386 - */
81387 -int reiser4_init_super_data(struct super_block *super, char *opt_string)
81388 -{
81389 -       int result;
81390 -       struct opt_desc *opts, *p;
81391 -       reiser4_super_info_data *sbinfo = get_super_private(super);
81392 -
81393 -       /* initialize super, export, dentry operations */
81394 -       sbinfo->ops.super = reiser4_super_operations;
81395 -       sbinfo->ops.export = reiser4_export_operations;
81396 -       sbinfo->ops.dentry = reiser4_dentry_operations;
81397 -       super->s_op = &sbinfo->ops.super;
81398 -       super->s_export_op = &sbinfo->ops.export;
81399 -
81400 -       /* initialize transaction manager parameters to default values */
81401 -       sbinfo->tmgr.atom_max_size = totalram_pages / 4;
81402 -       sbinfo->tmgr.atom_max_age = REISER4_ATOM_MAX_AGE / HZ;
81403 -       sbinfo->tmgr.atom_min_size = 256;
81404 -       sbinfo->tmgr.atom_max_flushers = ATOM_MAX_FLUSHERS;
81405 -
81406 -       /* initialize cbk cache parameter */
81407 -       sbinfo->tree.cbk_cache.nr_slots = CBK_CACHE_SLOTS;
81408 -
81409 -       /* initialize flush parameters */
81410 -       sbinfo->flush.relocate_threshold = FLUSH_RELOCATE_THRESHOLD;
81411 -       sbinfo->flush.relocate_distance = FLUSH_RELOCATE_DISTANCE;
81412 -       sbinfo->flush.written_threshold = FLUSH_WRITTEN_THRESHOLD;
81413 -       sbinfo->flush.scan_maxnodes = FLUSH_SCAN_MAXNODES;
81414 -
81415 -       sbinfo->optimal_io_size = REISER4_OPTIMAL_IO_SIZE;
81416 -
81417 -       /* preliminary tree initializations */
81418 -       sbinfo->tree.super = super;
81419 -       sbinfo->tree.carry.new_node_flags = REISER4_NEW_NODE_FLAGS;
81420 -       sbinfo->tree.carry.new_extent_flags = REISER4_NEW_EXTENT_FLAGS;
81421 -       sbinfo->tree.carry.paste_flags = REISER4_PASTE_FLAGS;
81422 -       sbinfo->tree.carry.insert_flags = REISER4_INSERT_FLAGS;
81423 -       rwlock_init(&(sbinfo->tree.tree_lock));
81424 -       spin_lock_init(&(sbinfo->tree.epoch_lock));
81425 -
81426 -       /* initialize default readahead params */
81427 -       sbinfo->ra_params.max = num_physpages / 4;
81428 -       sbinfo->ra_params.flags = 0;
81429 -
81430 -       /* allocate memory for structure describing reiser4 mount options */
81431 -       opts = kmalloc(sizeof(struct opt_desc) * MAX_NR_OPTIONS,
81432 -                      reiser4_ctx_gfp_mask_get());
81433 -       if (opts == NULL)
81434 -               return RETERR(-ENOMEM);
81435 -
81436 -       /* initialize structure describing reiser4 mount options */
81437 -       p = opts;
81438 -
81439  #if REISER4_DEBUG
81440  #  define OPT_ARRAY_CHECK if ((p) > (opts) + MAX_NR_OPTIONS) {         \
81441                 warning("zam-1046", "opt array is overloaded"); break;  \
81442 @@ -370,6 +310,10 @@ do {                                               \
81443  #define PUSH_SB_FIELD_OPT(field, format) PUSH_OPT(SB_FIELD_OPT(field, format))
81444  #define PUSH_BIT_OPT(name, bit) PUSH_OPT(BIT_OPT(name, bit))
81445
81446 +static noinline void push_sb_field_opts(struct opt_desc *p,
81447 +                                       struct opt_desc *opts,
81448 +                                       reiser4_super_info_data *sbinfo)
81449 +{
81450         /*
81451          * tmgr.atom_max_size=N
81452          * Atoms containing more than N blocks will be forced to commit. N is
81453 @@ -435,7 +379,69 @@ do {                                               \
81454          */
81455         PUSH_SB_FIELD_OPT(altsuper, "%lu");
81456  #endif
81457 +}
81458 +
81459 +/**
81460 + * reiser4_init_super_data - initialize reiser4 private super block
81461 + * @super: super block to initialize
81462 + * @opt_string: list of reiser4 mount options
81463 + *
81464 + * Sets various reiser4 parameters to default values. Parses mount options and
81465 + * overwrites default settings.
81466 + */
81467 +int reiser4_init_super_data(struct super_block *super, char *opt_string)
81468 +{
81469 +       int result;
81470 +       struct opt_desc *opts, *p;
81471 +       reiser4_super_info_data *sbinfo = get_super_private(super);
81472 +
81473 +       /* initialize super, export, dentry operations */
81474 +       sbinfo->ops.super = reiser4_super_operations;
81475 +       sbinfo->ops.export = reiser4_export_operations;
81476 +       sbinfo->ops.dentry = reiser4_dentry_operations;
81477 +       super->s_op = &sbinfo->ops.super;
81478 +       super->s_export_op = &sbinfo->ops.export;
81479 +
81480 +       /* initialize transaction manager parameters to default values */
81481 +       sbinfo->tmgr.atom_max_size = totalram_pages / 4;
81482 +       sbinfo->tmgr.atom_max_age = REISER4_ATOM_MAX_AGE / HZ;
81483 +       sbinfo->tmgr.atom_min_size = 256;
81484 +       sbinfo->tmgr.atom_max_flushers = ATOM_MAX_FLUSHERS;
81485 +
81486 +       /* initialize cbk cache parameter */
81487 +       sbinfo->tree.cbk_cache.nr_slots = CBK_CACHE_SLOTS;
81488 +
81489 +       /* initialize flush parameters */
81490 +       sbinfo->flush.relocate_threshold = FLUSH_RELOCATE_THRESHOLD;
81491 +       sbinfo->flush.relocate_distance = FLUSH_RELOCATE_DISTANCE;
81492 +       sbinfo->flush.written_threshold = FLUSH_WRITTEN_THRESHOLD;
81493 +       sbinfo->flush.scan_maxnodes = FLUSH_SCAN_MAXNODES;
81494 +
81495 +       sbinfo->optimal_io_size = REISER4_OPTIMAL_IO_SIZE;
81496 +
81497 +       /* preliminary tree initializations */
81498 +       sbinfo->tree.super = super;
81499 +       sbinfo->tree.carry.new_node_flags = REISER4_NEW_NODE_FLAGS;
81500 +       sbinfo->tree.carry.new_extent_flags = REISER4_NEW_EXTENT_FLAGS;
81501 +       sbinfo->tree.carry.paste_flags = REISER4_PASTE_FLAGS;
81502 +       sbinfo->tree.carry.insert_flags = REISER4_INSERT_FLAGS;
81503 +       rwlock_init(&(sbinfo->tree.tree_lock));
81504 +       spin_lock_init(&(sbinfo->tree.epoch_lock));
81505 +
81506 +       /* initialize default readahead params */
81507 +       sbinfo->ra_params.max = num_physpages / 4;
81508 +       sbinfo->ra_params.flags = 0;
81509 +
81510 +       /* allocate memory for structure describing reiser4 mount options */
81511 +       opts = kmalloc(sizeof(struct opt_desc) * MAX_NR_OPTIONS,
81512 +                      reiser4_ctx_gfp_mask_get());
81513 +       if (opts == NULL)
81514 +               return RETERR(-ENOMEM);
81515 +
81516 +       /* initialize structure describing reiser4 mount options */
81517 +       p = opts;
81518
81519 +       push_sb_field_opts(p, opts, sbinfo);
81520         /* turn on BSD-style gid assignment */
81521         PUSH_BIT_OPT("bsdgroups", REISER4_BSD_GID);
81522         /* turn on 32 bit times */
81523 _
81524
81525 This is a multi-part message in MIME format.
81526 --------------010200050204000200080203
81527 Content-Type: text/plain; charset=ISO-8859-1; format=flowed
81528 Content-Transfer-Encoding: 7bit
81529
81530 Laurent Riffard wrote:
81531 > Hi Edward,
81532 >
81533
81534 Hello Laurent.
81535
81536 > This patch is buggy, isn't it ?
81537 >
81538
81539 Yes, sorry, my fault..
81540 I have sent the fixup already to the list yesterday..
81541 Resending for you and Akpm.
81542 Andrew, please apply.
81543
81544 Thanks.
81545 Edward.
81546
81547 > I've got 2 reiser4 FS in my /etc/fstab:
81548 >
81549 > /dev/vglinux1/lvkernel-r4 /home/laurent/kernel reiser4 defaults,noatime,nodiratime,tmgr.atom_max_size=2048 0 0
81550 > /dev/disk/by-uuid/b8dbe880-b664-49aa-8050-bddc91fd5e49 /mnt/diske reiser4 noauto,users,noatime,nodiratime 0 0
81551 >
81552 > The first FS can't be mounted:
81553 >
81554 > [  235.078342] reiser4[mount(4205)]: parse_options (fs/reiser4/init_super.c:253)[nikita-2307]:
81555 > [  235.078345] WARNING: Unrecognized option: "tmgr.atom_max_size=2048"
81556 >
81557
81558 --------------010200050204000200080203
81559 Content-Type: text/plain;
81560  name="reiser4-reduce-frame-size-fix.patch"
81561 Content-Transfer-Encoding: 7bit
81562 Content-Disposition: inline;
81563  filename="reiser4-reduce-frame-size-fix.patch"
81564
81565 . Fix up the bug in reiser4_init_super_data():
81566   The pointer "p" to opt_desc structure is not
81567   incremented.
81568   Pass "&p" instead of "p" to push_sb_field_opts(),
81569   which is supposed to increment the pointer.
81570 . Modify macros PUSH_OPT, OPT_ARRAY_CHECK to accept
81571   arguments.
81572
81573 Signed-off-by Edward Shsihkin <edward.shishkin@gmail.com>
81574 ---
81575  1 file changed, 16 insertions(+), 12 deletions(-)
81576
81577
81578 Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
81579 ---
81580
81581  fs/reiser4/init_super.c |   28 ++++++++++++++++------------
81582  1 file changed, 16 insertions(+), 12 deletions(-)
81583
81584 diff -puN fs/reiser4/init_super.c~reiser4-reduce-frame-size-of-reiser4_init_super_data-fixup fs/reiser4/init_super.c
81585 --- a/fs/reiser4/init_super.c~reiser4-reduce-frame-size-of-reiser4_init_super_data-fixup
81586 +++ a/fs/reiser4/init_super.c
81587 @@ -293,27 +293,27 @@ static int parse_options(char *opt_strin
81588  #define MAX_NR_OPTIONS (30)
81589
81590  #if REISER4_DEBUG
81591 -#  define OPT_ARRAY_CHECK if ((p) > (opts) + MAX_NR_OPTIONS) {         \
81592 +#  define OPT_ARRAY_CHECK(opt, array)                                  \
81593 +       if ((opt) > (array) + MAX_NR_OPTIONS) {                         \
81594                 warning("zam-1046", "opt array is overloaded"); break;  \
81595         }
81596  #else
81597 -#   define OPT_ARRAY_CHECK noop
81598 +#   define OPT_ARRAY_CHECK(opt, array) noop
81599  #endif
81600
81601 -#define PUSH_OPT(...)                          \
81602 +#define PUSH_OPT(opt, array, ...)              \
81603  do {                                           \
81604         struct opt_desc o = __VA_ARGS__;        \
81605 -       OPT_ARRAY_CHECK;                        \
81606 -       *p ++ = o;                              \
81607 +       OPT_ARRAY_CHECK(opt, array);            \
81608 +       *(opt) ++ = o;                          \
81609  } while (0)
81610
81611 -#define PUSH_SB_FIELD_OPT(field, format) PUSH_OPT(SB_FIELD_OPT(field, format))
81612 -#define PUSH_BIT_OPT(name, bit) PUSH_OPT(BIT_OPT(name, bit))
81613 -
81614 -static noinline void push_sb_field_opts(struct opt_desc *p,
81615 +static noinline void push_sb_field_opts(struct opt_desc **p,
81616                                         struct opt_desc *opts,
81617                                         reiser4_super_info_data *sbinfo)
81618  {
81619 +#define PUSH_SB_FIELD_OPT(field, format)               \
81620 +       PUSH_OPT(*p, opts, SB_FIELD_OPT(field, format))
81621         /*
81622          * tmgr.atom_max_size=N
81623          * Atoms containing more than N blocks will be forced to commit. N is
81624 @@ -441,8 +441,12 @@ int reiser4_init_super_data(struct super
81625         /* initialize structure describing reiser4 mount options */
81626         p = opts;
81627
81628 -       push_sb_field_opts(p, opts, sbinfo);
81629 +       push_sb_field_opts(&p, opts, sbinfo);
81630         /* turn on BSD-style gid assignment */
81631 +
81632 +#define PUSH_BIT_OPT(name, bit)                        \
81633 +       PUSH_OPT(p, opts, BIT_OPT(name, bit))
81634 +
81635         PUSH_BIT_OPT("bsdgroups", REISER4_BSD_GID);
81636         /* turn on 32 bit times */
81637         PUSH_BIT_OPT("32bittimes", REISER4_32_BIT_TIMES);
81638 @@ -456,7 +460,7 @@ int reiser4_init_super_data(struct super
81639         /* disable use of write barriers in the reiser4 log writer. */
81640         PUSH_BIT_OPT("no_write_barrier", REISER4_NO_WRITE_BARRIER);
81641
81642 -       PUSH_OPT(
81643 +       PUSH_OPT(p, opts,
81644         {
81645                 /*
81646                  * tree traversal readahead parameters:
81647 @@ -482,7 +486,7 @@ int reiser4_init_super_data(struct super
81648         );
81649
81650         /* What to do in case of fs error */
81651 -       PUSH_OPT(
81652 +       PUSH_OPT(p, opts,
81653         {
81654                 .name = "onerror",
81655                 .type = OPT_ONEOF,
81656 _
81657 From: Johannes Buchner <buchner.johannes@gmx.at>
81658
81659 > from
81660 > http://www.kernel.org/pub/linux/kernel/people/edward/reiser4/reiser4-for-2.6/
81661 I'm sorry, I did not see half of this is already covered by
81662 vfs-take-2add-set_page_dirty_notag.patch. With that part removed,
81663 this patch is only cosmetics now.
81664
81665 Cc: Edward Shishkin <edward.shishkin@gmail.com>
81666 Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
81667 ---
81668
81669  fs/reiser4/carry.c     |    2 +-
81670  fs/reiser4/carry_ops.c |    2 +-
81671  fs/reiser4/znode.h     |    1 -
81672  mm/filemap.c           |    2 +-
81673  4 files changed, 3 insertions(+), 4 deletions(-)
81674
81675 diff -puN fs/reiser4/carry.c~reiser4-some-changes-from-reiser4-2631-patch fs/reiser4/carry.c
81676 --- a/fs/reiser4/carry.c~reiser4-some-changes-from-reiser4-2631-patch
81677 +++ a/fs/reiser4/carry.c
81678 @@ -1095,7 +1095,7 @@ static void fatal_carry_error(carry_leve
81679   *
81680   * This function itself only manages changes in carry structures and delegates
81681   * all hard work (allocation of znode for new root, changes of parent and
81682 - * sibling pointers to the reiser4_add_tree_root().
81683 + * sibling pointers) to the reiser4_add_tree_root().
81684   *
81685   * Locking: old tree root is locked by carry at this point. Fake znode is also
81686   * locked.
81687 diff -puN fs/reiser4/carry_ops.c~reiser4-some-changes-from-reiser4-2631-patch fs/reiser4/carry_ops.c
81688 --- a/fs/reiser4/carry_ops.c~reiser4-some-changes-from-reiser4-2631-patch
81689 +++ a/fs/reiser4/carry_ops.c
81690 @@ -2015,7 +2015,7 @@ static int carry_estimate_bitmaps(void)
81691                 int bytes;
81692
81693                 bytes = capped_height() * (0 +  /* bnode should be added, but
81694 -                                                * its is private to bitmap.c,
81695 +                                                * it is private to bitmap.c,
81696                                                  * skip for now. */
81697                                            2 * sizeof(jnode));
81698                                                 /* working and commit jnodes */
81699 diff -puN fs/reiser4/znode.h~reiser4-some-changes-from-reiser4-2631-patch fs/reiser4/znode.h
81700 --- a/fs/reiser4/znode.h~reiser4-some-changes-from-reiser4-2631-patch
81701 +++ a/fs/reiser4/znode.h
81702 @@ -18,7 +18,6 @@
81703
81704  #include <linux/types.h>
81705  #include <linux/spinlock.h>
81706 -#include <linux/semaphore.h>
81707  #include <linux/pagemap.h>     /* for PAGE_CACHE_SIZE */
81708  #include <asm/atomic.h>
81709
81710 diff -puN mm/filemap.c~reiser4-some-changes-from-reiser4-2631-patch mm/filemap.c
81711 --- a/mm/filemap.c~reiser4-some-changes-from-reiser4-2631-patch
81712 +++ a/mm/filemap.c
81713 @@ -779,7 +779,6 @@ repeat:
81714         rcu_read_unlock();
81715         return ret;
81716  }
81717 -EXPORT_SYMBOL(find_get_pages);
81718
81719  /**
81720   * find_get_pages_contig - gang contiguous pagecache lookup
81721 @@ -949,6 +948,7 @@ static void shrink_readahead_size_eio(st
81722  {
81723         ra->ra_pages /= 4;
81724  }
81725 +EXPORT_SYMBOL(find_get_pages);
81726
81727  /**
81728   * do_generic_file_read - generic file read routine
81729 _
81730 From: Johannes Buchner <buchner.johannes@gmx.at>
81731
81732 Cc: Edward Shishkin <edward.shishkin@gmail.com>
81733 Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
81734 ---
81735
81736  fs/reiser4/context.c |    2 +-
81737  fs/reiser4/txnmgr.c  |    4 ++--
81738  2 files changed, 3 insertions(+), 3 deletions(-)
81739
81740 diff -puN fs/reiser4/context.c~reiser4-some-comments-were-still-mentioning-pdflush fs/reiser4/context.c
81741 --- a/fs/reiser4/context.c~reiser4-some-comments-were-still-mentioning-pdflush
81742 +++ a/fs/reiser4/context.c
81743 @@ -147,7 +147,7 @@ static void reiser4_throttle_write_at(re
81744          * call balance_dirty_pages_ratelimited() to process formatted nodes
81745          * dirtied during this system call. Do that only if we are not in mount
81746          * and there were nodes dirtied in this context and we are not in
81747 -        * writepage (to avoid deadlock) and not in pdflush
81748 +        * writepage (to avoid deadlock)
81749          */
81750         if (sbinfo != NULL && sbinfo->fake != NULL &&
81751             context->nr_marked_dirty != 0 &&
81752 diff -puN fs/reiser4/txnmgr.c~reiser4-some-comments-were-still-mentioning-pdflush fs/reiser4/txnmgr.c
81753 --- a/fs/reiser4/txnmgr.c~reiser4-some-comments-were-still-mentioning-pdflush
81754 +++ a/fs/reiser4/txnmgr.c
81755 @@ -1360,7 +1360,7 @@ static int txn_try_to_fuse_small_atom(tx
81756     code tries to flush current atom.
81757
81758     flush_some_atom() is called as part of memory clearing process. It is
81759 -   invoked from balance_dirty_pages(), pdflushd, and entd.
81760 +   invoked from balance_dirty_pages() and entd.
81761
81762     If we can flush no nodes, atom is committed, because this frees memory.
81763
81764 @@ -1469,7 +1469,7 @@ flush_some_atom(jnode * start, long *nr_
81765                          * or atom is too old/large,
81766                          * we force current atom to commit */
81767                         /* wait for commit completion but only if this
81768 -                        * wouldn't stall pdflushd and ent thread. */
81769 +                        * wouldn't stall ent thread. */
81770                         if (!wbc->nonblocking && !ctx->entd)
81771                                 txnh->flags |= TXNH_WAIT_COMMIT;
81772                         atom->flags |= ATOM_FORCE_COMMIT;
81773 _
81774 From: Johannes Buchner <buchner.johannes@gmx.at>
81775
81776 > generic_sync_sb_inodes was removed in d8a8559cd7a9cc,
81777 > writeback_inodes_sb and sync_inodes_sb should replace them.
81778 Second version, now also fixes the last comments relating to
81779 generic_sync_sb_inodes.
81780
81781 Cc: Edward Shishkin <edward.shishkin@gmail.com>
81782 Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
81783 ---
81784
81785  fs/reiser4/context.h   |    2 +-
81786  fs/reiser4/entd.c      |    4 +++-
81787  fs/reiser4/super_ops.c |    6 ++++--
81788  3 files changed, 8 insertions(+), 4 deletions(-)
81789
81790 diff -puN fs/reiser4/context.h~reiser4-generic_sync_sb_inodes-doesnt-exist-anymore fs/reiser4/context.h
81791 --- a/fs/reiser4/context.h~reiser4-generic_sync_sb_inodes-doesnt-exist-anymore
81792 +++ a/fs/reiser4/context.h
81793 @@ -66,7 +66,7 @@ struct reiser4_context {
81794         /* count non-trivial jnode_set_dirty() calls */
81795         unsigned long nr_marked_dirty;
81796
81797 -       /* reiser4_sync_inodes calls (via generic_sync_sb_inodes)
81798 +       /* reiser4_sync_inodes calls (via writeback/sync_inodes_sb)
81799          * reiser4_writepages for each of dirty inodes. Reiser4_writepages
81800          * captures pages. When number of pages captured in one
81801          * reiser4_sync_inodes reaches some threshold - some atoms get
81802 diff -puN fs/reiser4/entd.c~reiser4-generic_sync_sb_inodes-doesnt-exist-anymore fs/reiser4/entd.c
81803 --- a/fs/reiser4/entd.c~reiser4-generic_sync_sb_inodes-doesnt-exist-anymore
81804 +++ a/fs/reiser4/entd.c
81805 @@ -241,7 +241,9 @@ static void entd_flush(struct super_bloc
81806         if (rq->wbc->nr_to_write > 0) {
81807                 rq->wbc->range_start = 0;
81808                 rq->wbc->range_end = LLONG_MAX;
81809 -               generic_sync_sb_inodes(rq->wbc);
81810 +               writeback_inodes_sb(super);
81811 +               if (rq->wbc->sync_mode == WB_SYNC_ALL)
81812 +                       sync_inodes_sb(super);
81813         }
81814         rq->wbc->nr_to_write = ENTD_CAPTURE_APAGE_BURST;
81815         reiser4_writeout(super, rq->wbc);
81816 diff -puN fs/reiser4/super_ops.c~reiser4-generic_sync_sb_inodes-doesnt-exist-anymore fs/reiser4/super_ops.c
81817 --- a/fs/reiser4/super_ops.c~reiser4-generic_sync_sb_inodes-doesnt-exist-anymore
81818 +++ a/fs/reiser4/super_ops.c
81819 @@ -384,7 +384,7 @@ static void reiser4_clear_inode(struct i
81820   * @wbc:
81821   *
81822   * This method is called by background and non-backgound writeback. Reiser4's
81823 - * implementation uses generic_sync_sb_inodes to call reiser4_writepages for
81824 + * implementation uses writeback/sync_inodes_sb to call reiser4_writepages for
81825   * each of dirty inodes. Reiser4_writepages handles pages dirtied via shared
81826   * mapping - dirty pages get into atoms. Writeout is called to flush some
81827   * atoms.
81828 @@ -412,7 +412,9 @@ static void reiser4_sync_inodes(struct s
81829          * call reiser4_writepages for each of dirty inodes to turn dirty pages
81830          * into transactions if they were not yet.
81831          */
81832 -       generic_sync_sb_inodes(wbc);
81833 +       writeback_inodes_sb(super);
81834 +       if (wbc->sync_mode == WB_SYNC_ALL)
81835 +               sync_inodes_sb(super);
81836
81837         /* flush goes here */
81838         wbc->nr_to_write = to_write;
81839 _
81840 From: Johannes Buchner <buchner.johannes@gmx.at>
81841
81842 The parameter wbc=NULL caused a NULL pointer dereference in
81843 reiser4_sync_inodes.  This issue was introduced in patch
81844 reiser4-vfs-add-super_operationssync_inodes
81845
81846 Cc: Edward Shishkin <edward.shishkin@gmail.com>
81847 Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
81848 ---
81849
81850  fs/reiser4/super_ops.c |    2 +-
81851  1 file changed, 1 insertion(+), 1 deletion(-)
81852
81853 diff -puN fs/reiser4/super_ops.c~reiser4-fixed-null-pointer-dereference fs/reiser4/super_ops.c
81854 --- a/fs/reiser4/super_ops.c~reiser4-fixed-null-pointer-dereference
81855 +++ a/fs/reiser4/super_ops.c
81856 @@ -395,7 +395,7 @@ static void reiser4_sync_inodes(struct s
81857         reiser4_context *ctx;
81858         long to_write;
81859
81860 -       if (wbc->for_kupdate)
81861 +       if (wbc == NULL || wbc->for_kupdate)
81862                 /* reiser4 has its own means of periodical write-out */
81863                 return;
81864
81865 _